Currently I am working on porting a molecular dynamics simulation program, which was written in plain cpu C++, to Cuda. In short, the program initialises a list of atoms, transfers the control to an object of class CCalc which calculates atomic forces, velocities and positions for 100 (or another number of) iterations, and finally returns to draw the atoms on the screen.
My goal is to have all compute-heavy functions in CCalc run on the gpu. To prevent having to copy all calculation constants in CCalc one by one, I decided to copy the whole class to device memory, pointed to by this__d. Since the drawing function is called from the cpu, the atom list needs to be copied between cpu and gpu every 100 iterations and as such is not a member of CCalc.
In function CCalc::refreshCellList(), I want to rearrange atoms__d (the atom list residing in device memory) such that all atoms in the same cell are grouped together. In other words, atoms__d needs to be sorted with cellId as keys.
As I don't want to waste time implementing my own sorting algorithm, I tried using thrust::sort_by_key(). And here's where I got stuck. The function thrust::sort_by_key() requires device_ptr objects as arguments; however I cannot access cellId since I can only cast this__d to device_ptr, which I can't dereference on the cpu.
Is there a way to do this without having to break down the "class on gpu" structure?
Here is (an excerpt of) my code:
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "device_functions.h"
#include <vector>
#include <thrust\sort.h>
#include <thrust\device_ptr.h>
#define REFRESH_CELL_LISTS 20
struct Atom
{
float pos[3];
float vel[3];
float force[3];
// others
};
std::vector<Atom> atom;
Atom *atom__d;
int noOfAtoms = 0;
class CCalc;
__global__ void makeCells(CCalc *C, Atom *A);
class CCalc
{
private:
CCalc *this__d;
public:
const int nAtoms = noOfAtoms;
int *cellId;
const int nCellX = 4, nCellY = 3;
// many force calculation constants
CCalc()
{
cudaMalloc((void**)&cellId, nAtoms*sizeof(int));
// some other stuff
cudaMalloc((void**)&this__d, sizeof(CCalc));
cudaMemcpy(this__d, this, sizeof(CCalc), cudaMemcpyHostToDevice);
}
// destructor
void relaxStructure(int numOfIterations)
{
cudaMalloc((void**)&atom__d, nAtoms*sizeof(Atom));
cudaMemcpy(atom__d, &atom[0], nAtoms*sizeof(Atom), cudaMemcpyHostToDevice);
for(int iter = 0; iter < numOfIterations; iter++)
{
// stuff
if(!(iter % REFRESH_CELL_LISTS)) refreshCellLists();
// calculate forces; update velocities and positions
}
cudaMemcpy(&atom[0], atom__d, nAtoms*sizeof(Atom), cudaMemcpyDeviceToHost);
cudaFree(atom__d);
}
// functions for force, velocity and position calculation
void refreshCellLists()
{
makeCells<<<(nAtoms + 31) / 32, 32>>>(this__d, atom__d);
cudaDeviceSynchronize();
// sort atom__d array using cellId as keys;
// here is where I would like to use thrust::sort_by_key()
}
};
__global__ void makeCells(CCalc *C, Atom *A)
{
int index = blockDim.x*blockIdx.x + threadIdx.x;
if(index < C->nAtoms)
{
// determine cell x, y based on position
// for now let's use an arbitrary mapping to obtain x, y
int X = (index * index) % C->nCellX;
int Y = (index * index) % C->nCellY;
C->cellId[index] = X + Y * C->nCellX;
}
}
int main()
{
cudaSetDevice(0);
noOfAtoms = 1000; // normally defined by input file
atom.resize(noOfAtoms);
// initialise atom positions, velocities and forces
CCalc calcObject;
while(true) // as long as we need
{
// draw atoms on screen
calcObject.relaxStructure(100);
}
}
Thank you very much.
In other words, atoms__d needs to be sorted with cellId as keys.
It should be possible to do that, at your indicated point in the refreshCellLists method. For simplicity, I have chosen to use the raw device pointers directly (although we could easily wrap these raw device pointers in thrust::device_ptr also) combined with the thrust::device execution policy. Here is a worked example:
$ cat t1156.cu
#include <vector>
#include <thrust/execution_policy.h>
#include <thrust/sort.h>
#include <thrust/device_ptr.h>
#define REFRESH_CELL_LISTS 20
struct Atom
{
float pos[3];
float vel[3];
float force[3];
// others
};
std::vector<Atom> atom;
Atom *atom__d;
int noOfAtoms = 0;
class CCalc;
__global__ void makeCells(CCalc *C, Atom *A);
class CCalc
{
private:
CCalc *this__d;
public:
const int nAtoms = noOfAtoms;
int *cellId;
const int nCellX = 4, nCellY = 3;
// many force calculation constants
CCalc()
{
cudaMalloc((void**)&cellId, nAtoms*sizeof(int));
// some other stuff
cudaMalloc((void**)&this__d, sizeof(CCalc));
cudaMemcpy(this__d, this, sizeof(CCalc), cudaMemcpyHostToDevice);
}
// destructor
void relaxStructure(int numOfIterations)
{
cudaMalloc((void**)&atom__d, nAtoms*sizeof(Atom));
cudaMemcpy(atom__d, &atom[0], nAtoms*sizeof(Atom), cudaMemcpyHostToDevice);
for(int iter = 0; iter < numOfIterations; iter++)
{
// stuff
if(!(iter % REFRESH_CELL_LISTS)) refreshCellLists();
// calculate forces; update velocities and positions
}
cudaMemcpy(&atom[0], atom__d, nAtoms*sizeof(Atom), cudaMemcpyDeviceToHost);
cudaFree(atom__d);
}
// functions for force, velocity and position calculation
void refreshCellLists()
{
makeCells<<<(nAtoms + 31) / 32, 32>>>(this__d, atom__d);
cudaDeviceSynchronize();
// sort atom__d array using cellId as keys;
thrust::sort_by_key(thrust::device, cellId, cellId+nAtoms, atom__d);
}
};
__global__ void makeCells(CCalc *C, Atom *A)
{
int index = blockDim.x*blockIdx.x + threadIdx.x;
if(index < C->nAtoms)
{
// determine cell x, y based on position
// for now let's use an arbitrary mapping to obtain x, y
int X = (index * index) % C->nCellX;
int Y = (index * index) % C->nCellY;
C->cellId[index] = X + Y * C->nCellX;
}
}
int main()
{
cudaSetDevice(0);
noOfAtoms = 1000; // normally defined by input file
atom.resize(noOfAtoms);
// initialise atom positions, velocities and forces
CCalc calcObject;
for (int i = 0; i < 100; i++) // as long as we need
{
// draw atoms on screen
calcObject.relaxStructure(100);
}
}
$ nvcc -std=c++11 -o t1156 t1156.cu
$ cuda-memcheck ./t1156
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$
When building thrust codes, especially on windows, I usually make a set of recommendations as summarized here.
Related
I am rewriting a library that performs calculations and other operations on data that is stored in contiguous chunks of memory so that it can work on GPUs using the CUDA framework. The data represents information that lives on a 4-dimensional grid. The total size of the grid can range from 1000's to millions of grid points. Along each direction, the grid may have as little as 8 or as much as 100's of points. My question is about what is the best way to implement operations on a subset of the grid. For example, suppose that my grid is [0,nx)x[0,ny)x[0,nz)x[0,nq), and I want to implement a transformation that multiplies all the points whose indexes belong to [1,nx-1)x[1,ny-1)x[1,nz-1)x[0,nq-1) by minus 1.
Right now, what I do is via nested loops. This is a skeleton of code
{
int nx,ny,nz,nq;
nx=10,ny=10,nz=10,nq=10;
typedef thrust::device_vector<double> Array;
Array A(nx*ny*nz*nq);
thrust::fill(A.begin(), A.end(), (double) 1);
for (auto q=1; q<nq-1; ++q){
for (auto k=1; k<nz-1; ++k){
for (auto j=1; j<ny-1; ++j){
int offset1=+1+j*nx+k*nx*ny+q*nx*ny*nz;
int offset2=offset1+nx-2;
thrust::transform(A.begin()+offset1,
A.begin()+offset2,
thrust::negate<double>());
}
}
}
}
However, I wonder if this is the most efficient way, because it seems to me that in this case at most only nx-2 threads can be run simultaneously. So I was thinking that perhaps a better way would be to generate a sequence iterator (returning the linear position along the array), zip it to the array with a zip iterator, and defining a functor that examines the second element of the tuple (the position value) and if that value falls into the accepted range, modify the first element of the tuple. However, there may be a better way to do that. I am new to CUDA, and to make matter worse I really cut my teeth with Fortran, so it is hard for me to think outside the for-loop box...
I'm not sure what is the most efficient way. I can suggest what I think will be more efficient than your skeleton code.
Your proposal in the text is headed in the right direction. Rather than use a set of nested for-loops that will iterate potentially quite a few times, we should seek to get everything done in one thrust call. But we still need to have that one thrust call only modify the array values at the indices within the "cubic" volume to be operated on.
We don't want to use a method involving testing of a generated index against the valid index volume, however, as you seem to be suggesting. This would require us to launch a grid as large as our array, even if we only wanted to modify a small volume of it.
Instead, we launch an operation that is just large enough to cover the needed number of elements to modify, and we create a functor which does a linear index -> 4D index -> adjusted linear index conversion. That functor then operates within a transform iterator to convert an ordinary linear sequence starting at 0, 1, 2, etc. to a sequence that starts and stays within the volume to be modified. A permutation iterator is then used with this modified sequence to select the values of the array to modify.
Here's an example showing the difference in timing for your nested loop method (1) vs. mine (2) for an array of 64x64x64x64 and a modified volume of 62x62x62x62:
$ cat t39.cu
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/functional.h>
#include <thrust/equal.h>
#include <cassert>
#include <iostream>
struct my_idx
{
int nx, ny, nz, nq, lx, ly, lz, lq, dx, dy, dz, dq;
my_idx(int _nx, int _ny, int _nz, int _nq, int _lx, int _ly, int _lz, int _lq, int _hx, int _hy, int _hz, int _hq) {
nx = _nx;
ny = _ny;
nz = _nz;
nq = _nq;
lx = _lx;
ly = _ly;
lz = _lz;
lq = _lq;
dx = _hx - lx;
dy = _hy - ly;
dz = _hz - lz;
dq = _hq - lq;
// could do a lot of assert checking here
}
__host__ __device__
int operator()(int idx){
int rx = idx / dx;
int ix = idx - (rx * dx);
int ry = rx / dy;
int iy = rx - (ry * dy);
int rz = ry / dz;
int iz = ry - (rz * dz);
int rq = rz / dq;
int iq = rz - (rq * dq);
return (((iq+lq)*nz+iz+lz)*ny+iy+ly)*nx+ix+lx;
}
};
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
int main()
{
int nx,ny,nz,nq,lx,ly,lz,lq,hx,hy,hz,hq;
nx=64,ny=64,nz=64,nq=64;
lx=1,ly=1,lz=1,lq=1;
hx=nx-1,hy=ny-1,hz=nz-1,hq=nq-1;
thrust::device_vector<double> A(nx*ny*nz*nq);
thrust::device_vector<double> B(nx*ny*nz*nq);
thrust::fill(A.begin(), A.end(), (double) 1);
thrust::fill(B.begin(), B.end(), (double) 1);
// method 1
unsigned long long m1_time = dtime_usec(0);
for (auto q=lq; q<hq; ++q){
for (auto k=lz; k<hz; ++k){
for (auto j=ly; j<hy; ++j){
int offset1=lx+j*nx+k*nx*ny+q*nx*ny*nz;
int offset2=offset1+(hx-lx);
thrust::transform(A.begin()+offset1,
A.begin()+offset2, A.begin()+offset1,
thrust::negate<double>());
}
}
}
cudaDeviceSynchronize();
m1_time = dtime_usec(m1_time);
// method 2
unsigned long long m2_time = dtime_usec(0);
auto p = thrust::make_permutation_iterator(B.begin(), thrust::make_transform_iterator(thrust::counting_iterator<int>(0), my_idx(nx, ny, nz, nq, lx, ly, lz, lq, hx, hy, hz, hq)));
thrust::transform(p, p+(hx-lx)*(hy-ly)*(hz-lz)*(hq-lq), p, thrust::negate<double>());
cudaDeviceSynchronize();
m2_time = dtime_usec(m2_time);
if (thrust::equal(A.begin(), A.end(), B.begin()))
std::cout << "method 1 time: " << m1_time/(float)USECPSEC << "s method 2 time: " << m2_time/(float)USECPSEC << "s" << std::endl;
else
std::cout << "mismatch error" << std::endl;
}
$ nvcc -std=c++11 t39.cu -o t39
$ ./t39
method 1 time: 1.6005s method 2 time: 0.013182s
$
I'm working on programming my own little game which should have a visibility effect as described here. My world consists of Polygons which each have a list of Edges (sorted CW). I now want (as described in the article) to cast Rays towards the Edges of the polygons, find the intersections and retrieve a Polygon that defines the visible area.
So I wrote a classes for Vectors, Points, Edges and Polygons and adjusted the intersection-algorithm so it works with my code.
I then tested it and everything worked fine, but as I ran the Intersection algorithm in a for-loop to simulate a large amount of Edges processed(starting with 100, until 1000) the fps dropped drastically, with 100 Edges "only" 300fps (3000 before), and with 300 it dropped below 60 i think. This seems to be way to much drop for me as i wanted to reuse this code for my Lightsources and then i think i would quickly come up with processing way more than 300 Edges and it should run fast on way less powerful processors(i got an xeon e1230v3).
I figured out that only calling the EdgeIntersection the program runs many times faster, but I definitely need to loop through the Edges in my polygons so this is no option.
My Source-Code:
Vector.h/.cpp: Basic Vector class with two floats(X,Y), getters&setters, rotating
Vertex.h/.cpp: Basic Point class with a Position Vector, getters&setters and a boolean that indicates whether it is a Intersection Vertex
Edge.h/.cpp Basic Edge class with start/end-Verticies, getters&setters and rotating function(uses Vector.rotate())
Polygon.h:
#pragma once
#include <vector>
#include "Edge.h"
namespace geo
{
class Polygon
{
private:
std::vector<Edge> edges;
public:
Polygon();
Polygon(std::vector<Edge> edges);
~Polygon();
std::vector<Edge> getEdges();
Edge getEdge(int index);
int getEdgeCount();
void setEdges(std::vector<Edge> edges);
void setEdge(Edge e, int index);
void addEdge(Edge e);
void removeEdge(int index);
};
}
Ray.h:
#pragma once
#include "Vertex.h"
class Ray
{
private:
geo::Vertex origin;
geo::Vector dir;
public:
Ray();
Ray(geo::Vertex origin, geo::Vector dir);
~Ray();
geo::Vertex getOrigin();
geo::Vector getDirection();
void setOrigin(geo::Vertex origin);
void setDirection(geo::Vector dir);
};
LightModule.h:
#pragma once
#include "Polygon.h"
#include "Ray.h"
class LightModule
{
private:
//List of blocking Polygons
std::vector<geo::Polygon>* blockingPolygons;
std::vector<Ray> rays;
geo::Polygon bounds;
geo::Polygon visible;
/*geo::Polygon blocked;*/
//HitDetection Class later
geo::Vertex getIntersection(Ray r, geo::Edge* e);
geo::Vertex getClosestIntersection(Ray r, geo::Polygon *p);
public:
LightModule();
LightModule(std::vector<geo::Polygon>* blockingPolygons);
~LightModule();
//Set the Blocking Polygons
void setBlockingPolygons(std::vector<geo::Polygon>* blockingPolygons);
geo::Vertex callCI(Ray r, geo::Polygon* p);
geo::Vertex callI(Ray r, geo::Edge* e);
//Cast Rays towards Vertecies and store them in rays
void updateRays();
//Update Visibility Polygon
void updateVisible();
//Return Visibility Polygon
geo::Polygon* getVisible();
};
LightMModule.cpp:
#include "LightModule.h"
LightModule::LightModule()
{
rays.clear();
}
LightModule::LightModule(std::vector<geo::Polygon>* blockingPolygons)
{
this->blockingPolygons = blockingPolygons;
rays.clear();
}
LightModule::~LightModule()
{
}
void LightModule::setBlockingPolygons(std::vector<geo::Polygon>* blockingPolygons)
{
this->blockingPolygons = blockingPolygons;
}
//Test-cast a Ray (will follow mouse in the Test)
void LightModule::updateRays()
{
Ray r(geo::Vertex(geo::Vector(200, 100)), geo::Vector(-100, 0));
rays.push_back(r);
}
void LightModule::updateVisible()
{
}
//Both for Testing will later be part of a seperate class
geo::Vertex LightModule::callCI(Ray r, geo::Polygon *p)
{
return this->getClosestIntersection(r, p);
}
geo::Vertex LightModule::callI(Ray r, geo::Edge* e)
{
return this->getIntersection(r, e);
}
//TEST
geo::Vertex LightModule::getIntersection(Ray r, geo::Edge* e)
{
geo::Vertex v;
v.setIntersectVert(false);
float r_px = r.getOrigin().getPosition().getX();
float r_py = r.getOrigin().getPosition().getY();
float r_dx = r.getDirection().getX();
float r_dy = r.getDirection().getY();
float s_px = e->getOrigin().getPosition().getX();
float s_py = e->getOrigin().getPosition().getY();
float s_dx = e->getDirection().getX();
float s_dy = e->getDirection().getY();
float r_mag = sqrt(r_dx*r_dx + r_dy*r_dy);
float s_mag = sqrt(s_dx*s_dx + s_dy*s_dy);
if (r_dx / r_mag == s_dx / s_mag && r_dy / r_mag == s_dy / s_mag)
{
return v;
}
float T2 = (r_dx*(s_py - r_py) + r_dy*(r_px - s_px)) / (s_dx*r_dy - s_dy*r_dx);
float T1 = (s_px + s_dx*T2 - r_px) / r_dx;
if (T1 < 0 /*|| T1 > 1 For Lines*/)
{
return v;
}
if (T2 < 0 || T2 > 1)
{
return v;
}
v.setIntersectVert(true);
v.setPosition(geo::Vector(r_px + r_dx*T1, r_py + r_dy*T1));
return v;
}
geo::Vertex LightModule::getClosestIntersection(Ray r, geo::Polygon *p)
{
geo::Vertex v;
v.setIntersectVert(false);
geo::Vertex v_nearest(geo::Vector(0, 0));
v_nearest.setIntersectVert(false);
geo::Vector h1;
geo::Vector h2;
for (int i = 0; i < p->getEdges().size(); i++)
{
v = this->getIntersection(r, &p->getEdges().at(i));
h1.setX(v.getPosition().getX() - r.getOrigin().getPosition().getX());
h1.setY(v.getPosition().getY() - r.getOrigin().getPosition().getY());
h2.setX(v_nearest.getPosition().getX() - r.getOrigin().getPosition().getX());
h2.setY(v_nearest.getPosition().getY() - r.getOrigin().getPosition().getY());
if (i < 1)
v_nearest = v;
else if (v.isIntersectVert() == true && h1.getLength() < h2.getLength())
{
v_nearest = v;
}
}
return v_nearest;
}
For the Testing i create a Polygon a LightModule and call updateRays and then call the helper-Function callCI().
I know my code gets pretty messy when i have to cascade my getters and setters, ill have to fix that but for the Rest i hope everything is understandable and if not feel free to ask. And just to have mentioned it, I Test-draw my Objects with Vertex-Arrays but I don't need Graphical output of the intersection process, i just need the visible polygon.
Just to point out again: I need a faster way of finding the Intersection-Point between a Ray and a Polygon and as I didn't know if i did something wrong in my code I posted it all here so someone can maybe help me making my code more efficient or show me a different method to solve my problem.
Have a nice day and thank you for your answers :)
Paul
EDIT: Would it be meaningfully faster to first triangulate my polygons and then do a Ray-Triangle intersection Test?
I can't speak to the algorithm (which is possibly what you need) but some immediate thoughts on speeding up what you have.
First off you can define all your getters and setters inline (put them in the class in the header, not the separate source file) so the compiler can optimize the function calls away.
Then these changes might buy you a few frames:
// make sure your getters and setters are inline so the compiler
// can optimize them away
geo::Vertex LightModule::getClosestIntersection(Ray r, geo::Polygon* p)
{
geo::Vertex v;
v.setIntersectVert(false);
geo::Vector h1;
geo::Vector h2;
// cache these
Vector ray_position = r.getOrigin().getPosition();
geo::Vertex v_nearest(geo::Vector(0, 0));
v_nearest.setIntersectVert(false);
// cache size (don't dereference each time)
size_t size = p->getEdges().size();
// avoid acces violation
if(!size)
return v_nearest;
// preset item 0
v_nearest = this->getIntersection(r, &p->getEdges()[0]);
// start from 1 not 0
for(int i = 1; i < size; i++)
{
// don't use at() its slower
// v = this->getIntersection(r, &p->getEdges().at(i));
v = this->getIntersection(r, &p->getEdges()[i]);
// used cached ray position rather than call functions
h1.setX(v.getPosition().getX() - ray_position.getX());
h1.setY(v.getPosition().getY() - ray_position.getY());
h2.setX(v_nearest.getPosition().getX() - ray_position.getX());
h2.setY(v_nearest.getPosition().getY() - ray_position.getY());
// this if not needed because presetting item 0
//if(i < 1)
// v_nearest = v;
if(v.isIntersectVert() == true && h1.getLength() < h2.getLength())
{
v_nearest = v;
}
}
return v_nearest;
}
I removed one of the if statements by calculating the 0 item before the loop and starting the loop from 1, the rest is just caching a much used value and avoiding at() which is slower because it does bound-checking.
I wrote a kernel for OpenCL where I initialise all the elements of a 3D array to -> i*i*i + j*j*j. I'm now having problems in creating a grid of threads to do the initialisation of the elements (concurrently). I know that the code that I have now only uses 3 threads, how can I expand on that?
Please help. I'm new to OpenCL, so any suggestion or explanation might be handy. Thanks!
This is code:
_kernel void initialize (
int X;
int Y;
int Z;
_global float*A) {
// Get global position in X direction
int dirX = get_global_id(0);
// Get global position in Y direction
int dirY = get_global_id(1);
// Get global position in Z direction
int dirZ = get_global_id(2);
int A[2000][100][4];
int i,j,k;
for (i=0;i<2000;i++)
{
for (j=0;j<100;j++)
{
for (k=0;k<4;k++)
{
A[dirX*X+i][dirY*Y+j][dirZ*Z+k] = i*i*i + j*j*j;
}
}
}
}
You create the buffer to store your output 'A' in the calling (host) code. This is passed to your kernel as a pointer, which is correct in your function definition above. However you don't need to declare it again inside your kernel function, so remove the line int A[2000][100][4];.
You can simplify the code greatly. Using the 3D global ID to indicate the 3D index into the array for each work-item, you could change the loop as follows (assuming that for a given i and j, all elements along Z should have the same value):
__kernel void initialize (__global float* A) {
// cast required so that kernel compiler knows the array dimensions
__global float (*a)[2000][100][4] = A;
// Get global position in X direction
int i = get_global_id(0);
// Get global position in Y direction
int j = get_global_id(1);
// Get global position in Z direction
int k = get_global_id(2);
(*a)[i][j][k] = i*i*i + j*j*j;
}
In your calling code you would then create the kernel with a global work-size of 2000x100x4.
Practically this is a lot of work items to schedule, so you would likely get better performance from a global (one-dimensional) work-size of 2000 and a loop inside the kernel, e.g.:
__kernel void initialize (__global float* A) {
// cast required so that kernel compiler knows the array dimensions
__global float (*a)[2000][100][4] = A;
// Get global position in X direction
int i = get_global_id(0);
for (j=0;j<100;j++) {
for (k=0;k<4;k++) {
(*a)[i][j][k] = i*i*i + j*j*j;
}
}
}
My Code works for my purely glut implementation, but I am trying to get it to work in qt.
I have a vector of masspoints for a wire mesh system
std::vector<masspoint> m_particles;
The problem is in my qt version none of what I write really sticks and I am left with an array of zeros. Basically I am confused why the glut version has correct values but the qt one does not given that it is basically identical code. What is wrong with the qt code?
Yes I only see zeros when using qDebug. When I am calling my drawing function in the qt version all vertex points turn out to be 0 in all components so nothing is seen.
int myboog = 1;
int county = 0;
// Constructors
Cloth::Cloth(float width, float height, int particles_in_width, int particles_in_height):
m_width(particles_in_width),
m_height(particles_in_height),
m_dimensionWidth(width),
m_dimensionHeight(height),
m_distanceX(width/(float)particles_in_width),
m_distanceY(height/(float)particles_in_height)
{
//Set the particle array to the given size
//Height by width
//mparticles is the name of our vector
m_particles.resize(m_width*m_height);
qDebug() << m_particles.size();
// Create the point masses to simulate the cloth
for (int x = 0; x < m_width; ++x)
{
for (int y=0; y < m_height; ++y)
{
// Place the pointmass of the cloth, lift the edges to give the wind more effect as the cloth falls
Vector3f position = Vector3f(m_dimensionWidth * (x / (float)m_width),
((x==0)||(x==m_width-1)||(y==0)||(y==m_height-1)) ? m_distanceY/2.0f:0,
m_dimensionHeight * (y / (float)m_height));
// The gravity effect is applied to new pmasspoints
m_particles[y * m_width + x] = masspoint(position,Vector3f(0,-0.06,0));
}
}
int num = (int)m_particles.size();
for (int i=0; i<num; ++i)
{
masspoint* p = &m_particles[i];
if(myboog)
{
qDebug() << "test " << *p->getPosition().getXLocation() << county;
county++;
}
}
myboog = 0;
// Calculate the normals for the first time so the initial draw is correctly lit
calculateClothNormals();
}
Code for masspoint involved in constructor for CLoth
#ifndef MASSPOINT_H
#define MASSPOINT_H
#include <QGLWidget>
#include "vector3f.h"
class masspoint
{
private:
Vector3f m_position; // Current Location of the pointmass
Vector3f m_velocity; // Direction and speed the pointmass is traveling in
Vector3f m_acceleration; // Speed at which the pointmass is accelerating (used for gravity)
Vector3f m_forceAccumulated; // Force that has been accumulated since the last update
Vector3f m_normal; // Normal of this pointmass, used to light the cloth when drawing
float m_damping; // Amount of velocity lost per update
bool m_stationary; // Whether this pointmass is currently capible of movement
public:
masspoint& operator= (const masspoint& particle);
//Some constructors
masspoint();
masspoint(const masspoint& particle);
masspoint(Vector3f position, Vector3f acceleration);
//Like eulur integration
void integrate(float duration);
// Accessor functions
//Get the position of the point mass
inline Vector3f getPosition() const {return m_position;}
Vector stuff involved in the constructor for CLoth
#ifndef VECTOR3F_H
#define VECTOR3F_H
#include <math.h>
// Vector library to be used
class Vector3f
{
private:
float m_x, m_y, m_z;
public:
const float* getXLocation() const { return &m_x; }
I need to create a 2D int array of size 800x800. But doing so creates a stack overflow (ha ha).
I'm new to C++, so should I do something like a vector of vectors? And just encapsulate the 2d array into a class?
Specifically, this array is my zbuffer in a graphics program. I need to store a z value for every pixel on the screen (hence the large size of 800x800).
Thanks!
You need about 2.5 megs, so just using the heap should be fine. You don't need a vector unless you need to resize it. See C++ FAQ Lite for an example of using a "2D" heap array.
int *array = new int[800*800];
(Don't forget to delete[] it when you're done.)
Every post so far leaves the memory management for the programmer. This can and should be avoided. ReaperUnreal is darn close to what I'd do, except I'd use a vector rather than an array and also make the dimensions template parameters and change the access functions -- and oh just IMNSHO clean things up a bit:
template <class T, size_t W, size_t H>
class Array2D
{
public:
const int width = W;
const int height = H;
typedef typename T type;
Array2D()
: buffer(width*height)
{
}
inline type& at(unsigned int x, unsigned int y)
{
return buffer[y*width + x];
}
inline const type& at(unsigned int x, unsigned int y) const
{
return buffer[y*width + x];
}
private:
std::vector<T> buffer;
};
Now you can allocate this 2-D array on the stack just fine:
void foo()
{
Array2D<int, 800, 800> zbuffer;
// Do something with zbuffer...
}
I hope this helps!
EDIT: Removed array specification from Array2D::buffer. Thanks to Andreas for catching that!
Kevin's example is good, however:
std::vector<T> buffer[width * height];
Should be
std::vector<T> buffer;
Expanding it a bit you could of course add operator-overloads instead of the at()-functions:
const T &operator()(int x, int y) const
{
return buffer[y * width + x];
}
and
T &operator()(int x, int y)
{
return buffer[y * width + x];
}
Example:
int main()
{
Array2D<int, 800, 800> a;
a(10, 10) = 50;
std::cout << "A(10, 10)=" << a(10, 10) << std::endl;
return 0;
}
You could do a vector of vectors, but that would have some overhead. For a z-buffer the more typical method would be to create an array of size 800*800=640000.
const int width = 800;
const int height = 800;
unsigned int* z_buffer = new unsigned int[width*height];
Then access the pixels as follows:
unsigned int z = z_buffer[y*width+x];
I might create a single dimension array of 800*800. It is probably more efficient to use a single allocation like this, rather than allocating 800 separate vectors.
int *ary=new int[800*800];
Then, probably encapsulate that in a class that acted like a 2D array.
class _2DArray
{
public:
int *operator[](const size_t &idx)
{
return &ary[idx*800];
}
const int *operator[](const size_t &idx) const
{
return &ary[idx*800];
}
};
The abstraction shown here has a lot of holes, e.g, what happens if you access out past the end of a "row"? The book "Effective C++" has a pretty good discussion of writing good multi dimensional arrays in C++.
One thing you can do is change the stack size (if you really want the array on the stack) with VC the flag to do this is [/F](http://msdn.microsoft.com/en-us/library/tdkhxaks(VS.80).aspx).
But the solution you probably want is to put the memory in the heap rather than on the stack, for that you should use a vector of vectors.
The following line declares a vector of 800 elements, each element is a vector of 800 ints and saves you from managing the memory manually.
std::vector<std::vector<int> > arr(800, std::vector<int>(800));
Note the space between the two closing angle brackets (> >) which is required in order disambiguate it from the shift right operator (which will no longer be needed in C++0x).
Or you could try something like:
boost::shared_array<int> zbuffer(new int[width*height]);
You should still be able to do this too:
++zbuffer[0];
No more worries about managing the memory, no custom classes to take care of, and it's easy to throw around.
There's the C like way of doing:
const int xwidth = 800;
const int ywidth = 800;
int* array = (int*) new int[xwidth * ywidth];
// Check array is not NULL here and handle the allocation error if it is
// Then do stuff with the array, such as zero initialize it
for(int x = 0; x < xwidth; ++x)
{
for(int y = 0; y < ywidth; ++y)
{
array[y * xwidth + x] = 0;
}
}
// Just use array[y * xwidth + x] when you want to access your class.
// When you're done with it, free the memory you allocated with
delete[] array;
You could encapsulate the y * xwidth + x inside a class with an easy get and set method (possibly with overloading the [] operator if you want to start getting into more advanced C++). I'd recommend getting to this slowly though if you're just starting with C++ and not start creating re-usable fully class templates for n-dimension arrays which will just confuse you when you're starting off.
As soon as you get into graphics work you might find that the overhead of having extra class calls might slow down your code. However don't worry about this until your application isn't fast enough and you can profile it to show where the time is lost, rather than making it more difficult to use at the start with possible unnecessary complexity.
I found that the C++ lite FAQ was great for information such as this. In particular your question is answered by:
http://www.parashift.com/c++-faq-lite/freestore-mgmt.html#faq-16.16
You can allocate array on static storage (in file's scope, or add static qualifier in function scope), if you need only one instance.
int array[800][800];
void fn()
{
static int array[800][800];
}
This way it will not go to the stack, and you not have to deal with dynamic memory.
Well, building on what Niall Ryan started, if performance is an issue, you can take this one step further by optimizing the math and encapsulating this into a class.
So we'll start with a bit of math. Recall that 800 can be written in powers of 2 as:
800 = 512 + 256 + 32 = 2^5 + 2^8 + 2^9
So we can write our addressing function as:
int index = y << 9 + y << 8 + y << 5 + x;
So if we encapsulate everything into a nice class we get:
class ZBuffer
{
public:
const int width = 800;
const int height = 800;
ZBuffer()
{
for(unsigned int i = 0, *pBuff = zbuff; i < width * height; i++, pBuff++)
*pBuff = 0;
}
inline unsigned int getZAt(unsigned int x, unsigned int y)
{
return *(zbuff + y << 9 + y << 8 + y << 5 + x);
}
inline unsigned int setZAt(unsigned int x, unsigned int y, unsigned int z)
{
*(zbuff + y << 9 + y << 8 + y << 5 + x) = z;
}
private:
unsigned int zbuff[width * height];
};