Variable gets lost after allocating array of structs in cuda - c++

I have a structure with arrays of structures inside in C, and I need a copy of that in the GPU. For that I am writing a function that makes some cudaMalloc and cudaMemcpys of the variables in the struct from host to device.
A simple version (the real one has various structs and variables/arrays inside) of the struct is:
struct Node {
float* position;
};
struct Graph{
unsigned int nNode;
Node* node;
unsigned int nBoundary;
unsigned int* boundary;
};
My problem is that I must be doing something wrong in the memory allocation and copy of the struct. When I copy the variables withing Graph, I can see that they are properly copied (by accessing it in a kernel as in the example below). For example, I can check that graph.nBoundary=3.
However, I can only see this if I do not allocate and copy the memory of Node *. If I do, I get -858993460 instead of 3. Interestingly, Node * is not wrongly allocated, as I can inspect the value of say graph.node[0].pos[0] and it has the correct value.
This only happens with the graph.nBoundary. All the other variables remain with the correct numerical values, but this one gets "wronged" when running the cudaMemcpy of the Node*.
What am I doing wrong and why does this happen? How do I fix it?
Let me know if you need more information.
MCVE:
#include <algorithm>
#include <cuda_runtime_api.h>
#include <cuda.h>
// A point, part of some elements
struct Node {
float* position;
};
struct Graph{
unsigned int nNode;
Node* node;
unsigned int nBoundary;
unsigned int* boundary;
};
Graph* cudaGraphMalloc(const Graph* inGraph);
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void testKernel(Graph* graph,unsigned int * d_res){
d_res[0] = graph->nBoundary;
};
int main()
{
// Generate some fake data on the CPU
Graph graph;
graph.node = (Node*)malloc(2 * sizeof(Node));
graph.boundary = (unsigned int*)malloc(3 * sizeof(unsigned int));
for (int i = 0; i < 3; i++){
graph.boundary[i] = i + 10;
}
graph.nBoundary = 3;
graph.nNode = 2;
for (int i = 0; i < 2; i++){
// They can have different sizes in the original code
graph.node[i].position = (float*)malloc(3 * sizeof(float));
graph.node[i].position[0] = 45;
graph.node[i].position[1] = 1;
graph.node[i].position[2] = 2;
}
// allocate GPU memory
Graph * d_graph = cudaGraphMalloc(&graph);
// some dummy variables to test on GPU.
unsigned int * d_res, *h_res;
cudaMalloc((void **)&d_res, sizeof(unsigned int));
h_res = (unsigned int*)malloc(sizeof(unsigned int));
//Run kernel
testKernel << <1, 1 >> >(d_graph, d_res);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaMemcpy(h_res, d_res, sizeof(unsigned int), cudaMemcpyDeviceToHost));
printf("%u\n", graph.nBoundary);
printf("%d", h_res[0]);
return 0;
}
Graph* cudaGraphMalloc(const Graph* inGraph){
Graph* outGraph;
gpuErrchk(cudaMalloc((void**)&outGraph, sizeof(Graph)));
//copy constants
gpuErrchk(cudaMemcpy(&outGraph->nNode, &inGraph->nNode, sizeof(unsigned int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(&outGraph->nBoundary, &inGraph->nBoundary, sizeof(unsigned int), cudaMemcpyHostToDevice));
// copy boundary
unsigned int * d_auxboundary, *h_auxboundary;
h_auxboundary = inGraph->boundary;
gpuErrchk(cudaMalloc((void**)&d_auxboundary, inGraph->nBoundary*sizeof(unsigned int)));
gpuErrchk(cudaMemcpy(d_auxboundary, h_auxboundary, inGraph->nBoundary*sizeof(unsigned int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(&outGraph->boundary, d_auxboundary, sizeof(unsigned int *), cudaMemcpyDeviceToDevice));
//Create nodes
Node * auxnode;
gpuErrchk(cudaMalloc((void**)&auxnode, inGraph->nNode*sizeof(Node)));
// Crate auxiliary pointers to grab them from host and pass them to device
float ** d_position, ** h_position;
d_position = static_cast<float **>(malloc(inGraph->nNode*sizeof(float*)));
h_position = static_cast<float **>(malloc(inGraph->nNode*sizeof(float*)));
for (int i = 0; i < inGraph->nNode; i++){
// Positions
h_position[i] = inGraph->node[i].position;
gpuErrchk(cudaMalloc((void**)&d_position[i], 3 * sizeof(float)));
gpuErrchk(cudaMemcpy(d_position[i], h_position[i], 3 * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(&auxnode[i].position, d_position[i], sizeof(float *), cudaMemcpyDeviceToDevice));
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////////
////////////// If I comment the following section, nBoundary can be read by the kernel
///////////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////////
gpuErrchk(cudaMemcpy(&outGraph->node, auxnode, inGraph->nNode*sizeof(Node *), cudaMemcpyDeviceToDevice));
return outGraph;
}

The problem is in the function cudaGraphMalloc where you are trying to allocate device memory to the members of outGraph which has already been allocated on the device. In process of doing so, you are de-referencing a device pointer on host which is illegal.
To allocate device memory to members of struct type variable which exists on the device, we first have to create a temporary host variable of that struct type, then allocate device memory to its members, and then copy it to the struct which exists on the device.
I have answered a similar question here. Please take a look at it.
The fixed code may look like this:
#include <algorithm>
#include <cuda_runtime.h>
#include <cuda.h>
// A point, part of some elements
struct Node {
float* position;
};
struct Graph {
unsigned int nNode;
Node* node;
unsigned int nBoundary;
unsigned int* boundary;
};
Graph* cudaGraphMalloc(const Graph* inGraph);
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void testKernel(Graph* graph, unsigned int * d_res) {
d_res[0] = graph->nBoundary;
};
int main()
{
// Generate some fake data on the CPU
Graph graph;
graph.node = (Node*)malloc(2 * sizeof(Node));
graph.boundary = (unsigned int*)malloc(3 * sizeof(unsigned int));
for (int i = 0; i < 3; i++) {
graph.boundary[i] = i + 10;
}
graph.nBoundary = 3;
graph.nNode = 2;
for (int i = 0; i < 2; i++) {
// They can have different sizes in the original code
graph.node[i].position = (float*)malloc(3 * sizeof(float));
graph.node[i].position[0] = 45;
graph.node[i].position[1] = 1;
graph.node[i].position[2] = 2;
}
// allocate GPU memory
Graph * d_graph = cudaGraphMalloc(&graph);
// some dummy variables to test on GPU.
unsigned int * d_res, *h_res;
cudaMalloc((void **)&d_res, sizeof(unsigned int));
h_res = (unsigned int*)malloc(sizeof(unsigned int));
//Run kernel
testKernel << <1, 1 >> >(d_graph, d_res);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaMemcpy(h_res, d_res, sizeof(unsigned int), cudaMemcpyDeviceToHost));
printf("%u\n", graph.nBoundary);
printf("%u\n", h_res[0]);
return 0;
}
Graph* cudaGraphMalloc(const Graph* inGraph)
{
//Create auxiliary Graph variable on host
Graph temp;
//copy constants
temp.nNode = inGraph->nNode;
temp.nBoundary = inGraph->nBoundary;
// copy boundary
gpuErrchk(cudaMalloc((void**)&(temp.boundary), inGraph->nBoundary * sizeof(unsigned int)));
gpuErrchk(cudaMemcpy(temp.boundary, inGraph->boundary, inGraph->nBoundary * sizeof(unsigned int), cudaMemcpyHostToDevice));
//Create nodes
size_t nodeBytesTotal = temp.nNode * sizeof(Node);
gpuErrchk(cudaMalloc((void**)&(temp.node), nodeBytesTotal));
for (int i = 0; i < temp.nNode; i++)
{
//Create auxiliary node on host
Node auxNodeHost;
//Allocate device memory to position member of auxillary node
size_t nodeBytes = 3 * sizeof(float);
gpuErrchk(cudaMalloc((void**)&(auxNodeHost.position), nodeBytes));
gpuErrchk(cudaMemcpy(auxNodeHost.position, inGraph->node[i].position, nodeBytes, cudaMemcpyHostToDevice));
//Copy auxillary host node to device
Node* dPtr = temp.node + i;
gpuErrchk(cudaMemcpy(dPtr, &auxNodeHost, sizeof(Node), cudaMemcpyHostToDevice));
}
Graph* outGraph;
gpuErrchk(cudaMalloc((void**)&outGraph, sizeof(Graph)));
gpuErrchk(cudaMemcpy(outGraph, &temp, sizeof(Graph), cudaMemcpyHostToDevice));
return outGraph;
}
Be advised that you will have to keep the host copies of internal device pointers (i.e. the auxiliary host variables). This is because you will have to free the device memory later and since you will only have a device copy of Graph in the main code, you won't be able to access its members from the host to call cudaFree on them. In this case the variable Node auxNodeHost (created in each iteration) and Graph temp are those variables.
The above code does not do that and is just for demonstration purpose.
Tested on Windows 10, Visual Studio 2015, CUDA 9.2, NVIDIA Driver 397.44.

Related

CUDA pointer inside kernel becomes null

I'm trying to pass a pointer to triangle data to a kernel, but when debugging I find the pointer becomes null, d_list contains the triangles and both d_list and d_world are members of the main window class, also the error checking returns "no error"
d_list is of type hittable* and d_world is hittable_list*
__global__ void create_world(hittable* d_list, hittable_list* d_world, int num_triangles) {
if (threadIdx.x == 0 && blockIdx.x == 0) {
// the class hittable_list contains a counter for the list size, which no matter the
// scene size it always becomes zero
d_world = new hittable_list(&d_list, num_triangles);
}
}
checkCudaErrors(cudaMalloc((void**)&d_list, num_hittables * sizeof(triangle)));
checkCudaErrors(cudaMalloc((void**)&d_world, sizeof(hittable_list)));
cudaMemcpy(d_list, m_triangles.data(), num_hittables * sizeof(triangle), cudaMemcpyHostToDevice);
create_world << <1, 1 >> > (d_list, d_world, num_hittables);
checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaDeviceSynchronize());
I tried initializing the "world" in the host then cudaMemcpy'ing to the d_world, but it also fails
EDIT: minimal exmple
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <vector>
struct make_list {
__device__ make_list(float** list, int n) { contents = list; size = n; };
float** contents;
int size;
};
__global__ void render(make_list** world) {
int size = (*world)->size; // set a breakpoint here, the size is 0
}
__global__ void create_world(float* d_list, make_list* d_world, int num_triangles) {
if (threadIdx.x == 0 && blockIdx.x == 0) {
// the class hittable_list contains a counter for the list size, which no matter the
// scene size it always becomes zero
d_world = new make_list(&d_list, num_triangles);
}
}
int main () {
float* d_list;
make_list* d_world;
int size = 8;
std::vector<float> m_triangles(size);
cudaMalloc((void**)&d_list, size * sizeof(float));
cudaMalloc((void**)&d_world, sizeof(make_list));
cudaMemcpy(d_list, m_triangles.data(), size * sizeof(float), cudaMemcpyHostToDevice);
create_world << <1, 1 >> > (d_list, d_world, size);
cudaDeviceSynchronize();
render << <1, 1 >> > (&d_world);
cudaDeviceSynchronize();
return 0;
}
EDIT 2: updated with virtual function call, it's causing crashes
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <vector>
#include <cstdio>
class hittable {
public:
__device__ virtual int hit() const = 0;
};
struct make_list : public hittable {
__device__ make_list(float** list, int n) { contents = list; size = n; };
__device__ virtual int hit() const {
return size;
}
float** contents;
int size;
};
__global__ void render(make_list** world) {
int size = (*world)->size; // set a breakpoint here, the size is 0
printf("size = %d\n", size);
int new_size = (*world)->hit();
printf("new size = %d\n", new_size);
}
__global__ void create_world(float* d_list, make_list** d_world, int num_triangles) {
if (threadIdx.x == 0 && blockIdx.x == 0) {
// the class hittable_list contains a counter for the list size, which no matter the
// scene size it always becomes zero
*d_world = new make_list(&d_list, num_triangles);
}
}
int main() {
float* d_list;
make_list** d_world;
cudaMalloc(&d_world, sizeof(make_list*));
int size = 8;
std::vector<float> m_triangles(size);
cudaMalloc((void**)&d_list, size * sizeof(float));
cudaMemcpy(d_list, m_triangles.data(), size * sizeof(float), cudaMemcpyHostToDevice);
create_world << <1, 1 >> > (d_list, d_world, size);
cudaDeviceSynchronize();
render << <1, 1 >> > (d_world);
cudaDeviceSynchronize();
return 0;
}
There are at least a few issues.
In C++, when you pass a variable to a function via the function parameters, a copy of that variable is made for local use by the function. Any modifications made to that variable will not show up globally, i.e. in the calling environment, because the function is operating on a copy of the variable. Therefore this could never do what you want:
d_world = new make_list(&d_list, num_triangles);
There is nothing illegal about it, per se, but it will not have the desired effect. The global copy of d_world is unchanged by that assignment. This is a C++ concept, not unique or specific to CUDA, and it trips people up from time to time.
This is almost never legal in CUDA:
render << <1, 1 >> > (&d_world);
^
In typical usage, it is not possible to pass the address of a host location to device code via a kernel call parameter. Any attempt to dereference that pointer &d_world will result in dereferencing the address of a host location. That is illegal in CUDA device code.
While not necessarily a problem at this point, you should be aware of the fact that in-kernel new operates against the device heap which has a default limit of 8MB, and furthermore allocations created this way cannot take part in host-issued cudaMemcpy* calls. These topics are covered in the programming guide.
When I make changes to address those first 2 items, I get what appear to be sensible results:
$ cat t2190.cu
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <vector>
#include <cstdio>
struct make_list {
__device__ make_list(float** list, int n) { contents = list; size = n; };
float** contents;
int size;
};
__global__ void render(make_list** world) {
int size = (*world)->size; // set a breakpoint here, the size is 0
printf("size = %d\n", size);
}
__global__ void create_world(float* d_list, make_list** d_world, int num_triangles) {
if (threadIdx.x == 0 && blockIdx.x == 0) {
// the class hittable_list contains a counter for the list size, which no matter the
// scene size it always becomes zero
*d_world = new make_list(&d_list, num_triangles);
}
}
int main () {
float* d_list;
make_list** d_world;
cudaMalloc(&d_world, sizeof(make_list*));
int size = 8;
std::vector<float> m_triangles(size);
cudaMalloc((void**)&d_list, size * sizeof(float));
cudaMemcpy(d_list, m_triangles.data(), size * sizeof(float), cudaMemcpyHostToDevice);
create_world << <1, 1 >> > (d_list, d_world, size);
cudaDeviceSynchronize();
render << <1, 1 >> > (d_world);
cudaDeviceSynchronize();
return 0;
}
$ nvcc -o t2190 t2190.cu
$ compute-sanitizer ./t2190
========= COMPUTE-SANITIZER
size = 8
========= ERROR SUMMARY: 0 errors
$
Although you don't show how you are using the contents member of the make_list object, I'm doubtful that this could possibly do anything useful for you, for the same reason as I have indicated in item 1 above:
*d_world = new make_list(&d_list,
^^^^^^^
The address you are using there is the address of a temporary local variable made by the function. My guess is you probably want d_list there or possibly *d_list, and this might necessitate changes in your contents object member of the handling of that object member. Whatever you are doing there will almost certainly require changes not unlike the refactoring I have done to address items 1 and 2.
For now, without knowing anything further about your intent, something that seems sensible to me would be like this:
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <vector>
#include <cstdio>
struct make_list {
__device__ make_list(float* list, int n) { contents = list; size = n; };
float* contents;
int size;
};
__global__ void render(make_list** world) {
int size = (*world)->size; // set a breakpoint here, the size is 0
printf("size = %d\n", size);
}
__global__ void create_world(float* d_list, make_list** d_world, int num_triangles) {
if (threadIdx.x == 0 && blockIdx.x == 0) {
// the class hittable_list contains a counter for the list size, which no matter the
// scene size it always becomes zero
*d_world = new make_list(d_list, num_triangles);
}
}
int main () {
float* d_list;
make_list** d_world;
cudaMalloc(&d_world, sizeof(make_list*));
int size = 8;
std::vector<float> m_triangles(size);
cudaMalloc((void**)&d_list, size * sizeof(float));
cudaMemcpy(d_list, m_triangles.data(), size * sizeof(float), cudaMemcpyHostToDevice);
create_world << <1, 1 >> > (d_list, d_world, size);
cudaDeviceSynchronize();
render << <1, 1 >> > (d_world);
cudaDeviceSynchronize();
return 0;
}

C++ Deep Copy Object

I am trying to deep copy objects back and forth. When I run the gdb, I get the following error after one iteration of the loop.
Program received signal SIGSEGV, Segmentation fault.
0x0804ab96 in DGCPM::DGCPM (this=0x844b760, cur=0x1) at DGCPM.C:27
27 memcpy(vRCells, cur->vRCells,sizeof(float)*nThetaCells);
I suspect the problem has to do with creating the "new class," but I'm not sure. Any suggestions?
(Note: The "_initialize" code calls a FORTRAN subroutine that sets the values in the program.)
Here is the run.C main file:
#include "../include/DGCPM.h"
#define particle_num 5
class DGCPM **mallocModels(int n);
int main(int argc, char *argv[]){
class DGCPM **m;
class DGCPM **cur;
m=mallocModels(particle_num);//update
for(int t = 0; t < 48; t++){
//Update m, and then...
cur = m;
m = (DGCPM**)malloc(sizeof(class DGCPM *)*particle_num);
for(int i=0;i<particle_num;i++){
randomidx = ((double)rand() / ((double)RAND_MAX + 1));
currentidx = find(cumPw,randomidx,particle_num);
m[i] = new class DGCPM(cur[currentidx]);
}
for(int i=0;i<particle_num;i++){
delete cur[i];
}
free(cur);
}
return 0;
}
/*============================================================================
mallocModels - allocate the ensemble of models
============================================================================*/
class DGCPM **mallocModels(int n){
class DGCPM **m;
m=(class DGCPM **)amjSafeMalloc(sizeof(class DGCPM *)*n,
(char *)"mallocModels:m");
for(int i=0;i<n;i++)
m[i]=new class DGCPM();
return m;
}
/*============================================================================
Find - Return a particle index that has a high probability of having a high weight.
============================================================================*/
int find(float *cumPw, double randomidx, int nM){
/*Wrong implementation*/
int index = 0;
flag = 0;
while(flag == 0){
if(cumPw[i] >= randomidx){
flag = 1;
i++;
}
else{
index ++;
}
}
return index; //Sometimes, index was going to number of models, or number of models + 1, which are out of bounds.
/*Correct implementation*/
int index = 0;
for(int i = 0; i < nM-1; i++){
if(cumPw[i] >= randomidx){
index = i;
break;
}
}
if(index >= nM){
index = nM-1;
printf("Error: random index exceeds bounds");
}
return index;
}
Here is the DGCPM.h header file:
class DGCPM{
public:
DGCPM(); /* Initialized with defaults setup */
DGCPM(class DGCPM *cur); //Copy constructor
DGCPM(int nThetaCells, int nPhiCells, float thetaMin, float thetaMax);
~DGCPM(); /* Free memory */
private:
int internal; /* 1=memory allocated internally and should be deallocated when ~DGCPM is called, 2=memory is internal except for mGridN which is external */
int nThetaCells,nRCells,nPhiCells;
float thetaMin,thetaMax;
float rMin,rMax;
float delR,delPhi;
float deltMax;
float *vRCells; /* [nThetaCells] */
float *vThetaCells; /* [nThetaCells] */
float *vPhiCells; /* [nPhiCells] */
float **mGridB; /* [nPhiCells][nThetaCells] */
float **mGridBi; /* [nPhiCells][nThetaCells] */
float **mGridPot; /* [nPhiCells][nThetaCells] */
float **mGridEr; /* [nPhiCells][nThetaCells] */
float **mGridEp; /* [nPhiCells][nThetaCells] */
float **mGridVr; /* [nPhiCells][nThetaCells] */
float **mGridVp; /* [nPhiCells][nThetaCells] */
float **mGridN; /* [nPhiCells][nThetaCells] */
float **mGridHalf; /* [nPhiCells][nThetaCells] Particles / weber (workspace for upwind and superbee) */
float **mGridDen; /* [nPhiCells][nThetaCells] */
float **mGridVol; /* [nPhiCells][nThetaCells] */
float **mGridX; /* [nPhiCells][nThetaCells] */
float **mGridY; /* [nPhiCells][nThetaCells] */
float **mGridOc; /* [nPhiCells][nThetaCells] */
float **std; /* [nPhiCells][nThetaCells] */
float parI[2];
float delTMax;
float Re;
void initialize(int nThetaCells, int nPhiCells, float thetaMin,
float thetaMax);
};
And finally the DGCPM.C object wrapper:
/******************************************************************************
* DGCPM.C - This implements the DGCPM plasmasphere model class *
******************************************************************************/
#define TWO_PI 6.2831853071795864769252866
#include "../include/DGCPM.h"
# include <cstdlib>
# include <cmath>
/*============================================================================
DGCPM::DGCPM()
Initialize with default setup
============================================================================*/
DGCPM::DGCPM(){
internal=1;
initialize(200,200,14.963217,60.0);/*(180,200,14.963217,60.0);*/
}
//Copy Constructor
DGCPM::DGCPM(class DGCPM *cur){
internal=1;
initialize(200,200,14.963217,60.0);/*(180,200,14.963217,60.0);*/
memcpy(vRCells, cur->vRCells,sizeof(float)*nThetaCells);
memcpy(vPhiCells, cur->vPhiCells,sizeof(float)*nPhiCells);
memcpy(vThetaCells, cur->vThetaCells,sizeof(float)*nThetaCells);
memcpy(mGridB[0], cur->mGridB[0],sizeof(float)*nThetaCells*nPhiCells);
memcpy(mGridBi[0], cur->mGridBi[0],sizeof(float)*nThetaCells*nPhiCells);
memcpy(mGridPot[0], cur->mGridPot[0],sizeof(float)*nThetaCells*nPhiCells);
memcpy(mGridEr[0], cur->mGridEr[0],sizeof(float)*nThetaCells*nPhiCells);
memcpy(mGridEp[0], cur->mGridEp[0],sizeof(float)*nThetaCells*nPhiCells);
memcpy(mGridVr[0], cur->mGridVr[0],sizeof(float)*nThetaCells*nPhiCells);
memcpy(mGridVp[0], cur->mGridVp[0],sizeof(float)*nThetaCells*nPhiCells);
memcpy(mGridN[0], cur->mGridN[0],sizeof(float)*nThetaCells*nPhiCells);
memcpy(mGridHalf[0], cur->mGridHalf[0],sizeof(float)*nThetaCells*nPhiCells);
memcpy(mGridDen[0], cur->mGridDen[0],sizeof(float)*nThetaCells*nPhiCells);
memcpy(mGridVol[0], cur->mGridVol[0],sizeof(float)*nThetaCells*nPhiCells);
memcpy(mGridOc[0], cur->mGridOc[0],sizeof(float)*nThetaCells*nPhiCells);
memcpy(mGridX[0], cur->mGridX[0],sizeof(float)*nThetaCells*nPhiCells);
memcpy(mGridY[0], cur->mGridY[0],sizeof(float)*nThetaCells*nPhiCells);
memcpy(std[0], cur->std[0],sizeof(float)*nThetaCells*nPhiCells);
}
/*============================================================================
DGCPM::~DGCPM()
Free allocated memory
============================================================================*/
DGCPM::~DGCPM(){
if(internal>=1){
amjFree1dFloat(vRCells);
amjFree1dFloat(vThetaCells);
amjFree1dFloat(vPhiCells);
amjFree2dFloat(mGridB);
amjFree2dFloat(mGridBi);
amjFree2dFloat(mGridEr);
amjFree2dFloat(mGridEp);
amjFree2dFloat(mGridVr);
amjFree2dFloat(mGridVp);
if(internal==1) amjFree2dFloat(mGridN);
amjFree2dFloat(mGridHalf);
amjFree2dFloat(mGridDen);
amjFree2dFloat(mGridVol);
amjFree2dFloat(mGridX);
amjFree2dFloat(mGridY);
amjFree2dFloat(mGridOc);
amjFree2dFloat(std);
}
}
/******************************************************************************
******************************************************************************
** Private functions **
******************************************************************************
******************************************************************************/
/*============================================================================
DGCPM::initialize(int nThetaCells, int nPhiCells, float thetaMin,
float thetaMax);
This is the initialization function used when all memory should be
allocated internally.
============================================================================*/
void DGCPM::initialize(int nThetaCells, int nPhiCells, float thetaMin,
float thetaMax){
initialize(nThetaCells,nPhiCells,thetaMin,thetaMax,
amjMalloc1dFloat(nThetaCells,(char *)"DGCPM::DGCPM:vRCells"),
amjMalloc1dFloat(nThetaCells,(char *)"DGCPM::DGCPM:vThetaCells"),
amjMalloc1dFloat(nPhiCells,(char *)"DGCPM::DGCPM:vPhiCells"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridB"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridBi"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridPot"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridEr"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridEp"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridVr"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridVp"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridN"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridHalf"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridDen"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridVol"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridX"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridY"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridOc"),
//Added by J.Wise
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:std"));
}
/*============================================================================
DGCPM::initialize(int nThetaCells, int nPhiCells, float thetaMin,
float thetaMax);
This is the initialization function used when mGridN is passed from
the outside but all other memory is allocated internally.
============================================================================*/
void DGCPM::initialize(int nThetaCells, int nPhiCells, float thetaMin,
float thetaMax, float **mGridN){
initialize(nThetaCells,nPhiCells,thetaMin,thetaMax,
amjMalloc1dFloat(nThetaCells,(char *)"DGCPM::DGCPM:vRCells"),
amjMalloc1dFloat(nThetaCells,(char *)"DGCPM::DGCPM:vThetaCells"),
amjMalloc1dFloat(nPhiCells,(char *)"DGCPM::DGCPM:vPhiCells"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridB"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridBi"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridPot"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridEr"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridEp"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridVr"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridVp"),
mGridN,
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridHalf"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridDen"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridVol"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridX"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridY"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:mGridOc"),
amjMalloc2dFloat(nPhiCells,nThetaCells,
(char *)"DGCPM::DGCPM:std"));
}
/*
initialize() - this initialization function uses pre-allocated
memory areas passed in from the outside. This function is used both
when DGCPM allocates memory itself and when it receives
pre-allocated memory from the outside in order to eliminate
duplication of code with the associated risk of errors.
============================================================================*/
void DGCPM::initialize(int nThetaCells, int nPhiCells, float thetaMin,
float thetaMax, float *vRCells, float *vThetaCells,
float *vPhiCells, float **mGridB, float **mGridBi,
float **mGridPot, float **mGridEr, float **mGridEp,
float **mGridVr, float **mGridVp, float **mGridN,
float **mGridHalf, float **mGridDen, float **mGridVol,
float **mGridX, float **mGridY, float **mGridOc, float **std){
DGCPM::nThetaCells=nThetaCells;
DGCPM::nPhiCells=nPhiCells;
DGCPM::thetaMin=thetaMin;
DGCPM::thetaMax=thetaMax;
DGCPM::vRCells=vRCells;
DGCPM::vThetaCells=vThetaCells;
DGCPM::vPhiCells=vPhiCells;
DGCPM::mGridB=mGridB;
DGCPM::mGridBi=mGridBi;
DGCPM::mGridPot=mGridPot;
DGCPM::mGridEr=mGridEr;
DGCPM::mGridEp=mGridEp;
DGCPM::mGridVr=mGridVr;
DGCPM::mGridVp=mGridVp;
DGCPM::mGridN=mGridN;
DGCPM::mGridHalf=mGridHalf;
DGCPM::mGridDen=mGridDen;
DGCPM::mGridVol=mGridVol;
DGCPM::mGridX=mGridX;
DGCPM::mGridY=mGridY;
DGCPM::mGridOc=mGridOc;
DGCPM::std=std;
Re=6.378e6;
initialize_(&nThetaCells,&nRCells,&nPhiCells,&thetaMin,&thetaMax,&rMin,&rMax,
&delR,&delPhi,vRCells,vThetaCells,vPhiCells,mGridB[0],mGridBi[0],
mGridN[0],mGridDen[0],mGridVol[0],mGridX[0],mGridY[0],mGridOc[0],std[0]);
}
Here's a sample custom memory function, which takes care of initialization and allocation:
void *amjSafeMalloc(int n, char *message){
void *d;
d=malloc(n);
if(d==NULL){
fprintf(stderr,"amjSafeMalloc error: Could not allocate %d bytes "
"for %s. Exiting.\n",n,message);
exit(1);
}
return d;
}
float *amjMalloc1dFloat(int a, char *message){
float *d;
sprintf(msg,"%s:amjMalloc1DFloat:d",message);
d=(float *)amjSafeMalloc(sizeof(float)*a,msg);
return d;
}
float **amjMalloc2dFloat(int a, int b, char *message){
float **d;
int i;
sprintf(msg,"%s:amjMalloc2DFloat:d",message);
d=(float **)amjSafeMalloc(sizeof(float *)*a,msg);
sprintf(msg,"%s:amjMalloc2DFloat:d[0]",message);
d[0]=(float *)amjSafeMalloc(sizeof(float)*a*b,msg);
for(i=1;i<a;i++) d[i]=d[i-1]+b;
return d;
}
class DGCPM
{
public:
DGCPM(int nThetaCells, int nPhiCells)
: nThetaCells(nThetaCells)
, nPhiCells(nPhiCells)
, mGridB(nThetaCells, vector<float>(nPhiCells)) // first Y then X
{
}
private:
int nThetaCells, nPhiCells;
vector<vector<float>> mGridB;
};
Deep copies for free. Deletes memory for free.
By free I mean you don't have to write the code..
From your comment /* [nPhiCells][nThetaCells] */ in your class definition, I take it that you intent the float** to be 2D arrays. However, if you can use them like 2D arrays, they are actually arrays of pointers to arrays. That is a huge difference: it means, you have to copy nPhiCells individual arrays of nThetaCells elements and you have to setup the pointer array itself. Now, when you do
memcpy(mGridHalf[0], cur->mGridHalf[0],sizeof(float)*nThetaCells*nPhiCells);
in your copy constructor, you assume that there is no pointer array, and that all line arrays are sequential in memory. Either this copy exceeds the bounds of the pointer array (segfaulting), or accessing you array via mGridHalf[i][j] simply does the wrong thing, reinterpreting float data as pointers (and segfaulting).
Unfortunately, C++ is a horrible language for interacting with fortran multidimensional arrays because it has no notion of variable sized arrays. So the following is C code, not C++ code. In C, you can tackle the issue like this:
float (*mGridHalf)[nThetaCells] = malloc(nPhiCells*sizeof(*mGridHalf));
will correctly allocate and type a 2D array (i. e. an array of arrays) that can be accessed with
mGridHalf[phi][theta] = 7.3;
Since all elements are consecutive in memory, the entire thing can correctly be copied with
memcpy(mGridHalf, cur->mGridHalf, nPhiCells*sizeof(*mGridHalf));
and freed with
free(mGridHalf);
Technically, mGridHalf is now a pointer to an array, the pointer arithmetic that is invoked by the array access effectively does the same computation as if you had written:
float* foo = malloc(nPhiCells*nThetaCells*sizeof(*foo));
foo[phi*nThetaCells + theta] = 7.3;
However, using the correct pointer type float (*)[nThetaCells] allows you to avoid doing the index computation yourself.
The issue is more than likely you're assuming that float** has data that is one contiguous chunk of memory. If so, here is one way of accomplishing this. First, I show the wrong way (but used often):
float** createFloat2D(int nRows, int nCols)
{
float** p1 = new float*[nRows];
for (int i = 0; i < nCols; ++i )
p1[i] = new float[nCols];
return p1;
}
void destroyFloat2D(float**f, int nRows, int nCols)
{
for (int i = 0; i < nCols; ++i )
delete [] f[i];
delete [] f;
}
Looks simple, and works for most purposes, but will fail if the assumption is made that the data is in a contiguous chunk of memory.
The other way to create a 2D array is to make the data contiguous.
float** createFloat2D(int nRows, int nCols)
{
float** p1 = new float*[nRows]; // allocate row pointers
float* p2 = new float[nRows * nCols]; // allocate data in one chunk
for (int i = 0; i < nCols; ++i, p2 += nCols )
p1[i] = p2; // point the row pointers into the pool of memory
return p1;
}
void destroyFloat2D(float**f)
{
delete [] f[0];
delete [] f;
}
Note above that the data is created in one contiguous "pool". Now, using yourArray[0] actually points to the beginning of this memory. Also note that destruction is done without having to know the number of rows or columns, since f[0] points to the pool of memory.
So now, code like this should work
float** mGridB = createFloat2D(nThetaCells, nPhiCells);
//...
memcpy(mGridB[0], cur->mGridB[0], sizeof(float)*nThetaCells*nPhiCells);
The code above now works correctly, if we use the second method of creating the 2d array.
I would still stick with the vector for 1-d float arrays, as you have the pointer to the data (see my earlier comment). For the code above, I would wrap it in a class that handles creation and destruction easily.
The last thing is the copy constructor. A copy constructor in C++ has the following possible signatures:
DGCPM(const DGCPM&);
DGCPM(DGCPM&);
DGCPM(volatile DBCPM&);
I may have missed one, but the signature should be one of those above, more than likely, the first one (you can also have additional arguments after the reference argument, but they all must have default values).
Note that a DBCPM* is not a valid argument for a copy constructor as your code stated -- remember that a copy constructor is not only for use, but also the compiler will use it to make copies. So to signal the compiler that "yes, this function is used to make copies", your function must match one of the signatures above.
In addition, you need an assignment operator, in other words, the class needs to implement the "rule of 3".
This going to sound so stupid (elementary programming error): my index "i" was going beyond (number of models - 1), so I was getting a segmentation fault from accessing memory that didn't exist.

Why does reverse this function not work

In the constructor I fill the array on the device side.
but now I want to execute reverse function on the array.
using namespace std;
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
__global__ void generateVector(int *data,int count){
int tid = blockIdx.x;
data[tid] = -tid;
}
__global__ void reverseArray(int *data,int count){
int tid = blockIdx.x;
data[tid] = tid;
}
class FData{
private:
int *data;
int size;
public:
FData(int sizeP){
size = sizeP;
data = new int[size];
int *devA;
cudaMalloc((void**) &devA, size * sizeof(int));
generateVector<<<size,1>>>(devA,size);
cudaMemcpy(data,devA, size * sizeof(int),cudaMemcpyDeviceToHost);
cudaFree(devA);
}
~FData(){
delete [] data;
}
int getSize(){
return size;
}
int elementAt(int i){
return data[i];
}
void reverse(){
int *devA;
cudaMalloc((void**) &devA, sizeof(int));
reverseArray<<<size,1>>>(devA,size);
cudaMemcpy(data,devA,size * sizeof(int),cudaMemcpyDeviceToHost);
cudaFree(devA);
}
};
int main(void) {
FData arr(30);
cout << arr.elementAt(1);
arr.reverse();
cout << arr.elementAt(1);
return 0;
}
It still prints the values which I filled in the constructor. What is the problem here? How can i solve it? What is going wrong?
Your kernels aren't reversing anything. They're just negating the values, so if anything I would be quite surprised if you saw anything get reversed. With that said, if you add error checking to your code (see this other SO post on how best to do the error checking) then you'll see that your code will fail on the call to cudaMalloc in your reverse function. You can fix this by changing devA to be a plain pointer (it doesn't really make sense for you to be allocating it as a host-array anyways, as you're not using it on the host to begin with).
void reverse(){
int *devA;
cudaMalloc((void**) &devA, size * sizeof(int));
reverseArray<<<size,1>>>(devA,size);
cudaMemcpy(data,devA,size * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(devA);
}
Also, you should free your memory too, you have both host-side and device-side memory leaks. Whenever you have a cudaMalloc call, you should havea corresponding cudaFree. Also, consider adding a destructor to free your host-side data member, as you have a memory leak there too.
~FData()
{
delete [] data;
}

Assigning and retrieving bit-wise memory value for Genetic Algo

I came across this code for developing a class for GA/GP but failed to understand it and hence unable debug the program.
typedef struct {
void *dataPointer;
int length;
} binary_data;
typedef struct {
organism *organisms; //This must be malloc'ed
int organismsCount;
int (*fitnessTest)(organism org);
int orgDnaLength;
unsigned int desiredFitness;
void (*progress)(unsigned int fitness);
} evolutionary_algorithm;
The above is straight forward. Then we try to initiate organism before testing their fitnness etc...
int main(int argc, char *argv[])
{
srand(time(NULL));
int i;
evolutionary_algorithm ea;
ea.progress = progressDisplayer;
ea.organismsCount = 50;
ea.orgDnaLength = sizeof(unsigned int);
organism *orgs =(organism *) malloc(sizeof(organism) * ea.organismsCount);
for (i = 0; i < 50; i++)
{
organism newOrg;
binary_data newOrgDna;
newOrgDna.dataPointer = malloc(sizeof(unsigned int));
memset(newOrgDna.dataPointer, i, 1);
newOrgDna.length = sizeof(unsigned int);
newOrg.dna = newOrgDna;
orgs[i] = newOrg;
}
As far as i understand is the memset() tries to write a binary value into that memory location void pointer (newOrgDna.dataPointer) and so on. But i cant figure how to reassemble all those binary values to get the integer value assigned to variable "dna" of newOrg so that i check the integer value assign to the an individual organism and eventually the entire population residing in the entire memory location which has been assigned to "orgs".
As you guess from above, i not very familiar memory management at this deep level of details so your help is very much appreciated.
Thank you so much
This code looks a bit strange. This line:
newOrgDna.dataPointer = malloc(sizeof(unsigned int));
will allocate probably 4 bytes (or 8 on 64 bit machines). Strange part is that memset in line just below will set only first byte.
To get actual value you might do:
char val = *((char*) newOrgDna.dataPointer);
But, as I said, this code looks a bit off. I would rewrite it as:
for (i = 0; i < 50; i++)
{
organism newOrg;
binary_data newOrgDna;
unsigned int * data = (unsigned int*) malloc(sizeof(unsigned int));
*data = i;
newOrgDna.length = sizeof(*data);
newOrgDna.data = (void*) data; // I think that cast can be dropped
newOrg.dna = newOrgDna;
orgs[i] = newOrg;
}
Then everywhere you want to get data from organism * you can do:
void f( organism * o )
{
assert( sizeof(unsigned int) == o->dna.length );
unsigned int data = *((unsigned int*) o->dna.data);
}
Also this is rather a C question not C++.

CUDA - copy to array within array of Objects

I have a CUDA application I'm working on with an array of Objects; each object has a pointer to an array of std::pair<int, double>. I'm trying to cudaMemcpy the array of objects over, then cudaMemcpy the array of pairs to each of the objects, however this is giving me all kinds of grief. It crashes attempting to copy to the inner array; I don't understand how to move this over...
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
using namespace std;
class Object
{
public:
int id;
float something;
std::pair<int, float> *somePairs;
};
Object *objects;
void initObjects()
{
objects = new Object[10];
for( int idx = 0; idx < 10; idx++ )
{
objects[idx].id = idx;
objects[idx].something = (float) idx;
objects[idx].somePairs = new std::pair<int, float>[10];
for ( int jdx = 10; jdx < 10; jdx++ )
{
objects[idx].somePairs[jdx] = std::pair<int, float>( jdx, (float) jdx );
}
}
}
void cudaMemcpyObjects()
{
Object *devObjects;
cudaMalloc( &devObjects, sizeof(Object) * 10 );
cudaMemcpy( devObjects, objects, sizeof(Object) * 10, cudaMemcpyHostToDevice );
for ( int idx = 0; idx < 10; idx++ )
{
size_t pairSetSize = sizeof(std::pair<int, float>) * 10;
// CRASH HERE ... v
cudaMalloc( &(devObjects[idx].somePairs), pairSetSize );
cudaMemcpy( devObjects[idx].somePairs, objects[idx].somePairs,
sizeof( std::pair<int, float> ) * 10, cudaMemcpyHostToDevice );
}
}
int main()
{
initObjects();
cudaMemcpyObjects();
return 0;
}
My CUDA experience is only in its infancy, but I believe the error is like this:
cudaMalloc is a host function that wants to write the pointer into host memory. However, you are passing to it a pointer in device memory!
To fix this, you should first create the device pointers and fill them into your host object structure, and only then copy the whole thing over to the device, and also copy the individual pairs over to the device as well.
Schematically:
struct Bar;
struct Foo
{
int tag;
Bar * bp;
};
void setup()
{
Foo * hFoo = new Foo[10];
Foo * dFoo;
cudaMalloc(dFoo, sizeof(Foo) * 10);
for (size_t i = 0; i != 10; ++i)
{
Bar * dBar;
cudaMalloc(&dbar, sizeof(Bar));
Bar b; // automatic temporary -- we never keep a host copy of this
cudaMemcpy(dBar, &b, sizeof(Bar));
hFoo[i].bp = dBar; // this is already a device pointer!
}
cudaMemcpy(dFoo, hFoo, sizeof(Foo) * 10);
}
On the return, don't forget that the Foo::bp are device pointers that you still need to copy back one by one!
It would probably be easier to just have one self-contained class that you can move in one go, but that may not be practical, or desirable for reasons of memory locality. You have to thing carefully about this. If the member is just a pair, why not put the two items in the main class directly?