CUDA pointer inside kernel becomes null - c++

I'm trying to pass a pointer to triangle data to a kernel, but when debugging I find the pointer becomes null, d_list contains the triangles and both d_list and d_world are members of the main window class, also the error checking returns "no error"
d_list is of type hittable* and d_world is hittable_list*
__global__ void create_world(hittable* d_list, hittable_list* d_world, int num_triangles) {
if (threadIdx.x == 0 && blockIdx.x == 0) {
// the class hittable_list contains a counter for the list size, which no matter the
// scene size it always becomes zero
d_world = new hittable_list(&d_list, num_triangles);
}
}
checkCudaErrors(cudaMalloc((void**)&d_list, num_hittables * sizeof(triangle)));
checkCudaErrors(cudaMalloc((void**)&d_world, sizeof(hittable_list)));
cudaMemcpy(d_list, m_triangles.data(), num_hittables * sizeof(triangle), cudaMemcpyHostToDevice);
create_world << <1, 1 >> > (d_list, d_world, num_hittables);
checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaDeviceSynchronize());
I tried initializing the "world" in the host then cudaMemcpy'ing to the d_world, but it also fails
EDIT: minimal exmple
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <vector>
struct make_list {
__device__ make_list(float** list, int n) { contents = list; size = n; };
float** contents;
int size;
};
__global__ void render(make_list** world) {
int size = (*world)->size; // set a breakpoint here, the size is 0
}
__global__ void create_world(float* d_list, make_list* d_world, int num_triangles) {
if (threadIdx.x == 0 && blockIdx.x == 0) {
// the class hittable_list contains a counter for the list size, which no matter the
// scene size it always becomes zero
d_world = new make_list(&d_list, num_triangles);
}
}
int main () {
float* d_list;
make_list* d_world;
int size = 8;
std::vector<float> m_triangles(size);
cudaMalloc((void**)&d_list, size * sizeof(float));
cudaMalloc((void**)&d_world, sizeof(make_list));
cudaMemcpy(d_list, m_triangles.data(), size * sizeof(float), cudaMemcpyHostToDevice);
create_world << <1, 1 >> > (d_list, d_world, size);
cudaDeviceSynchronize();
render << <1, 1 >> > (&d_world);
cudaDeviceSynchronize();
return 0;
}
EDIT 2: updated with virtual function call, it's causing crashes
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <vector>
#include <cstdio>
class hittable {
public:
__device__ virtual int hit() const = 0;
};
struct make_list : public hittable {
__device__ make_list(float** list, int n) { contents = list; size = n; };
__device__ virtual int hit() const {
return size;
}
float** contents;
int size;
};
__global__ void render(make_list** world) {
int size = (*world)->size; // set a breakpoint here, the size is 0
printf("size = %d\n", size);
int new_size = (*world)->hit();
printf("new size = %d\n", new_size);
}
__global__ void create_world(float* d_list, make_list** d_world, int num_triangles) {
if (threadIdx.x == 0 && blockIdx.x == 0) {
// the class hittable_list contains a counter for the list size, which no matter the
// scene size it always becomes zero
*d_world = new make_list(&d_list, num_triangles);
}
}
int main() {
float* d_list;
make_list** d_world;
cudaMalloc(&d_world, sizeof(make_list*));
int size = 8;
std::vector<float> m_triangles(size);
cudaMalloc((void**)&d_list, size * sizeof(float));
cudaMemcpy(d_list, m_triangles.data(), size * sizeof(float), cudaMemcpyHostToDevice);
create_world << <1, 1 >> > (d_list, d_world, size);
cudaDeviceSynchronize();
render << <1, 1 >> > (d_world);
cudaDeviceSynchronize();
return 0;
}

There are at least a few issues.
In C++, when you pass a variable to a function via the function parameters, a copy of that variable is made for local use by the function. Any modifications made to that variable will not show up globally, i.e. in the calling environment, because the function is operating on a copy of the variable. Therefore this could never do what you want:
d_world = new make_list(&d_list, num_triangles);
There is nothing illegal about it, per se, but it will not have the desired effect. The global copy of d_world is unchanged by that assignment. This is a C++ concept, not unique or specific to CUDA, and it trips people up from time to time.
This is almost never legal in CUDA:
render << <1, 1 >> > (&d_world);
^
In typical usage, it is not possible to pass the address of a host location to device code via a kernel call parameter. Any attempt to dereference that pointer &d_world will result in dereferencing the address of a host location. That is illegal in CUDA device code.
While not necessarily a problem at this point, you should be aware of the fact that in-kernel new operates against the device heap which has a default limit of 8MB, and furthermore allocations created this way cannot take part in host-issued cudaMemcpy* calls. These topics are covered in the programming guide.
When I make changes to address those first 2 items, I get what appear to be sensible results:
$ cat t2190.cu
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <vector>
#include <cstdio>
struct make_list {
__device__ make_list(float** list, int n) { contents = list; size = n; };
float** contents;
int size;
};
__global__ void render(make_list** world) {
int size = (*world)->size; // set a breakpoint here, the size is 0
printf("size = %d\n", size);
}
__global__ void create_world(float* d_list, make_list** d_world, int num_triangles) {
if (threadIdx.x == 0 && blockIdx.x == 0) {
// the class hittable_list contains a counter for the list size, which no matter the
// scene size it always becomes zero
*d_world = new make_list(&d_list, num_triangles);
}
}
int main () {
float* d_list;
make_list** d_world;
cudaMalloc(&d_world, sizeof(make_list*));
int size = 8;
std::vector<float> m_triangles(size);
cudaMalloc((void**)&d_list, size * sizeof(float));
cudaMemcpy(d_list, m_triangles.data(), size * sizeof(float), cudaMemcpyHostToDevice);
create_world << <1, 1 >> > (d_list, d_world, size);
cudaDeviceSynchronize();
render << <1, 1 >> > (d_world);
cudaDeviceSynchronize();
return 0;
}
$ nvcc -o t2190 t2190.cu
$ compute-sanitizer ./t2190
========= COMPUTE-SANITIZER
size = 8
========= ERROR SUMMARY: 0 errors
$
Although you don't show how you are using the contents member of the make_list object, I'm doubtful that this could possibly do anything useful for you, for the same reason as I have indicated in item 1 above:
*d_world = new make_list(&d_list,
^^^^^^^
The address you are using there is the address of a temporary local variable made by the function. My guess is you probably want d_list there or possibly *d_list, and this might necessitate changes in your contents object member of the handling of that object member. Whatever you are doing there will almost certainly require changes not unlike the refactoring I have done to address items 1 and 2.
For now, without knowing anything further about your intent, something that seems sensible to me would be like this:
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <vector>
#include <cstdio>
struct make_list {
__device__ make_list(float* list, int n) { contents = list; size = n; };
float* contents;
int size;
};
__global__ void render(make_list** world) {
int size = (*world)->size; // set a breakpoint here, the size is 0
printf("size = %d\n", size);
}
__global__ void create_world(float* d_list, make_list** d_world, int num_triangles) {
if (threadIdx.x == 0 && blockIdx.x == 0) {
// the class hittable_list contains a counter for the list size, which no matter the
// scene size it always becomes zero
*d_world = new make_list(d_list, num_triangles);
}
}
int main () {
float* d_list;
make_list** d_world;
cudaMalloc(&d_world, sizeof(make_list*));
int size = 8;
std::vector<float> m_triangles(size);
cudaMalloc((void**)&d_list, size * sizeof(float));
cudaMemcpy(d_list, m_triangles.data(), size * sizeof(float), cudaMemcpyHostToDevice);
create_world << <1, 1 >> > (d_list, d_world, size);
cudaDeviceSynchronize();
render << <1, 1 >> > (d_world);
cudaDeviceSynchronize();
return 0;
}

Related

Variable gets lost after allocating array of structs in cuda

I have a structure with arrays of structures inside in C, and I need a copy of that in the GPU. For that I am writing a function that makes some cudaMalloc and cudaMemcpys of the variables in the struct from host to device.
A simple version (the real one has various structs and variables/arrays inside) of the struct is:
struct Node {
float* position;
};
struct Graph{
unsigned int nNode;
Node* node;
unsigned int nBoundary;
unsigned int* boundary;
};
My problem is that I must be doing something wrong in the memory allocation and copy of the struct. When I copy the variables withing Graph, I can see that they are properly copied (by accessing it in a kernel as in the example below). For example, I can check that graph.nBoundary=3.
However, I can only see this if I do not allocate and copy the memory of Node *. If I do, I get -858993460 instead of 3. Interestingly, Node * is not wrongly allocated, as I can inspect the value of say graph.node[0].pos[0] and it has the correct value.
This only happens with the graph.nBoundary. All the other variables remain with the correct numerical values, but this one gets "wronged" when running the cudaMemcpy of the Node*.
What am I doing wrong and why does this happen? How do I fix it?
Let me know if you need more information.
MCVE:
#include <algorithm>
#include <cuda_runtime_api.h>
#include <cuda.h>
// A point, part of some elements
struct Node {
float* position;
};
struct Graph{
unsigned int nNode;
Node* node;
unsigned int nBoundary;
unsigned int* boundary;
};
Graph* cudaGraphMalloc(const Graph* inGraph);
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void testKernel(Graph* graph,unsigned int * d_res){
d_res[0] = graph->nBoundary;
};
int main()
{
// Generate some fake data on the CPU
Graph graph;
graph.node = (Node*)malloc(2 * sizeof(Node));
graph.boundary = (unsigned int*)malloc(3 * sizeof(unsigned int));
for (int i = 0; i < 3; i++){
graph.boundary[i] = i + 10;
}
graph.nBoundary = 3;
graph.nNode = 2;
for (int i = 0; i < 2; i++){
// They can have different sizes in the original code
graph.node[i].position = (float*)malloc(3 * sizeof(float));
graph.node[i].position[0] = 45;
graph.node[i].position[1] = 1;
graph.node[i].position[2] = 2;
}
// allocate GPU memory
Graph * d_graph = cudaGraphMalloc(&graph);
// some dummy variables to test on GPU.
unsigned int * d_res, *h_res;
cudaMalloc((void **)&d_res, sizeof(unsigned int));
h_res = (unsigned int*)malloc(sizeof(unsigned int));
//Run kernel
testKernel << <1, 1 >> >(d_graph, d_res);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaMemcpy(h_res, d_res, sizeof(unsigned int), cudaMemcpyDeviceToHost));
printf("%u\n", graph.nBoundary);
printf("%d", h_res[0]);
return 0;
}
Graph* cudaGraphMalloc(const Graph* inGraph){
Graph* outGraph;
gpuErrchk(cudaMalloc((void**)&outGraph, sizeof(Graph)));
//copy constants
gpuErrchk(cudaMemcpy(&outGraph->nNode, &inGraph->nNode, sizeof(unsigned int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(&outGraph->nBoundary, &inGraph->nBoundary, sizeof(unsigned int), cudaMemcpyHostToDevice));
// copy boundary
unsigned int * d_auxboundary, *h_auxboundary;
h_auxboundary = inGraph->boundary;
gpuErrchk(cudaMalloc((void**)&d_auxboundary, inGraph->nBoundary*sizeof(unsigned int)));
gpuErrchk(cudaMemcpy(d_auxboundary, h_auxboundary, inGraph->nBoundary*sizeof(unsigned int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(&outGraph->boundary, d_auxboundary, sizeof(unsigned int *), cudaMemcpyDeviceToDevice));
//Create nodes
Node * auxnode;
gpuErrchk(cudaMalloc((void**)&auxnode, inGraph->nNode*sizeof(Node)));
// Crate auxiliary pointers to grab them from host and pass them to device
float ** d_position, ** h_position;
d_position = static_cast<float **>(malloc(inGraph->nNode*sizeof(float*)));
h_position = static_cast<float **>(malloc(inGraph->nNode*sizeof(float*)));
for (int i = 0; i < inGraph->nNode; i++){
// Positions
h_position[i] = inGraph->node[i].position;
gpuErrchk(cudaMalloc((void**)&d_position[i], 3 * sizeof(float)));
gpuErrchk(cudaMemcpy(d_position[i], h_position[i], 3 * sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(&auxnode[i].position, d_position[i], sizeof(float *), cudaMemcpyDeviceToDevice));
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////////
////////////// If I comment the following section, nBoundary can be read by the kernel
///////////////////////////////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////////////////////////////////
gpuErrchk(cudaMemcpy(&outGraph->node, auxnode, inGraph->nNode*sizeof(Node *), cudaMemcpyDeviceToDevice));
return outGraph;
}
The problem is in the function cudaGraphMalloc where you are trying to allocate device memory to the members of outGraph which has already been allocated on the device. In process of doing so, you are de-referencing a device pointer on host which is illegal.
To allocate device memory to members of struct type variable which exists on the device, we first have to create a temporary host variable of that struct type, then allocate device memory to its members, and then copy it to the struct which exists on the device.
I have answered a similar question here. Please take a look at it.
The fixed code may look like this:
#include <algorithm>
#include <cuda_runtime.h>
#include <cuda.h>
// A point, part of some elements
struct Node {
float* position;
};
struct Graph {
unsigned int nNode;
Node* node;
unsigned int nBoundary;
unsigned int* boundary;
};
Graph* cudaGraphMalloc(const Graph* inGraph);
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
__global__ void testKernel(Graph* graph, unsigned int * d_res) {
d_res[0] = graph->nBoundary;
};
int main()
{
// Generate some fake data on the CPU
Graph graph;
graph.node = (Node*)malloc(2 * sizeof(Node));
graph.boundary = (unsigned int*)malloc(3 * sizeof(unsigned int));
for (int i = 0; i < 3; i++) {
graph.boundary[i] = i + 10;
}
graph.nBoundary = 3;
graph.nNode = 2;
for (int i = 0; i < 2; i++) {
// They can have different sizes in the original code
graph.node[i].position = (float*)malloc(3 * sizeof(float));
graph.node[i].position[0] = 45;
graph.node[i].position[1] = 1;
graph.node[i].position[2] = 2;
}
// allocate GPU memory
Graph * d_graph = cudaGraphMalloc(&graph);
// some dummy variables to test on GPU.
unsigned int * d_res, *h_res;
cudaMalloc((void **)&d_res, sizeof(unsigned int));
h_res = (unsigned int*)malloc(sizeof(unsigned int));
//Run kernel
testKernel << <1, 1 >> >(d_graph, d_res);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaMemcpy(h_res, d_res, sizeof(unsigned int), cudaMemcpyDeviceToHost));
printf("%u\n", graph.nBoundary);
printf("%u\n", h_res[0]);
return 0;
}
Graph* cudaGraphMalloc(const Graph* inGraph)
{
//Create auxiliary Graph variable on host
Graph temp;
//copy constants
temp.nNode = inGraph->nNode;
temp.nBoundary = inGraph->nBoundary;
// copy boundary
gpuErrchk(cudaMalloc((void**)&(temp.boundary), inGraph->nBoundary * sizeof(unsigned int)));
gpuErrchk(cudaMemcpy(temp.boundary, inGraph->boundary, inGraph->nBoundary * sizeof(unsigned int), cudaMemcpyHostToDevice));
//Create nodes
size_t nodeBytesTotal = temp.nNode * sizeof(Node);
gpuErrchk(cudaMalloc((void**)&(temp.node), nodeBytesTotal));
for (int i = 0; i < temp.nNode; i++)
{
//Create auxiliary node on host
Node auxNodeHost;
//Allocate device memory to position member of auxillary node
size_t nodeBytes = 3 * sizeof(float);
gpuErrchk(cudaMalloc((void**)&(auxNodeHost.position), nodeBytes));
gpuErrchk(cudaMemcpy(auxNodeHost.position, inGraph->node[i].position, nodeBytes, cudaMemcpyHostToDevice));
//Copy auxillary host node to device
Node* dPtr = temp.node + i;
gpuErrchk(cudaMemcpy(dPtr, &auxNodeHost, sizeof(Node), cudaMemcpyHostToDevice));
}
Graph* outGraph;
gpuErrchk(cudaMalloc((void**)&outGraph, sizeof(Graph)));
gpuErrchk(cudaMemcpy(outGraph, &temp, sizeof(Graph), cudaMemcpyHostToDevice));
return outGraph;
}
Be advised that you will have to keep the host copies of internal device pointers (i.e. the auxiliary host variables). This is because you will have to free the device memory later and since you will only have a device copy of Graph in the main code, you won't be able to access its members from the host to call cudaFree on them. In this case the variable Node auxNodeHost (created in each iteration) and Graph temp are those variables.
The above code does not do that and is just for demonstration purpose.
Tested on Windows 10, Visual Studio 2015, CUDA 9.2, NVIDIA Driver 397.44.

Transferring an array pointer into CUDA memory via separate class

I have a class named "Coordinate" which consist of an int array pointer and a bool variable. I want to send this pointer into CUDA, modify it and then use it back in CPU memory.
Here is Coordinate.h :
#ifndef __COORDINATE_H
#define __COORDINATE_H
#include <stdlib.h>
#include <cuda.h>
using namespace std;
class Coordinate {
public:
int *array_pointer;
bool flag;
Coordinate() { flag = false; }
Coordinate(int array_length) {
flag = false;
array_pointer = new int[array_length];
for (int i = 0; i < array_length; i++) {
array_pointer[i] = -1;
}
}
};
#endif
I have made 2 global functions in cudamain.cu Check1 and Check2, both will take a Coordinate as argument. Check1 function will change only boolean flag which Check2 will change boolean flag and also modify the array.
Here is cudamain.cu :
#include <iostream>
#include <cuda.h>
#include "Coordinate.h"
using namespace std;
__global__ void check1(Coordinate *ptr) {
c->flag = true;
}
__global__ void check2(Coordinate *c) {
c->flag = true;
for (int i = 0; i < 10; i++) {
c->array_pointer[i] = i;
}
}
int main() {
Coordinate *d_a, *d_b, a, b;
a = Coordinate(10); b = Coordinate(10);
size_t size = sizeof(Coordinate);
cudaMalloc((void**)&d_a, size); cudaMalloc((void**)&d_b, size);
cudaMemcpy(d_a, &a, size, cudaMemcpyHostToDevice); cudaMemcpy(d_b, &b, size, cudaMemcpyHostToDevice);
check1 << <1, 1 >> > (d_a);
cudaMemcpy(&a, d_a, size, cudaMemcpyDeviceToHost);
cout <<"d_a result-> " <<a.flag <<" " <<a.array_pointer[9] << endl;
check2 << <1, 1 >> > (d_b);
cudaMemcpy(&b, d_b, size, cudaMemcpyDeviceToHost);
cout << "d_b result-> " << b.flag << " " << b.array_pointer[9] << endl;
return 0;
}
I made 2 separate coordinate objects a and b, a will go with check1 and b will go with check2. Both a and b are initialized in same way.
The result I get is
d_a result-> 1 -1
d_b result-> 0 -1
Expected result:
d_a result-> 1 -1
d_b result-> 1 9
Different Coordinate objects may have different array length so I can't initialize the array pointer in the coordinate class.
You cannot access host memory from a CUDA kernel by dereferncing, unless that piece of memory was specially-allocated to allow this, e.g. using cudaMallocManaged(). So your program cannot work. Read this Parallel4All post on accessing the same memory both from the host and the device. Another alternative is the one #RobertCrovella linked to, involving allocating device-side memory.
But, frankly, I doubt any of these two options are what you should go for in this case, since a class named Coordinate does not seem to be something which would need a variable-size array of integers. Are you sure something like
template <unsigned NumDimensions>
class Coordinate<N> {
std::array<int, NumDimensions> a;
// etc. etc.
}
won't do?
(Note that the std::array class itself cannot really be used in device code, like most of the standard library. But you can easily clone std::array and then use your cuda::array class on both the host and the device side.)
Even if dynamic allocation of memory is required for some reason, it is not a good idea to have a class which, it seems, would be used many times, allocate its own memory. Consider using some pre-allocated buffer and have your Coordinates just advance an offset into it (although this would require synchronization for thread safety, or making the buffer thread-local).

Why does reverse this function not work

In the constructor I fill the array on the device side.
but now I want to execute reverse function on the array.
using namespace std;
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
__global__ void generateVector(int *data,int count){
int tid = blockIdx.x;
data[tid] = -tid;
}
__global__ void reverseArray(int *data,int count){
int tid = blockIdx.x;
data[tid] = tid;
}
class FData{
private:
int *data;
int size;
public:
FData(int sizeP){
size = sizeP;
data = new int[size];
int *devA;
cudaMalloc((void**) &devA, size * sizeof(int));
generateVector<<<size,1>>>(devA,size);
cudaMemcpy(data,devA, size * sizeof(int),cudaMemcpyDeviceToHost);
cudaFree(devA);
}
~FData(){
delete [] data;
}
int getSize(){
return size;
}
int elementAt(int i){
return data[i];
}
void reverse(){
int *devA;
cudaMalloc((void**) &devA, sizeof(int));
reverseArray<<<size,1>>>(devA,size);
cudaMemcpy(data,devA,size * sizeof(int),cudaMemcpyDeviceToHost);
cudaFree(devA);
}
};
int main(void) {
FData arr(30);
cout << arr.elementAt(1);
arr.reverse();
cout << arr.elementAt(1);
return 0;
}
It still prints the values which I filled in the constructor. What is the problem here? How can i solve it? What is going wrong?
Your kernels aren't reversing anything. They're just negating the values, so if anything I would be quite surprised if you saw anything get reversed. With that said, if you add error checking to your code (see this other SO post on how best to do the error checking) then you'll see that your code will fail on the call to cudaMalloc in your reverse function. You can fix this by changing devA to be a plain pointer (it doesn't really make sense for you to be allocating it as a host-array anyways, as you're not using it on the host to begin with).
void reverse(){
int *devA;
cudaMalloc((void**) &devA, size * sizeof(int));
reverseArray<<<size,1>>>(devA,size);
cudaMemcpy(data,devA,size * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(devA);
}
Also, you should free your memory too, you have both host-side and device-side memory leaks. Whenever you have a cudaMalloc call, you should havea corresponding cudaFree. Also, consider adding a destructor to free your host-side data member, as you have a memory leak there too.
~FData()
{
delete [] data;
}

C++ abstraction of libpng causes Crash in malloc -- While deleting a "Jagged" 2D Array

The class corresponding to this crash is:
#ifndef IMAGE_DATA_
#define IMAGE_DATA_
#include <stdexcept>
template <typename data_type>
class ImageData
{
public:
ImageData(unsigned long width, unsigned long height);
~ImageData();
data_type **&get_data();
unsigned long int get_width() const
{
return _m_Width;
}
unsigned long int get_height() const
{
return _m_Height;
}
protected:
ImageData(ImageData &copy);
ImageData& operator= (ImageData &copy);
private:
data_type **_m_rData;
unsigned long _m_Width;
unsigned long _m_Height;
};
template <typename data_type>
ImageData<data_type>::ImageData(unsigned long width, unsigned long height) :
_m_rData(NULL),
_m_Width(width),
_m_Height(height)
{
if (width == 0 || height == 0)
throw std::runtime_error("Invalid width or height");
try {
_m_rData = new data_type*[_m_Height]();
for (unsigned long int i = 0; i < _m_Height; ++i) {
_m_rData[i] = NULL;
}
for (unsigned long int i = 0; i < _m_Height; ++i) {
_m_rData[i] = new data_type[_m_Width];
}
}
catch (std::bad_alloc e) {
throw std::runtime_error("Failure to create space for Image");
}
}
template <typename data_type>
ImageData<data_type>::~ImageData()
{
for (unsigned long i = 0; i < _m_Height; ++i) {
delete [] _m_rData[i];
_m_rData[i] = NULL;
}
delete [] _m_rData;
_m_rData = NULL;
}
template <typename data_type>
data_type **&ImageData<data_type>::get_data()
{
return _m_rData;
}
#endif
And it is used in the following manner:
PNGFileReader::PNGFileReader(const std::string &path) :
_m_Image(NULL),
_m_pPNG(NULL),
_m_pPNGInfo(NULL)
{
...
/*
* Read Image in all at once into users data
*/
_m_Image = new ImageData<unsigned char>(width, height);
png_read_image(_m_pPNG, _m_Image->get_data());
png_read_end(_m_pPNG, NULL);
fclose(_m_CFilePointer);
_m_CFilePointer = NULL;
}
PNGFileReader::~PNGFileReader()
{
if (_m_CFilePointer) {
fclose(_m_CFilePointer);
}
png_destroy_read_struct(&_m_pPNG, &_m_pPNGInfo, NULL);
delete _m_Image;
}
When stepping through with the debugger the _m_rData in the ImageData class is the same pointer as when I used new on it. I have even tried to wrap the delete statement inside ImageData destructor with if == NULL statments. However, I still get a sigabrt while running my code. The stack trace from gdb is:
0 __GI_raise raise.c 64 0x3512a36285
1 __GI_abort abort.c 91 0x3512a37b9b
2 __libc_message libc_fatal.c 198 0x3512a77a7e
3 malloc_printerr malloc.c 5021 0x3512a7dda6
4 _int_free malloc.c 3942 0x3512a7f08e
5 ImageData<unsigned char>::~ImageData imagedata.h 57 0x40236d
6 PNGFileReader::~PNGFileReader pngfilereader.cpp 59 0x401ed3
7 main main.cpp 8 0x40246a
UPDATE
For anyone that is curios the following now works. Apparently it is an issue with how png_alligns its data. This forces you I guess to use libpng's method calls which internally use free and malloc, not new. This is essentially the same things as calling free(data) where data was created with data = new type[N]. The code below depicts how to correctly use libpng.
#ifndef PNG_FILE_READER_H_
#define PNG_FILE_READER_H_
#include "imagedata.h"
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <png.h>
#include <iostream>
#include <vector>
#include <string>
template <typename data_type>
class ImageData;
class PNGFileReader
{
public:
// Ctor and Dtor
PNGFileReader(const std::string &path);
~PNGFileReader();
// For testing purposes
friend std::ostream &operator<< (std::ostream &out,
PNGFileReader *object)
{
for (unsigned long i = 0; i < object->get_image_height(); ++i) {
for (unsigned long j = 0; j < object->get_image_width(); ++j) {
png_byte c = object->_m_ImageData[i][j];
out << c;
}
}
return out;
}
// Getters
long unsigned int get_image_width() const;
long unsigned int get_image_height() const;
private:
// Helper functions:
bool _create_png_structs();
// Member variables:
FILE *_m_CFilePointer;
unsigned long int _m_ImageWidth;
unsigned long int _m_ImageHeight;
png_bytepp _m_ImageData;
png_structp _m_pPNG;
png_infop _m_pPNGInfo;
// Enums
enum PNGBOOL {NOT_PNG, PNG};
enum PNGERRORS {ERROR, SUCCESS};
};
#endif /* PNG_FILE_READER_H_ */
#include "pngfilereader.h"
#include "filereader.h"
#include <stdexcept>
PNGFileReader::PNGFileReader(const std::string &path) :
_m_ImageData(NULL),
_m_pPNG(NULL),
_m_pPNGInfo(NULL)
{
/*
* Check if first 8 bytes are the correct PNG header
*/
enum {BYTES_TO_READ = 8};
unsigned char sig[BYTES_TO_READ];
FileReader(path, sig, BYTES_TO_READ);
bool not_png = png_sig_cmp(sig, 0, BYTES_TO_READ);
if (not_png) {
throw std::runtime_error("Your file is not of PNG format");
}
/*
* Create the png structs using a FILE *. libpng requires
* this type and will not take a C++ stream
*/
_m_CFilePointer = fopen(path.c_str(), "rb");
if (!_m_CFilePointer) {
throw std::runtime_error("Failure to open PNG file");
}
if (!_create_png_structs()) {
throw std::runtime_error("Failure to create PNG structs");
}
/*
* Initialize PNG io and read data into PNG structs
*/
png_init_io(_m_pPNG, _m_CFilePointer);
png_read_info(_m_pPNG, _m_pPNGInfo);
_m_ImageHeight = png_get_image_height(_m_pPNG, _m_pPNGInfo);
_m_ImageWidth = png_get_rowbytes(_m_pPNG, _m_pPNGInfo);
/*
* Create sufficient PNG Space and Read Image in all at
* once into users data. Note that you have to use png's
* types to prevent sigabrt (6) while freeing memory.
*/
_m_ImageData = (png_bytepp)png_malloc(_m_pPNG,
sizeof(png_bytep)*_m_ImageHeight);
if (_m_ImageData == NULL) {
throw std::runtime_error("Memory allocation failure");
}
for (unsigned long int i = 0; i < _m_ImageHeight; ++i) {
_m_ImageData[i] = NULL;
}
for (unsigned long int i = 0; i < _m_ImageHeight; ++i) {
_m_ImageData[i] = (png_bytep)png_malloc(_m_pPNG,
sizeof(png_byte)*_m_ImageWidth);
if (_m_ImageData[i] == NULL) {
throw std::runtime_error("Memory allocation failure.");
}
}
png_read_image(_m_pPNG, _m_ImageData);
png_read_end(_m_pPNG, NULL);
fclose(_m_CFilePointer);
_m_CFilePointer = NULL;
}
PNGFileReader::~PNGFileReader()
{
if (_m_CFilePointer) {
fclose(_m_CFilePointer);
}
/*
* Free all resources (-1)
*/
png_free_data(_m_pPNG, _m_pPNGInfo, PNG_FREE_ALL, -1);
for (unsigned long int i = 0; i < _m_ImageHeight; ++i) {
png_free(_m_pPNG, _m_ImageData[i]);
}
free(_m_ImageData);
png_destroy_read_struct(&_m_pPNG, &_m_pPNGInfo, NULL);
}
// Getters
long unsigned int PNGFileReader::get_image_width() const
{
return _m_ImageWidth;
}
long unsigned int PNGFileReader::get_image_height() const
{
return _m_ImageHeight;
}
// Private helper functions
bool PNGFileReader::_create_png_structs()
{
/*
* Create the pointer to main libpng struct, as well as
* two info structs to maintain information after, and
* prior to all operations on png m_Data. Only necessary
* to release resource after function succeeds.
*/
_m_pPNG = png_create_read_struct(PNG_LIBPNG_VER_STRING, (png_voidp)NULL,
NULL, NULL);
if (!_m_pPNG){
return PNGFileReader::ERROR;
}
_m_pPNGInfo = png_create_info_struct(_m_pPNG);
if (!_m_pPNGInfo) {
return PNGFileReader::ERROR;
}
return PNGFileReader::SUCCESS;
}
If you need a really 2D array to pass to a library, but want to have the flexibility of a jagged array, what you do is
Allocate the first level pointer block as usual
Instead of allocating m separate rows of n cells (one for each pointer in the first level block) you allocate a single set of n*m cells and then set the first level pointers to point at every nth location. This way the main allocation is sized and laid out in memory just as a 2D array, but you can still use the two-pointer-dereference [][] syntax to get to the cells.
Pass the start of the second level allocation to the library.
This works because there are strict requirements on who multidimensional arrays are laid out in memory (i.e. the must be contiguous at every level of interpretation).

CUDA - copy to array within array of Objects

I have a CUDA application I'm working on with an array of Objects; each object has a pointer to an array of std::pair<int, double>. I'm trying to cudaMemcpy the array of objects over, then cudaMemcpy the array of pairs to each of the objects, however this is giving me all kinds of grief. It crashes attempting to copy to the inner array; I don't understand how to move this over...
#include <cuda.h>
#include <cuda_runtime.h>
#include <iostream>
using namespace std;
class Object
{
public:
int id;
float something;
std::pair<int, float> *somePairs;
};
Object *objects;
void initObjects()
{
objects = new Object[10];
for( int idx = 0; idx < 10; idx++ )
{
objects[idx].id = idx;
objects[idx].something = (float) idx;
objects[idx].somePairs = new std::pair<int, float>[10];
for ( int jdx = 10; jdx < 10; jdx++ )
{
objects[idx].somePairs[jdx] = std::pair<int, float>( jdx, (float) jdx );
}
}
}
void cudaMemcpyObjects()
{
Object *devObjects;
cudaMalloc( &devObjects, sizeof(Object) * 10 );
cudaMemcpy( devObjects, objects, sizeof(Object) * 10, cudaMemcpyHostToDevice );
for ( int idx = 0; idx < 10; idx++ )
{
size_t pairSetSize = sizeof(std::pair<int, float>) * 10;
// CRASH HERE ... v
cudaMalloc( &(devObjects[idx].somePairs), pairSetSize );
cudaMemcpy( devObjects[idx].somePairs, objects[idx].somePairs,
sizeof( std::pair<int, float> ) * 10, cudaMemcpyHostToDevice );
}
}
int main()
{
initObjects();
cudaMemcpyObjects();
return 0;
}
My CUDA experience is only in its infancy, but I believe the error is like this:
cudaMalloc is a host function that wants to write the pointer into host memory. However, you are passing to it a pointer in device memory!
To fix this, you should first create the device pointers and fill them into your host object structure, and only then copy the whole thing over to the device, and also copy the individual pairs over to the device as well.
Schematically:
struct Bar;
struct Foo
{
int tag;
Bar * bp;
};
void setup()
{
Foo * hFoo = new Foo[10];
Foo * dFoo;
cudaMalloc(dFoo, sizeof(Foo) * 10);
for (size_t i = 0; i != 10; ++i)
{
Bar * dBar;
cudaMalloc(&dbar, sizeof(Bar));
Bar b; // automatic temporary -- we never keep a host copy of this
cudaMemcpy(dBar, &b, sizeof(Bar));
hFoo[i].bp = dBar; // this is already a device pointer!
}
cudaMemcpy(dFoo, hFoo, sizeof(Foo) * 10);
}
On the return, don't forget that the Foo::bp are device pointers that you still need to copy back one by one!
It would probably be easier to just have one self-contained class that you can move in one go, but that may not be practical, or desirable for reasons of memory locality. You have to thing carefully about this. If the member is just a pair, why not put the two items in the main class directly?