Initialize an unsigned char using cudaMemcpy - c++

I allocated the memory for an array of unsigned char using cudaMalloc and the initialized using cudaMemset.
unsigned char *device_channel_data;
cudaMalloc( device_channel_data, sizeof(unsigned char) * image_size);
cudaMemset( *device_channel_data, 0, sizeof(unsigned char) * image_size);
After that I'm checking whether it is really set to 0 by copying the data back to host. I'm printing some of the elements to check the data but the values printed are random.
unsigned char *host_channel_channel;
cudaMemcpy(host_channel_channel, device_channel_data, sizeof(unsigned char) * image_size, cudaMemcpyDeviceToHost);
for(int i = 0; i < 10; i ++)
{
std::cout<< (int)host_channel_channel[i] << std::endl;
}
I want to initialize the device_channel data to 0.
My knowledge with pointers and CUDA programming is very limited. I'm just starting with CUDA Programming. Thanks in advance for the help.

The answer of your question is in the documentation of CUDA: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html
For error handling (Use of cudamalloc(). Why the double pointer?) the CUDA-API need a pointer to pointer as an input parameter for allocating the memory, while for memset a simple pointer is needed. Change your code to:
unsigned char *device_channel_data;
cudaMalloc( (void**)&device_channel_data, sizeof(unsigned char) * image_size);
cudaMemset( device_channel_data, 0, sizeof(unsigned char) * image_size);
and everything should work fine.

Related

Assigning Values to unsigned short* buffer

I am having issues assigning value to unsigned short* and unsigned char* buffers. The code looks something like this:
// Header File
unsigned short* short_buff;
unsigned char* char_buff;
// Implementation File
short_buff = new unsigned short[10];
memset(&short_buff, 0, 10);
char_buff = new unsigned char[10];
memset(&char_buff, 0, 10);
unsigned short sVal = 0;
unsigned char cVal = 0x0;
// All of these cause core dumps
short_buff[0] = sVal;
memcpy(&short_buff[0], &sVal, 2); // 2 bytes per unsigned short
std::cout << short_buff[0] << std::endl;
// All of these also cause core dumps
char_buff[0] = cVal;
memcpy(&char_buff[0], &cVal, 1); // 1 byte per unsigned char
std::cout << char_buff[0] << std::endl;
// Yet strangely these cause no issues
unsigned short s2Val = short_buff[0];
unsigned char c2Val = char_buff[0];
I am completely at a loss as to what is going on here and why.. Any help would be greatly appreciated!
memset(short_buff, 0, 10*sizeof(short));
and
memset(char_buff, 0, 10*sizeof(char));
Two mistakes, & is wrong, you should pass the value of the pointer to memset not the address of the pointer variable. (This version memset(&short_buffer[0], ...); also works).
Secondly memset counts bytes not elements, so you need to multiply the array size by the element size, use sizeof for that.
Strangely you got it more or less right with memcpy later on. Why not the same thing for memset?

Access an array of pointers to objects on device CUDA

New to CUDA and gpu programming, having trouble with copying array of object pointers to device.
I have a vector of object pointers, each object contains two vectors, that I will be working with in device code.
I need to somehow copy that array into the device memory, however after reading similar solutions, still can't figure it out.
This is the structure of an object, I'm working with:
std::vector<int> retVals;
std::vector<int> children{4};
So, not only I need to make copy of the array, I also need to convert these vectors to the int array in each object.
EDIT:
This is what I have come up so far with:
auto **nodesPtr = ( aho_corasick::Node**)malloc(a->nodes.size() * sizeof(aho_corasick::Node *));
int i = 0;
for (auto &node: a->nodes){
auto *newNode = new aho_corasick::Node(' ');
cudaMalloc((void**)&(newNode->cudaChildren), sizeof(int) * node->children.size());
cudaMemcpy(newNode->cudaChildren, node->children.data(), sizeof(int) * node->children.size(), cudaMemcpyHostToDevice);
cudaMalloc((void**)&(newNode->cudaRets), sizeof(int) * node->retVals.size());
cudaMemcpy(newNode->cudaRets, node->children.data(), sizeof(int) * node->retVals.size(), cudaMemcpyHostToDevice);
aho_corasick::Node* devNode;
cudaMalloc((void**)&devNode, sizeof(aho_corasick::Node));
cudaMemcpy(devNode, newNode, sizeof(aho_corasick::Node), cudaMemcpyHostToDevice);
nodesPtr[i++] = devNode;
}
aho_corasick::Node **devNodes;
cudaMalloc((void **)&devNodes, a->nodes.size() * sizeof(aho_corasick::Node *));
cudaMemcpy(devNodes, nodesPtr, a->nodes.size() * sizeof(aho_corasick::Node *), cudaMemcpyHostToDevice);
Still does not seem to work tho.
Also, how bad is such code in CUDA terms and how would I go around the array of pointers?
EDIT2:
Forgot to point out: I added two additional fields in my objects: two int arrays and in the for loop I am creating a new object, into which I am copying the two corresponding vectors (into the int array fields) and after that I am creating a new object in device memory with those fields.
Then, after the loop, I am allocating the array of object pointers in device memory.
auto **nodesPtr = (aho_corasick::Node **) malloc(a->nodes.size() * sizeof(aho_corasick::Node *));
int i = 0;
for (auto &node: a->nodes) {
auto *newNode = new aho_corasick::Node(' ');
cudaMalloc((void **) &(newNode->cudaChildren), sizeof(int) * node->children.size());
cudaMemcpy(newNode->cudaChildren, node->children.data(), sizeof(int) * node->children.size(),
cudaMemcpyHostToDevice);
cudaMalloc((void **) &(newNode->cudaRets), sizeof(int) * node->retVals.size());
cudaMemcpy(newNode->cudaRets, node->retVals.data(), sizeof(int) * node->retVals.size(),
cudaMemcpyHostToDevice);
newNode->retsCount = node->retVals.size();
aho_corasick::Node *devNode;
cudaMalloc((void **) &devNode, sizeof(aho_corasick::Node));
cudaMemcpy(devNode, newNode, sizeof(aho_corasick::Node), cudaMemcpyHostToDevice);
nodesPtr[i++] = devNode;
}
aho_corasick::Node **devNodes;
cudaMalloc((void ***) &devNodes, a->nodes.size() * sizeof(aho_corasick::Node *));
cudaMemcpy(devNodes, nodesPtr, a->nodes.size() * sizeof(aho_corasick::Node *), cudaMemcpyHostToDevice);
This is how I copied my array of object into device. However, as it turns out, for my task this approach is not acceptable. It takes few hours to copy all objects.
Closing this thread, but if someone knows how to avoid using array of pointers and therefore avoid copying, please let me know.

SIGSEGV in CUDA allocation

I've an host array of uint64_t of size spectrum_size and I need to allocate and copy it on my GPU.
But when I'm trying to allocate this in the GPU memory, but I continue to receive SIGSEGV... Any ideas?
uint64_t * gpu_hashed_spectrum;
uint64_t * gpu_hashed_spectrum_h = new uint64_t [spectrum_size];
HANDLE_ERROR(cudaMalloc((void **)&gpu_hashed_spectrum, sizeof(uint64_t *) * spectrum_size));
for(i=0; i<spectrum_size; i++) {
HANDLE_ERROR(cudaMalloc((void **)&gpu_hashed_spectrum_h[i], sizeof(uint64_t)));
}
printf("\t\t...Copying\n");
for(i=0; i<spectrum_size; i++) {
HANDLE_ERROR(cudaMemcpy((void *)gpu_hashed_spectrum_h[i], (const void *)hashed_spectrum[i], sizeof(uint64_t), cudaMemcpyHostToDevice));
}
HANDLE_ERROR(cudaMemcpy(gpu_hashed_spectrum, gpu_hashed_spectrum_h, spectrum_size * sizeof(uint64_t *), cudaMemcpyHostToDevice));
Full code available here
UPDATE:
I tried to do in this way, noy I've got SIGSEGV on other parts of the code (in the kernel, when using this array. Maybe is due to other errors.
uint64_t * gpu_hashed_spectrum;
HANDLE_ERROR(cudaMalloc((void **)&gpu_hashed_spectrum, sizeof(uint64_t) * spectrum_size));
HANDLE_ERROR(cudaMemcpy(gpu_hashed_spectrum, hashed_spectrum, spectrum_size * sizeof(uint64_t), cudaMemcpyHostToDevice));
At least you are confused about uint64_t** and unit64_t*.
At line 1, you define gpu_hashed_spectrum as a pointer, pointing to some data of the type unit64_t, but at line 3
HANDLE_ERROR(cudaMalloc((void **)&gpu_hashed_spectrum, sizeof(uint64_t *) * spectrum_size));
you use gpu_hashed_spectrum as a pointer, pointing to some data of the type unit64_t*.
Maybe you should change your definition to
uint64_t** gpu_hashed_spectrum;
As well as some other lines.

How to copy data from unsigned int to ulong4 in CUDA

.h file:
#define VECTOR_SIZE 1024
.cpp file:
int main ()
{
unsigned int* A;
A = new unsigned int [VECTOR_SIZE];
CopyToDevice (A);
}
.cu file:
void CopyToDevice (unsigned int *A)
{
ulong4 *UA
unsigned int VectorSizeUlong4 = VECTOR_SIZE / 4;
unsigned int VectorSizeBytesUlong4 = VectorSizeUlong4 * sizeof(ulong4);
cudaMalloc( (void**)&UA, VectorSizeBytesUlong4 );
// how to use cudaMemcpy to copy data from A to UA?
// I tried to do the following but it gave access violation error:
for (int i=0; i<VectorSizeUlong4; ++i)
{
UA[i].x = A[i*4 + 0];
UA[i].y = A[i*4 + 1];
UA[i].z = A[i*4 + 2];
UA[i].w = A[i*4 + 3];
}
// I also tried to copy *A to device and then work on it instead going back to CPU to access *A every time but this did not work again
}
The CUDA ulong4 is a 16 byte aligned structure defined as
struct __builtin_align__(16) ulong4
{
unsigned long int x, y, z, w;
};
this means that the stream of four consecutive 32 bit unsigned source integers you want to use to populate a stream of ulong4 are the same size. The simplest solution is contained right in the text on the image you posted - just cast (either implicitly or explicitly) the unsigned int pointer to a ulong4 pointer, use cudaMemcpydirectly on the host and device memory, and pass the resulting device pointer to whatever kernel function you have that requires a ulong4 input. Your device transfer function could look something like:
ulong4* CopyToDevice (unsigned int* A)
{
ulong4 *UA, *UA_h;
size_t VectorSizeUlong4 = VECTOR_SIZE / 4;
size_t VectorSizeBytesUlong4 = VectorSizeUlong4 * sizeof(ulong4);
cudaMalloc( (void**)&UA, VectorSizeBytesUlong4);
UA_h = reinterpret_cast<ulong4*>(A); // not necessary but increases transparency
cudaMemcpy(UA, UA_h, VectorSizeBytesUlong4);
return UA;
}
[Usual disclaimer: written in browser, not tested or compiled, use at own risk]
This should raise all alarm bells:
cudaMalloc( (void**)&UA, VectorSizeBytesUlong4 );
// ...
UA[i].x = A[i*4 + 0];
You are allocating UA on the device and then use it in host code. Don't ever do that. You will need to use cudaMemcpy to copy arrays to the device. This tutorial shows you a basic program that uses cudaMemcpy to copy things over. The length argument to cudaMemcpy is the length of your array in bytes. And in your case that is VECTOR_SIZE * sizeof(unsigned int).

2d char array to CUDA kernel

I need help with transfer char[][] to Cuda kernel. This is my code:
__global__
void kernel(char** BiExponent){
for(int i=0; i<500; i++)
printf("%c",BiExponent[1][i]); // I want print line 1
}
int main(){
char (*Bi2dChar)[500] = new char [5000][500];
char **dev_Bi2dChar;
...//HERE I INPUT DATA TO Bi2dChar
size_t host_orig_pitch = 500 * sizeof(char);
size_t pitch;
cudaMallocPitch((void**)&dev_Bi2dChar, &pitch, 500 * sizeof(char), 5000);
cudaMemcpy2D(dev_Bi2dChar, pitch, Bi2dChar, host_orig_pitch, 500 * sizeof(char), 5000, cudaMemcpyHostToDevice);
kernel <<< 1, 512 >>> (dev_Bi2dChar);
free(Bi2dChar); cudaFree(dev_Bi2dChar);
}
I use:
nvcc.exe" -gencode=arch=compute_20,code=\"sm_20,compute_20\" --use-local-env --cl-version 2012 -ccbin
Thanks for help.
cudaMemcpy2D doesn't actually handle 2-dimensional (i.e. double pointer, **) arrays in C.
Note that the documentation indicates it expects single pointers, not double pointers.
Generally speaking, moving arbitrary double pointer C arrays between the host and the device is more complicated than a single pointer array.
If you really want to handle the double-pointer array, then search on "CUDA 2D Array" in the upper right hand corner of this page, and you'll find various examples of how to do it. (For example, the answer given by #talonmies here)
Often, an easier approach is simply to "flatten" the array so it can be referenced by a single pointer, i.e. char[] instead of char[][], and then use index arithmetic to simulate 2-dimensional access.
Your flattened code would look something like this:
(the code you provided is an uncompilable, incomplete snippet, so mine is also)
#define XDIM 5000
#define YDIM 500
__global__
void kernel(char* BiExponent){
for(int i=0; i<500; i++)
printf("%c",BiExponent[(1*XDIM)+i]); // I want print line 1
}
int main(){
char (*Bi2dChar)[YDIM] = new char [XDIM][YDIM];
char *dev_Bi2dChar;
...//HERE I INPUT DATA TO Bi2dChar
cudaMalloc((void**)&dev_Bi2dChar,XDIM*YDIM * sizeof(char));
cudaMemcpy(dev_Bi2dChar, &(Bi2dChar[0][0]), host_orig_pitch, XDIM*YDIM * sizeof(char), cudaMemcpyHostToDevice);
kernel <<< 1, 512 >>> (dev_Bi2dChar);
free(Bi2dChar); cudaFree(dev_Bi2dChar);
}
If you want a pitched array, you can create it similarly, but you will still do so as single pointer arrays, not double pointer arrays.
You can't use printf in a Cuda kernel. The reason being is that the code is being executed on the GPU and not on the host CPU.
You can, however use cuPrintf
How do we use cuPrintf()?