I've an host array of uint64_t of size spectrum_size and I need to allocate and copy it on my GPU.
But when I'm trying to allocate this in the GPU memory, but I continue to receive SIGSEGV... Any ideas?
uint64_t * gpu_hashed_spectrum;
uint64_t * gpu_hashed_spectrum_h = new uint64_t [spectrum_size];
HANDLE_ERROR(cudaMalloc((void **)&gpu_hashed_spectrum, sizeof(uint64_t *) * spectrum_size));
for(i=0; i<spectrum_size; i++) {
HANDLE_ERROR(cudaMalloc((void **)&gpu_hashed_spectrum_h[i], sizeof(uint64_t)));
}
printf("\t\t...Copying\n");
for(i=0; i<spectrum_size; i++) {
HANDLE_ERROR(cudaMemcpy((void *)gpu_hashed_spectrum_h[i], (const void *)hashed_spectrum[i], sizeof(uint64_t), cudaMemcpyHostToDevice));
}
HANDLE_ERROR(cudaMemcpy(gpu_hashed_spectrum, gpu_hashed_spectrum_h, spectrum_size * sizeof(uint64_t *), cudaMemcpyHostToDevice));
Full code available here
UPDATE:
I tried to do in this way, noy I've got SIGSEGV on other parts of the code (in the kernel, when using this array. Maybe is due to other errors.
uint64_t * gpu_hashed_spectrum;
HANDLE_ERROR(cudaMalloc((void **)&gpu_hashed_spectrum, sizeof(uint64_t) * spectrum_size));
HANDLE_ERROR(cudaMemcpy(gpu_hashed_spectrum, hashed_spectrum, spectrum_size * sizeof(uint64_t), cudaMemcpyHostToDevice));
At least you are confused about uint64_t** and unit64_t*.
At line 1, you define gpu_hashed_spectrum as a pointer, pointing to some data of the type unit64_t, but at line 3
HANDLE_ERROR(cudaMalloc((void **)&gpu_hashed_spectrum, sizeof(uint64_t *) * spectrum_size));
you use gpu_hashed_spectrum as a pointer, pointing to some data of the type unit64_t*.
Maybe you should change your definition to
uint64_t** gpu_hashed_spectrum;
As well as some other lines.
Related
New to CUDA and gpu programming, having trouble with copying array of object pointers to device.
I have a vector of object pointers, each object contains two vectors, that I will be working with in device code.
I need to somehow copy that array into the device memory, however after reading similar solutions, still can't figure it out.
This is the structure of an object, I'm working with:
std::vector<int> retVals;
std::vector<int> children{4};
So, not only I need to make copy of the array, I also need to convert these vectors to the int array in each object.
EDIT:
This is what I have come up so far with:
auto **nodesPtr = ( aho_corasick::Node**)malloc(a->nodes.size() * sizeof(aho_corasick::Node *));
int i = 0;
for (auto &node: a->nodes){
auto *newNode = new aho_corasick::Node(' ');
cudaMalloc((void**)&(newNode->cudaChildren), sizeof(int) * node->children.size());
cudaMemcpy(newNode->cudaChildren, node->children.data(), sizeof(int) * node->children.size(), cudaMemcpyHostToDevice);
cudaMalloc((void**)&(newNode->cudaRets), sizeof(int) * node->retVals.size());
cudaMemcpy(newNode->cudaRets, node->children.data(), sizeof(int) * node->retVals.size(), cudaMemcpyHostToDevice);
aho_corasick::Node* devNode;
cudaMalloc((void**)&devNode, sizeof(aho_corasick::Node));
cudaMemcpy(devNode, newNode, sizeof(aho_corasick::Node), cudaMemcpyHostToDevice);
nodesPtr[i++] = devNode;
}
aho_corasick::Node **devNodes;
cudaMalloc((void **)&devNodes, a->nodes.size() * sizeof(aho_corasick::Node *));
cudaMemcpy(devNodes, nodesPtr, a->nodes.size() * sizeof(aho_corasick::Node *), cudaMemcpyHostToDevice);
Still does not seem to work tho.
Also, how bad is such code in CUDA terms and how would I go around the array of pointers?
EDIT2:
Forgot to point out: I added two additional fields in my objects: two int arrays and in the for loop I am creating a new object, into which I am copying the two corresponding vectors (into the int array fields) and after that I am creating a new object in device memory with those fields.
Then, after the loop, I am allocating the array of object pointers in device memory.
auto **nodesPtr = (aho_corasick::Node **) malloc(a->nodes.size() * sizeof(aho_corasick::Node *));
int i = 0;
for (auto &node: a->nodes) {
auto *newNode = new aho_corasick::Node(' ');
cudaMalloc((void **) &(newNode->cudaChildren), sizeof(int) * node->children.size());
cudaMemcpy(newNode->cudaChildren, node->children.data(), sizeof(int) * node->children.size(),
cudaMemcpyHostToDevice);
cudaMalloc((void **) &(newNode->cudaRets), sizeof(int) * node->retVals.size());
cudaMemcpy(newNode->cudaRets, node->retVals.data(), sizeof(int) * node->retVals.size(),
cudaMemcpyHostToDevice);
newNode->retsCount = node->retVals.size();
aho_corasick::Node *devNode;
cudaMalloc((void **) &devNode, sizeof(aho_corasick::Node));
cudaMemcpy(devNode, newNode, sizeof(aho_corasick::Node), cudaMemcpyHostToDevice);
nodesPtr[i++] = devNode;
}
aho_corasick::Node **devNodes;
cudaMalloc((void ***) &devNodes, a->nodes.size() * sizeof(aho_corasick::Node *));
cudaMemcpy(devNodes, nodesPtr, a->nodes.size() * sizeof(aho_corasick::Node *), cudaMemcpyHostToDevice);
This is how I copied my array of object into device. However, as it turns out, for my task this approach is not acceptable. It takes few hours to copy all objects.
Closing this thread, but if someone knows how to avoid using array of pointers and therefore avoid copying, please let me know.
I allocated the memory for an array of unsigned char using cudaMalloc and the initialized using cudaMemset.
unsigned char *device_channel_data;
cudaMalloc( device_channel_data, sizeof(unsigned char) * image_size);
cudaMemset( *device_channel_data, 0, sizeof(unsigned char) * image_size);
After that I'm checking whether it is really set to 0 by copying the data back to host. I'm printing some of the elements to check the data but the values printed are random.
unsigned char *host_channel_channel;
cudaMemcpy(host_channel_channel, device_channel_data, sizeof(unsigned char) * image_size, cudaMemcpyDeviceToHost);
for(int i = 0; i < 10; i ++)
{
std::cout<< (int)host_channel_channel[i] << std::endl;
}
I want to initialize the device_channel data to 0.
My knowledge with pointers and CUDA programming is very limited. I'm just starting with CUDA Programming. Thanks in advance for the help.
The answer of your question is in the documentation of CUDA: https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html
For error handling (Use of cudamalloc(). Why the double pointer?) the CUDA-API need a pointer to pointer as an input parameter for allocating the memory, while for memset a simple pointer is needed. Change your code to:
unsigned char *device_channel_data;
cudaMalloc( (void**)&device_channel_data, sizeof(unsigned char) * image_size);
cudaMemset( device_channel_data, 0, sizeof(unsigned char) * image_size);
and everything should work fine.
I tried the code in this link Is CUDA pinned memory zero-copy?
The one who asked claims the program worked fine for him
But does not work the same way on mine
the values does not change if I manipulate them in the kernel.
Basically my problem is, my GPU memory is not enough but I want to do calculations which require more memory. I my program to use RAM memory, or host memory and be able to use CUDA for calculations. The program in the link seemed to solve my problem but the code does not give output as shown by the guy.
Any help or any working example on Zero copy memory would be useful.
Thank you
__global__ void testPinnedMemory(double * mem)
{
double currentValue = mem[threadIdx.x];
printf("Thread id: %d, memory content: %f\n", threadIdx.x, currentValue);
mem[threadIdx.x] = currentValue+10;
}
void test()
{
const size_t THREADS = 8;
double * pinnedHostPtr;
cudaHostAlloc((void **)&pinnedHostPtr, THREADS, cudaHostAllocDefault);
//set memory values
for (size_t i = 0; i < THREADS; ++i)
pinnedHostPtr[i] = i;
//call kernel
dim3 threadsPerBlock(THREADS);
dim3 numBlocks(1);
testPinnedMemory<<< numBlocks, threadsPerBlock>>>(pinnedHostPtr);
//read output
printf("Data after kernel execution: ");
for (int i = 0; i < THREADS; ++i)
printf("%f ", pinnedHostPtr[i]);
printf("\n");
}
First of all, to allocate ZeroCopy memory, you have to specify cudaHostAllocMapped flag as an argument to cudaHostAlloc.
cudaHostAlloc((void **)&pinnedHostPtr, THREADS * sizeof(double), cudaHostAllocMapped);
Still the pinnedHostPointer will be used to access the mapped memory from the host side only. To access the same memory from device, you have to get the device side pointer to the memory like this:
double* dPtr;
cudaHostGetDevicePointer(&dPtr, pinnedHostPtr, 0);
Pass this pointer as kernel argument.
testPinnedMemory<<< numBlocks, threadsPerBlock>>>(dPtr);
Also, you have to synchronize the kernel execution with the host to read the updated values. Just add cudaDeviceSynchronize after the kernel call.
The code in the linked question is working, because the person who asked the question is running the code on a 64 bit OS with a GPU of Compute Capability 2.0 and TCC enabled. This configuration automatically enables the Unified Virtual Addressing feature of the GPU in which the device sees host + device memory as a single large memory instead of separate ones and host pointers allocated using cudaHostAlloc can be passed directly to the kernel.
In your case, the final code will look like this:
#include <cstdio>
__global__ void testPinnedMemory(double * mem)
{
double currentValue = mem[threadIdx.x];
printf("Thread id: %d, memory content: %f\n", threadIdx.x, currentValue);
mem[threadIdx.x] = currentValue+10;
}
int main()
{
const size_t THREADS = 8;
double * pinnedHostPtr;
cudaHostAlloc((void **)&pinnedHostPtr, THREADS * sizeof(double), cudaHostAllocMapped);
//set memory values
for (size_t i = 0; i < THREADS; ++i)
pinnedHostPtr[i] = i;
double* dPtr;
cudaHostGetDevicePointer(&dPtr, pinnedHostPtr, 0);
//call kernel
dim3 threadsPerBlock(THREADS);
dim3 numBlocks(1);
testPinnedMemory<<< numBlocks, threadsPerBlock>>>(dPtr);
cudaDeviceSynchronize();
//read output
printf("Data after kernel execution: ");
for (int i = 0; i < THREADS; ++i)
printf("%f ", pinnedHostPtr[i]);
printf("\n");
return 0;
}
.h file:
#define VECTOR_SIZE 1024
.cpp file:
int main ()
{
unsigned int* A;
A = new unsigned int [VECTOR_SIZE];
CopyToDevice (A);
}
.cu file:
void CopyToDevice (unsigned int *A)
{
ulong4 *UA
unsigned int VectorSizeUlong4 = VECTOR_SIZE / 4;
unsigned int VectorSizeBytesUlong4 = VectorSizeUlong4 * sizeof(ulong4);
cudaMalloc( (void**)&UA, VectorSizeBytesUlong4 );
// how to use cudaMemcpy to copy data from A to UA?
// I tried to do the following but it gave access violation error:
for (int i=0; i<VectorSizeUlong4; ++i)
{
UA[i].x = A[i*4 + 0];
UA[i].y = A[i*4 + 1];
UA[i].z = A[i*4 + 2];
UA[i].w = A[i*4 + 3];
}
// I also tried to copy *A to device and then work on it instead going back to CPU to access *A every time but this did not work again
}
The CUDA ulong4 is a 16 byte aligned structure defined as
struct __builtin_align__(16) ulong4
{
unsigned long int x, y, z, w;
};
this means that the stream of four consecutive 32 bit unsigned source integers you want to use to populate a stream of ulong4 are the same size. The simplest solution is contained right in the text on the image you posted - just cast (either implicitly or explicitly) the unsigned int pointer to a ulong4 pointer, use cudaMemcpydirectly on the host and device memory, and pass the resulting device pointer to whatever kernel function you have that requires a ulong4 input. Your device transfer function could look something like:
ulong4* CopyToDevice (unsigned int* A)
{
ulong4 *UA, *UA_h;
size_t VectorSizeUlong4 = VECTOR_SIZE / 4;
size_t VectorSizeBytesUlong4 = VectorSizeUlong4 * sizeof(ulong4);
cudaMalloc( (void**)&UA, VectorSizeBytesUlong4);
UA_h = reinterpret_cast<ulong4*>(A); // not necessary but increases transparency
cudaMemcpy(UA, UA_h, VectorSizeBytesUlong4);
return UA;
}
[Usual disclaimer: written in browser, not tested or compiled, use at own risk]
This should raise all alarm bells:
cudaMalloc( (void**)&UA, VectorSizeBytesUlong4 );
// ...
UA[i].x = A[i*4 + 0];
You are allocating UA on the device and then use it in host code. Don't ever do that. You will need to use cudaMemcpy to copy arrays to the device. This tutorial shows you a basic program that uses cudaMemcpy to copy things over. The length argument to cudaMemcpy is the length of your array in bytes. And in your case that is VECTOR_SIZE * sizeof(unsigned int).
I need help with transfer char[][] to Cuda kernel. This is my code:
__global__
void kernel(char** BiExponent){
for(int i=0; i<500; i++)
printf("%c",BiExponent[1][i]); // I want print line 1
}
int main(){
char (*Bi2dChar)[500] = new char [5000][500];
char **dev_Bi2dChar;
...//HERE I INPUT DATA TO Bi2dChar
size_t host_orig_pitch = 500 * sizeof(char);
size_t pitch;
cudaMallocPitch((void**)&dev_Bi2dChar, &pitch, 500 * sizeof(char), 5000);
cudaMemcpy2D(dev_Bi2dChar, pitch, Bi2dChar, host_orig_pitch, 500 * sizeof(char), 5000, cudaMemcpyHostToDevice);
kernel <<< 1, 512 >>> (dev_Bi2dChar);
free(Bi2dChar); cudaFree(dev_Bi2dChar);
}
I use:
nvcc.exe" -gencode=arch=compute_20,code=\"sm_20,compute_20\" --use-local-env --cl-version 2012 -ccbin
Thanks for help.
cudaMemcpy2D doesn't actually handle 2-dimensional (i.e. double pointer, **) arrays in C.
Note that the documentation indicates it expects single pointers, not double pointers.
Generally speaking, moving arbitrary double pointer C arrays between the host and the device is more complicated than a single pointer array.
If you really want to handle the double-pointer array, then search on "CUDA 2D Array" in the upper right hand corner of this page, and you'll find various examples of how to do it. (For example, the answer given by #talonmies here)
Often, an easier approach is simply to "flatten" the array so it can be referenced by a single pointer, i.e. char[] instead of char[][], and then use index arithmetic to simulate 2-dimensional access.
Your flattened code would look something like this:
(the code you provided is an uncompilable, incomplete snippet, so mine is also)
#define XDIM 5000
#define YDIM 500
__global__
void kernel(char* BiExponent){
for(int i=0; i<500; i++)
printf("%c",BiExponent[(1*XDIM)+i]); // I want print line 1
}
int main(){
char (*Bi2dChar)[YDIM] = new char [XDIM][YDIM];
char *dev_Bi2dChar;
...//HERE I INPUT DATA TO Bi2dChar
cudaMalloc((void**)&dev_Bi2dChar,XDIM*YDIM * sizeof(char));
cudaMemcpy(dev_Bi2dChar, &(Bi2dChar[0][0]), host_orig_pitch, XDIM*YDIM * sizeof(char), cudaMemcpyHostToDevice);
kernel <<< 1, 512 >>> (dev_Bi2dChar);
free(Bi2dChar); cudaFree(dev_Bi2dChar);
}
If you want a pitched array, you can create it similarly, but you will still do so as single pointer arrays, not double pointer arrays.
You can't use printf in a Cuda kernel. The reason being is that the code is being executed on the GPU and not on the host CPU.
You can, however use cuPrintf
How do we use cuPrintf()?