CUDA deep copy with other data - c++

I'm trying to copy my struct Test to the GPU, change the data, and upload it back to the CPU. This is what I've tried so far, note that my code crashes on the last, commented out, line:
struct Test {
int x, y;
int* data;
};
// Test kernel
static __global__ void TestKernel(Test* d) {
const uint32_t index = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
// increment some values
++d->data[0];
++d->data[1];
++d->data[2];
++d->x;
++d->y;
}
// Test snippet:
Test* host = new Test{ 10, 20,new int[3]{1, 2, 3} };
Test* device = nullptr;
int* deviceData;
COMPUTE_SAFE(cudaMalloc(&device, sizeof(Test)));
COMPUTE_SAFE(cudaMalloc(&deviceData, 3 * sizeof(int)));
COMPUTE_SAFE(cudaMemcpy(device, host, sizeof(Test), cudaMemcpyHostToDevice));
COMPUTE_SAFE(cudaMemcpy(deviceData, host->data, 3 * sizeof(int), cudaMemcpyHostToDevice));
COMPUTE_SAFE(cudaMemcpy(&(device->data), &deviceData, sizeof(float*), cudaMemcpyHostToDevice));
TestKernel <<< 1, 1 >>> (device);
COMPUTE_SAFE(cudaDeviceSynchronize());
COMPUTE_SAFE(cudaMemcpy(host, device, sizeof(Test), cudaMemcpyDeviceToHost));
COMPUTE_SAFE(cudaMemcpy(host->data, deviceData, 3 * sizeof(float), cudaMemcpyDeviceToHost));
printf("\nhost:\n");
printf("%d %d\n", host->x, host->y); // works
// printf("%d %d %d\n", host->data[0], host->data[1], host->data[2]); // crashes
Note that I've seen multiple related questions, but none of them also copy some data apart from the deep copied data pointer.
My error message:
Exception thrown at 0x00007FF7A2C5297D in VFD.exe: 0xC0000005: Access
violation reading location 0x0000000B01600208.
Note that I'm probably copying the memory incorrectly, or something along those lines. If I remove the COMPUTE_SAFE(cudaMemcpy(host, device, sizeof(Test), cudaMemcpyDeviceToHost)); line I'm able to access the host->data array, but the x and y values stay unincremented for obvious reasons.

After you cudaMemcpy into the host struct back from the GPU, you override the data pointer in it with an invalid GPU data pointer.
In order to fix it you need to restore the original data pointer (and then copy the actual data).
Working version:
struct Test
{
int x, y;
int* data;
};
static __global__ void TestKernel(Test* d)
{
++(d->data[0]);
++(d->data[1]);
++(d->data[2]);
++(d->x);
++(d->y);
}
int main()
{
int* hostData = new int[3]{ 1, 2, 3 };
Test* host = new Test{ 10, 20, hostData };
int* deviceData = nullptr;
Test* device = nullptr;
COMPUTE_SAFE(cudaMalloc(&device, sizeof(Test)));
COMPUTE_SAFE(cudaMalloc(&deviceData, 3 * sizeof(int)));
COMPUTE_SAFE(cudaMemcpy(device, host, sizeof(Test), cudaMemcpyHostToDevice));
COMPUTE_SAFE(cudaMemcpy(deviceData, host->data, 3 * sizeof(int), cudaMemcpyHostToDevice));
COMPUTE_SAFE(cudaMemcpy(&(device->data), &deviceData, sizeof(int*), cudaMemcpyHostToDevice));
TestKernel << < 1, 1 >> > (device);
COMPUTE_SAFE(cudaDeviceSynchronize());
COMPUTE_SAFE(cudaMemcpy(host, device, sizeof(Test), cudaMemcpyDeviceToHost));
host->data = hostData; // Restore host data pointer
COMPUTE_SAFE(cudaMemcpy(host->data, deviceData, 3 * sizeof(int), cudaMemcpyDeviceToHost));
printf("\nhost:\n");
printf("%d %d\n", host->x, host->y);
printf("%d %d %d\n", host->data[0], host->data[1], host->data[2]);
return 0;
}
Output:
host:
11 21
2 3 4
Some notes:
For clarity I added () in the kernel increment statements.
You used sizeof(float*) for the data pointer, although it is an int* (of course the size is the same).

Related

cudaMemcpyAsync from page-locked host memory to device memory returns reading access violation error

I am coding a memory-heavy multi-GPU CUDA program. I found that my cudaMemcpyAsync calls werent actually performed asynchronously. After some research I found out that I would have to copy it from page-locked host memory to the device. So what I now attempt to do is to copy a part of the whole host input data array into a chunk of page-locked host memory, and then copy that to the device. The H2H cudaMemcpyAsync works fine without outputting any errors, the H2D afterwards gives me this error: Access violation reading address 0xWHATEVER. Additionally, in the sample code I am providing there is an identical error with the cudaMallocHost call. (this works fine in the main project)
I have tried to make a much simpler sample project (the one below). This still gives me errors, so I don't really know what to do.
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <string>
#ifndef KERNEL_H
#define KERNEL_H
typedef struct
{
int device = 0;
double *d_array, //device array ptr
*h_array_pl; //page locked array ptr
} IOdataPtr;
#endif
void printCudaError(cudaError_t error, char err_src[]) { //error printing function to reduce line count
if (error != cudaSuccess) {
printf("Error: %i while performing %s \n", error, err_src);
}
}
int main() {
const int GPU_N = 2;
const int CALC_N = 1024*1024*1024;
cudaError_t error;
cudaStream_t stream[GPU_N];
double *h_array;
h_array = (double*)malloc(sizeof(double) * CALC_N);
for (int i = 0; i < CALC_N; i++) {
h_array[i] = 2;
}
IOdataPtr ptr[GPU_N];
for (int i = 0; i < GPU_N; i++) {
//normal host alloc
ptr[i].device = i;
error = cudaSetDevice(ptr[i].device); //select device
printCudaError(error, "cudaSetDevice");
cudaStreamCreate(&stream[i]);
printCudaError(error, "cudaStreamCreate");
error = cudaMalloc((void**)&(ptr[i].d_array),
CALC_N / GPU_N * sizeof(double));
printCudaError(error, "cudaMalloc");
error = cudaMallocHost((void **)&ptr[i].h_array_pl,
CALC_N / GPU_N * sizeof(double));
printCudaError(error, "cudaMallocHost");
//xre
//data -> pl
error = cudaMemcpyAsync(ptr[i].h_array_pl, //dst
&h_array[i * CALC_N / GPU_N], //src
CALC_N / GPU_N * sizeof(double), //amt
cudaMemcpyHostToHost, //kind
stream[i]); //stream
printCudaError(error, "cudaMemcpyAsync H2H");
//pl -> dev
error = cudaMemcpyAsync(ptr[i].d_array, //dst
ptr[i].h_array_pl, //src
CALC_N / GPU_N * sizeof(double), //amt
cudaMemcpyHostToDevice, //kind
stream[i]); //stream
printCudaError(error, "cudaMemcpyAsync H2D");
cudaStreamDestroy(stream[i]);
error = cudaFree(ptr[i].d_array);
printCudaError(error, "cudaFree");
}
printf("Well it worked");
free(h_array);
getchar();
}
The output my code gives me:
Error: 2 while performing cudaMallocHost
Error: 2 while performing cudaMemcpyAsync H2H
Error: 2 while performing cudaMemcpyAsync H2D
Error: 2 while performing cudaFree
Well it worked
Error 2 is cudaErrorMemoryAllocation
In the code you currently have posted, this line of code is wrong:
error = cudaMemcpyAsync(ptr[i].d_array, &ptr[i].h_array_pl, CALC_N / GPU_N * sizeof(double), cudaMemcpyHostToDevice, stream[i]);
^
That ampersand doesn't belong there. ptr[i].h_array_pl is already a pointer to the source of the data transfer, you should not be taking the address of that pointer.
Using the address of this pointer as the data source of the copy operation would result in incorrect and illegal host memory accesses, for the size of the transfer indicated in this code. Whether or not this would be detected depends on a number of factors, but it's possibly or probably the reason for the Access violation reading location... report, which is generally referring to an illegal access to host memory.

Problem feeding Thrust vector into getrf/getri

Continuing on my CUDA beginner's adventure, I've been introduced to Thrust, which seems a convenient lib that saves me the hassle of explicit memory (de-)allocation.
I've already tried combining it with a few cuBLAS routines, e.g. gemv, by generating a raw pointer to the underlying storage with thrust::raw_pointer_cast(array.data()) and then feeding this to the routines, and it works just fine.
The current task is to get the inverse of a matrix, and for that I'm using getrfBatched and getriBatched. From the documentation:
cublasStatus_t cublasDgetrfBatched(cublasHandle_t handle,
int n,
double *Aarray[],
int lda,
int *PivotArray,
int *infoArray,
int batchSize);
where
Aarray - device - array of pointers to <type> array
Naturally I thought I could use another layer of Thrust vector to express this array of pointers and again feed its raw pointer to cuBLAS, so here's what I did:
void test()
{
thrust::device_vector<double> in(4);
in[0] = 1;
in[1] = 3;
in[2] = 2;
in[3] = 4;
cublasStatus_t stat;
cublasHandle_t handle;
stat = cublasCreate(&handle);
thrust::device_vector<double> out(4, 0);
thrust::device_vector<int> pivot(2, 0);
int info = 0;
thrust::device_vector<double*> in_array(1);
in_array[0] = thrust::raw_pointer_cast(in.data());
thrust::device_vector<double*> out_array(1);
out_array[0] = thrust::raw_pointer_cast(out.data());
stat = cublasDgetrfBatched(handle, 2,
(double**)thrust::raw_pointer_cast(in_array.data()), 2,
thrust::raw_pointer_cast(pivot.data()), &info, 1);
stat = cublasDgetriBatched(handle, 2,
(const double**)thrust::raw_pointer_cast(in_array.data()), 2,
thrust::raw_pointer_cast(pivot.data()),
(double**)thrust::raw_pointer_cast(out_array.data()), 2, &info, 1);
}
When executed, stat says CUBLAS_STATUS_SUCCESS (0) and info says 0 (execution successful), yet if I try to access the elements of in, pivot or out with standard bracket notation, I hit a thrust::system::system_error. Seems to me that the corresponding memory got corrupted somehow.
Anything obvious that I'm missing here?
The documentation for cublas<t>getrfBatched indicates that the infoArray parameter is expected to be a pointer to device memory.
Instead you have passed a pointer to host memory:
int info = 0;
...
stat = cublasDgetrfBatched(handle, 2,
(double**)thrust::raw_pointer_cast(in_array.data()), 2,
thrust::raw_pointer_cast(pivot.data()), &info, 1);
^^^^^
If you run your code with cuda-memcheck (always a good practice, in my opinion, any time you are having trouble with a CUDA code, before asking others for help) you will receive an error of "invalid global write of size 4". This is due to the fact that a kernel launched by cublasDgetrfBatched() is attempting to write the info data to device memory using an ordinary host pointer that you provided, which is always illegal in CUDA.
CUBLAS itself does not trap errors like this for performance reasons. However the thrust API uses more rigorous synchronization and error checking, in some cases. Therefore, the use of thrust code after this error reports the error, even though the error had nothing to do with thrust (it was an asynchronously reported error from a previous kernel launch).
The solution is straightforward; provide device storage for info:
$ cat t329.cu
#include <thrust/device_vector.h>
#include <cublas_v2.h>
#include <iostream>
void test()
{
thrust::device_vector<double> in(4);
in[0] = 1;
in[1] = 3;
in[2] = 2;
in[3] = 4;
cublasStatus_t stat;
cublasHandle_t handle;
stat = cublasCreate(&handle);
thrust::device_vector<double> out(4, 0);
thrust::device_vector<int> pivot(2, 0);
thrust::device_vector<int> info(1, 0);
thrust::device_vector<double*> in_array(1);
in_array[0] = thrust::raw_pointer_cast(in.data());
thrust::device_vector<double*> out_array(1);
out_array[0] = thrust::raw_pointer_cast(out.data());
stat = cublasDgetrfBatched(handle, 2,
(double**)thrust::raw_pointer_cast(in_array.data()), 2,
thrust::raw_pointer_cast(pivot.data()), thrust::raw_pointer_cast(info.data()), 1);
stat = cublasDgetriBatched(handle, 2,
(const double**)thrust::raw_pointer_cast(in_array.data()), 2,
thrust::raw_pointer_cast(pivot.data()),
(double**)thrust::raw_pointer_cast(out_array.data()), 2, thrust::raw_pointer_cast(info.data()), 1);
for (int i = 0; i < 4; i++) {
double test = in[i];
std::cout << test << std::endl;
}
}
int main(){
test();
}
$ nvcc -o t329 t329.cu -lcublas
t329.cu(12): warning: variable "stat" was set but never used
$ cuda-memcheck ./t329
========= CUDA-MEMCHECK
3
0.333333
4
0.666667
========= ERROR SUMMARY: 0 errors
$
You'll note this change in the above code is applied to usage for both cublas calls, as the infoArray parameter has the same expectations for both.

Segmentation fault when sending struct having std::vector member

Why I get the following error for the following code with mpirun -np 2 ./out command? I called make_layout() after resizing the std::vector so normally I should not get this error. It works if I do not resize. What is the reason?
main.cpp:
#include <iostream>
#include <vector>
#include "mpi.h"
MPI_Datatype MPI_CHILD;
struct Child
{
std::vector<int> age;
void make_layout();
};
void Child::make_layout()
{
int nblock = 1;
int age_size = age.size();
int block_count[nblock] = {age_size};
MPI_Datatype block_type[nblock] = {MPI_INT};
MPI_Aint offset[nblock] = {0};
MPI_Type_struct(nblock, block_count, offset, block_type, &MPI_CHILD);
MPI_Type_commit(&MPI_CHILD);
}
int main()
{
int rank, size;
MPI_Init(NULL, NULL);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
Child kid;
kid.age.resize(5);
kid.make_layout();
int datasize;
MPI_Type_size(MPI_CHILD, &datasize);
std::cout << datasize << std::endl; // output: 20 (5x4 seems OK).
if (rank == 0)
{
MPI_Send(&kid, 1, MPI_CHILD, 1, 0, MPI_COMM_WORLD);
}
if (rank == 1)
{
MPI_Recv(&kid, 1, MPI_CHILD, 0, 0, MPI_COMM_WORLD, NULL);
}
MPI_Finalize();
return 0;
}
Error message:
*** Process received signal ***
Signal: Segmentation fault (11)
Signal code: Address not mapped (1)
Failing at address: 0x14ae7b8
[ 0] /lib/x86_64-linux-gnu/libpthread.so.0(+0x113d0)[0x7fe1ad91c3d0]
[ 1] /lib/x86_64-linux-gnu/libc.so.6(cfree+0x22)[0x7fe1ad5c5a92]
[ 2] ./out[0x400de4]
[ 3] /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf0)[0x7fe1ad562830]
[ 4] ./out[0x400ec9]
*** End of error message ***
Here is an example with several std::vector members that uses MPI datatypes with absolute addresses:
struct Child
{
int foo;
std::vector<float> bar;
std::vector<int> baz;
Child() : dtype(MPI_DATATYPE_NULL) {}
~Child() { if (dtype != MPI_DATATYPE_NULL) MPI_Type_free(dtype); }
const MPI_Datatype mpi_dtype();
void invalidate_dtype();
private:
MPI_Datatype dtype;
void make_dtype();
};
const MPI_Datatype Child::mpi_dtype()
{
if (dtype == MPI_DATATYPE_NULL)
make_dtype();
return dtype;
}
void Child::invalidate_dtype()
{
if (dtype != MPI_DATATYPE_NULL)
MPI_Datatype_free(&dtype);
}
void Child::make_dtype()
{
const int nblock = 3;
int block_count[nblock] = {1, bar.size(), baz.size()};
MPI_Datatype block_type[nblock] = {MPI_INT, MPI_FLOAT, MPI_INT};
MPI_Aint offset[nblock];
MPI_Get_address(&foo, &offset[0]);
MPI_Get_address(&bar[0], &offset[1]);
MPI_Get_address(&baz[0], &offset[2]);
MPI_Type_struct(nblock, block_count, offset, block_type, &dtype);
MPI_Type_commit(&dtype);
}
Sample use of that class:
Child kid;
kid.foo = 5;
kid.bar.resize(5);
kid.baz.resize(10);
if (rank == 0)
{
MPI_Send(MPI_BOTTOM, 1, kid.mpi_dtype(), 1, 0, MPI_COMM_WORLD);
}
if (rank == 1)
{
MPI_Recv(MPI_BOTTOM, 1, kid.mpi_dtype(), 0, 0, MPI_COMM_WORLD, NULL);
}
Notice the use of MPI_BOTTOM as the buffer address. MPI_BOTTOM specifies the bottom of the address space, which is 0 on architectures with flat address space. Since the offsets passed to MPI_Type_create_struct are the absolute addresses of the structure members, when those are added to 0, the result is again the absolute address of each structure member. Child::mpi_dtype() returns a lazily constructed MPI datatype specific to that instance.
Since resize() reallocates memory, which could result in the data being moved to a different location in memory, the invalidate_dtype() method should be used to force the recreation of the MPI datatype after resize() or any other operation that might trigger memory reallocation:
// ...
kid.bar.resize(100);
kid.invalidate_dtype();
// MPI_Send / MPI_Recv
Please excuse any sloppy C++ code above.
The problem here is that you're telling MPI to send a block of integers from &kid, but that's not where your data is. &kid points to an std::vector object, which has an internal pointer to your block of integers allocated somewhere on the heap.
Replace &kid with kid.age.data() and it should work. The reason it "works" when you don't resize is that the vectors will be of 0 size, so MPI will try to send an empty message and no actual memory access takes place.
Be careful, you faced several problems.
First std::vector stores object in heap, so data is not really stored inside your struct.
Second you are not able to send STL containers even between dynamic libraries, also for app instances this is also true. Because they may be compiled with different versions of STL and work on different architectures differently.
Here is good answer about this part of question: https://stackoverflow.com/a/22797419/440168

Segmentation Fault(Core dump), Do not know why

I was trying to make up a code that I saw for some distributed application to see its working, but I am getting an error. I am trying to see the working of messages
The code is
class Address {
public:
char addr[6];
Address() {}
Address(string address) {
size_t pos = address.find(":");
int id = stoi(address.substr(0, pos));
short port = (short)stoi(address.substr(pos + 1, address.size()-pos-1));
memcpy(&addr[0], &id, sizeof(int));
memcpy(&addr[4], &port, sizeof(short));
}
};
enum MsgTypes{
JOINREQ,
JOINREPLY,
DUMMYLASTMSGTYPE,
HEARTBEAT
};
/**
* STRUCT NAME: MessageHdr
*
* DESCRIPTION: Header and content of a message
*/
typedef struct MessageHdr {
enum MsgTypes msgType;
}MessageHdr;
typedef struct en_msg {
// Number of bytes after the class
int size;
// Source node
Address from;
// Destination node
Address to;
}en_msg;
void send(Address *myaddr, Address *toaddr, char *data, int size);
int main()
{
MessageHdr *msg;
size_t msgsize = sizeof(MessageHdr) + sizeof(Address) + sizeof(long) + 1;
int id=233;
short port =22;
long heartbeat=1;
string s=to_string(id)+to_string(port);
string s1=to_string(id+1)+to_string(port+1);
Address *addr= new Address(s);
Address *toaddr= new Address(s);
msg->msgType = JOINREQ;
memcpy((char *)(msg+1), addr, sizeof(addr));
memcpy((char *)(msg+1) + 1 + sizeof(addr), (char *)heartbeat, sizeof(long));
send(addr, toaddr, (char *)msg, msgsize);
}
void send(Address *myaddr, Address *toaddr, char *data, int size) {
en_msg *em;
static char temp[2048];
em = (en_msg *)malloc(sizeof(en_msg) + size);
em->size = size;
memcpy(&(em->from), &(myaddr), sizeof(em->from));
memcpy(&(em->to), &(toaddr), sizeof(em->from));
memcpy(em + 1, data, size);
cout<<em;
}
The error is just this line :
Segmentation fault (core dumped)
1) as Retired Ninja said in comments, first line of main must be like
MessageHdr *msg = new MessageHdr();
because msg->msgType = JOINREQ; will cause an error with unassigned msg.
2) the first fix will not help because of expressions like
(char *)(msg+1)
here msg of type MessageHdr * used as char * in address arithmetic. I mean that it is dangerous to calculate addresses by expression of style
(char *)(msg+1) + 1
while msg is of MessageHdr* type, (msg+1) - means a shift to next structure MessageHdr and the additional +1 after casting to char* means shift to one byte. Personally I cannot understand the logic, whereas MessageHdr structure has only one enum field and with so strange address manipulation you are trying to fit instance of Address class (or long int value) to that structure with memcpy.
Conclusion:
Very substantial program redesign is needed with writing comments and using clear logic in operations.

CUDA error when handling large input

So I have a rather strange error that is happening. I have a kernel that is supposed to alter the value of every element in an array. As of right now I only test with launching one thread.
__global__ void kernel(int* data) {
for (int var = 0; var < SIZE; ++var) {
data[var] = data[var] + 1;
}
}
Here is the whole code:
#include "stdint.h"
#include "stdio.h"
#include "kernelLauncher.cuh"
#include <cuda_runtime.h>
#define SIZE 10485760
typedef uint64_t POLY_64;
typedef unsigned char BYTE;
__global__ void kernel(int* data) {
for (int var = 0; var < SIZE; ++var) {
data[var] = data[var] + 1;
}
}
int main() {
int* data = (int*) malloc(sizeof(int) * SIZE);
int* data_d;
for (int var = 0; var < SIZE; ++var) {
data[var] = 1;
}
//allocate device memory for the fingerprinting data
cudaMalloc((void**) &data_d, sizeof(int) * SIZE);
//copy the data to device
CUDA_CHECK_RETURN(
cudaMemcpy(data_d, data, sizeof(int) * SIZE, cudaMemcpyHostToDevice));
kernel<<<1, 1>>>(data_d);
cudaThreadSynchronize();
CUDA_CHECK_RETURN(cudaMemcpy(data, data_d, sizeof(int) * SIZE, cudaMemcpyDeviceToHost));
//try to print the result
for (int var = 0; var < SIZE; ++var) {
printf("%d\n", data[var]);
}
CUDA_CHECK_RETURN(cudaFree(data_d));
return 0;
}
When my SIZE is defined to 1048576, I get my data back just fine. Unfortunatelly when I define it as 10485760 (10 times more). I get:
Error unspecified launch failure at line 40 in file ../src/runTest.cu
Can somebody point me in the right direction. Why is this problem happening ? Thank you in advance
EDIT: So yes.. it is the size definition. I changed my code now so there are no discrepancies between the hard coded loop value in the kernel and the defined constant. However, if I have 10485760 instead of 1048576 it simply does not work.. Why is that. This is not too much allocation at one go.. My card is a Quadro FX 770m with compute capability 1.1
So.. here is what actually seemed to be happening. As some of you suggested, the kernel was indeed taking too long and timing out (although I read from various soruces that this does not happen on Linux systems) So separating the work like this in fact solves the issue and avoids the watchdog killing the kernel :
kernel<<<1, 1>>>(data_d, 0, 1048576);
cudaDeviceSynchronize();
kernel<<<1, 1>>>(data_d, 1048576, 2097152);
cudaDeviceSynchronize();
kernel<<<1, 1>>>(data_d, 2097152, 3145728);
cudaDeviceSynchronize();
kernel<<<1, 1>>>(data_d, 3145728, 4194304);
cudaDeviceSynchronize();
kernel<<<1, 1>>>(data_d, 4194304, 5242880);
cudaDeviceSynchronize();
kernel<<<1, 1>>>(data_d, 5242880, 6291456);
cudaDeviceSynchronize();
kernel<<<1, 1>>>(data_d, 6291456, 7340032);
cudaDeviceSynchronize();
kernel<<<1, 1>>>(data_d, 7340032, 8388608);
cudaDeviceSynchronize();
Now I wonder, what is the way to avoid hitting this threshold. I tried adding
Section "Device"
Identifier "Device0"
Driver "nvidia"
VendorName "NVIDIA Corporation"
Option "Interactive" "0" #<<--- added to avoid kernel time-out
EndSection
into the device section in my Xorg.conf, but this did not really help.