cudMemcpy invalid argument no clues about what is wrong - c++

I wrote some code and can't get it work for some reason...but it's already copy paste from working test program generated by VisualStudio:
__device__ int pseudoRandomFunction(int seed)
{
unsigned int m_w = 150;
unsigned int m_z = 40;
m_z = 36969 * (m_z & 65535) + (m_z >> 16);
m_w = 18000 * (m_w & 65535) + (m_w >> 16);
return (m_z << 16) + m_w;
}
__global__ void fillArrayWithRandom(int* vector, int seed = 0)
{
int i = threadIdx.x;
vector[i] = pseudoRandomFunction(seed^i);
}
void Lab1(int* array, int size)
{
int* gpuArray = 0;
gpuErrorCheck(cudaSetDevice(0));
gpuErrorCheck(cudaMalloc(&gpuArray, size));
gpuErrorCheck(cudaMemcpy(gpuArray, array, size * sizeof(int), cudaMemcpyHostToDevice)); // <--- here is invalid argument exception!
fillArrayWithRandom<<<1,size>>>(gpuArray,0);
gpuErrorCheck(cudaGetLastError());
gpuErrorCheck(cudaDeviceSynchronize());
gpuErrorCheck(cudaMemcpy(array, gpuArray, size * sizeof(int), cudaMemcpyDeviceToHost));
cudaFree(gpuArray);
gpuErrorCheck(cudaDeviceReset());
}
I even tryed to change argument types but no progress...plese help! My gpu GeForce 9600GT, and cuda 6.5

The only problem I can see is that this line is not correct:
gpuErrorCheck(cudaMalloc(&gpuArray, size));
it should be:
gpuErrorCheck(cudaMalloc(&gpuArray, size*sizeof(int)));
with that change, I can run your code without error, when size = 10.
Note that your pseudoRandomFunction routine does not use seed, so every value it creates will be the same.

Related

printing array from unified memory on cuda device doesn`t work

I try to create some hashes on a cuda device and printing them on the host. But at the printf on the host i am getting a read error at position 0x000000000100002F
The relevant lines look like this:
int main() {
const int block_size = 2;
const int num_blocks = 256;
const int N = block_size * num_blocks;
unsigned char** hashes;
cudaMallocManaged(&hashes, N * (32 * sizeof(unsigned char)));
cudaMemset(hashes, 0, N * (32 * sizeof(unsigned char)));
On the device
__global__ void sha256_kernel(unsigned char **dhashes){
int idx = blockIdx.x * blockDim.x + threadIdx.x;
sha256_final(&ctx, sha);
dhashes[idx] = sha;
// printf("%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x\n", dhashes[idx][0], dhashes[idx][1], dhashes[idx][2], dhashes[idx][3], dhashes[idx][4], dhashes[idx][5], dhashes[idx][6], dhashes[idx][7], dhashes[idx][8], dhashes[idx][9], dhashes[idx][10], dhashes[idx][11], dhashes[idx][12], dhashes[idx][13], dhashes[idx][14], dhashes[idx][15],
// dhashes[idx][16], dhashes[idx][17], dhashes[idx][18], dhashes[idx][19], dhashes[idx][20], dhashes[idx][21], dhashes[idx][22], dhashes[idx][23], dhashes[idx][24], dhashes[idx][25], dhashes[idx][26], dhashes[idx][27], dhashes[idx][28], dhashes[idx][29], dhashes[idx][30], dhashes[idx][31]);
// printing here is correct
}
And back on the host side...
sha256_kernel << < num_blocks, block_size>> > (hashes);
cudaDeviceSynchronize();
for (int i = 0; i < N; i++) {
printf("%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x\n", hashes[i][0], hashes[i][1], hashes[i][2], hashes[i][3], hashes[i][4], hashes[i][5], hashes[i][6], hashes[i][7], hashes[i][8], hashes[i][9], hashes[i][10], hashes[i][11], hashes[i][12], hashes[i][13], hashes[i][14], hashes[i][15],
hashes[i][16], hashes[i][17], hashes[i][18], hashes[i][19], hashes[i][20], hashes[i][21], hashes[i][22], hashes[i][23], hashes[i][24], hashes[i][25], hashes[i][26], hashes[i][27], hashes[i][28], hashes[i][29], hashes[i][30], hashes[i][31]);
}//printing here doesn't work
Seems to be correct, but when I try to print the hashes at host, I get an read error?
The memory allocation you are using to hold the hashes is incorrect. To have an array of pointers to the memory for each hash, you would require memory allocated both for the array of pointers, and for the hashes themselves, so something like:
unsigned char** hashes;
unsigned char* buff;
cudaMallocManaged(&hashes, N * sizeof(unsigned char*));
cudaMallocManaged(&buff, N * (32 * sizeof(unsigned char)));
cudaMemset(buff, 0, N * (32 * sizeof(unsigned char)));
for(i=0; i<N; i++) hashes[i] = &buff[i*32];
[ Disclaimer: written in browser, never compiled or tested, use at own risk ]

How does cudaLaunchKernel know the array size of "void **args"?

I know the size of array can be got with following code:
int a = 12;
float b = 12.0f;
char c = 'c';
void *param[] = { (void*)&a, (void*)&b, (void*)&c };
// the element size of param
size_t size = sizeof(param)/sizeof(void*);
But now, I want param be passed to a function named TryToGetTheSize, and get a size as the return value.
size_t TryToGetTheSize(void **array)
{
// return the size of void* array
}
...
size_t size = TryToGetTheSize(param);
I've tried an idea from the implementation of strlen, which incrementally moves the char* pointer to next continuous memory space, and counting by check the value of current position is '\0' or not.
But that method does not work with void**, there is no way to check the validation of void* indicated address.
So, it seems impossible to know the size with only given the void** array, but when I lookup CUDA API, I found this:
cudaLaunchKernel(const void* func, dim3 gridDim, dim3 blockDim, void** args, size_t sharedMem, cudaStream_t stream)
In the CUDA, we usually use <<<>>> as kernel launching, but it's the same if we manually setup the arugments and call cudaLaunchKernel directly
In cudaLaunchKerenl API, I notice the fourth parameter args used as parameters of kernel function func, and there is no other parameters describe the size of args
So, I have two questions:
1) How does cudaLaunchKernel know the size of void** args?
2) If cudaLaunchKernel doesn't need to know the size of void** args, how does it work?
Here are my sample code that use cudaLaunchKernel instead of <<<>>> in kernel launching.
#include<stdio.h>
#include<stdlib.h>
#include<cuda_runtime.h>
__global__
void saxpy(int n, float a, float *x, float *y)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n) y[i] = a * x[i] + y[i];
}
int main(void)
{
int N = 1 << 20;
float *hx, *hy, *dx, *dy;
hx = (float*)malloc(N * sizeof(float));
hy = (float*)malloc(N * sizeof(float));
cudaMalloc(&dx, N * sizeof(float));
cudaMalloc(&dy, N * sizeof(float));
for (int idx = 0; idx < N; idx++)
{
hx[idx] = 1.0f;
hy[idx] = 2.0f;
}
cudaMemcpy(dx, hx, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(dy, hy, N * sizeof(float), cudaMemcpyHostToDevice);
unsigned int threads = 256;
unsigned int blocks = (N + 255) / threads;
float ratio = 2.0f;
//saxpy<<<blocks, threads>>>(N, ratio, dx, dy);
void *args[] = { &N, &ratio, &dx, &dy };
cudaLaunchKernel((void*)saxpy, dim3(blocks), dim3(threads), args, 0, NULL);
cudaMemcpy(hy, dy, N * sizeof(float), cudaMemcpyDeviceToHost);
float max_error = 0.0f;
for (int jdx = 0; jdx < N; jdx++)
{
max_error = max(max_error, abs(hy[jdx] - 4.0f));
}
printf("Max Error: %f\n", max_error);
cudaFree(dx);
cudaFree(dy);
free(hx);
free(hy);
return 0;
}
Quoting from the related documentation:
The number of kernel parameters and their offsets and sizes do not
need to be specified as that information is retrieved directly from
the kernel's image.
Every CUDA device function has its argument list stored with the statically compiled function code. The API, therefore, knows exactly how many argument entries a call to cudaLaunchKernel requires. You will get a segfault or undefined behaviour if you supply too few to the launch call.

CUDA - Parallel Reduction Sum

I am trying to implement a parallel reduction sum in CUDA 7.5. I have been trying to follow the NVIDIA PDF that walks you through the initial algorithm and then steadily more optimised versions. I am currently making an array that is filled with 1 as the value in every array position so that I can check the output is correct but I am getting a value of -842159451 for an array of size 64. I am expecting that the kernel code is correct as I have followed the exact code from NVIDIA for it but here is my kernel:
__global__ void reduce0(int *input, int *output) {
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid] = input[i];
__syncthreads();
for (unsigned int s = 1; s < blockDim.x; s *= 2) {
if (tid % (2 * s) == 0) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
if (tid == 0) output[blockIdx.x] = sdata[0];
}
Here is my code calling the kernel, which is where I expect my problem to be:
int main()
{
int numThreadsPerBlock = 1024;
int *hostInput;
int *hostOutput;
int *deviceInput;
int *deviceOutput;
int numInputElements = 64;
int numOutputElements; // number of elements in the output list, initialised below
numOutputElements = numInputElements / (numThreadsPerBlock / 2);
if (numInputElements % (numThreadsPerBlock / 2)) {
numOutputElements++;
}
hostInput = (int *)malloc(numInputElements * sizeof(int));
hostOutput = (int *)malloc(numOutputElements * sizeof(int));
for (int i = 0; i < numInputElements; ++i) {
hostInput[i] = 1;
}
const dim3 blockSize(numThreadsPerBlock, 1, 1);
const dim3 gridSize(numOutputElements, 1, 1);
cudaMalloc((void **)&deviceInput, numInputElements * sizeof(int));
cudaMalloc((void **)&deviceOutput, numOutputElements * sizeof(int));
cudaMemcpy(deviceInput, hostInput, numInputElements * sizeof(int), cudaMemcpyHostToDevice);
reduce0 << <gridSize, blockSize >> >(deviceInput, deviceOutput);
cudaMemcpy(hostOutput, deviceOutput, numOutputElements * sizeof(int), cudaMemcpyDeviceToHost);
for (int ii = 1; ii < numOutputElements; ii++) {
hostOutput[0] += hostOutput[ii]; //accumulates the sum in the first element
}
int sumGPU = hostOutput[0];
printf("GPU Result: %d\n", sumGPU);
std::string wait;
std::cin >> wait;
return 0;
}
I have also tried bigger and smaller array sizes for the input and I get the same result of a very large negative value no matter the size of the array.
Seems you are using a dynamically allocated shared array:
extern __shared__ int sdata[];
but you are not allocating it in the kernel invocation:
reduce0 <<<gridSize, blockSize >>>(deviceInput, deviceOutput);
You have two options:
Option 1
Allocate the shared memory statically in the kernel, e.g.
constexpr int threadsPerBlock = 1024;
__shared__ int sdata[threadsPerBlock];
More often than not I find this the cleanest approach, as it works without a problem when you have multiple arrays in shared memory. The drawback is that while the size usually depends on the number of threads in the block, you need the size to be known at compile-time.
Option 2
Specify the amount of dynamically allocated shared memory in the kernel invocation.
reduce0 <<<gridSize, blockSize, numThreadsPerBlock*sizeof(int) >>>(deviceInput, deviceOutput);
This will work for any value of numThreadsPerBlock (provided it is within the allowed range of course). The drawback is that if you have multiple extern shared arrays, you need to figure out how to put then in the memory yourself, so that one does not overwrite the other.
Note, there may be other problems in your code. I didn't test it. This is something I spotted immediately upon glancing over your code.

Atomic Add on Cuda not working..

My problem is to find out the number of integer points in n dimensional sphere using CUDA. I dont understand what is wrong with the below code but it is giving 0 output all the time. CUDA compute capability is 2.0 and tool kit version is 3.10.
Thanks for all the help.
__global__ void count_in(int pow_rad, int ndim,int *digit,int w,unsigned int *count,double radius)
{
long int i,j;
int rem,idx,sq,num;
int iy=blockDim.y * blockIdx.y + threadIdx.y;
int ix=blockDim.x * blockIdx.x + threadIdx.x;
int width=gridDim.x*blockDim.x;
int h=2*w+1;
i=iy*width+ix;
if(i>pow_rad) return;
sq=0;
idx=0;
num=i;
for(j=0;j<ndim;j++)
{digit[j]=0;}
while(num!=0)
{
rem=num%w;
num/=w;
digit[idx]=rem;
idx++;
}
for(j=0;j<ndim;j++)
{sq+=(digit[j]-h)*(digit[j]-h);}
if(sq<(radius*radius))
atomicInc(count,(unsigned int)1);
__syncthreads();
}
int main(int argc, char* argv[])
{
const long ntrials = 5;
int i;
for (int n = 0; n < ntrials; ++n) {
int *digit;
unsigned int *count;
std::cout<<n<<std::endl;
int pow_rad;
unsigned int num;
// Select radius and number of dimensions at random
const double r = drand48() * (RMAX - RMIN) + RMIN;
const int nd = lrand48() % (MAXDIM - 1) + 1;
cudaMalloc((void**) &digit,sizeof(int)*nd);
cudaMalloc((void**) &count,sizeof(unsigned int));
cudaMemset(count,0,sizeof(unsigned int));
int h=(int)floor(r);
int w=2*h+1;
std::cout << "###"<< r <<" "<< nd<< std::endl;
for(i=1;i<=nd;i++)
pow_rad*=w;
int width=(int)sqrt(pow_rad);
// Call your function
dim3 dimBlock(32,32);
dim3 dimGrid((width/32)+1,(width/32)+1);
count_in<<<dimGrid,dimBlock>>>(pow_rad, nd,digit,w,count,r);
cudaMemcpy(&num,count,sizeof(unsigned int),cudaMemcpyDeviceToHost);
std::cout << "-->"<<num << std::endl;
}
}
I didn't look at all of your code, but the lines
atomicInc(count,(unsigned int)1);
seems to show a common misunderstanding of the atomicInc function. The second argument is not the amount to increment, but the modulus; when the global variable reaches that amount, it resets to zero. With the value you specified, each time the statement executes the variable count is reset to 0.
If you change atomicInc to atomicAdd, or if you change the modulus to something large enough that it will never be reached, it should work better.

Numerical error in cuda/cublas simple kernel using particular input

I am working with cuda and cublas and I was trying to implement simple operations like matrix element-wise multiplication/division. I am using only float for my experiments. I know the most obvious way to do it is to write a kernel like this one:
__global__ void mul_elementwise(const unsigned int n, float* source, float* dest, const float value)
{
const unsigned int offset = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int stride = blockDim.x * gridDim.x;
for (unsigned int i = offset; i < n; i += stride)
{
dest[i] = source[i] * value;
}
}
This kernel can work both for multiplication and division (just using 1/x as value). But this can be achieved using cublas library too: suppose we have a matrix A m x n stored in column-major style and a scalar x, then setting alpha = x or alpha = 1/x and d_ones as a vector of m*n 1s, we can invoke and obtain the same result
cublasSaxpy(cublas_handle, m * n, &alpha, d_ones, 1, A_dev, 1);
Both methods work just fine, but I am facing few problems with some particular matrix, for which both methods do no work. I isolated this big matrix and build a MCVE available here (you can compile it with nvcc mcve.cu -lcublas. As you can see the results in both cases are totally wrong: host result is totally different, I am trying to figure out what's going on. I do not see any error in code but maybe i should try to use double instead of float and see what happens.
Any opinions about this situation? Thanks in advance!
EDIT #1 I tried using doubles but nothing changes if I use cublasDaxpy meanwhile it works perfectly with the custom kernel. I think the values are too small so single floating point precision is not enough.
Interesting MCVE. Wouldn't it have been possible to shrink your vector down to just a few elements? Isn't it possible to show the calculation discrepancy based on just 1 vector element?
Anyway I see several problems.
Your kernel implements the following function: y=alpha*x. But SAXPY implements y=alpha*x+y. Now, if y started out as (all) zero, then these two would be the same. But that's not what you have:
CUBLAS Your Kernel
---------------------------
alpha: alpha alpha
x: 1 ahost (ahost is your huge data array)
y: ahost -
So your kernel is computing y=alpha * ahost, but your CUBLAS call is computing y = alpha*1 + ahost. I wouldn't expect the same result from these, in general.
Your analysis of error seems flawed in a few ways. First, you are computing the absolute error in a float variable (a number which will always be positive, since it's the absolute value), but then you're comparing it against a negative number:
float diff = abs(host[i]-dev[i]);
...
if (diff > (-1e12))
won't that if test always be true? Perhaps you meant 1e-12 although that would still be flawed. Looking for a fixed error threshold on a floating point comparison should be scaled to the size of the numbers being compared. float quantities only contain about 6-7 accurate decimal digits. (And summing these errors is also troublesome.)
Here is a complete code that has the above issues fixed, and produces zero sum error for all the comparisons (host<->kernel and host<->cublas):
static float array[] = {0x00000000,
0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0xB58DA1CF,0xB50D2FEC,0x34A48536,0xB4A1D5BC,0x358E1345,0x35943AAC,0xB5983F40,0xB43628BB,0xB4A95348,0xB4DB751C,0xB50C8D1A,0xB3EFCBB5,0x3552B8CD,0x3538A167,0x358FDE0D,0xB4D54CE9,0xB5D29BB7,0xB4A234EE,0x346EF2F4,0x35B5D9F2,0xB40F1487,0x3554BC20,0x33FD9466,0xB536D37D,0xB3C2E594,0xB59DA581,0x3584FC87,0x34438F09,0x35D293CB,0xB4FBB002,0xB59F41E9};
#include <iostream>
#include <stdio.h>
#include <cublas_v2.h>
#include <assert.h>
#define TOL 0.0001
typedef unsigned int u32;
#define GET_STRIDE() u32(blockDim.x * gridDim.x)
#define GET_OFFSET() u32(blockIdx.x * blockDim.x + threadIdx.x)
inline
cudaError_t checkCuda(cudaError_t result)
{
#if defined(DEBUG) || defined(_DEBUG)
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
}
#endif
return result;
}
__global__ void div_elementwise(const u32 n, float* source, float* dest, const float value)
{
for (u32 i = GET_OFFSET(); i < n; i += GET_STRIDE())
{
dest[i] = source[i] * value;
}
}
float check_eq(float* dev, float* host, u32 len)
{
float sum = 0.0f;
for (u32 i = 0; i < len; ++i)
{
if (dev[i]!=host[i])
{
//printf("diff %d %f %f\n", i, dev[i], host[i]);
//break;
float diff = abs((host[i]-dev[i])/host[i]);
sum += diff;
if (diff > (TOL))
printf("diff %d %f\n", i, diff);
}
}
printf("%f\n", sum);
return sum;
}
void div_host(float* a, float v, u32 len)
{
for (u32 i = 0; i < len; ++i)
{
a[i]=a[i]*v;
}
}
int main()
{
u32 len = sizeof(array)/sizeof(float);
printf("array len = %d\n", len);
for (int i =0; i < len; i++) if (isnan(array[i])) {printf("nan value at %d\n",i); return -1;}
float* adev, *adevcublas, *d_zero;
float* ahost = (float*) malloc(len * sizeof(float));
checkCuda(cudaMalloc(&adev, len * sizeof(float)));
checkCuda(cudaMalloc(&adevcublas, len * sizeof(float)));
checkCuda(cudaMalloc(&d_zero, len * sizeof(float)));
memcpy(ahost, &array[0], len * sizeof(float));
checkCuda(cudaMemcpy(adev, ahost, len * sizeof(float), cudaMemcpyHostToDevice));
checkCuda(cudaMemcpy(adevcublas, ahost, len * sizeof(float), cudaMemcpyHostToDevice));
checkCuda(cudaMemset(d_zero, 0, len*sizeof(float)));
float alpha = 1/2494.f;
printf("%f\n", alpha);
div_host(ahost, alpha, len);
u32 tb = 256;
div_elementwise<<<((len + tb - 1) / tb),tb>>>(len, adev, adev, alpha);
float* r = (float*) malloc(len * sizeof(float));
checkCuda(cudaMemcpy(r, adev, len * sizeof(float), cudaMemcpyDeviceToHost));
check_eq(r,ahost,len);
cublasHandle_t ch;
cublasCreate(&ch);
float* r0 = (float*) malloc(len * sizeof(float));
cublasStatus_t stat = cublasSaxpy(ch, len, &alpha, adevcublas, 1, d_zero, 1);
if (stat != CUBLAS_STATUS_SUCCESS) {std::cout << "CUBLAS error: " << (int)stat << std::endl; return 1;}
checkCuda(cudaMemcpy(r0, d_zero, len * sizeof(float), cudaMemcpyDeviceToHost));
check_eq(r0,ahost,len);
free(r);
free(r0);
free(ahost);
cudaFree(adev);
return 0;
}