Managing a 2D CUDA Array - c++

I'm trying to pass a 2d array to a kernel so that each thread can access index = threadIdx.x + (blockIdx.x * blockDim.x) but I'm having trouble figuring out just how to do this and how to copy the data back over.
size_t pitch;
cudaMallocPitch(&d_array, &pitch, block_size * sizeof(int), num_blocks);
cudaMemset2D(d_array, pitch, 0, block_size * sizeof(int), num_blocks * sizeof(int));
kernel<<<grid_size, block_size>>>(d_array, pitch);
cudaMemcpy2D(h_array, pitch, d_array, pitch, block_size, num_blocks, cudaMemcpyDeviceToHost);
for (num_blocks)
for(block_size)
h_array[block][thread] should be 1
__global__ void kernel(int *array, int pitch) {
int *row = (int*)((char*)array + blockIdx.x * pitch);
row[threadIdx.x] = 1;
return;
}
What am I doing wrong, here?

Your cudaMemset2D is accesing to a bigger memory space that you previously allocated with cudaMallocPitch Also your cudaMemcpy2D is copying a little portion of that memory.
You should use the function in the following way:
cudaMallocPitch(&d_array, &pitch, block_size * sizeof(int), num_blocks);
cudaMemset2D(d_array, pitch, 0, block_size * sizeof(int), num_blocks) // * sizeof(int)); <- This size is bigger than the previously declared
kernel<<<grid_size, block_size>>>(d_array, pitch);
cudaMemcpy2D(h_array, pitch, d_array, pitch, block_size * sizeof(int) /* you forgot this here */, num_blocks, cudaMemcpyDeviceToHost);

Here's a complete code that passes a basic test with the errors mentioned by #hidrargyro fixed:
$ cat t236.cu
#include <stdio.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__global__ void kernel(int *array, int pitch) {
int *row = (int*)((char*)array + blockIdx.x * pitch);
row[threadIdx.x] = 1;
return;
}
int main(){
int *d_array, *h_array;
int block_size = 256;
int num_blocks = 256;
int grid_size = num_blocks;
h_array=(int *)malloc(block_size*num_blocks*sizeof(int));
if (h_array==0) {printf("malloc fail\n"); return 1;}
cudaMalloc((void **)&d_array, block_size*num_blocks*sizeof(int));
cudaCheckErrors("cudaMalloc fail");
size_t pitch;
cudaMallocPitch(&d_array, &pitch, block_size * sizeof(int), num_blocks);
cudaCheckErrors("cudaMallocPitch fail");
cudaMemset2D(d_array, pitch, 0, block_size * sizeof(int), num_blocks);
cudaCheckErrors("cudaMemset2D fail");
kernel<<<grid_size, block_size>>>(d_array, pitch);
cudaDeviceSynchronize();
cudaCheckErrors("kernel fail");
cudaMemcpy2D(h_array, block_size*sizeof(int), d_array, pitch, block_size*sizeof(int), num_blocks, cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy 2D fail");
for (int i = 0; i<num_blocks; i++)
for(int j = 0; j<block_size; j++)
if (h_array[i*block_size+j] != 1) {printf("mismatch at i=%d, j=%d, should be 1, was %d\n", i,j,h_array[i*block_size+j]); return 1;}
printf("success\n");
return 0;
}
$ nvcc -arch=sm_20 -o t236 t236.cu
$ ./t236
success
$

Related

cudaMemcpyFromSymbol and cudaMemcpyToSymbol always return cudaErrorInvalidSymbol (13) error

I have a problem with call cudaMemcpyFrom(To)Symbol function in CUDA C++ API. Alternative ideas with storing some parameters between blocks are welcome. Below I've provide some (example) code, that doesn't work correctly.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <cstdlib>
#include <ctime>
int avgHost(int*, int);
cudaError_t cudaError;
__device__ int getGlobalIdx()
{
int blockId = blockIdx.x + blockIdx.y * gridDim.x + gridDim.x * gridDim.y * blockIdx.z;
int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z) + (threadIdx.z * (blockDim.x * blockDim.y)) + (threadIdx.y * blockDim.x) + threadIdx.x;
return threadId;
}
__device__ int avg;
__device__ int count;
__device__ int sum;
__global__ void avgKernel(const int *arr)
{
auto idx = getGlobalIdx();
count++;
sum += arr[idx];
avg = sum / count;
}
int main(int argc, char ** argv)
{
const int arraySize = 128;
auto arr1 = (int*)__vcrt_malloc_normal(arraySize * sizeof(int));
srand(time(NULL));
for (size_t i = 0; i < arraySize; i++)
{
arr1[i] = rand() % 100;
}
auto hostAvg = avgHost(arr1, arraySize);
fprintf_s(stdout, "AVG: %d", hostAvg);
free(arr1);
return 0;
}
int avgHost(int* arr, int arraySize)
{
int *dArray = nullptr;
cudaSetDevice(0);
cudaError = cudaMemcpyToSymbol((void *)count, (void*)0, sizeof(int), 0, cudaMemcpyHostToDevice);
if (cudaError)
{
fprintf_s(stderr, "%s\t%d\n", cudaGetErrorString(cudaError), __LINE__);
return -1;
}
cudaMalloc((void**)&dArray, arraySize * sizeof(int));
cudaMemcpy(dArray, arr, arraySize * sizeof(int), cudaMemcpyKind::cudaMemcpyHostToDevice);
avgKernel <<<1, arraySize>>> (dArray);
cudaDeviceSynchronize();
int hostResult = -1;
cudaError = cudaMemcpyFromSymbol(&hostResult, (void *)avg, sizeof(int), 0, cudaMemcpyDeviceToHost);
if (cudaError)
{
fprintf_s(stderr, "%s\t%d\n", cudaGetErrorString(cudaError), __LINE__);
}
cudaFree(dArray);
return hostResult;
}
When I launch code system printed
invalid device symbol 55 AVG: -1
C:\Users\Administrator\source\repos\CudaTests\x64\Debug\cudabase.exe
(process 18152) exited with code 0.
Any ideas?
Remove this lines, because device variable "count" is always 0 before launch kernel.
cudaError = cudaMemcpyToSymbol((void *)count, (void*)0, sizeof(int), 0, cudaMemcpyHostToDevice);
if (cudaError)
{
fprintf_s(stderr, "%s\t%d\n", cudaGetErrorString(cudaError), __LINE__);
return -1;
}
EDIT
cudaError = cudaMemcpyFromSymbol(&hostResult, (void *)avg, sizeof(int), 0, CudaMemcpyDeviceToHost);
TO
cudaError = cudaMemcpyFromSymbol(&hostResult, avg, sizeof(int), 0, cudaMemcpyDeviceToHost);
(remove cast from parameter "avg"). Intellisence provide error, but code compile and run success.

Allocate 1 Dimension array with cudaMallocPitch and then copy to device with cudaMemcpy2D 3

I have read this post Allocate 2D array with cudaMallocPitch and copying with cudaMemcpy2D among many others including NVIDIA docs and I can't get cudaMallocPitch to work together with cudaMemcpy2D.
I need to copy a very big matrix in an array format (Matrix[width*height]) along with a simple array to perform Matrix * vector operations. It is not optional for me to use cudaMallocPitch in order to avoid conflicts and have a better performance.
So, I started by just trying to copy the matrix (vector in my case) to the device and check if it was correctly copied but my code does not print anything. If I use cudaMalloc and cudaMemcpy everything works fine. But I do not know what to do with cudaMallocPitch and cudaMemcpy2D.
What can I do to fix this?
#include <stdio.h>
__global__ void kernel(size_t mpitch, double * A, int N)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
while (idx < N)
{
double e = *(double *)(((char *) A + idx * mpitch) + N);
printf("(%f)", e);
}
}
int main()
{
int N = 1500;
double * A = new double[N], * d_A;
size_t pitch;
for (int i = 0; i < N; ++i)
{
A[i] = i;
}
cudaMallocPitch(&d_A, &pitch, sizeof(double) * N, 1);
cudaMemcpy2D(d_A, pitch, A, N * sizeof(double), sizeof(double) * N, 1, cudaMemcpyHostToDevice);
unsigned int blocksize = 1024;
unsigned int nblocks = (N + blocksize - 1) / blocksize;
kernel <<<nblocks, blocksize>>>(pitch, d_A, N);
cudaFree(d_A);
delete [] A;
return 0;
}
Error checking can make a big difference in debugging. You should always use it before coming here.
It wasn't clear if you wanted a row or column vector i.e. a matrix of [1xN] or [Nx1]
I've added an explanation on Talomnies suggestion, but first the 'working slabs of code'
Here's [Nx1]
#include <cstdio>
#include <iostream>
#include <cuda.h>
using namespace std;
__global__ void kernel(size_t mpitch, double * A, int N)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if(idx>=N) return;
double e = *(double *)(((char *) A + idx * mpitch));
printf("(%f)", e);
}
int main()
{
int N = 15;
double * A = new double[N], * d_A;
size_t pitch;
for (int i = 0; i < N; ++i)
{
A[i] = i;
}
cudaError_t err = cudaMallocPitch(&d_A, &pitch, sizeof(double), N);
if(err!=cudaSuccess) cout<<"err0:"<<cudaGetErrorString(err)<<endl;
err = cudaMemcpy2D(d_A, pitch, A, sizeof(double), sizeof(double), N, cudaMemcpyHostToDevice);
if(err!=cudaSuccess) cout<<"err1:"<<cudaGetErrorString(err)<<endl;
unsigned int blocksize = 1024;
unsigned int nblocks = (N + blocksize - 1) / blocksize;
kernel <<<nblocks, blocksize>>>(pitch, d_A, N);
cudaDeviceSynchronize();
err = cudaGetLastError();
if(err!=cudaSuccess) cout<<"err2:"<<cudaGetErrorString(err)<<endl;
cudaFree(d_A);
delete [] A;
return 0;
}
[1xN]:
#include <cstdio>
#include <iostream>
#include <cuda.h>
using namespace std;
__global__ void kernel(size_t mpitch, double * A, int N)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if(idx>=N) return;
int row=0;//only one row
double *row_ptr = (double *)( (char *) (A + mpitch * row) );
double e = row_ptr[idx];
printf("(%f)", e);
}
int main()
{
int N = 15;
double * A = new double[N], * d_A;
size_t pitch;
for (int i = 0; i < N; ++i)
{
A[i] = i;
}
cudaError_t err = cudaMallocPitch(&d_A, &pitch, sizeof(double)*N, 1);
if(err!=cudaSuccess) cout<<"err0:"<<cudaGetErrorString(err)<<endl;
err = cudaMemcpy2D(d_A, pitch, A, sizeof(double)*N, sizeof(double)*N, 1, cudaMemcpyHostToDevice);
if(err!=cudaSuccess) cout<<"err1:"<<cudaGetErrorString(err)<<endl;
unsigned int blocksize = 1024;
unsigned int nblocks = (N + blocksize - 1) / blocksize;
kernel <<<nblocks, blocksize>>>(pitch, d_A, N);
cudaDeviceSynchronize();
err = cudaGetLastError();
if(err!=cudaSuccess) cout<<"err2:"<<cudaGetErrorString(err)<<endl;
cudaFree(d_A);
delete [] A;
return 0;
}
Explanation
Firslty, Error Handling:
Considering how easy error handling is in CUDA there isn't a good excuse not to put it in.
cudaError_t err = cudaMallocPitch(&d_A, &pitch, sizeof(double)*N, 1);
if(err!=cudaSuccess) cout<<"err0:"<<cudaGetErrorString(err)<<endl;
Second, you didn't specify if you wanted a column vector or a row vector. Since a row vector is simply a 1-D array in linear memory and you don't need pitched memory to do that, I will assume for this explanation that you meant a column vector.
The reoccurring problem you were having was "misaligned address" in the kernel. This indicates that the problem is book-keeping, so lets walk through the three major steps of handling an aligned 2D array (even though our arrays will be either a column or row vector).
Allocating:
Your allocation was written out as
cudaMallocPitch(&d_A, &pitch, sizeof(double) * N, 1);
This is correct for the row vector as the API is cudaMallocPitch(void*** pointer, size_t* pitch_return, size_t row_width_in_bytes, size_t count_of_rows) However if we would like to do a column vector correct call is
cudaMallocPitch(&d_A, &pitch, sizeof(double), N);
Accessing:
For accessing you were mixing up accessing a row, and accessing an element in the row.
double e = *(double *)(((char *) A + idx * mpitch) + N);
Once again stick to the documentation. The API documentation for cudaMallocPitch includes
T* pElement = (T*)((char*)BaseAddress + Row * pitch) + Column;
for us this translates into
int column=0;
double element=(double*) ((char*)A + idx * mpitch) + column;
I've used column = 0 for completeness since we do not have more than one column.
Copying:
cudaMemcpy2D(d_A, pitch, A, N * sizeof(double), sizeof(double) * N, 1, cudaMemcpyHostToDevice);
For this case this is correct. API for cudaMemcpy2D is
cudaMemcpy2D(void* destination, size_t pitch_from_mallocPitch, const void* source, size_t source_pitch_bytes, size_t src_width_in_bytes, size_t src_rows_count, enum type_of_xfer);

Multiply vectorized 2D square matrix and compressed tridiagonal matrix in CUDA

I have two matrices
#define MATRIX_SIZE 20
#define BLOCK_SIZE 2
#define TILE_SIZE 2
double** A
double** B
Matrix A is dense, Matrix B is tridiagonal. I have created a vectorized representation of A
/* sz = A.rowlen = B.rowlen = A.collen = B.collen */
double* A1d = matrix_to_vector(sz, A);
I have also created a compressed representation of B with the following function
double* l_array = new double(sz - 1);
double* m_array = new double(sz);
double* r_array = new double(sz-1);
int current_l_idx = 0;
int current_m_idx = 0;
int current_r_idx = 0;
for (int i = 0; i < sz; i++) {
for (int j = 0; j < sz; j++) {
if ((i == j+1) || (i-1 == j)) {
l_array[current_l_idx] = B[i][j];
current_l_idx++;
}
else if ((i == j-1) || (i+1 == j)) {
r_array[current_r_idx] = B[i][j];
current_r_idx++;
}
else if (i == j) {
m_array[current_m_idx] = B[i][j];
current_m_idx++;
}
}
}
I then create an empty 2D vectorized matrix E as well as all my objects for CUDA
double* E1d = matrix_to_vector(sz, E);
double* d_A
double* d_B_l;
double* d_B_m;
double* d_B_r;
double* d_E;
size_t sizeA = sz * sz * sizeof(double);
size_t sizeB_lr = (sz - 1) * sizeof(double);
size_t sizeB_m = sz * sizeof(double);
cudaMalloc(&d_A, sizeA);
cudaMalloc(&d_B_l. sizeB_lr);
cudaMalloc(&d_B_m, sizeB_m);
cudaMalloc(&d_B_r, sizeB_lr);
cudaMalloc(&d_E, sizeA);
cudaMemcpy(d_A, A1d, sizeA, cudaMemcpyHostToDevice);
cudaMemcpy(d_B_l, l_array, sizeB_lr, cudaMemcpyHostToDevice);
cudaMemcpy(d_B_m, m_array, sizeB_m, cudaMemcpyHostToDevice);
cudaMemcpy(d_B_r, r_array, sizeB_lr, cudaMemcpyHostToDevice);
cudaMemcpy(d_E, E1d, sizeA, cudaMemcpyHostToDevice);
dim3 threads(BLOCK_SIZE, BLOCK_SIZE);
dim3 grid(MATRIX_SIZE / threads.x, MATRIX_SIZE / threads.y);
cudakernel<<<grid, threads>>>(sz, d_A, d_B_l, d_B_m, d_B_r, d_E);
I can perform this multiplication serially but I, unfortunately, have NO idea how to implement this on the CUDA device
Assumptions
A and B are always square
sz will always be evenly divisible by BLOCK_SIZE and TILE_SIZE
BLOCK_SIZE will always equal TILE_SIZE
I suspect based on your setup code that you are looking for a tiled shared-memory approach to this kind of matrix multiplication, and I'm not really wanting to do your homework for you, so I'll demonstrate an example that doesn't use shared memory.
If you understand how matrix multiplication works, and you also understand how to create an ordinary shared memory GPU matrix multiply kernel, converting the following code to use shared memory should be relatively straightforward:
#include <stdio.h>
#define DSIZE 256
#define BSIZE 32
#define TOL 0.0001
typedef double mytype;
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
// C = A x B
// A,B,C are all dense
template <typename T>
__global__ void mm(const T * __restrict__ A, const T * __restrict__ B, T * __restrict__ C, const int sz){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
int idy = threadIdx.y+blockDim.y*blockIdx.y;
if ((idx < sz) && (idy < sz)){
T temp = 0;
for (int i = 0; i < sz; i++)
temp += A[idy*sz+i]*B[i*sz+idx];
C[idy*sz+idx] = temp;}
}
// C = A x B
// A,C are dense, B is tridiagonal
template <typename T>
__global__ void mmt(const T * __restrict__ A, const T * __restrict__ B_l, const T * __restrict__ B_m, const T * __restrict__ B_r, T * __restrict__ C, const int sz){
int idx = threadIdx.x+blockDim.x*blockIdx.x;
int idy = threadIdx.y+blockDim.y*blockIdx.y;
if ((idx < sz) && (idy < sz)){
T temp = 0;
if (idx > 0) temp += A[idy*sz+(idx-1)]*B_r[idx-1];
temp += A[idy*sz+(idx) ]*B_m[idx];
if (idx < (sz-1)) temp += A[idy*sz+(idx+1)]*B_l[idx];
C[idy*sz+idx] = temp;}
}
int main(){
mytype *d_A, *h_A, *d_B, *h_B, *d_C, *h_Cd, *h_Cs, *d_B_l, *h_B_l, *d_B_m, *h_B_m, *d_B_r, *h_B_r;
size_t msz = DSIZE*DSIZE;
size_t mszb = msz*sizeof(mytype);
// host side allocations
h_A = (mytype *)malloc(mszb);
h_B = (mytype *)malloc(mszb);
h_Cd =(mytype *)malloc(mszb);
h_Cs =(mytype *)malloc(mszb);
h_B_l = (mytype *)malloc((DSIZE-1)*sizeof(mytype));
h_B_r = (mytype *)malloc((DSIZE-1)*sizeof(mytype));
h_B_m = (mytype *)malloc( DSIZE*sizeof(mytype));
if (!h_A || !h_B || !h_Cd || !h_Cs || !h_B_l || !h_B_r || !h_B_m) {printf("malloc fail\n"); return -1;}
// device side allocations
cudaMalloc(&d_A, mszb);
cudaMalloc(&d_B, mszb);
cudaMalloc(&d_C, mszb);
cudaMalloc(&d_B_l, (DSIZE-1)*sizeof(mytype));
cudaMalloc(&d_B_r, (DSIZE-1)*sizeof(mytype));
cudaMalloc(&d_B_m, DSIZE*sizeof(mytype));
cudaCheckErrors("cudaMalloc fail");
// prepare A, B matrices
/*
|1 1 1 ...|
A = |2 2 2 ...|
|3 3 3 ...|
|4 4 4 ...|
|... |
|2 1 0 ...| B_l = left/lower subdiagonal (i.e. all 3's)
B = |3 2 1 ...| B_m = middle/main diagonal (i.e. all 2's)
|0 3 2 ...| B_r = right/upper superdiagonal (i.e. all 1's)
|0 0 3 ...|
|... |
*/
for (int i = 0; i < DSIZE; i++){
if (i < DSIZE-1){
h_B_r[i] = 1;
h_B_l[i] = 3;}
h_B_m[i] = 2;
for (int j = 0; j < DSIZE; j++){
h_A[i*DSIZE+j] = i+1;
if (j==i+1) h_B[i*DSIZE+j] = 1;
else if (j==i) h_B[i*DSIZE+j] = 2;
else if (j==i-1) h_B[i*DSIZE+j] = 3;
else h_B[i*DSIZE+j] = 0;}}
// copy data to device
cudaMemcpy(d_A, h_A, mszb, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, mszb, cudaMemcpyHostToDevice);
cudaMemcpy(d_B_l, h_B_l, (DSIZE-1)*sizeof(mytype), cudaMemcpyHostToDevice);
cudaMemcpy(d_B_r, h_B_r, (DSIZE-1)*sizeof(mytype), cudaMemcpyHostToDevice);
cudaMemcpy(d_B_m, h_B_m, DSIZE*sizeof(mytype), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy1 fail");
// perform dense-dense multiply
dim3 block(BSIZE,BSIZE);
dim3 grid((DSIZE+block.x-1)/block.x, (DSIZE+block.y-1)/block.y);
cudaMemset(d_C, 0, mszb);
mm<<<grid, block>>>(d_A, d_B, d_C, DSIZE);
cudaMemcpy(h_Cd, d_C, mszb, cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy 2/kernel fail");
// perform dense-sparse multiply
cudaMemset(d_C, 0, mszb);
mmt<<<grid, block>>>(d_A, d_B_l, d_B_m, d_B_r, d_C, DSIZE);
cudaMemcpy(h_Cs, d_C, mszb, cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy 3/kernel fail");
// compare results
for (int i = 0; i < DSIZE; i++)
for (int j = 0; j < DSIZE; j++)
if (abs(h_Cs[i*DSIZE+j] - h_Cd[i*DSIZE+j]) > TOL) {printf("results mismatch at (%d, %d) dense: %f sparse: %f\n", i, j, h_Cd[i*DSIZE+j], h_Cs[i*DSIZE+j]); return -1;}
printf("Success!\n");
return 0;
}
Notes:
All of the global memory accesses in the mmt kernel (i.e. for A, the B vectors, and C) should properly coalesce across threads. Therefore, a conversion to use shared memory should also easily yield non-bank-conflicted access to shared memory.
While studying this code may be useful for learning, I recommend any serious sparse-dense matrix multiplication be done with routines from CUSPARSE such as csrmm. It will almost certainly be much more efficient (faster) than the above code, and likely faster than any shared memory conversion of the above code as well.

Bad data coming from cudaMemcpy2D

If this sort of question has been asked I apologize, link me to the thread please!
Anyhow I am new to CUDA (I'm coming from OpenCL) and wanted to try generating an image with it. The relevant CUDA code is:
__global__
void mandlebrot(uint8_t *pixels, size_t pitch, unsigned long width, unsigned long height) {
unsigned block_size = blockDim.x;
uint2 location = {blockIdx.x*block_size, blockIdx.y*block_size};
ulong2 pixel_location = {threadIdx.x, threadIdx.y};
ulong2 real_location = {location.x + pixel_location.x, location.y + pixel_location.y};
if (real_location.x >= width || real_location.y >= height)
return;
uint8_t *row = (uint8_t *)((char *)pixels + real_location.y * pitch);
row[real_location.x * 4+0] = 0;
row[real_location.x * 4+1] = 255;
row[real_location.x * 4+2] = 0;
row[real_location.x * 4+3] = 255;
}
cudaError_t err = cudaSuccess;
#define CUDA_ERR(e) \
if ((err = e) != cudaSuccess) { \
fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err)); \
exit(-1); \
}
int main(void) {
ulong2 dims = {1000, 1000};
unsigned long block_size = 500;
dim3 threads_per_block(block_size, block_size);
dim3 remainders(dims.x % threads_per_block.x, dims.y % threads_per_block.y);
dim3 blocks(dims.x / threads_per_block.x + (remainders.x == 0 ? 0 : 1), dims.y / threads_per_block.y + (remainders.y == 0 ? 0 : 1));
size_t pitch;
uint8_t *pixels, *h_pixels = NULL;
CUDA_ERR(cudaMallocPitch(&pixels, &pitch, dims.x * 4 * sizeof(uint8_t), dims.y));
mandlebrot<<<blocks, threads_per_block>>>(pixels, pitch, dims.x, dims.y);
h_pixels = (uint8_t *)malloc(dims.x * 4 * sizeof(uint8_t) * dims.y);
memset(h_pixels, 0, dims.x * 4 * sizeof(uint8_t) * dims.y);
CUDA_ERR(cudaMemcpy2D(h_pixels, dims.x * 4 * sizeof(uint8_t), pixels, pitch, dims.x, dims.y, cudaMemcpyDeviceToHost));
save_png("out.png", h_pixels, dims.x, dims.y);
CUDA_ERR(cudaFree(pixels));
free(h_pixels);
CUDA_ERR(cudaDeviceReset());
puts("Success");
return 0;
}
The save_png function is a usual utility function I created for taking a block of data and saving it to a png:
void save_png(const char *filename, uint8_t *buffer, unsigned long width, unsigned long height) {
png_structp png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
if (!png_ptr) {
std::cerr << "Failed to create png write struct" << std::endl;
return;
}
png_infop info_ptr = png_create_info_struct(png_ptr);
if (!info_ptr) {
std::cerr << "Failed to create info_ptr" << std::endl;
png_destroy_write_struct(&png_ptr, NULL);
return;
}
FILE *fp = fopen(filename, "wb");
if (!fp) {
std::cerr << "Failed to open " << filename << " for writing" << std::endl;
png_destroy_write_struct(&png_ptr, &info_ptr);
return;
}
if (setjmp(png_jmpbuf(png_ptr))) {
png_destroy_write_struct(&png_ptr, &info_ptr);
std::cerr << "Error from libpng!" << std::endl;
return;
}
png_init_io(png_ptr, fp);
png_set_IHDR(png_ptr, info_ptr, width, height, 8, PNG_COLOR_TYPE_RGBA, PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_DEFAULT, PNG_FILTER_TYPE_DEFAULT);
png_write_info(png_ptr, info_ptr);
png_byte *row_pnts[height];
size_t i;
for (i = 0; i < height; i++) {
row_pnts[i] = buffer + width * 4 * i;
}
png_write_image(png_ptr, row_pnts);
png_write_end(png_ptr, info_ptr);
png_destroy_write_struct(&png_ptr, &info_ptr);
fclose(fp);
}
Anyways the image that's generated is a weird whiteish strip that's speckled with random colored pixels which can be seen here.
Is there something glaring I did wrong? I tried to follow the introduction documentation on the CUDA site. Otherwise can anyone help me out to fix this? Here I'm simply trying to fill the pixels buffer with green pixels.
I am using a MBP retina with an NVIDIA GeForce GT 650M discrete graphics card. I can run and paste the output to print_devices from the cuda sample code if need be.
EDIT: Note no errors or warnings during compilation with the following makefile:
all:
nvcc -c mandlebrot.cu -o mandlebrot.cu.o
nvcc mandlebrot.cu.o -o mandlebrot -lpng
and no errors at runtime.
It's better if you provide a complete code that someone can copy, paste, compile, and run, without adding anything or changing anything, Stripping off the include headers isn't helpful, in my opinion, and making your test code dependent on a png library that others may not have is also not productive, if you want help.
Your error checking on kernel launches is broken. You may want to review proper cuda error checking. If you had proper error checking, or ran your code with cuda-memcheck, you would discover an error 9 on the kernel launch. This is an invalid configuration. If you print out your blocks and threads_per_block variables, you'll see something like this:
blocks: 2, 2
threads: 500, 500
You are in fact setting threads per block to 500,500 here:
unsigned long block_size = 500;
dim3 threads_per_block(block_size, block_size);
That is illegal, as you are requesting 500x500 threads per block (i.e. 250000 threads) which exceeds the maximum limit of 1024 threads per block.
So your kernel is not running at all and you're getting garbage.
You can fix this error pretty simply by changing your block_size definition:
unsigned long block_size = 16;
After that there is still an issue, as you've misinterpreted the parameters for cudaMemcpy2D.:
CUDA_ERR(cudaMemcpy2D(h_pixels, dims.x * 4 * sizeof(uint8_t), pixels, pitch, dims.x, dims.y, cudaMemcpyDeviceToHost));
The documentation states for the 5th parameter:
width - Width of matrix transfer (columns in bytes)
but you've passed the width in elements (groups of 4 bytes) rather than bytes.
This will fix that:
CUDA_ERR(cudaMemcpy2D(h_pixels, dims.x * 4 * sizeof(uint8_t), pixels, pitch, dims.x*4, dims.y, cudaMemcpyDeviceToHost));
With the above changes, I was able to get good results with a test version of your code:
#include <stdio.h>
#include <stdint.h>
__global__
void mandlebrot(uint8_t *pixels, size_t pitch, unsigned long width, unsigned long height) {
unsigned block_size = blockDim.x;
uint2 location = {blockIdx.x*block_size, blockIdx.y*block_size};
ulong2 pixel_location = {threadIdx.x, threadIdx.y};
ulong2 real_location = {location.x + pixel_location.x, location.y + pixel_location.y};
if (real_location.x >= width || real_location.y >= height)
return;
uint8_t *row = (uint8_t *)((char *)pixels + real_location.y * pitch);
row[real_location.x * 4+0] = 0;
row[real_location.x * 4+1] = 255;
row[real_location.x * 4+2] = 0;
row[real_location.x * 4+3] = 255;
}
cudaError_t err = cudaSuccess;
#define CUDA_ERR(e) \
if ((err = e) != cudaSuccess) { \
fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err)); \
exit(-1); \
}
int main(void) {
ulong2 dims = {1000, 1000};
dim3 threads_per_block(16, 16);
dim3 remainders(dims.x % threads_per_block.x, dims.y % threads_per_block.y);
dim3 blocks(dims.x / threads_per_block.x + (remainders.x == 0 ? 0 : 1), dims.y / threads_per_block.y + (remainders.y == 0 ? 0 : 1));
size_t pitch;
uint8_t *pixels, *h_pixels = NULL;
CUDA_ERR(cudaMallocPitch(&pixels, &pitch, dims.x * 4 * sizeof(uint8_t), dims.y));
printf("blocks: %u, %u\n", blocks.x, blocks.y);
printf("threads: %u, %u\n", threads_per_block.x, threads_per_block.y);
mandlebrot<<<blocks, threads_per_block>>>(pixels, pitch, dims.x, dims.y);
h_pixels = (uint8_t *)malloc(dims.x * 4 * sizeof(uint8_t) * dims.y);
memset(h_pixels, 0, dims.x * 4 * sizeof(uint8_t) * dims.y);
CUDA_ERR(cudaMemcpy2D(h_pixels, dims.x * 4 * sizeof(uint8_t), pixels, pitch, dims.x*4, dims.y, cudaMemcpyDeviceToHost));
// save_png("out.png", h_pixels, dims.x, dims.y);
for (int row = 0; row < dims.y; row++)
for (int col = 0; col < dims.x; col++){
if (h_pixels[(row*dims.x*4) + col*4 ] != 0) {printf("mismatch 0 at %u,%u: was: %u should be: %u\n", row,col, h_pixels[(row*dims.x)+col*4], 0); return 1;}
if (h_pixels[(row*dims.x*4) + col*4 +1] != 255) {printf("mismatch 1 at %u,%u: was: %u should be: %u\n", row,col, h_pixels[(row*dims.x)+col*4 +1], 255); return 1;}
if (h_pixels[(row*dims.x*4) + col*4 +2] != 0) {printf("mismatch 2: was: %u should be: %u\n", h_pixels[(row*dims.x)+col*4 +2], 0); return 1;}
if (h_pixels[(row*dims.x*4) + col*4 +3] != 255) {printf("mismatch 3: was: %u should be: %u\n", h_pixels[(row*dims.x)+col*4 +3 ], 255); return 1;}
}
CUDA_ERR(cudaFree(pixels));
free(h_pixels);
CUDA_ERR(cudaDeviceReset());
puts("Success");
return 0;
}
Note the above code is a complete code you can copy, paste, compile and run.

CUDA/CUBLAS Matrix-Vector Multiplication

I previously posted a question regarding matrix-vector multiplication in CUDA and about writing my own kernel. After doing this, I decided to implement my problem using CUBLAS as suggested by some users (thanks #Robert Crovella ) on SO in the hopes of achieving higher performance (my project is performance driven).
Just to clarify: I want to multiply a NxN matrix with a 1xN vector.
I've been looking at the code pasted below for a couple of days now and I cant figure out why the multiplication is giving me an incorrect result. I fear that i am causing problems by using < vector > arrays (this is part of a much larger system that uses these data types). I don't mean to use this thread as a debugging tool but I think this will also be helpful to other users trying to achieve this as I have not come across a particularly comprehensive source on the internet for my particular problem (and for the cublas v2 API). Thanks in advance!
#include <cuda.h>
#include <vector>
#include <iostream>
#include <fstream>
#include <stdio.h>
#include <stdlib.h>
#include <cmath>
#include <cublas_v2.h>
#include <time.h>
//#include "timenow.cu"
// error check macros
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
// for CUBLAS V2 API
#define cublasCheckErrors(fn) \
do { \
cublasStatus_t __err = fn; \
if (__err != CUBLAS_STATUS_SUCCESS) { \
fprintf(stderr, "Fatal cublas error: %d (at %s:%d)\n", \
(int)(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
// random data filler
void fillvector(float *data, int N){
for(int i=0; i<N; i++){
data[i] = float(rand() % 10);
}
}
//printer
void printer(bool printOut, float *data, int N){
if(printOut == true){
for(int i=0; i<N; i++){
printf("%2.1f ", data[i]);
}
printf("\n");
}
}
/////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////
int main(){
bool printOut = true;
int N;
std::cout << "Enter N: " ;
std::cin >> N;
std::vector<float> x0;
x0.resize(N);
std::vector<float> p;
p.resize(N*N);
// matrix A
std::vector<float> A[N];
for(int i=0;i<N;i++){
A[i].resize(N);
fillvector(A[i].data(), N);
printer(printOut, A[i].data(), N);
}
printf("\n");
fillvector(x0.data(), N);
printer(printOut, x0.data(), N);
printf("\nStarting CUDA computation...");
///double startTime = timenow();
// device pointers
float *d_A, *d_p, *d_b, *d_x0, *d_v, *d_temp;
cudaMalloc((void**)&d_A, N*N*sizeof(float));
cudaMalloc((void**)&d_temp, N*sizeof(float));
cudaMalloc((void**)&d_x0, N*sizeof(float));
cudaCheckErrors("cuda malloc fail");
// might need to flatten A...
cublasSetVector(N, sizeof(float), &x0, 1, d_x0, 1);
//daMemcpy(d_x0, &x0, N*sizeof(float), cudaMemcpyHostToDevice);
cublasSetMatrix(N, N, sizeof(float), &A, N, d_A, N);
cudaCheckErrors("cuda memcpy of A or x0 fail");
float *temp;
temp = (float *)malloc(N*sizeof(temp));
cublasHandle_t handle;
cublasCheckErrors(cublasCreate(&handle));
float alpha = 1.0f;
float beta = 0.0f;
cublasCheckErrors(cublasSgemv(handle, CUBLAS_OP_N, N, N, &alpha, d_A, N, d_x0, 1, &beta, d_temp, 1));
cublasGetVector(N, sizeof(float), &temp, 1, d_temp, 1);
//cudaMemcpy(temp, d_temp, N*sizeof(float), cudaMemcpyDeviceToHost);
cudaCheckErrors("returning to host failed");
printf("\n");
printer(printOut, temp, N);
/*alpha = -1.0;
cublasSaxpy(handle, N, &alpha, d_temp, 1, d_v, 1);
cublasGetVector(N, sizeof(float) * N, d_v, 1, &v, 1);
printf("\n");
for(int i=0; i<N; i++){
printf("%2.1f ",v[i]);
}*/
printf("\nFinished CUDA computations...");
//double endTime = timenow();
//double timeDiff = endTime - startTime;
//printf("\nRuntime: %2.3f seconds \n", timeDiff);
cudaFree(d_temp);
cudaFree(d_A);
cudaFree(d_p);
cudaFree(d_x0);
return 0;
}
We don't reference the first element of a vector this way:
cublasSetVector(N, sizeof(float), &x0, 1, d_x0,
Instead you should do this:
cublasSetVector(N, sizeof(float), &(x0[0]), 1, d_x0, 1);
And likewise for your SetMatrix call referencing A:
cublasSetMatrix(N, N, sizeof(float), &(A[0]), N, d_A, N);
Your GetVector call has 2 errors:
cublasGetVector(N, sizeof(float), &temp, 1, d_temp, 1);
You have your temp and d_temp parameters reversed (you are copying from device to host) and you should not take the address of temp: it is already a pointer. So do this:
cublasGetVector(N, sizeof(float), d_temp, 1, temp, 1);
You're not doing proper error checking on all cublas calls, such as your get/set matrix/vector calls. Use the same method you are using on other cublas calls for these also.
You are creating A as an array of vectors. This won't work with cublasSetMatrix. Instead we need to create A as a flat vector, of sufficient size (N*N) to store the entire matrix.
Finally, cublas expects the matrices it uses to be stored in column-major order. If you pass C-style arrays in row-major order, you should use the transpose for that matrix in cublasSgemv:
cublasCheckErrors(cublasSgemv(handle, CUBLAS_OP_T, N, N, &alpha, d_A, N, d_x0, 1, &beta, d_temp, 1));
The following code has these various problems fixed:
$ cat t235.cu
#include <cuda.h>
#include <vector>
#include <iostream>
#include <fstream>
#include <stdio.h>
#include <stdlib.h>
#include <cmath>
#include <cublas_v2.h>
#include <time.h>
//#include "timenow.cu"
// error check macros
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
// for CUBLAS V2 API
#define cublasCheckErrors(fn) \
do { \
cublasStatus_t __err = fn; \
if (__err != CUBLAS_STATUS_SUCCESS) { \
fprintf(stderr, "Fatal cublas error: %d (at %s:%d)\n", \
(int)(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
// random data filler
void fillvector(float *data, int N){
for(int i=0; i<N; i++){
data[i] = float(rand() % 10);
}
}
//printer
void printer(bool printOut, float *data, int N){
if(printOut == true){
for(int i=0; i<N; i++){
printf("%2.1f ", data[i]);
}
printf("\n");
}
}
/////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////
int main(){
bool printOut = true;
int N;
std::cout << "Enter N: " ;
std::cin >> N;
std::vector<float> x0;
x0.resize(N);
std::vector<float> p;
p.resize(N*N);
// matrix A
std::vector<float> A(N*N);
fillvector(A.data(), N*N);
for (int i=0; i< N; i++){
printer(printOut, &(A[(i*N)]), N);
printf("\n");}
fillvector(x0.data(), N);
printer(printOut, x0.data(), N);
printf("\nStarting CUDA computation...");
///double startTime = timenow();
// device pointers
float *d_A, *d_x0, *d_temp;
cudaMalloc((void**)&d_A, N*N*sizeof(float));
cudaMalloc((void**)&d_temp, N*sizeof(float));
cudaMalloc((void**)&d_x0, N*sizeof(float));
cudaCheckErrors("cuda malloc fail");
// might need to flatten A...
cublasCheckErrors(cublasSetVector(N, sizeof(float), &(x0[0]), 1, d_x0, 1));
//daMemcpy(d_x0, &x0, N*sizeof(float), cudaMemcpyHostToDevice);
cublasCheckErrors(cublasSetMatrix(N, N, sizeof(float), &(A[0]), N, d_A, N));
//cudaCheckErrors("cuda memcpy of A or x0 fail");
float *temp;
temp = (float *)malloc(N*sizeof(temp));
cublasHandle_t handle;
cublasCheckErrors(cublasCreate(&handle));
float alpha = 1.0f;
float beta = 0.0f;
cublasCheckErrors(cublasSgemv(handle, CUBLAS_OP_T, N, N, &alpha, d_A, N, d_x0, 1, &beta, d_temp, 1));
cublasCheckErrors(cublasGetVector(N, sizeof(float), d_temp, 1, temp, 1));
//cudaMemcpy(temp, d_temp, N*sizeof(float), cudaMemcpyDeviceToHost);
//cudaCheckErrors("returning to host failed");
printf("\n");
printer(printOut, temp, N);
/*alpha = -1.0;
cublasSaxpy(handle, N, &alpha, d_temp, 1, d_v, 1);
cublasGetVector(N, sizeof(float) * N, d_v, 1, &v, 1);
printf("\n");
for(int i=0; i<N; i++){
printf("%2.1f ",v[i]);
}*/
printf("\nFinished CUDA computations...\n");
//double endTime = timenow();
//double timeDiff = endTime - startTime;
//printf("\nRuntime: %2.3f seconds \n", timeDiff);
cudaFree(d_temp);
cudaFree(d_A);
//cudaFree(d_p);
cudaFree(d_x0);
return 0;
}
$ nvcc -arch=sm_20 -O3 -o t235 t235.cu -lcublas
$ ./t235
Enter N: 5
3.0 6.0 7.0 5.0 3.0
5.0 6.0 2.0 9.0 1.0
2.0 7.0 0.0 9.0 3.0
6.0 0.0 6.0 2.0 6.0
1.0 8.0 7.0 9.0 2.0
0.0 2.0 3.0 7.0 5.0
Starting CUDA computation...
83.0 86.0 92.0 62.0 110.0
Finished CUDA computations...
$