Cuda - Device values 0 after kernel execution - c++

For some reason when I execute my program the device variables have a zero values. Just before I execute the cuda kernel the device variables have the correct values. The output image is just black of the original image size. All the memory allocations and copying to and from host seem to be correct.
Thanks for any help!
// Includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#ifdef _WIN32
# define WINDOWS_LEAN_AND_MEAN
# define NOMINMAX
# include <windows.h>
#endif
#define Image_Size 512
#define Kernel_Size 3
// Includes CUDA
#include <cuda_runtime.h>
// Utilities and timing functions
#include "./inc/helper_functions.h" // includes cuda.h and cuda_runtime_api.h
// CUDA helper functions
#include "./inc/helper_cuda.h" // helper functions for CUDA error check
const char *imageFilename = "lena_bw.pgm";
const char *sampleName = "simpleTexture";
#define C_PI 3.141592653589793238462643383279502884197169399375
void __global__ SwirlCu(int width, int height, int stride, float *pRawBitmapOrig, float *pBitmapCopy, double factor)
{
// This function effectively swirls an image
// This CUDA kernel is basically the exact same code as the CPU-only, except it has a slightly different setup
// Each thread on the GPU will process exactly one pixel
// Before doing anything, we need to determine the current pixel we are calculating in this thread
// Original code used i as y, and j as x. We will do the same so we can just re-use CPU code in the CUDA kernel
int i = blockIdx.y * blockDim.y + threadIdx.y;
int j = blockIdx.x * blockDim.x + threadIdx.x;
// Test to see if we're testing a valid pixel
if (i >= height || j >= width) return; // Don't bother doing the calculation. We're not in a valid pixel location
double cX = (double)width/2.0f;
double cY = (double)height/2.0f;
double relY = cY-i;
double relX = j-cX;
// relX and relY are points in our UV space
// Calculate the angle our points are relative to UV origin. Everything is in radians.
double originalAngle;
if (relX != 0)
{
originalAngle = atan(abs(relY)/abs(relX));
if ( relX > 0 && relY < 0) originalAngle = 2.0f*C_PI - originalAngle;
else if (relX <= 0 && relY >=0) originalAngle = C_PI-originalAngle;
else if (relX <=0 && relY <0) originalAngle += C_PI;
}
else
{
// Take care of rare special case
if (relY >= 0) originalAngle = 0.5f * C_PI;
else originalAngle = 1.5f * C_PI;
}
// Calculate the distance from the center of the UV using pythagorean distance
double radius = sqrt(relX*relX + relY*relY);
// Use any equation we want to determine how much to rotate image by
//double newAngle = originalAngle + factor*radius; // a progressive twist
double newAngle = originalAngle + 1/(factor*radius+(4.0f/C_PI));
// Transform source UV coordinates back into bitmap coordinates
int srcX = (int)(floor(radius * cos(newAngle)+0.5f));
int srcY = (int)(floor(radius * sin(newAngle)+0.5f));
srcX += cX;
srcY += cY;
srcY = height - srcY;
// Clamp the source to legal image pixel
if (srcX < 0) srcX = 0;
else if (srcX >= width) srcX = width-1;
if (srcY < 0) srcY = 0;
else if (srcY >= height) srcY = height-1;
// Set the pixel color
// Since each thread writes to exactly 1 unique pixel, we don't have to do anything special here
pRawBitmapOrig[i*stride/4 + j] = pBitmapCopy[srcY*stride/4 + srcX];
}
////////////////////////////////////////////////////////////////////////////////
// Declaration, forward
void runTest(int argc, char **argv);
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
printf("%s starting...\n", sampleName);
// Process command-line arguments
if (argc > 1)
{
if (checkCmdLineFlag(argc, (const char **) argv, "input"))
{
getCmdLineArgumentString(argc,(const char **) argv,"input",(char **) &imageFilename);
}
else if (checkCmdLineFlag(argc, (const char **) argv, "reference"))
{
printf("-reference flag should be used with -input flag");
exit(EXIT_FAILURE);
}
}
runTest(argc, argv);
cudaDeviceReset();
printf("%s completed",
sampleName);
//exit(testResult ? EXIT_SUCCESS : EXIT_FAILURE);
}
////////////////////////////////////////////////////////////////////////////////
//! Run a simple test for CUDA
////////////////////////////////////////////////////////////////////////////////
void runTest(int argc, char **argv)
{
int devID = findCudaDevice(argc, (const char **) argv);
unsigned int kernel_bytes = Kernel_Size * Kernel_Size * sizeof(float);
// load image from disk
float *hData = NULL;
float *host_array_kernel = 0;
float *device_array_Image = 0;
float *device_array_kernel = 0;
float *device_array_Result = 0;
unsigned int width, height;
char *imagePath = sdkFindFilePath(imageFilename, argv[0]);
if (imagePath == NULL)
{
printf("Unable to source image file: %s\n", imageFilename);
exit(EXIT_FAILURE);
}
sdkLoadPGM(imagePath, &hData, &width, &height);
unsigned int size = width * height * sizeof(float);
printf("Loaded '%s', %d x %d pixels\n", imageFilename, width, height);
// Allocation of device arrays using CudaMalloc
cudaMalloc((void**)&device_array_Image, size);
cudaMalloc((void**)&device_array_kernel, kernel_bytes);
cudaMalloc((void**)&device_array_Result, size);
host_array_kernel = (float*)malloc(kernel_bytes); // kernel
// Allocate mem for the result on host side
float *hOutputDataSharp = (float *) malloc(size);
GenerateKernel (host_array_kernel);
// copy arrays and kernel from host to device
checkCudaErrors(cudaMemcpy(device_array_Image, hData, size, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(device_array_kernel, host_array_kernel, kernel_bytes, cudaMemcpyHostToDevice));
dim3 dimBlock(16, 16, 1);
dim3 dimGrid(width / dimBlock.x, height / dimBlock.y, 1);
//Do the Convolution
printf("DImage : '%.8f'\n",device_array_Image);
printf("DKernel : '%.8f'\n",device_array_kernel);
//serialConvolution(hData, host_array_kernel ,hOutputDataSharp);
SwirlCu<<<512, 512>>>(width, height, width*4, device_array_Image,device_array_Result, 0.005f);
printf("DResult : '%.8f'\n",device_array_Result);
checkCudaErrors(cudaDeviceSynchronize());
cudaMemcpy(hOutputDataSharp,device_array_Result, size, cudaMemcpyDeviceToHost);
printf("HResult : '%.8f'\n",hOutputDataSharp);
// Write result to file
char outputSharp[1024];
strcpy(outputSharp, imagePath);
strcpy(outputSharp, "data/serial_sharptest.pgm");
sdkSavePGM(outputSharp, hOutputDataSharp, width, height);
cudaFree(device_array_Result);
cudaFree(device_array_Image);
cudaFree(device_array_kernel);
free(hData);
free(imagePath);
//free(host_array_Image);
free(host_array_kernel);
free(hOutputDataSharp);
//free(hOutputImage);
//free(hOutputKernel);
}

Your code is writing in the source image:
pRawBitmapOrig[i*stride/4 + j] = pBitmapCopy[srcY*stride/4 + srcX];
which writes to device_array_Image which is the source, not the destination you are expecting results in.
Moreover, I am very curious on the output of printf("DResult : '%.8f'\n",device_array_Result); as device_array_Result is in GPU address space and allocated with cudaMalloc. On which device are you running ?

Related

How to keep track of a global index within an MPI_Scatter(...) function?

I am working on an MPI program in which process 0 reads a .raw file, places it in a pointer array, and distribute it to other processes so they can each do work. I am currently distributing sections of the pointer array by using MPI_Scatter which seems to be working.
The problem:
The calculations to do on each sub array requires knowing what its global index is. Currently I only know how to go over its local index. Below is the code:
#include <iostream>
#include <cstdint>
#include <mpi.h>
#define MCW MPI_COMM_WORLD
const int WIDTH = 1300;
const int HEIGHT = 600;
//work function that requires the *global* x,y coordinate of the file
float calc_vorticity(int x, int y, int width, int height, float* vector_field) {
float d_x = 0.01;
float d_y = 0.01;
uint32_t index = y * width + x;
int start_x = (x == 0) ? 0 : x - 1;
int end_x = (x == width - 1) ? x : x + 1;
int start_y = (y == 0) ? 0 : y - 1;
int end_y = (y == height - 1) ? y : y + 1;
uint32_t duidx = (start_y * width + end_x) * 2;
uint32_t dvidx = (end_y * width + start_x) * 2;
std::pair<double, double> fdu(vector_field[duidx], vector_field[duidx + 1]);
std::pair<double, double> fdv(vector_field[dvidx], vector_field[dvidx + 1]);
std::pair<double, double> vec0(vector_field[index * 2], vector_field[index * 2 + 1]);
float duy = (fdu.second - vec0.second) / (d_x * (end_x - start_x));
float dvx = (fdv.first - vec0.first) / (d_y * (end_y - start_y));
return duy - dvx;
}
int main(int argc, char** argv) {
int rank, size;
MPI_Init(&argc, &argv); //initialize MPI
MPI_Comm_rank(MCW, &rank); //Assign each processor a rank
MPI_Comm_size(MCW, &size); //Assign the size
//allocate the space
float* vector_field = new float[WIDTH * HEIGHT * 2];
//have rank 0 read in image
if(!rank) {
FILE* file = fopen("../data/cyl2d_1300x600_float32[2].raw", "r");
fread(vector_field, sizeof(float), WIDTH * HEIGHT * 2, file);
fclose(file);
}
//create buffer of data & catch all processes up to this point
float* buffer = new float[(WIDTH * HEIGHT * 2) / size];
MPI_Barrier(MCW);
//broadcast the array to other ranks
MPI_Scatter(
vector_field, //array being sent out that resides on root process
(WIDTH * HEIGHT * 2) / size, //number of items to send
MPI_FLOAT, //type of items sent
buffer, //buffer of data that holds number of items in sent array
(WIDTH * HEIGHT * 2) / size, //number of items to be recieved
MPI_FLOAT, //type of items recieved
0, //originating process
MCW //communicator
);
//allocate space for results
//Note that we only need half the size of buffer as vorticity is represented with only 1 float
float* vorticities = new float[(WIDTH * HEIGHT) / size];
MPI_Barrier(MCW);
//calculate value for each cell
// !!! Problem here !!!
/* Call calc_vorticity(...) sending the current **global** x/y coordinate */
/*
Serial implementation call looks like this:
for (int i=0; i<WIDTH; i++){
for (int j=0; j<HEIGHT; j++){
vorticities[j*WIDTH+i] = vorticity(i, j, WIDTH, HEIGHT, vector_field);
}
}
*/
//assure all processes are caught up
MPI_Barrier(MCW);
//have process 0 write to output file
if(!rank) {
FILE* output = fopen("../data/vorticities_dist.raw", "w");
fwrite(vorticities, sizeof(float), WIDTH * HEIGHT, output);
fclose(output);
}
//delete used memory
delete[] vector_field;
delete[] vorticities;
//Finalize & exit
MPI_Finalize();
return 1;
}
I've looked at a lot of MPI_Gather Docs, and I don't think Scatter/Gather is the answer as you cannot also broadcast the global indexes with that pointer array. I was also tossing the idea around of creating my own object which has 2 elements, num & global_index. This way I could broadcast an array of objects which carry their global index with them, though I don't know how to scatter custom data types.

list of white pixels indices in image using CUDA

Given a binary image, I want to return the list of indices for white pixels in it using GPU (Compute Unified Device Architecture). How to determine the index for points vector?
Here is the CUDA Kernel .
//copy only active pixel locations
__global__ void get_white_pixels_kernel(unsigned char* bin_image,
float * points,
int width,
int height,
int grayWidthStep)
{
int row_index = threadIdx.y+ blockIdx.y*blockDim.y;
int col_index = threadIdx.x+blockIdx.x*blockDim.x;
if ((col_index < width) && (row_index < height))
{
//Location of gray pixel in output
const int gray_tid = row_index * grayWidthStep + col_index;
if(input[gray_tid]==255)
points[--here is the index]= Point2f(row_index,col_index);
}
}
Following is a naive method to achieve the desired functionality:
Generate a mask of pixel indices with dummy values for pixel with zero value.
Count the number of non-zero pixels
Create an output vector with length equal to non-zero count.
Copy the non-zero pixel indices from the generated mask to the output vector (a process known as stream-compaction)
Following is a sample code for the above mentioned process.
Code
#include <cstdio>
#include <vector>
#include <cuda_runtime.h>
#include <thrust/count.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <opencv2/opencv.hpp>
static void _check_err(cudaError_t err, const char* file, int line)
{
if(err)
{
const char* err_str = cudaGetErrorString(err);
printf("CUDA Error: %s\nFile: %s\nLine: %d\n", err_str, file, line);
exit(EXIT_FAILURE);
}
}
#define CHECK_ERR(err) _check_err((err), __FILE__, __LINE__)
__global__ void kernel_find_indices(const unsigned char* input, int width, int height, int step, int2* indices)
{
const int x = blockIdx.x * blockDim.x + threadIdx.x;
const int y = blockIdx.y * blockDim.y + threadIdx.y;
if(x < width && y < height)
{
const int tidPixel = y * step + x;
const int tidIndex = y * width + x;
unsigned char value = input[tidPixel];
int2 index_to_write;
if(value)
{
//Write actual index to pixels with non-zero value
index_to_write.x = x;
index_to_write.y = y;
}
else
{
//Write dummy index to pixels with zero value
index_to_write.x = -1;
index_to_write.y = -1;
}
indices[tidIndex] = index_to_write;
}
}
//Operator to check whether an index is of a non-zero pixel
struct isNonZeroIndex
{
__host__ __device__ bool operator()(const int2 &idx)
{
return (idx.x != -1) && (idx.y != -1);
}
};
std::vector<cv::Point> getIndicesOfNonZeroPixels(cv::Mat input)
{
std::vector<int2> output_int2;
std::vector<cv::Point> output;
int pixelCount = input.cols * input.rows;
size_t imageBytes= input.step * input.rows;
unsigned char* image_d;
thrust::device_vector<int2> index_buffer_d(pixelCount);
//Allocate device memory for input image
CHECK_ERR(cudaMalloc(&image_d, imageBytes));
//Copy input image to device
CHECK_ERR(cudaMemcpy(image_d, input.ptr(), imageBytes, cudaMemcpyHostToDevice));
dim3 block(16,16);
dim3 grid;
grid.x = (input.cols + block.x - 1) / block.x;
grid.y = (input.rows + block.y - 1) / block.y;
//Generate an index mask with dummy values for indices with zero pixel value
kernel_find_indices<<<grid, block>>>(image_d, input.cols, input.rows, input.step, thrust::raw_pointer_cast(index_buffer_d.data()));
CHECK_ERR(cudaDeviceSynchronize());
int nonZeroCount = thrust::count_if(index_buffer_d.begin(), index_buffer_d.end(), isNonZeroIndex());
//Keep only those indices whose pixel value is non-zero (stream compaction)
thrust::device_vector<int2> compacted(nonZeroCount);
thrust::copy_if(index_buffer_d.begin(), index_buffer_d.end(), compacted.begin(), isNonZeroIndex());
//Copy non-zero pixel indices to host
output_int2.resize(nonZeroCount);
thrust::copy(compacted.begin(), compacted.end(), output_int2.begin());
CHECK_ERR(cudaFree(image_d));
//Convert vector<int2> to vector<cv::Point>
output.resize(nonZeroCount);
for(size_t i=0; i<nonZeroCount; i++)
output[i] = cv::Point(output_int2[i].x, output_int2[i].y);
return output;
}
void run_test()
{
//Generate a sample test image
cv::Mat test = cv::Mat::zeros(100,100, CV_8UC1);
cv::rectangle(test, cv::Rect(5,5,20,20), cv::Scalar::all(255), CV_FILLED);
//Get pixel indices of non-zero pixels
std::vector<cv::Point> indices = getIndicesOfNonZeroPixels(test);
//Display those indices
for(size_t i=0; i<indices.size(); i++)
{
printf("%d, %d\n", indices[i].x, indices[i].y);
}
//Show image
cv::imshow("Sample", test);
cv::waitKey();
}
int main(int argc, char** argv)
{
run_test();
return 0;
}
Compilation Command
nvcc -o nz nz.cu -arch=sm_61 -L/usr/local/lib -lopencv_core
-lopencv_highgui -lopencv_imgproc
Please keep in mind that this code is for image of type 8UC1 (8 bit, single channel) only. You can easily extend it to other data-types as required.

calling c function from MATLAB?

I want to call a c function from matlab, for that I tried writing a wrapper function using MEX. While compiling I am getting
error C2109: subscript requires array or pointer type
and error C2440: 'function' : cannot convert from 'double *' to 'double'
Can anyone help me where i did the mistake??
#include "mex.h"
#include "matrix.h"
#include "CVIPtoolkit.h"
#include "CVIPtools.h"
#include "CVIPmatrix.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
void midd(double outdata, int type, int height, int width){
Image *outputImage;
byte **output;
int r,c;
mexPrintf("type %d\n", type);
mexPrintf("height %d\n", height);
mexPrintf("width %d\n", width);
outputImage=new_Image (PGM, GRAY_SCALE, 0, height, width, CVIP_BYTE, REAL );
outputImage = h_image(type, height,width);
output = getData_Image(outputImage, 0);
for(r=0; r < height; r++) {
for(c=0; c < width; c++)
{
mexPrintf("type %d\n", type);
mexPrintf("height %d\n", height);
mexPrintf("width %d\n", width);
outdata[r+height*c+height*width] =output[r][c]; /* passing data back to MATLAB variable from CVIPtools variable */
}
}
}
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[])
{
double *outdata;
int type, height, width;
// double *indata = (double *)mxGetData(prhs[0]);
type = mxGetScalar(prhs[0]);
height = mxGetScalar(prhs[1]);
width = mxGetScalar(prhs[2]);
mexPrintf("type %d\n", type);
mexPrintf("height %d\n", height);
mexPrintf("width %d\n", width);
plhs[0] = mxCreateDoubleMatrix(height,width,mxREAL);
outdata = mxGetData(plhs[0]);
midd(outdata, type, height, width);
}
The c function i am trying to call is as follows:
#include "CVIPtoolkit.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
Image *
h_image(int type, unsigned int height, unsigned int width){
/* type = 1, Constant
* type = 2, Fixed mask
* type = 3, Gaussian
*/
unsigned int r, c, hf_w = width/2, hf_h = height/2;
Image *outimage;
float **outdata, sum = 0.0, sigma, tmp1, tmp2, tmp;
if (height < 3 || width < 3) {
fprintf(stderr, "Masksize too small, at least 3x3\n");
return (Image *)NULL;
}
outimage = new_Image(PGM, GRAY_SCALE, 1, height, width, CVIP_FLOAT, REAL);
outdata = (float **)getData_Image(outimage, 0);
switch (type) {
case 1:
for (r = 0; r < height; r++)
for (c = 0; c < width; c++) {
outdata[r][c] = 1.0;
sum += outdata[r][c];
}
break;
case 2:
for (r = 0; r < height; r++)
for (c = 0; c < width; c++) {
outdata[r][c] = 1.0;
sum += outdata[r][c];
}
outdata[height/2][width/2] = height * width;
sum = sum - 1.0 + outdata[height/2][width/2];
break;
case 3:
c = (width + height) /4;
r = (width + height) /2;
sigma = sqrt(c*c / (2 * log(2) + (r - 3) * log(3)));
sigma = 1.0 / 2.0 /sigma/sigma;
tmp = width * height;
for (r = 0; r < height; r++)
for (c = 0; c < width; c++) {
tmp1 = (r-hf_h)*(r-hf_h); tmp2 = (c-hf_w)*(c-hf_w);
outdata[r][c] = tmp*exp(- (tmp1 + tmp2) * sigma);
sum += outdata[r][c];
}
break;
default:
fprintf(stderr, "Incorrect mask type number: %d\n", type);
return (Image *)NULL;
}
return outimage;
}
In your main function, outdata is a pointer to a double, yet your function midd takes in an actual double itself. That's why you're getting that error in type.
Simply change your function declaration so that the first input accepts a pointer to a double:
void midd(double *outdata, int type, int height, int width)
// ^^^^^^^^
Minor Note
I question your copying of your image data back to a MEX array here:
outdata[r+height*c+height*width] =output[r][c];
You don't need height*width as the offset. r + height*c is enough to access a single channel 2D matrix in column-major order. You only need to offset by height*width if you have a multi-channel image. That offset allows you to access image data in other channels... and since you only have single channel data (it looks like so...), this offset isn't required.
Therefore, you simply need to do:
outdata[r + height*c] = output[r][c];
If you don't do this, I suspect you will eventually get segmentation faults because you'll eventually access parts of memory you aren't allowed to access.
Also, once you fully test your code, get rid of the mexPrintf statements. It's going to unnecessarily flood your Command Prompt with print messages since you have it inside a nested for loop. I suspect you did this for debugging, and that's perfectly fine, but I would suggest you attach the MEX function to an actual debugger and debug your code properly instead of the print statements.
See my post on how to get that set up here: Preventing a MEX file from crashing in MATLAB

Numerical error in cuda/cublas simple kernel using particular input

I am working with cuda and cublas and I was trying to implement simple operations like matrix element-wise multiplication/division. I am using only float for my experiments. I know the most obvious way to do it is to write a kernel like this one:
__global__ void mul_elementwise(const unsigned int n, float* source, float* dest, const float value)
{
const unsigned int offset = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int stride = blockDim.x * gridDim.x;
for (unsigned int i = offset; i < n; i += stride)
{
dest[i] = source[i] * value;
}
}
This kernel can work both for multiplication and division (just using 1/x as value). But this can be achieved using cublas library too: suppose we have a matrix A m x n stored in column-major style and a scalar x, then setting alpha = x or alpha = 1/x and d_ones as a vector of m*n 1s, we can invoke and obtain the same result
cublasSaxpy(cublas_handle, m * n, &alpha, d_ones, 1, A_dev, 1);
Both methods work just fine, but I am facing few problems with some particular matrix, for which both methods do no work. I isolated this big matrix and build a MCVE available here (you can compile it with nvcc mcve.cu -lcublas. As you can see the results in both cases are totally wrong: host result is totally different, I am trying to figure out what's going on. I do not see any error in code but maybe i should try to use double instead of float and see what happens.
Any opinions about this situation? Thanks in advance!
EDIT #1 I tried using doubles but nothing changes if I use cublasDaxpy meanwhile it works perfectly with the custom kernel. I think the values are too small so single floating point precision is not enough.
Interesting MCVE. Wouldn't it have been possible to shrink your vector down to just a few elements? Isn't it possible to show the calculation discrepancy based on just 1 vector element?
Anyway I see several problems.
Your kernel implements the following function: y=alpha*x. But SAXPY implements y=alpha*x+y. Now, if y started out as (all) zero, then these two would be the same. But that's not what you have:
CUBLAS Your Kernel
---------------------------
alpha: alpha alpha
x: 1 ahost (ahost is your huge data array)
y: ahost -
So your kernel is computing y=alpha * ahost, but your CUBLAS call is computing y = alpha*1 + ahost. I wouldn't expect the same result from these, in general.
Your analysis of error seems flawed in a few ways. First, you are computing the absolute error in a float variable (a number which will always be positive, since it's the absolute value), but then you're comparing it against a negative number:
float diff = abs(host[i]-dev[i]);
...
if (diff > (-1e12))
won't that if test always be true? Perhaps you meant 1e-12 although that would still be flawed. Looking for a fixed error threshold on a floating point comparison should be scaled to the size of the numbers being compared. float quantities only contain about 6-7 accurate decimal digits. (And summing these errors is also troublesome.)
Here is a complete code that has the above issues fixed, and produces zero sum error for all the comparisons (host<->kernel and host<->cublas):
static float array[] = {0x00000000,
0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0x00000000,0xB58DA1CF,0xB50D2FEC,0x34A48536,0xB4A1D5BC,0x358E1345,0x35943AAC,0xB5983F40,0xB43628BB,0xB4A95348,0xB4DB751C,0xB50C8D1A,0xB3EFCBB5,0x3552B8CD,0x3538A167,0x358FDE0D,0xB4D54CE9,0xB5D29BB7,0xB4A234EE,0x346EF2F4,0x35B5D9F2,0xB40F1487,0x3554BC20,0x33FD9466,0xB536D37D,0xB3C2E594,0xB59DA581,0x3584FC87,0x34438F09,0x35D293CB,0xB4FBB002,0xB59F41E9};
#include <iostream>
#include <stdio.h>
#include <cublas_v2.h>
#include <assert.h>
#define TOL 0.0001
typedef unsigned int u32;
#define GET_STRIDE() u32(blockDim.x * gridDim.x)
#define GET_OFFSET() u32(blockIdx.x * blockDim.x + threadIdx.x)
inline
cudaError_t checkCuda(cudaError_t result)
{
#if defined(DEBUG) || defined(_DEBUG)
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result));
assert(result == cudaSuccess);
}
#endif
return result;
}
__global__ void div_elementwise(const u32 n, float* source, float* dest, const float value)
{
for (u32 i = GET_OFFSET(); i < n; i += GET_STRIDE())
{
dest[i] = source[i] * value;
}
}
float check_eq(float* dev, float* host, u32 len)
{
float sum = 0.0f;
for (u32 i = 0; i < len; ++i)
{
if (dev[i]!=host[i])
{
//printf("diff %d %f %f\n", i, dev[i], host[i]);
//break;
float diff = abs((host[i]-dev[i])/host[i]);
sum += diff;
if (diff > (TOL))
printf("diff %d %f\n", i, diff);
}
}
printf("%f\n", sum);
return sum;
}
void div_host(float* a, float v, u32 len)
{
for (u32 i = 0; i < len; ++i)
{
a[i]=a[i]*v;
}
}
int main()
{
u32 len = sizeof(array)/sizeof(float);
printf("array len = %d\n", len);
for (int i =0; i < len; i++) if (isnan(array[i])) {printf("nan value at %d\n",i); return -1;}
float* adev, *adevcublas, *d_zero;
float* ahost = (float*) malloc(len * sizeof(float));
checkCuda(cudaMalloc(&adev, len * sizeof(float)));
checkCuda(cudaMalloc(&adevcublas, len * sizeof(float)));
checkCuda(cudaMalloc(&d_zero, len * sizeof(float)));
memcpy(ahost, &array[0], len * sizeof(float));
checkCuda(cudaMemcpy(adev, ahost, len * sizeof(float), cudaMemcpyHostToDevice));
checkCuda(cudaMemcpy(adevcublas, ahost, len * sizeof(float), cudaMemcpyHostToDevice));
checkCuda(cudaMemset(d_zero, 0, len*sizeof(float)));
float alpha = 1/2494.f;
printf("%f\n", alpha);
div_host(ahost, alpha, len);
u32 tb = 256;
div_elementwise<<<((len + tb - 1) / tb),tb>>>(len, adev, adev, alpha);
float* r = (float*) malloc(len * sizeof(float));
checkCuda(cudaMemcpy(r, adev, len * sizeof(float), cudaMemcpyDeviceToHost));
check_eq(r,ahost,len);
cublasHandle_t ch;
cublasCreate(&ch);
float* r0 = (float*) malloc(len * sizeof(float));
cublasStatus_t stat = cublasSaxpy(ch, len, &alpha, adevcublas, 1, d_zero, 1);
if (stat != CUBLAS_STATUS_SUCCESS) {std::cout << "CUBLAS error: " << (int)stat << std::endl; return 1;}
checkCuda(cudaMemcpy(r0, d_zero, len * sizeof(float), cudaMemcpyDeviceToHost));
check_eq(r0,ahost,len);
free(r);
free(r0);
free(ahost);
cudaFree(adev);
return 0;
}

Getting unknown error when launching large kernel sizes

I am running into a problem when launching a simple kernel when my array size becomes larger than 591 by 591. At a size of 591x591 the array is returned without any error, but as soon as I launch the kernel with grid dimensions of 38x38 blocks with 16x16 threads each, the kernel fails to launch and returns an "unknown error".
The following code is the kernel I am calling and the call to the kernel in my code:
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_device_runtime_api.h>
using namespace std;
#define BLOCKSIZE 16
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__,__LINE__);}
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if(abort) exit(code);
}
}
__global__ void IdentityMatrixKernel(float* identity, int size)
{
int index_x = blockIdx.x * blockDim.x + threadIdx.x;
int index_y = blockIdx.y * blockDim.y + threadIdx.y;
// map the two 2D indices to a single linear, 1D index
int grid_width = gridDim.x * blockDim.x;
int index = index_y * grid_width + index_x;
// map the two 2D block indices to a single linear, 1D block index
//int result = blockIdx.y * gridDim.x + blockIdx.x;
if (index % (size+1))
{
identity[index] = 0;
}
else
{
identity[index] = 1;
}
void foo(float *aArray, int size)
{
float* d_I;
int size2 = size*size*sizeof(float);
gpuErrchk(cudaMalloc(&d_I,size2));
dim3 block_size;
block_size.x = BLOCKSIZE;
block_size.y = BLOCKSIZE;
dim3 grid_size;
grid_size.x = size1/ block_size.x + 1;
grid_size.y = size1/ block_size.y + 1;
IdentityMatrixKernel<<<grid_size,block_size>>>(d_I,size);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaMemcpy(aArray,d_I,size2,cudaMemcpyDeviceToHost));
cudaFree(d_I);
}
int main()
{
int size = 591;
float *aArray = (float*)malloc(size*size*sizeof(float));
foo(aArray,size);
return 0;
}
For size = 591 no error shows up, outputs identity matrix of size 591x591 but for any larger size it spits out an "unknown error" to console.
One problem seems to be that you are launching a grid of threads that is larger than your actual matrix:
grid_size.x = size1/ block_size.x + 1;
grid_size.y = size1/ block_size.y + 1;
But you are not checking for any out-of-bounds accesses in your kernel. You need to add a thread check such as:
if ((index_x >= size)||(index_y >= size)) return;
near the beginning of your kernel. But that's not enough. Another problem is that your index calculation is not correct:
int index = index_y * grid_width + index_x;
On the surface of it, it appears to be correct, but since your thread array is larger than your data array (potentially), this can give incorrect indexing. Since you're passing size to the kernel anyway, change it to something like this:
int index = index_y * size + index_x;
And you should be able to eliminate the out-of-bounds accesses.
I extend Robert Crovella's answer.
If you define block_size.{x, y} with a big number (in your case 16), then you won't be able to work with arrays of smaller size e.g. 4x4. What you could do is define a small block size:
/* create thread blocks */
dim3 block_size;
block_size.x = 4;
block_size.y = 4;
/* create n x n block grids */
dim3 grid_size;
grid_size.x = size1/block_size.x;
grid_size.y = size1/block_size.y;
/* in case of partial sizes make grid_size 1 x 1 */
if (size1 % block_size.x)
grid_size.x = 1, grid_size.y = 1;