I am a beginner at CUDA programming, writing a program composed of a single file main.cu which is shown below.
#include <iostream>
#include <opencv2/opencv.hpp>
#define DEBUG(str) std::cerr << "\033[1;37m" << __FILE__ << ":" << __LINE__ << ": \033[1;31merror:\033[0m " << str << std::endl;
#define CUDADEBUG(cudaError) \
if (cudaError != cudaSuccess) \
DEBUG(cudaGetErrorString(cudaError));
#define ERROR(str) \
{ \
DEBUG(str); \
exit(1); \
}
__global__ void makeGrey(
unsigned char *&pimage,
const int &cn,
const size_t &total)
{
unsigned i = blockDim.x * blockIdx.x + threadIdx.x;
unsigned icn = i * cn;
printf("%u\n", i);
if (i < total)
{
float result = pimage[icn + 0] * .114 +
pimage[icn + 1] * .587 +
pimage[icn + 2] * .299;
pimage[icn + 0] = result; //B
pimage[icn + 1] = result; //G
pimage[icn + 2] = result; //R
// pimage[icn + 3] *= result; //A
}
}
int main(int argc, char **argv)
{
if (argc != 3)
ERROR("usage: executable in out");
cv::Mat image;
unsigned char *dimage;
image = cv::imread(argv[1], cv::IMREAD_UNCHANGED);
if (!image.data)
ERROR("Image null");
if (image.empty())
ERROR("Image empty");
if (!image.isContinuous())
ERROR("image is not continuous");
const size_t N = image.total();
const int cn = image.channels();
const size_t numOfElems = cn * N;
const int blockSize = 512;
const int gridSize = (N - 1) / blockSize + 1;
CUDADEBUG(cudaMalloc(&dimage, numOfElems * sizeof(unsigned char)));
CUDADEBUG(cudaMemcpy(dimage, image.data, numOfElems * sizeof(unsigned char), cudaMemcpyHostToDevice));
makeGrey<<<gridSize, blockSize>>>(dimage, cn, N);
cudaError_t errSync = cudaGetLastError();
cudaError_t errAsync = cudaDeviceSynchronize();
if (errSync != cudaSuccess)
std::cerr << "Sync kernel error: " << cudaGetErrorString(errSync) << std::endl;
if (errAsync != cudaSuccess)
std::cerr << "Async kernel error: " << cudaGetErrorString(errAsync) << std::endl;
CUDADEBUG(cudaMemcpy(image.data, dimage, numOfElems * sizeof(unsigned char), cudaMemcpyDeviceToHost)); //line 73
CUDADEBUG(cudaFree(dimage)); //line 74
cv::imwrite(argv[2], image);
return 0;
}
When I execute the program, I get
Async kernel error: an illegal memory access was encountered
/path-to-main.cu:73: error: an illegal memory access was encountered
/path-to-main.cu:74: error: an illegal memory access was encountered
I checked CV_VERSION macro which is 4.5.3-dev, and Cuda Toolkit 11.4 is installed (nvcc version 11.4). Also afaik, the kernel does not execute at all (I used Nsight gdb debugger and printf). I could not understand why I am accessing an illegal memory area. I appreciate any help. Thank you in advance.
As mentioned in a comment, your GPU function takes arguments by references.
__global__ void makeGrey(
unsigned char *&pimage,
const int &cn,
const size_t &total)
This is bad, passing a reference to a function means more or less that you're passing an address where you can find the value, not the value itself.
In your situation those values are in memory used by Host, NOT Device/GPU memory, when GPU tries to access those values it will most likely crash.
The types you are trying to pass, unsigned char*, int and size_t are very cheap to copy, there's no need to pass them by reference in the 1st place.
__global__ void makeGrey(
unsigned char *pimage,
const int cn,
const size_t total)
There are tools provided by nvidia to debug CUDA applications, but I'm not really familiar with them, you can also use printf inside GPU functions, but you will have to organize output from potentially thousand of threads.
In general, whenever you call GPU functions, be very cautious about what you're passing as parameters, as they need to be passed from Host memory to Device memory. Usually you want to pass everything by value, any pointers need to point to Device memory, and watch out from references.
Related
I have a program where I generate arrays with random elements using cuda. Since I upgraded from cuda 9.1 to cuda 9.2, the time it takes do that has gone up from a fraction of a second (about 0.1s) to almost two minutes (without changing any of the code). The problem seems to be the curand_init() function, as the rest is running at about the same speed. Was there a change I missed in the library, is this a bug or is it a problem with my code?
This is an example
#include <iostream>
#include <curand.h>
#include <curand_kernel.h>
#define cudaErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
std::cerr << "cudaAssert: " << cudaGetErrorString(code) << " " << file << ": " << line << std::endl;
if (abort) exit(code);
}
}
__global__
void setup_curand_state (curandState *state, int seed, int dim)
{
int index = threadIdx.x+blockDim.x*blockIdx.x;
if (index < dim)
curand_init(seed, index, 0, &state[index]);
}
__global__
void set_random (float* to, curandState* curand_state, int dim)
{
int index = threadIdx.x+ blockIdx.x* blockDim.x;
if (index < dim)
to [index] = curand_normal (&curand_state[index]);
}
int main () {
int dim = 100000;
float *data;
cudaErrchk (cudaMallocManaged ((void**) &data, dim * sizeof (float)));
curandState* curand_state;
cudaErrchk (cudaMalloc (&curand_state, (dim * sizeof (curandState))));
setup_curand_state <<<(dim + 1023) / 1024, 1024>>> (curand_state, time(NULL), dim);
cudaErrchk (cudaDeviceSynchronize());
set_random <<<(dim + 1023) / 1024, 1024>>> (data, curand_state, dim);
cudaFree (data);
return 0;
}
Answered by mrBonobo in the comment above:
Apparently, updating cuda through apt silently broke the install. Code
compiled for 9.1 would still work, but around 100/1000 times slower.
Reinstalling nvidia-cuda-toolkit solved the error
In the following code there's the cudaMemcpy not working, it returns an error, and the program exits. What can be the problem? It doesn't seem to me I'm doing something illegal, and the size of the vectors seem fine to me.
It might be possible the algorithm does something wrong at some point but the idea is correct I guess. The code is to sum n numbers by doing some partial sums in parallel, and then re-iterate.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
__device__ int aug_vec(int *vec, const int& i, const int& size) {
return (i >= size) ? 0 : vec[i];
}
__global__ void sumVectorElements(int *vec,const int& size) {
const int i = (blockDim.x*blockIdx.x + threadIdx.x);
vec[i] = aug_vec(vec, 2*i, size) + aug_vec(vec, 2 * i + 1, size);
}
__host__ int parallel_sum(int *vec,const int& size) {
cudaError_t err;
int *d_vec, *cp_vec;
int n_threads = (size >> 1) + (size & 1);
cp_vec = new int[size];
err = cudaMalloc((void**)&d_vec, size * sizeof(int));
if (err != cudaSuccess) {
std::cout << "error in cudaMalloc!" << std::endl;
exit(1);
}
err = cudaMemcpy(d_vec, vec, size*sizeof(int), cudaMemcpyHostToDevice);
if (err != cudaSuccess) {
std::cout << "error in cudaMemcpy!" << std::endl;
exit(1);
}
int curr_size = size;
while (curr_size > 1) {
std::cout << "size = " << curr_size << std::endl;
sumVectorElements<<<1,n_threads>>>(d_vec, curr_size);
curr_size = (curr_size >> 1) + (curr_size & 1);
}
err = cudaMemcpy(cp_vec, d_vec, size*sizeof(int), cudaMemcpyDeviceToHost); //THIS LINE IS THE PROBLEM!
if (err != cudaSuccess) {
std::cout << "error in cudaMemcpy" << std::endl;
exit(1);
}
err = cudaFree(d_vec);
if (err != cudaSuccess) {
std::cout << "error in cudaFree" << std::endl;
exit(1);
}
int rval = cp_vec[0];
delete[] cp_vec;
return rval;
}
int main(int argc, char **argv) {
const int n_blocks = 1;
const int n_threads_per_block = 12;
int vec[12] = { 0 };
for (auto i = 0; i < n_threads_per_block; ++i) vec[i] = i + 1;
int sum = parallel_sum(vec, n_threads_per_block);
std::cout << "Sum = " << sum << std::endl;
system("pause");
return 0;
}
The cudaMemcpy operation after the kernel is actually asynchronously reporting an error that is due to the kernel execution. Your error reporting is primitive. If you have an error code, you may get more useful information by printing out the result of passing that error code to cudaGetErrorString().
The error is occurring in the kernel due to use of the reference argument:
__global__ void sumVectorElements(int *vec,const int& size) {
^^^^^^^^^^^^^^^
Any argument you pass to a kernel and expect to be usable in kernel code must refer to data that is passed by value, or else data that is accessible/referenceable from device code. For example, passing a host pointer to device code is generally not legal in CUDA, because an attempt to dereference a host pointer in device code will fail.
The exceptions to the above would be data/pointers/references that are accessible in device code. Unified memory and pinned/mapped data are two examples, neither of which are being used here.
As a result, the reference parameter involves a reference (an address, basically) for a an item (size) in host memory. When the kernel code attempts to use this item, it must first de-reference it. The dereferenceing of a host item in device code is illegal in CUDA (unless using UM or pinned memory).
The solution in this case is simple: convert to an ordinary pass-by-value situation:
__global__ void sumVectorElements(int *vec,const int size) ...
^
remove ampersand
New to CUDA programming and extremely confused as to why I am getting the segfault in the following code:
#include <cuda.h>
#include <stdio.h>
#include <stdint.h>
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
using namespace std;
typedef struct password_t{
char word[56];
size_t length;
} password;
typedef struct libEntry_t{
uint8_t digest[16];
password pwd;
} libEntry;
// Generates a library of passwords and their corresponding MD5 hashes
//
// Params:
// numPwds - the number of passwords for which to generate hashes
// pwds - the list of passwords to hash
// library - the array in which to store the unhashed/hashed password library
__global__ void generateLibraryKernel(int numPwds, password* pwds, libEntry* library)
{
// __device__ void cuda_md5(const password *pwd, uint8_t *digest) {
int index = (blockIdx.x * blockDim.x) + threadIdx.x;
uint8_t hashed[16];
if (index < numPwds) {
cuda_md5(&pwds[index], hashed);
for (int j = 0; j < 16; j++) {
library[index].digest[j] = hashed[j];
}
library[index].pwd = pwds[index];
}
}
int crack_password (uint8_t* classified)
{
int count = 10;
unsigned int mem_size = sizeof(password) * count;
password *h_pwds = (password*) malloc(mem_size);
ifstream inFile("passwords.txt");
if (!inFile) {
cerr << "File passwords.txt not found." << endl;
return -1;
}
string line;
int i;
while (getline(inFile, line)) {
if (line.empty()) continue;
memcpy(h_pwds[i].word,line.c_str(),line.size());
h_pwds[i].length = line.size();
cout << "Password: " << h_pwds[i].word << "\n";
cout << "Length: " << h_pwds[i].length << "\n";
i++;
}
inFile.close();
/***** KERNEL CONFIGURATION & MEMORY MANAGEMENT ******/
password* d_pwds;
cudaMalloc( (void**) &d_pwds, mem_size);
cudaMemcpy( d_pwds, h_pwds, mem_size, cudaMemcpyHostToDevice);
libEntry *h_library = (libEntry*) malloc(sizeof(libEntry) * count);
libEntry* d_library;
cudaMalloc( (void**) &d_library, sizeof(libEntry) * count);
int h_numPwds = i;
cout << "INT NUMPWDS: " << h_numPwds << "\n";
int* d_numPwds;
cudaMalloc( (void**) &d_numPwds, sizeof(int));
cudaMemcpy( d_numPwds, &h_numPwds, sizeof(int), cudaMemcpyHostToDevice);
/*unsigned int threads_per_block = 1024;
dim3 grid(1024, 1, 1);
dim3 threads(threads_per_block, 1, 1);
// generateLibraryKernel(int numPwds, password* pwds, libEntry* library)
generateLibraryKernel<<<grid, threads>>>(d_numPwds[0], d_pwds, d_library);
cudaMemcpy( h_library, d_library, mem_size, cudaMemcpyDeviceToHost);*/
return 0;
}
int main(int argc, char *argv[])
{
if (argc != 2) {
fprintf(stderr, "usage: ./prog password\n");
return 1;
}
crack_password((uint8_t*) argv[1]);
cout << "Hack Password: " << argv[1] << "\n";
return 0;
}
I have gone through it line by line and I believe it happens on the following lines:
int* d_numPwds;
cudaMalloc( (void**) &d_numPwds, sizeof(int));
cudaMemcpy( d_numPwds, &h_numPwds, sizeof(int), cudaMemcpyHostToDevice);
When I comment cudaMemcpy above, I at least get the cout output on my terminal. Note that I have not gotten to the kernel execution part yet, I am just focusing on the memory allocation before I can actually execute and debug the kernel. Any help will be appreciated!
How I have been checking for return status:
#define CUDA_SAFE_CALL(call) do { \
CUDA_SAFE_CALL_NO_SYNC(call); \
cudaError err = cudaThreadSynchronize(); \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} } while (0)
EDIT: The error still occurs after I took care of the int memcpy and malloc, apparently I didn't have to alloc or cpy it. Could've just passed it over. So, the error is due to the following lines, and I am not sure which one or why?
password* d_pwds;
cudaMalloc( (void**) &d_pwds, mem_size);
cudaMemcpy( d_pwds, h_pwds, mem_size, cudaMemcpyHostToDevice);
libEntry *h_library = (libEntry*) malloc(sizeof(libEntry) * count);
libEntry* d_library;
cudaMalloc( (void**) &d_library, sizeof(libEntry) * count);
EDIT2: I cleaned up everything and still can't figure it out. By having CUDA_SAFE_CALL on the following line CUDA_SAFE_CALL( cudaMalloc((void**) &d_pwds, pwds_size)); I get segmentation fault even when every other memory allocation command is commented out.
For someone wondering what went wrong, I was able to fix it. I am not exactly sure what exactly was wrong but I had improper memory allocations at some places and in other cases I didn't even needed to use cudaMalloc or cudaMemcpy. Also, using What is the canonical way to check for errors using the CUDA runtime API? for checking errors instead of my own implementation worked. What I have now:
/***** KERNEL CONFIGURATION & MEMORY MANAGEMENT ******/
/***** GENERATE HASHED PASSWORD LIBRARY FOR COMPARE **/
unsigned int threads_per_block = 1024;
dim3 grid(1024, 1, 1);
dim3 threads(threads_per_block, 1, 1);
password* d_pwds;
ERROR_CHECK( cudaMalloc((void**) &d_pwds, pwds_size));
ERROR_CHECK( cudaMemcpy( d_pwds, h_pwds, pwds_size, cudaMemcpyHostToDevice));
libEntry* d_library;
ERROR_CHECK( cudaMalloc( (void**) &d_library, sizeof(libEntry) * count));
// generateLibraryKernel(int numPwds, password* pwds, libEntry* library)
generateLibraryKernel<<<grid, threads>>>(i, d_pwds, d_library);
ERROR_CHECK( cudaPeekAtLastError() );
ERROR_CHECK( cudaDeviceSynchronize() );
Where ERROR_CHECK is defined from the link above.
#define ERROR_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
I still don't fully understand memory management in CUDA (device and host allocations) but my code works now! Thank you all.
So I am trying to write a program that turns RGB images to greyscale.
I got the idea from the Udacity problem set. The problem is that when I write out the kernel in the Udacity web environment, it says my code works, however, when I try to do it locally on my computer, I get no errors, but my image instead of coming out greyscale, comes out completely grey. It looks like one grey box the dimensions of the image I loaded. Can you help me find the error in my code, I've compared it with the Udacity version and I can't seem to find it.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <string>
#include <cuda.h>
#include <stdio.h>
#include <opencv\cv.h>
#include <opencv\highgui.h>
#include <iostream>
#define CUDA_ERROR_CHECK
#define CudaSafeCall( err ) __cudaSafeCall( err, __FILE__, __LINE__ )
#define CudaCheckError() __cudaCheckError( __FILE__, __LINE__ )
inline void __cudaSafeCall(cudaError err, const char *file, const int line)
{
#ifdef CUDA_ERROR_CHECK
if (cudaSuccess != err)
{
fprintf(stderr, "cudaSafeCall() failed at %s:%i : %s\n",
file, line, cudaGetErrorString(err));
exit(-1);
}
#endif
return;
}
inline void __cudaCheckError(const char *file, const int line)
{
#ifdef CUDA_ERROR_CHECK
cudaError err = cudaGetLastError();
if (cudaSuccess != err)
{
fprintf(stderr, "cudaCheckError() failed at %s:%i : %s\n",
file, line, cudaGetErrorString(err));
exit(-1);
}
err = cudaDeviceSynchronize();
if (cudaSuccess != err)
{
fprintf(stderr, "cudaCheckError() with sync failed at %s:%i : %s\n",
file, line, cudaGetErrorString(err));
exit(-1);
}
#endif
return;
}
__global__ void rgb_2_grey(uchar* const greyImage, const uchar4* const rgbImage, int rows, int columns)
{
int rgb_x = blockIdx.x * blockDim.x + threadIdx.x; //x coordinate of pixel
int rgb_y = blockIdx.y * blockDim.y + threadIdx.y; //y coordinate of pixel
if ((rgb_x >= columns) && (rgb_y >= rows)) {
return;
}
int rgb_ab = rgb_y*columns + rgb_x; //absolute pixel position
uchar4 rgb_Img = rgbImage[rgb_ab];
greyImage[rgb_ab] = uchar((float(rgb_Img.x))*0.299f + (float(rgb_Img.y))*0.587f + (float(rgb_Img.z))*0.114f);
}
using namespace cv;
using namespace std;
void Proc_Img(uchar4** h_RGBImage, uchar** h_greyImage, uchar4 **d_RGBImage, uchar** d_greyImage);
void RGB_2_Greyscale(uchar* const d_greyImage, uchar4* const d_RGBImage, size_t num_Rows, size_t num_Cols);
void Save_Img();
Mat img_RGB;
Mat img_Grey;
uchar4 *d_rgbImg;
uchar *d_greyImg;
int main()
{
uchar4* h_rgbImg;
//uchar4* d_rgbImge=0;
uchar* h_greyImg;
//uchar* d_greyImge=0;
Proc_Img(&h_rgbImg, &h_greyImg, &d_rgbImg, &d_greyImg);
RGB_2_Greyscale(d_greyImg, d_rgbImg, img_RGB.rows, img_RGB.cols);
Save_Img();
return 0;
}
void Proc_Img(uchar4** h_RGBImage, uchar** h_greyImage, uchar4 **d_RGBImage, uchar** d_greyImage){
cudaFree(0);
CudaCheckError();
//loads image into a matrix object along with the colors in BGR format (must convert to rgb).
Mat img = imread("C:\\Users\\Austin\\Pictures\\wallpapers\\IMG_3581.JPG", CV_LOAD_IMAGE_COLOR);
if (img.empty()){
cerr << "couldnt open file dumbas..." << "C:\\Users\\Austin\\Pictures\\wallpapers\\IMG_3581.JPG" << endl;
exit(1);
}
//converts color type from BGR to RGB
cvtColor(img, img_RGB, CV_BGR2RGBA);
//allocate memory for new greyscale image.
//img.rows returns the range of pixels in y, img.cols returns range of pixels in x
//CV_8UC1 means 8 bit unsigned(non-negative) single channel of color, aka greyscale.
//all three of the parameters allow the create function in the Mat class to determine how much memory to allocate
img_Grey.create(img.rows, img.cols, CV_8UC1);
//creates rgb and greyscale image arrays
*h_RGBImage = (uchar4*)img_RGB.ptr<uchar>(0); //.ptr is a method in the mat class that returns a pointer to the first element of the matrix.
*h_greyImage = (uchar*)img_Grey.ptr<uchar>(0); //this is just like a regular array/pointer mem address to first element of the array. This is templated
//in this case the compiler runs the function for returning pointer of type unsigned char. for rgb image it is
//cast to uchar4 struct to hold r,g, and b values.
const size_t num_pix = (img_RGB.rows) * (img_RGB.cols); //amount of pixels
//allocate memory on gpu
cudaMalloc(d_RGBImage, sizeof(uchar4) * num_pix); //bites of 1 uchar4 times # of pixels gives number of bites necessary for array
CudaCheckError();
cudaMalloc(d_greyImage, sizeof(uchar) * num_pix);//bites of uchar times # pixels gives number of bites necessary for array
CudaCheckError();
cudaMemset(*d_greyImage, 0, sizeof(uchar) * num_pix);
CudaCheckError();
//copy array into allocated space
cudaMemcpy(*d_RGBImage, *h_RGBImage, sizeof(uchar4)*num_pix, cudaMemcpyHostToDevice);
CudaCheckError();
d_rgbImg = *d_RGBImage;
d_greyImg = *d_greyImage;
}
void RGB_2_Greyscale(uchar* const d_greyImage, uchar4* const d_RGBImage, size_t num_Rows, size_t num_Cols){
const int BS = 16;
const dim3 blockSize(BS, BS);
const dim3 gridSize((num_Cols / BS) + 1, (num_Rows / BS) + 1);
rgb_2_grey <<<gridSize, blockSize>>>(d_greyImage, d_RGBImage, num_Rows, num_Cols);
cudaDeviceSynchronize(); CudaCheckError();
}
void Save_Img(){
const size_t num_pix = (img_RGB.rows) * (img_RGB.cols);
cudaMemcpy(img_Grey.ptr<uchar>(0), d_greyImg, sizeof(uchar)*num_pix, cudaMemcpyDeviceToHost);
CudaCheckError();
imwrite("C:\\Users\\Austin\\Pictures\\wallpapers\\IMG_3581GR.JPG", img_Grey);
cudaFree(d_rgbImg);
cudaFree(d_greyImg);
}
EDIT: I realized that the local var in my main is the same name as the global var, I have edited the code here, now I get the error from visual studio that the
variable d_rgbIme is being used without being initialized
when I have already initialized it above. If I set them equal to zero I get a CUDA error saying
an illegal memory access was encountered
I tried running cuda-memcheck, but then I get the error that i could not run the file...
I have found the error thanks to one of the comments by Robert Crovella, he has been very helpful with this! it is in my kernel the if statement should read if ((rgb_x >= columns) || (rgb_y >= rows)) {
I was working on the same problem in JCUDA. See if you can use any part of this solution:
//Read Height and Width of image in Height & Width variables
int Width = image.getWidth();
int Height = image.getHeight();
int N = Height * Width;
int[] grayScale = new int[N];
//Allocate separate arrays to store Alpha, Red, Green and
//Blue values for every pixel
int[] redHost = new int[N];
int[] greenHost = new int[N];
int[] blueHost = new int[N];
int[] alphaHost = new int[N];
for(int i=0; i<Height; i++)
{
for(int j=0; j<Width; j++)
{
int pixel = image.getRGB(j, i);
//Read the ARGB data
alphaHost[i*Width+j] = (pixel >> 24) & 0xff;
redHost[i*Width+j] = (pixel >> 16) & 0xff;
greenHost[i*Width+j] = (pixel >> 8) & 0xff;
blueHost[i*Width+j] = (pixel) & 0xff;
}
}
/* Following are the CUDA Kernel parameters*/
Pointer kernelParameters = Pointer.to(
Pointer.to(new int[]{N}), //Total size of each array W * H
Pointer.to(redDev), // Pointer to redArray on device
Pointer.to(greenDev), // Pointer to greenArray on device
Pointer.to(blueDev), // Pointer to blueArray on device
Pointer.to(Output)); //Pointer to output array
/*Following is my RGBToGrayScale.cu..i.e. CUDA Kernel */
__global__ void RGBtoGrayScale(int N, int *red, int *green, int *blue, int *Output)
{
int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id<N)
{
Output[id] = (red[id]*0.2989) + (green[id]*0.587) + (blue[id]*0.114);
}
}
/* Get the output data back to Host memory */
cuMemcpyDtoH(Pointer.to(grayScale), Output, N * Sizeof.INT);
/* Write the image with the new RBG values*/
BufferedImage im = new BufferedImage(Width,Height,BufferedImage.TYPE_BYTE_GRAY);
WritableRaster raster = im.getRaster();
for(int i=0;i<Height;i++)
{
for(int j=0;j<Width;j++)
{
raster.setSample(j, i, 0, grayScale[i*Width+j]);
}
}
try
{
ImageIO.write(im,"JPEG",new File("glpattern.jpeg"));
} catch (IOException e)
{
e.printStackTrace();
}
I am trying to use CUDA to create a bitmap image of the mandlebrot set. I have looked at a few tutorials and already got some help here for the process of integrating the unmanaged CUDA dll with the managed C# gui. The problem I am having now is that my CUDA dll is not forming the bitmap correctly - and when I use an error checking macro on cudaDeviceSynchronize() after the kernel launch, I get cudaUnknownError.
Here is the relevant code:
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
struct complex
{
float r, i;
__device__ complex(float _r, float _i) : r(_r), i(_i) {}
__device__ float magnitudeSquared(){ return (r*r + i*i) ; }
__device__ complex& operator*=(const complex& rhs)
{
r = (r * rhs.r - i * rhs.i);
i = (r * rhs.i + i * rhs.r);
return *this;
}
__device__ complex& operator+=(const complex& rhs)
{
r = (r + rhs.r);
i = (i + rhs.i);
return *this;
}
};
__device__ int mandlebrotDiverge(complex *z)
{
complex c(*z);
int i = 0;
for(i = 0; i < MAX_ITERATIONS; i++)
{
*z *= *z;
*z += c;
if(z->magnitudeSquared() >= 2)
{
return 1;
}
}
return 0;
}
__global__ void kernel(int *ptr, int width, int height)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
float scale = 1.5f;
complex z(scale*(float)(width/2 - x)/(width/2), scale*(float)(height/2 - y)/(height/2));
if(offset < (1920*1080))
{
int mValue = mandlebrotDiverge(&z);
ptr[offset*3 + (uint8_t)0] = (uint8_t)(mValue*255);
ptr[offset*3 + (uint8_t)1] = (uint8_t)(mValue*255);
ptr[offset*3 + (uint8_t)2] = (uint8_t)(mValue*255);
}
}
extern "C" __declspec(dllexport) void __cdecl generateBitmap(void *bitmap)
{
int width = 1920;
int height = 1080;
int *dev_bmp;
dim3 blocks(width/16, height/16);
dim3 threads(16, 16);
gpuErrchk(cudaMalloc((void**)&dev_bmp, (3*width*height)));
kernel<<<blocks, threads>>>(dev_bmp, width, height);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaMemcpy(bitmap, dev_bmp, (width*height*3), cudaMemcpyDeviceToHost));
cudaFree(dev_bmp);
}
When I step through the code, everything appears to be working correctly until I get to gpuErrchk(cudaDeviceSynchronize()); - when I step into that, the error code simply says 'cudaUnknownError'. I have no clue what I am doing wrong at this point. Any help or advice on how to improve this solution would be appreciated.
EDIT:
Okay, looked at CUDA_memcheck, and I get this error (hundreds of times):
========= CUDA-MEMCHECK
========= Invalid __global__ write of size 4
========= at 0x00000a10 in C:/.../kernel.cu:77:kernel(int*, int, int)
========= by thread (15,11,0) in block (1,17,0)
========= Address 0x05a37f74 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
So I changed from *int to *unsigned char because I am trying to allocate arrays of individual bytes, not ints. Lots of errors cleared up, but now I get this:
========= CUDA-MEMCHECK
========= Program hit error 6 on CUDA API call to cudaDeviceSynchronize
========= Saved host backtrace up to driver entry point at error
========= Host Frame:C:\Windows\system32\nvcuda.dll (cuD3D11CtxCreate + 0x102459) [0x11e4b9]
========= Host Frame:C:\...\cudart32_55.dll (cudaDeviceSynchronize + 0xdd) [0x1149d]
========= Host Frame:C:\...\FractalMaxUnmanaged.dll (generateBitmap + 0xf0) [0x97c0]
=========
========= ERROR SUMMARY: 1 error
Okay, I'm making progress, but now when I step through the c# application, the byte buffer has 255 as the value for every byte, which doesn't make sense. Here is the c# code:
public unsafe class NativeMethods
{
[DllImport(#"C:\Users\Bill\Documents\Visual Studio 2012\Projects\FractalMaxUnmanaged\Debug\FractalMaxUnmanaged.dll", CallingConvention=CallingConvention.Cdecl)]
public static extern void generateBitmap(void *bitmap);
public static Bitmap create()
{
byte[] buf = new byte[1920 * 1080 * 3];
fixed (void* pBuffer = buf)
{
generateBitmap(pBuffer);
}
IntPtr unmanagedPtr = Marshal.AllocHGlobal(buf.Length);
Marshal.Copy(buf, 0, unmanagedPtr, buf.Length);
Bitmap img = new Bitmap(1920, 1080, 1920*3, PixelFormat.Format24bppRgb, unmanagedPtr);
Marshal.FreeHGlobal(unmanagedPtr);
return img;
}
}
Your problem here is that your memory allocations and copies are wrong, you are forgetting that cudaMalloc/cudaMemcpy expect the size in bytes. Since int uses 4 bytes, you are actually allocating less memory than required by your kernel. Use this instead (or use unsigned char which only needs 1 byte):
cudaMalloc((void**)&dev_bmp, (3*width*height)*sizeof(int));
cudaMemcpy(bitmap, dev_bmp, (3*width*height)*sizeof(int), cudaMemcpyDeviceToHost);
Also make sure that bitmap was allocated correctly. As #Eugene said, using cuda-memcheck is a good way to find the source of that kind of errors.