Cuda Memcpy from Device to Host crashes - c++

I m trying to find a minimum of RGB around the patch size of 15 x 15
In source.cpp file at
SAFE_CALL(cudaMemcpy(Dark_h, Dark_d, size2, cudaMemcpyDeviceToHost));
program get crashed
Here is my code snippet
darkprior.h
#ifndef DARKPRIOR_H_INCLUDED
#define DARKPRIOR_H_INCLUDED
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "cuda.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include "opencv2/opencv.hpp"
#define SAFE_CALL(call) \
do \
{ \
cudaError_t err = (call); \
if(cudaSuccess != err) \
{ \
fprintf(stderr,"CUDA Error:\nFile = %s\nLine = %d\nReason = %s\n", __FILE__, __LINE__, cudaGetErrorString(err)); \
cudaDeviceReset(); \
exit(EXIT_FAILURE); \
} \
} \
while (0)
void dark_channel(float *image_d, float *rgbmin_d, int height, int width);
#endif
Source.cpp
#include "DarkPrior.h"
#include <opencv2/opencv.hpp>
using namespace std;
using namespace cv;
int main()
{
//load the image
Mat src = imread("foggy_river.jpg");
//check whether image loaded is empty or not.
if (src.empty())
{
cerr << "no image"; return -1;
}
//Mat rgbMin(src.size(), CV_MAKETYPE(src.depth(), 1));
// int step = src.step;
float *image_h = NULL;
float *image_d = NULL;
float *Dark_d = NULL;
float *Dark_h = NULL;
//Mat rgbmin(src.size(), CV_MAKETYPE(src.depth(), 1));
size_t size1 = src.step * src.rows * sizeof(float);
size_t size2 = src.cols * src.rows * sizeof(float);
image_h = (float *)malloc(size1);
Dark_h = (float *)malloc(size1);
SAFE_CALL(cudaMalloc((void**)&image_d, size1));
SAFE_CALL(cudaMalloc((void**)&Dark_d, size2));
//convert image from CV::MAT to float*.
Mat dst;
src.convertTo(dst, CV_32F);
image_h = dst.ptr<float>();
SAFE_CALL(cudaMemcpy(image_d, image_h, size1, cudaMemcpyHostToDevice));
cout << "Calculating Minimum of RGB ..." << endl;
dark_channel(image_d, Dark_d, src.rows, src.cols);
SAFE_CALL(cudaMemcpy(Dark_h, Dark_d, size2, cudaMemcpyDeviceToHost));
Mat Dark_out(src.rows, src.cols, CV_32FC1, Dark_h);
imwrite("MinRGB.jpg", Dark_out);
cudaFree(image_d);
cudaFree(Dark_d);
//free(image_h);
//free(rgbmin_h);
return 0;
}
minRGB.cu
#include "DarkPrior.h"
//#define min(x,y) ((x<y)?x:y)
__device__ float safe_get(float *rgbMin, int width, int height, int x, int y)
{
// Clamp indices to image boundaries
x = min( max(0, x), width - 1);
y = min( max(0, y), height - 1);
// Translate 2D index into 1D index
const int idx = y * width + x ;
return rgbMin[idx];
}
__device__ float estimate_minimum_patch(float *rgbMin, int width, int height, int radius, int x, int y, float Minval)
{
for(int i = -radius; i <= radius; i++)
{
for(int j = -radius; j <= radius; j++)
{
float val = safe_get(rgbMin, width, height, x+i, y+j);
Minval = min (val, Minval);
}
}
}
__global__ void kernel_darkChannel (float *rgbMin, float *darkCh, int height, int width)
{
int radius = 7;
int x = blockIdx.x; // Current column
int y = blockIdx.y; // Current row
int tid = y * width + x;
float Minval = 255.0;
estimate_minimum_patch(rgbMin, width, height, radius, x, y, Minval);
darkCh[tid] = Minval;
}
__global__ void kernel_findMinRGB (float3 *image, float *tmp_min, int height, int width)
{
int x = blockIdx.x; // Current column
int y = blockIdx.y; // Current row
int i = y * width + x;
if(x > height && y > width)
{
return;
}
tmp_min[i] = min(image[i].x, min(image[i].y, image[i].z));
}
void dark_channel(float *image_d, float *Dark_d, int height, int width)
{
dim3 grid(width, height);
float *tmp_min;
cudaMalloc((void **)(&tmp_min), sizeof(float)*height*width);
kernel_findMinRGB <<<grid, 1>>> ((float3 *)image_d, tmp_min, height, width);
printf("RGB min is found\n");
kernel_darkChannel <<<grid, 1>>> (tmp_min, Dark_d, height, width);
printf("patch of minimum is also found\n");
return;
}
My code getting crashed with an error of unknown error # line 45 of source.cpp
I'm totally out of thoughts what is the reason, maybe you'll be able to help.

Pointer Dark_h points to host memory segment of size1 bytes. Pointer Dark_d points to device memory segment of size2 bytes. If size1 < size2 the call:
cudaMemcpy(Dark_h, Dark_d, size2, cudaMemcpyDeviceToHost)
will be troublesome as you'll write illegal memory (memory that's not part of array segment to which Dark_h points to, and perhaps you'll get SEGFAULT). I haven't tried it but I bet this is the reason behind the crash.

Related

Error: an illegal memory access was encountered

I had asked doubt error: calling a __host__ function from a __global__ function is not allowed and i got the ans . accordingly i have modified my code bt i am unable to access d_point[i]. how can i access that.
__global__ void densefun(int *d_counters,float2 *d_points,int d_x_max,int d_y_max,int width,int height, int min_distance,int size)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if(i <= size)
{
float2 point = (d_points)[i];
int x = floorf(point.x);
int y = floorf(point.y);
printf(" ( %d %d )",x,y);
if(x < d_x_max && y < d_y_max)
{
x /= min_distance;
y /= min_distance;
(d_counters)[y*width+x]++;
__syncthreads();
}
}
}
void DenseSample(const Mat& grey, std::vector<Point2f>& points, const double quality, const int min_distance)
{
int width = grey.cols/min_distance;
int height = grey.rows/min_distance;
Mat eig;
cornerMinEigenVal(grey, eig, 3, 3);
double maxVal = 0;
minMaxLoc(eig, 0, &maxVal);
const double threshold = maxVal*quality;
std::vector<int> counters(width*height);
int x_max = min_distance*width;
int y_max = min_distance*height;
printf("in descriptor size:%ld ",points.size());
int *d_counters;
float2 *d_points;
cudaMalloc(&d_counters,counters.size()*width*height*sizeof(int));
printf("in cuda point size:%d ",points.size());
cudaMalloc(&d_points,points.size()*sizeof(float2));
cout<<"points.size() : "<<points.size()<<endl;
cudaMemcpy(d_points, &points, points.size()*sizeof(float2), cudaMemcpyHostToDevice);
int blk=cvFloor(points.size()/1024)+1;
cout<<"blk : "<<blk<<endl;
if(points.size()>0)
{
densefun<<<blk,1024>>>(d_counters,d_points,x_max,y_max,width,height,min_distance, points.size());
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
printf("Error: %s\n", cudaGetErrorString(err));
cudaMemcpy(&counters, d_counters, counters.size()* width*height*sizeof(int), cudaMemcpyDeviceToHost);
}
cudaFree(d_counters);
cudaFree(d_points);
points.clear();
int index = 0;
int offset = min_distance/2;
for(int i = 0; i < height; i++)
for(int j = 0; j < width; j++, index++)
{
if(counters[index] <= 0)
{
int x = j*min_distance+offset;
int y = i*min_distance+offset;
if(eig.at<float>(y, x) > threshold)
points.push_back(Point2f(float(x), float(y)));
}
}
}
output is:
in descriptor size:1605 in cuda point size:1605 points.size() : 1605
blk : 2
Error: an illegal memory access was encountered
in descriptor size:918 in cuda point size:918 points.size() : 918
blk : 1
Error: an illegal memory access was encountered
You create a thread gird with block length 1024 and grid length equal to
int blk=cvFloor(points.size()/1024)+1;
Which basically means that the number of threads will be multiple of 1024 greater than points.size(). In this case using:
int i = blockDim.x * blockIdx.x + threadIdx.x;
float2 point = (d_points)[i];
cannot be successful, because you can be almost certain that you will get out of bounds memory access. Add some conditional to ensure that it won't happen.
__global__ void densefun(int *d_counters,float2 *d_points,int d_x_max,int d_y_max,int width, int height, int min_distance)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if(i < width * height)
{
//rest of the code
}
}
Also, you don't allocate enugh memory for d_points:
float2 *d_points;
cudaMalloc(&d_points,points.size()*sizeof(float));
If you want to allocate array of float2 (or copy to it) you need to use sizeof(float2).

visual studio 2012 parallelism with openmp

I am studying computer architecture in the university.
I have a home work which making convolution faster using parallelism(openMP).
For now I made convolution code (your_convolution) with omp, but It did not be faster at all!
I'm using visual studio 2012.
How can i make it faster??
here's whole convolution's code.
give me some help.
#include <intrin.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <vector>
#include <assert.h>
#include <omp.h>
using namespace std;
void convolution(float* output, float* input, float* filter, int width, int height, int r)
{
assert(output!=NULL && input!=NULL && filter!=NULL && width>0 && height>0 && r>0);
int w1=width-1;
int h1=height-1;
int fwidth=2*r+1;
int i, j, di, dj, ii, jj;
float sum;
for (i=0;i<height;++i)
{
for (j=0;j<width;++j)
{
sum=0;
for (di=-r;di<=r;++di)
{
ii=i+di;
ii=max(min(ii,h1),0);
for (dj=-r;dj<=r;++dj)
{
jj=j+dj;
jj=max(min(jj,w1),0);
sum+=filter[dj+r+(di+r)*fwidth]*input[jj+ii*width];
}
}
output[j+i*width]=sum;
}
}
}
void your_convolution(float* output, float* input, float* filter, int width, int height, int r)
{
// write your code here //
assert(output != NULL && input != NULL && filter != NULL && width>0 && height>0 && r>0);
int w1 = width - 1;
int h1 = height - 1;
int fwidth = 2 * r + 1;
int i, j, di, dj, ii, jj;
float sum;
omp_set_num_threads(4);
#pragma omp parallel
{
for (i = 0; i<height; ++i)
{
for (j = 0; j<width; ++j)
{
sum = 0;
for (di = -r; di <= r; ++di)
{
ii = i + di;
ii = max(min(ii, h1), 0);
#pragma omp parallel for
for (dj = -r; dj <= r; ++dj)
{
jj = j + dj;
jj = max(min(jj, w1), 0);
sum += filter[dj + r + (di + r)*fwidth] * input[jj + ii*width];
}
}
output[j + i*width] = sum;
}
}
}
}
int main()
{
// load the image
int width=1920; // width of the image
int height=1080; // height of the image
int len=width*height; // pixels in the image
int i, j, ii, jj, i2;
float* data=(float*)malloc(sizeof(float)*len); // buffer to load the image
float* output=(float*)malloc(sizeof(float)*len); // output buffer
FILE* fp=fopen("../image.dat", "rb"); // open the image, assume that the bld directory is a subdirectory to the src directory
fread(data, sizeof(float), width*height, fp); // load the float values, the image is gray.
fclose(fp);
// set the filter
int radius=3; // filter radius
float sigma=(float)(radius/3.0); // standard deviation of the Gaussian filter
float beta=(float)(-0.5/(sigma*sigma)); // coefficient exp(beta*x*x)
int fwidth=2*radius+1; // width of the filter
int flen=fwidth*fwidth; // number of elements in the filter
float* filter=(float*)malloc(sizeof(float)*flen); // filter buffer
float sum_weight=0; // we want to normalize the filter weights
for (i=-radius;i<=radius;++i)
{
ii=(i+radius)*fwidth;
i2=i*i;
for (j=-radius;j<=radius;++j)
{
jj=j+radius+ii;
filter[jj]=exp(beta*(i2+j*j));
sum_weight+=filter[jj];
}
}
sum_weight=(float)(1.0/sum_weight);
for (i=0;i<flen;++i)
filter[i]*=sum_weight; // now the weights are normalized to sum to 1
clock_t start=clock();
convolution(output, data, filter, width, height, radius);
clock_t finish=clock();
double duration = (double)(finish - start) / CLOCKS_PER_SEC;
printf( "convolution naive: %2.3f seconds\n", duration );
float* output2=(float*)malloc(sizeof(float)*len); // output buffer
start=clock();
your_convolution(output2, data, filter, width, height, radius);
finish=clock();
double duration2 = (double)(finish - start) / CLOCKS_PER_SEC;
printf( "your convolution: %2.3f seconds\n", duration2 );
double sum=0;
for (i=0;i<len;++i)
sum+=fabs(output[i]-output2[i]);
printf("difference of the outputs=%lf\n", sum);
printf( "The performance of your convolve is %2.1f times higher than convolution naive.\n", duration/duration2);
free(data);
free(filter);
free(output);
return 0;
}

How to determine a good centre for a zoom on the Mandelbrot set

I just finished writing a cuda program which renders images of the Mandelbrot set. The way I have it set up is that you pass the function which creates the image a scale which is pixels per unit and the x and y coordinates of the centre of the image in the complex plane. I want to create a deep zoom movie from many frames and I need my program to be able to automatically determine a centre where "interesting" stuff will be happening (not zooming in on a region which will just be all one colour). How should I pick the coordinate to zoom into.
Here is my code if anyone is interested.
#include <iostream>
#include <thrust/complex.h>
#include <cuda.h>
#include <cassert>
#include <cstdio>
#include <algorithm>
typedef double real;
inline void cuda_error(cudaError_t code, const char* lbl)
{
if(code != cudaSuccess)
{
std::cerr << lbl << " : " << cudaGetErrorString(code) << std::endl;
exit(1);
}
}
__global__ void mandelbrot_kernel(unsigned char* pix, real cx, real cy, real pix_scale, size_t w, size_t h, int iters)
{
cy = -cy;
real sx = cx - (w * pix_scale) / 2;
real sy = cy - (w * pix_scale) / 2;
size_t x = (size_t)blockIdx.x * blockDim.x + threadIdx.x;
size_t y = (size_t)blockIdx.y * blockDim.y + threadIdx.y;
if(x >= w || y >= h)
return;
thrust::complex<real> c(sx + pix_scale * x, sy + pix_scale * y);
thrust::complex<real> z(0, 0);
int i = 0;
for(; i < iters && thrust::abs(z) < 2; ++i)
z = z * z + c;
real scale = 255.0 / (real)iters;
size_t q = 3 * (w * y + x);
pix[q] = i * scale;
pix[q + 1] = 255 * sinf(z.imag());
pix[q + 2] = 255 * sinf(z.real());
}
void shade_mandelbrot(unsigned char* pix, real* devs, real cx, real cy, real pix_scale, int w, int h, int iters)
{
dim3 blockDim(16, 16);
dim3 gridDim((w + 15) / 16, (h + 15) / 16);
mandelbrot_kernel<<<gridDim, blockDim>>>(pix, cx, cy, pix_scale, w, h, iters);
}
void ppm_write(FILE* f, unsigned char* pix, int w, int h)
{
assert(fprintf(f, "P6 %d %d 255\n", w, h) > 0);
size_t sz = 3 * (size_t)w * (size_t)h;
assert(fwrite(pix, 1, sz, f) == sz);
}
int main()
{
int dim = 2000;
int w = dim;
int h = dim;
int imgs = 200;
int iters = 1024;
real cx = -0.7463, cy = 0.1102;
cuda_error(cudaSetDevice(0), "Set Device");
unsigned char* pix_buffers[2];
real* dev_buffers[2];
cuda_error(cudaHostAlloc(pix_buffers, 3 * sizeof(unsigned char) * w * h, 0), "Host Alloc 1");
cuda_error(cudaHostAlloc(pix_buffers + 1, 3 * sizeof(unsigned char) * w * h, 0), "Host Alloc 2");
real scale = 8.0 / w;
shade_mandelbrot(pix_buffers[0], dev_buffers[0], cx, cy, scale, w, h, iters);
for(int i = 0; i < imgs; i++)
{
cuda_error(cudaDeviceSynchronize(), "Sync");
std::cout << scale << std::endl;
if(i < (imgs - 1))
shade_mandelbrot(pix_buffers[(i + 1) % 2], dev_buffers[(i + 1) % 2], cx, cy, scale *= 0.97, w, h, 255);
char fn[100];
sprintf(fn, "/media/chase/3161D67803D8C5BE/Mandelbroght/image%06d.ppm", i);
puts(fn);
FILE* f = fopen(fn, "w");
assert(f);
ppm_write(f, pix_buffers[i % 2], w, h);
fclose(f);
}
cuda_error(cudaFreeHost(pix_buffers[0]), "Host Free 1");
cuda_error(cudaFreeHost(pix_buffers[1]), "Host Free 2");
return 0;
}
Points with a high iteration count (but not equal to iters) when exiting the inner loop will present interesting behavior because they are close to the set boundary. You could just pick points in random, run it through the algorithm and use the point with the highest count as the center. You may get results faster if you pick a few points in random, take the point with the highest iteration count, generate a few points around it, see if you get an even higher iteration count, and repeat with the best of those points.
Well I came up with an idea which works really well. What I do I calculate the entropy for each 16x16 square in the image. I then just zoom into the the area with the maximum entropy for that image.
__global__ void entropy_kernel(unsigned char* pix, float* entropy, size_t w, size_t h)
{
__shared__ float probs[256];
__shared__ float e;
if(threadIdx.x == 0 && threadIdx.y == 0)
{
e = 0;
for(int i = 0; i < 256; i++)
probs[i] = 0;
}
__syncthreads();
int x = blockIdx.x * ENTROPY_BLOCK_DIM + threadIdx.x;
int y = blockIdx.y * ENTROPY_BLOCK_DIM + threadIdx.y;
int px = pix[3 * (y * w + x)];
float p = 1.0 / (float)(ENTROPY_BLOCK_DIM * ENTROPY_BLOCK_DIM);
atomicAdd(probs + px, p);
__syncthreads();
p = probs[px];
if(p) atomicAdd(&e, p * log10f(p));
__syncthreads();
if(threadIdx.x == 0 && threadIdx.y == 0)
{
entropy[blockIdx.y * gridDim.x + blockIdx.x] = -e;
}
}
Here is how it turned out.
https://www.youtube.com/watch?v=mtxbdoJBA0Q

Code for Julia set always generates a grey image in CUDA C

I've written the following code for generating a Julia set fractal in CUDA C/C++ with some help from online sources. I've been trying for hours now, but I'm unable to figure out as to why this always generates a grey image rather than the one I get when I run the CPU code. I'm new to CUDA C and parallel programming, and I'm currently referring to CUDA by Example by Sanders and Kandrot.
Here is the CPU variant of code, which runs fine with all the necessary imports in VS2013:
/*
References:
[1] http://stackoverflow.com/questions/23711681/generating-custom-color-palette-for-julia-set
[2] http://www.cs.rit.edu/~ncs/color/t_convert.html
*/
#include <stdio.h>
#include <stdlib.h>
#include <complex>
#include <string.h>
#include <IL/il.h>
#include <IL/ilu.h>
#include <time.h>
using namespace std;
#define N 1024
#define SQRT_2 1.4142
#define MAX_ITER 512
void HSVtoRGB( float *r, float *g, float *b, float h, float s, float v );
void saveImage(int width, int height, unsigned char * bitmap, complex<float> seed);
void compute_julia(complex<float> c, unsigned char * image);
int main(int argc, char **argv)
{
complex<float> c(0.285f, 0.01f);
if(argc > 2)
{
c.real(atof(argv[1]));
c.imag(atof(argv[2]));
} else
fprintf(stderr, "Usage: %s <real> <imag>\nWhere <real> and <imag> form the complex seed for the Julia set.\n", argv[0]);
ilInit();
unsigned char *image = new unsigned char[N*N*3]; //RGB image
compute_julia(c, image);
saveImage(N, N, image, c);
delete[] image;
}
void compute_julia(complex<float> c, unsigned char * image)
{
complex<float> z_old(0.0f, 0.0f);
complex<float> z_new(0.0f, 0.0f);
for(int y=0; y<N; y++)
for(int x=0; x<N; x++)
{
z_new.real(4.0f * x / (N) - 2.0f);
z_new.imag(4.0f * y / (N) - 2.0f);
int i;
for(i=0; i<MAX_ITER; i++)
{
z_old.real(z_new.real());
z_old.imag(z_new.imag());
z_new = pow(z_new, 2);
z_new += c;
if(norm(z_new) > 4.0f) break;
}
float brightness = (i<MAX_ITER) ? 1.0f : 0.0f;
float hue = (i % MAX_ITER)/float(MAX_ITER - 1);
hue = (120*sqrtf(hue) + 150);
float r, g, b;
HSVtoRGB(&r, &g, &b, hue, 1.0f, brightness);
image[(x + y*N)*3 + 0] = (unsigned char)(b*255);
image[(x + y*N)*3 + 1] = (unsigned char)(g*255);
image[(x + y*N)*3 + 2] = (unsigned char)(r*255);
}
}
void saveImage(int width, int height, unsigned char * bitmap, complex<float> seed)
{
ILuint imageID = ilGenImage();
ilBindImage(imageID);
ilTexImage(width, height, 1, 3, IL_RGB, IL_UNSIGNED_BYTE, bitmap);
//ilEnable(IL_FILE_OVERWRITE);
char imageName[256];
sprintf(imageName, "Julia %.3f + i%.3f.png", seed.real(), seed.imag());
ilSave(IL_PNG, imageName);
fprintf(stderr, "Image saved as: %s\n", imageName);
}
// r,g,b values are from 0 to 1
// h = [0,360], s = [0,1], v = [0,1]
// if s == 0, then h = -1 (undefined)
void HSVtoRGB( float *r, float *g, float *b, float h, float s, float v )
{
int i;
float f, p, q, t;
if( s == 0 ) {
// achromatic (grey)
*r = *g = *b = v;
return;
}
h /= 60; // sector 0 to 5
i = floor( h );
f = h - i; // factorial part of h
p = v * ( 1 - s );
q = v * ( 1 - s * f );
t = v * ( 1 - s * ( 1 - f ) );
switch( i ) {
case 0:
*r = v;
*g = t;
*b = p;
break;
case 1:
*r = q;
*g = v;
*b = p;
break;
case 2:
*r = p;
*g = v;
*b = t;
break;
case 3:
*r = p;
*g = q;
*b = v;
break;
case 4:
*r = t;
*g = p;
*b = v;
break;
default: // case 5:
*r = v;
*g = p;
*b = q;
break;
}
}
And here is the corresponding GPU version (note that it is pretty unrefined at the moment, I'll do that once I'm able to get basic functionality out of it):
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <complex>
#include <string.h>
#include <IL/il.h>
#include <IL/ilu.h>
#include <time.h>
/*
References:
[1] http://stackoverflow.com/questions/23711681/generating-custom-color-palette-for-julia-set
[2] http://www.cs.rit.edu/~ncs/color/t_convert.html
*/
using namespace std;
#define N 1024
#define SQRT_2 1.4142
#define MAX_ITER 512
struct cuComplex {
float r;
float i;
__host__ __device__ cuComplex(float a, float b) : r(a), i(b) {}
__host__ __device__ float magnitude2(void) {
return r * r + i * i;
}
__host__ __device__ cuComplex operator*(const cuComplex& a) {
return cuComplex(r*a.r - i*a.i, i*a.r + r*a.i);
}
__host__ __device__ cuComplex operator+(const cuComplex& a) {
return cuComplex(r + a.r, i + a.i);
}
};
void HSVtoRGB(float *r, float *g, float *b, float h, float s, float v);
void saveImage(int width, int height, unsigned char * bitmap, cuComplex seed);
void compute_julia(complex<float> c, unsigned char * image);
__global__ void compute_julia_gpu(unsigned char* image);
__device__ void HSVtoRGB_GPU(float *r, float *g, float *b, float h, float s, float v);
int main(int argc, char **argv)
{
cuComplex c(-0.8f, 0.156f);
/*
if (argc > 2)
{
c.real(atof(argv[1]));
c.imag(atof(argv[2]));
}*/
fprintf(stderr, "Usage: %s <real> <imag>\nWhere <real> and <imag> form the complex seed for the Julia set.\n", argv[0]);
ilInit();
dim3 grid(N, N);
unsigned char *image = new unsigned char[N*N * 3]; //RGB image
size_t size = sizeof(image);
unsigned char *d_image; //RGB image
cudaMalloc((void **)&d_image, size);
compute_julia_gpu<<<grid, 1>>>(d_image);
cudaMemcpy(image, d_image, size, cudaMemcpyDeviceToHost);
saveImage(N, N, image, c);
cudaFree(d_image);
delete[] image;
}
__global__ void compute_julia_gpu(unsigned char* image) {
/*
complex<float> z_old(0.0f, 0.0f);
complex<float> z_new(0.0f, 0.0f);
complex<float> c(-0.8f, 0.156f);
*/
cuComplex z_old(0.0, 0.0);
cuComplex z_new(0.0, 0.0);
cuComplex c(-0.8f, 0.156f);
int x = blockIdx.x;
int y = blockIdx.y;
z_new.r = (4.0f * x / (N)-2.0f);
z_new.i = (4.0f * y / (N)-2.0f);
int i = 0;
for (i = 0; i<MAX_ITER; i++)
{
z_old.r = z_new.r;
z_old.i = z_new.i;
z_new = (z_new * z_new) + c;
if (z_new.magnitude2() > 4.0f) break;
}
float brightness = (i<MAX_ITER) ? 1.0f : 0.0f;
float hue = (i % MAX_ITER) / float(MAX_ITER - 1);
hue = (120 * sqrtf(hue) + 150);
float r, g, b;
HSVtoRGB_GPU(&r, &g, &b, hue, 1.0f, brightness);
image[(x + y*N) * 3 + 0] = (unsigned char)(b * 255);
image[(x + y*N) * 3 + 1] = (unsigned char)(g * 255);
image[(x + y*N) * 3 + 2] = (unsigned char)(r * 255);
}
void saveImage(int width, int height, unsigned char * bitmap, cuComplex seed)
{
ILuint imageID = ilGenImage();
ilBindImage(imageID);
ilTexImage(width, height, 1, 3, IL_RGB, IL_UNSIGNED_BYTE, bitmap);
//ilEnable(IL_FILE_OVERWRITE);
char imageName[256];
sprintf(imageName, "Julia %.3f + i%.3f.png", seed.r, seed.i);
ilSave(IL_PNG, imageName);
fprintf(stderr, "Image saved as: %s\n", imageName);
}
__device__ void HSVtoRGB_GPU(float *r, float *g, float *b, float h, float s, float v)
{
int i;
float f, p, q, t;
if (s == 0) {
// achromatic (grey)
*r = *g = *b = v;
return;
}
h /= 60; // sector 0 to 5
i = floor(h);
f = h - i; // factorial part of h
p = v * (1 - s);
q = v * (1 - s * f);
t = v * (1 - s * (1 - f));
switch (i) {
case 0:
*r = v;
*g = t;
*b = p;
break;
case 1:
*r = q;
*g = v;
*b = p;
break;
case 2:
*r = p;
*g = v;
*b = t;
break;
case 3:
*r = p;
*g = q;
*b = v;
break;
case 4:
*r = t;
*g = p;
*b = v;
break;
default: // case 5:
*r = v;
*g = p;
*b = q;
break;
}
}
Any help is appreciated, thanks.
The problem is your size variable:
#include <iostream>
#include <string.h>
using namespace std;
#define N 1024
int main() {
unsigned char *image = new unsigned char[N*N * 3]; //RGB image
size_t size = sizeof(image);
cout << size;
return 0;
}
in this case the ouput is 4 (on a 32 bit architecture) because sizeof returns the size of the variable's type. In this case it is unsigned char * and is 4 byte long.
You can run cuda-memcheck ./yourExecuteable and you will see errors when your code performs out of bounds access to the GPU's global memory. You will see a lot of errors because you are allocating only 4 bytes of global memory for your d_image array.

Passing arrays to Cuda

Hallo,
I am new to CUDA and im trying to copy an array of data into the CUDA kernel. Im not sure what I am doing wrong and could really do with some pointers in the right direction.
My UpdatePixel function works if I dont use the array to set the data. If I set colour.x to 1 my whole screen goes red. If I use m_dataPtr[index] as colour.x only a few pixels towards the bottom of the screen goes red (less then 5 pixels). I have attached the cuda code an the c++ code that I think would be relewant. As the code works fine with colour.x = 1, i suspect its the copy//allocation part of the cuda code that is broken?
CUDA:
#include <cutil_inline.h>
#include <cutil_math.h>
__constant__ float* m_dataPtr;
__device__ unsigned int rgbaFloatToInt_new(float4 rgba)
{
rgba.x = __saturatef(rgba.x); // clamp to [0.0, 1.0]
rgba.y = __saturatef(rgba.y);
rgba.z = __saturatef(rgba.z);
rgba.w = __saturatef(rgba.w);
return (uint(rgba.w*255)<<24) | (uint(rgba.z*255)<<16) | (uint(rgba.y*255)<<8) | uint(rgba.x*255);
}
__global__ void UpdatePixel(unsigned int *outputImage, unsigned int imageW, unsigned int imageH)
{
uint x = blockIdx.x*blockDim.x + threadIdx.x;
uint y = blockIdx.y*blockDim.y + threadIdx.y;
unsigned int index = y * imageW + x;
float data = m_dataPtr[index];
float4 colour;
colour.x = data;
colour.y = 0;
colour.z = 0;
colour.w = 1;
outputImage[index] = rgbaFloatToInt_new(colour);
}
extern "C" void UpdateImage(dim3 gridSize, dim3 blockSize,uint *d_output, uint imageW, uint imageH)
{
UpdatePixel<<<gridSize, blockSize>>>( d_output, imageW, imageH);
}
extern "C" void AllocateData(size_t dataSize)
{
cudaFree(m_dataPtr);
cutilSafeCall( cudaMalloc((void**)&m_dataPtr, dataSize) );
}
extern "C" void CopyData(float *dataPtr, size_t dataSize)
{
cutilSafeCall( cudaMemcpy(m_dataPtr, dataPtr, dataSize, cudaMemcpyHostToDevice ) );
}
C++:
float *pixelData = new float[imageWidth * imageHeight];
unsigned int pixelDataSize = (sizeof(float) * imageWidth * imageHeight);
for(unsigned int x = 0; x < imageWidth; x++)
{
for(unsigned int y = 0; y < imageHeight; y++)
{
unsigned int idx = imageWidth * y + x;
pixelData[idx] = 1;
}
}
AllocateData(pixelDataSize);
CopyData(pixelData, pixelDataSize);
If you are using constant memory on the gpu, you will need to use cudaMemcpyToSymbol rather than cudaMemcpy.