Unexpected CPU utilization with OpenCL - c++

I've written a simple OpenCL kernel to calculate the cross-correlation of two images on the GPU. However, when I execute the kernel with enqueueNDRangeKernel the CPU usage of one core rises to 100%, but the host code does nothing except waiting for the enqueued command to finish. Is this normal behavior of an OpenCL program? What is going on there?
OpenCL kernel (if relevant):
kernel void cross_correlation(global double *f,
global double *g,
global double *res) {
// This work item will compute the cross-correlation value for pixel w
const int2 w = (int2)(get_global_id(0), get_global_id(1));
// Main loop
int xy_index = 0;
int xy_plus_w_index = w.x + w.y * X;
double integral = 0;
for ( int y = 0; y + w.y < Y; ++y ) {
for ( int x = 0; x + w.x < X; ++x, ++xy_index, ++xy_plus_w_index ) {
// xy_index is equal to x + y * X
// xy_plus_w_index is equal to (x + w.x) + (y + w.y) * X
integral += f[xy_index] * g[xy_plus_w_index];
}
xy_index += w.x;
xy_plus_w_index += w.x;
}
res[w.x + w.y * X] = integral;
}
The images f, g, res have a size of X times Y pixels, where X and Y are set at compile time. I'm testing the above kernel with X = 2048 and Y = 2048.
Additional info: I am running the kernel on a Nvidia GPU with OpenCL version 1.2. The C++ program is written using the OpenCL C++ Wrapper API and executed on Debian using optirun from the bumblebee package.
As requested, here is a minimal working example:
#include <CL/cl.hpp>
#include <sstream>
#include <fstream>
using namespace std;
int main ( int argc, char **argv ) {
const int X = 2048;
const int Y = 2048;
// Create context
cl::Context context ( CL_DEVICE_TYPE_GPU );
// Read kernel from file
ifstream kernel_file ( "cross_correlation.cl" );
stringstream buffer;
buffer << kernel_file.rdbuf ( );
string kernel_code = buffer.str ( );
// Build kernel
cl::Program::Sources sources;
sources.push_back ( { kernel_code.c_str ( ), kernel_code.length ( ) } );
cl::Program program ( context, sources );
program.build ( " -DX=2048 -DY=2048" );
// Allocate buffer memory
cl::Buffer fbuf ( context, CL_MEM_READ_WRITE, X * Y * sizeof(double) );
cl::Buffer gbuf ( context, CL_MEM_READ_WRITE, X * Y * sizeof(double) );
cl::Buffer resbuf ( context, CL_MEM_WRITE_ONLY, X * Y * sizeof(double) );
// Create command queue
cl::CommandQueue queue ( context );
// Create kernel
cl::Kernel kernel ( program, "cross_correlation" );
kernel.setArg ( 0, fbuf );
kernel.setArg ( 1, gbuf );
kernel.setArg ( 2, resbuf );
// Set input arguments
double *f = new double[X*Y];
double *g = new double[X*Y];
for ( int i = 0; i < X * Y; i++ )
f[i] = g[i] = 0.001 * i;
queue.enqueueWriteBuffer ( fbuf, CL_TRUE, 0, X * Y * sizeof(double), f );
queue.enqueueWriteBuffer ( gbuf, CL_TRUE, 0, X * Y * sizeof(double), g );
// Execute kernel
queue.enqueueNDRangeKernel ( kernel, cl::NullRange, cl::NDRange ( X, Y ), cl::NullRange, NULL, NULL );
queue.finish ( );
return 0;
}

You don't say how you call enqueueNDRangeKernel - which is the critical bit. As I understand it, for NVidia, the call is blocking (although I don't think it's part of the standard that it should be so.)
You can get around this by having a separate thread invoke enqueueNDRangeKernel and let that thread block on it whilst your other threads continue, and teh blocking thread can signal an event when it completes.
There's a discussion on it here - and it raises some caveats about having multiple calls to the enqueue occurring in parallel.

Related

Why is OpenCL nested loop only working for some elements

I am trying to implement the following loop in an OpenCL kernel.
for(i=0;i<N;i++) for(j=0;j<M;j++) weights[i*M+j] += gradients[i] * input[j];
This is my kernel. I am currently hardcoding M to be 4 and it is only working for the first 4 elements.
__kernel
void cwk3( __global float *gradients, __global float *inputs, __global float *weights)
{
// The global id tells us the index of the vector for this thread.
int gid1 = get_global_id(0);
int gid2 = get_global_id(1);
// Perform the addition.
weights[(gid1 * 4) + gid2] += gradients[gid1] * inputs[gid2];
}
The relevant c++ code is
float
*gradients = (float*) malloc( N *sizeof(float) ),
*inputs = (float*) malloc( M*sizeof(float) ),
*weights = (float*) malloc( N*M*sizeof(float) );
initialiseArrays( gradients, inputs, weights, N, M );
cl_mem deviceGradients = clCreateBuffer( context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, N*sizeof(float), gradients
, &status );
cl_mem deviceInputs = clCreateBuffer( context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, M*sizeof(float), inputs
, &status );
cl_mem deviceWeights = clCreateBuffer( context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, N*M*sizeof(float), weights
, &status );
cl_kernel kernel = compileKernelFromFile( "kernel.cl", "cwk3", context, device );
status = clSetKernelArg( kernel, 0, sizeof(deviceGradients), &deviceGradients );
status = clSetKernelArg( kernel, 1, sizeof(deviceInputs), &deviceInputs );
status = clSetKernelArg( kernel, 2, sizeof(deviceWeights), &deviceWeights );
size_t indexSpaceSize[2], workGroupSize[1];
indexSpaceSize[0] = N;
indexSpaceSize[1] = M;
workGroupSize [0] = 4;
status = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, indexSpaceSize, workGroupSize, 0, NULL, NULL );
if( status != CL_SUCCESS )
{
printf( "Failure enqueuing kernel: Error %d.\n", status );
return EXIT_FAILURE;
}
status = clEnqueueReadBuffer( queue, deviceWeights, CL_TRUE, 0, N*M*sizeof(float), weights, 0, NULL, NULL );
if( status != CL_SUCCESS )
{
printf( "Could not copy device data to host: Error %d.\n", status );
return EXIT_FAILURE;
}
This simply creates the buffers and copies them to the GPU, launches the kernel and then reads the answer back from the GPU to the CPU. N and M are read in as command line arguments. I am currently setting them both to 4 for testing
You seem to be confused about global and local work groups.
Global work size specifies total number of calls (work items) executed.
global_work_size=[M,N] will call the kernel MxN times in total. One work item can determine its position by get_global_id. OpenCL could implement this as something like this :
for(i=0;i<N;i++)
for(j=0;j<M;j++)
call_kernel(set global_id=[i,j])
Local work groups describe how to group together individual launched work items( which are created according to global sizes) and make them aware of each other and share memory between themselves. None of those features you use/need, so ignore them.
So to implement your for loop in OpenCL:
for(i=0;i<N;i++)
for(j=0;j<M;j++)
weights[i*M+j] += gradients[i] * input[j];
You would have this kernel:
__kernel
void cwk3( __global float *gradients, __global float *inputs, __global float *weights)
{
int gid1 = get_global_id(0);
int gid2 = get_global_id(1);
int M = get_global_size(0);
weights[(gid1 * M) + gid2] += gradients[gid1] * inputs[gid2];
}
And call it like this:
size_t global_work[2];
global_work[0]=M;
global_work[1]=N;
// This is 2D kernel, not 1D
// Offsets are 0
// Global work size is M*N
// Ignore local work size
status = clEnqueueNDRangeKernel( queue, kernel, 2, NULL, global_work);

How to use linear indexes on cv::cuda::PtrStepSzf data

I'm working with opencv 3.1 cv::cuda template matching but the cv::cuda::minMaxLoc() function is too slow for my case. My match results have minimum size of 128x128 and max size up to 512x512. In average minMaxLoc() will take 1.65 ms for the 128x128 and up to 25 ms for something like 350x350 which is too long since this is done hundreds of times.
I underestand that my match sizes are maybe too small for what do you usually use in GPU. But I want to test along the lines that Robert Crovella did at thrust::max_element slow in comparison cublasIsamax - More efficient implementation? to see if I can get better performance.
My problem is that all those reductions the data is being read using linear indexes and cv::cuda::PtrStepSzfdoes not allow this(At least I did not find how). I try to reshape my match result but I cannot do that since the data is not contiguous. Do I need to go toward cudaMallocPitch and cudaMemcpy2DIf that the case how I do that with a cv::cuda::GPUMat read as cv::cuda::PtrStepSzf object?
__global__ void minLoc(const cv::cuda::PtrStepSzf data,
float* minVal,
float * minValLoc
)
{
int dsize = data.cols*data.rows
__shared__ volatile T vals[nTPB];
__shared__ volatile int idxs[nTPB];
__shared__ volatile int last_block;
int idx = threadIdx.x+blockDim.x*blockIdx.x;
last_block = 0;
T my_val = FLOAT_MIN;
int my_idx = -1;
// sweep from global memory
while (idx < dsize)
{
//data(idx) is an illegal call;The legal one is data(x,y)
// How do I do it?
if (data(idx) > my_val)
{
my_val = data(idx); my_idx = idx;
}
idx += blockDim.x*gridDim.x;
}
// ... rest of the kernel
}
void callMinLocKernel(cv::InputArray _input,
cv::Point minValLoc,
float minVal,
cv::cuda::Stream _stream)
{
const cv::cuda::GpuMat input = _input.getGpuMat();
dim3 cthreads(32, 32);
dim3 cblocks(
static_cast<int>(std::ceil(input1.size().width /
static_cast<double>(cthreads.x))),
static_cast<int>(std::ceil(input1.size().height /
static_cast<double>(cthreads.y))));
// code that creates and upload d_min, d_minLoc
float h_min = 9999;
int h_minLoc = -1;
float * d_min = 0;
int * d_minLoc = 0;
//gpuErrchk is defined on other place
gpuErrchk( cudaMalloc((void**)&d_min, sizeof(h_min)));
gpuErrchk( cudaMalloc((void**)&d_minLoc, sizeof(h_minLoc));
gpuErrchk( cudaMemcpy(d_min, &h_min, sizeof(h_min), cudaMemcpyHostToDevice) );
gpuErrchk( cudaMemcpy(d_minLoc, &h_minLoc, sizeof(h_minLoc), cudaMemcpyHostToDevice) );
cudaStream_t stream = cv::cuda::StreamAccessor::getStream(_stream);
minLoc<<<cblocks, cthreads, 0, stream>>>(input,d_min,d_minLoc);
gpuErrchk(cudaGetLastError());
//code to read the answer
gpuErrchk( cudaMemcpy(&h_min, d_min, sizeof(h_min), cudaMemcpyDeviceToHost) );
gpuErrchk( cudaMemcpy(&h_minLoc, d_minLoc, sizeof(h_minLoc), cudaMemcpyDeviceToHost) );
minValLoc = cv::point(h_minLoc/data.cols,h_minLoc%data.cols)
minVal = h_min;
}
int main()
{
//read Background and template
cv::Mat input = imread("cat.jpg",0);
cv::Mat templ = imread("catNose.jpg",0)
//convert to floats
cv::Mat float_input, float_templ;
input.convertTo(float_input,CV_32FC1);
input.convertTo(float_templ,CV_32FC1);
//upload Bckg and template to gpu
cv::cuda::GpuMat d_src,d_templ, d_match;
Size size = float_input.size();
d_src.upload(float_input);
d_templ.upload(float_templ);
double min_val, max_val;
Point min_loc, max_loc;
Ptr<cv::cuda::TemplateMatching> alg = cuda::createTemplateMatching(d_src.type(), CV_TM_SQDIFF);
alg->match(d_src, d_templ, d_match);
cv::cuda::Normalize(d_match,d_match);
//Too slow
//cv::cuda::minMaxLoc(d_match, &min_val, &max_val, &min_loc, &max_loc);
callMinLocKernel(d_match,min_val,min_loc);
return 0;
}
I did not find a way to actually use linear indexes with cv::cuda::PtrStepSzf. I am not sure there is one. Looks like when this format is used it can only use 2 subscripts. Instead I used the pointer ptr on cv::cuda::GpuMat input variable in the kernel wrapper as follow:
#define nTPB 1024
#define FLOAT_MAX 9999.0f
void callMinLocKernel(cv::InputArray _input,
cv::Point minValLoc,
float minVal,
cv::cuda::Stream _stream)
{
const cv::cuda::GpuMat input = _input.getGpuMat();
const float* linSrc = input.ptr<float>();
size_t step = input.step;
dim3 cthreads(nTPB);
dim3 cblocks(
static_cast<int>(std::ceil(input.size().width*input1.size().height /
static_cast<double>(nTPB))));
// code that creates and upload d_min, d_minLoc
float h_min = 9999;
int h_minLoc = -1;
float * d_min = 0;
int * d_minLoc = 0;
//gpuErrchk is defined on other place
gpuErrchk( cudaMalloc((void**)&d_min, sizeof(h_min)));
gpuErrchk( cudaMalloc((void**)&d_minLoc, sizeof(h_minLoc));
gpuErrchk( cudaMemcpy(d_min, &h_min, sizeof(h_min), cudaMemcpyHostToDevice) );
gpuErrchk( cudaMemcpy(d_minLoc, &h_minLoc, sizeof(h_minLoc), cudaMemcpyHostToDevice) );
cudaStream_t stream = cv::cuda::StreamAccessor::getStream(_stream);
minLoc<<<cblocks, cthreads, 0, stream>>>(input,d_min,d_minLoc);
gpuErrchk(cudaGetLastError());
//code to read the answer
gpuErrchk( cudaMemcpy(&h_min, d_min, sizeof(h_min), cudaMemcpyDeviceToHost) );
gpuErrchk( cudaMemcpy(&h_minLoc, d_minLoc, sizeof(h_minLoc), cudaMemcpyDeviceToHost) );
minValLoc = cv::point(h_minLoc/data.cols,h_minLoc%data.cols)
minVal = h_min;
}
And inside the Kernel as:
__global__ void minLoc(const float* data,
const size_t step,
cv::Size dataSz,
float* minVal,
float * minValLoc
)
{
__shared__ volatile T vals[nTPB];
__shared__ volatile int idxs[nTPB];
__shared__ volatile int last_block;
int idx = threadIdx.x+blockDim.x*blockIdx.x;
const int dsize = dataSz.height*dataSz.width;
last_block = 0;
float my_val = FLOAT_MAX;
int my_idx = -1;
// sweep from global memory
while (idx < dsize)
{
int row = idx / dataSz.width;
int id = ( row*step / sizeof( float ) ) + idx % dataSz.width;
if ( data[id] < my_val )
{
my_val = data[id];
my_idx = idx;
}
idx += blockDim.x*gridDim.x;
}
// ... rest of the kernel
}
The step is in bytes so it needs to be divided by sizeof(typeVariable)
I hope this help!

out of stack space error ( stack overflow error)

I'm trying to calculate a matrice multiplication of size N (square matrix) but I'm getting a stack overflow error(I'm new to Cuda ):
if I test the code for N < 300 everything is fine, but if I test it with N> 300 it does not work, and a stack overflow error was displayed but there is enough memory.in my graphics card GF 820M .
if N = 300 then 300 * 300 * 4(size of float) = 360000 byte : necessary space in the device to allocate for an array of type float.and here it must allocate for 3 Table to do multiplication .therefore 360000 * 3 = 1080000 bytes and if I control the CudaMalloc nothing is displayed.
I inform you that my main goal is to test for N large enough.How do I solve that? thank you in advance for any help you might be able to provide.
#include <stdio.h>
#include<device_launch_parameters.h>
#include<cuda.h>
#include<time.h>
#include<cuda_runtime.h>
#include <math.h>
__global__ void MatrixMul( float *Md , float *Nd , float *Pd , const int WIDTH )
{ // calculate thread id
unsigned int row = blockIdx.y*blockDim.y+threadIdx.y;
unsigned int col = blockIdx.x*blockDim.x+threadIdx.x;
for (int k = 0 ; k<WIDTH ; k++ )
{ Pd[row*WIDTH + col]+= Md[row * WIDTH + k ] * Nd[ k * WIDTH + col] ; }}
int main ()
{ const int i=64 ;
cudaEvent_t start, stop;
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
const int WIDTH =300;
cudaError_t cudaStatus;
float array1_h[WIDTH][WIDTH] ,array2_h[WIDTH][WIDTH] ,M_result_array_h[WIDTH][WIDTH];
float *array1_d , *array2_d ,*M_result_array_d ; // device array
// Allocate GPU buffers for 2 vectors (two input, one output)
cudaStatus = cudaMalloc((void **) &array1_d , WIDTH*WIDTH*sizeof (float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!"); }
cudaStatus = cudaMalloc((void **) &array2_d , WIDTH*WIDTH*sizeof (float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!"); }
for ( int i = 0 ; i<WIDTH ; i++ ) {
for (int j = 0 ; j<WIDTH ; j++ )
{ array1_h[i][j] = 1 ; array2_h[i][j] = 2 ; }}
//copy host array to device array; cudaMemcpy ( dest , source , WIDTH , direction )
cudaMemcpy ( array1_d , array1_h , WIDTH*WIDTH*sizeof (float) , cudaMemcpyHostToDevice ) ;
cudaMemcpy ( array2_d , array2_h , WIDTH*WIDTH*sizeof (float) , cudaMemcpyHostToDevice ) ;
//allocating memory for resultent device array
cudaStatus = cudaMalloc((void **) &M_result_array_d , WIDTH*WIDTH*sizeof (float) ) ;
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!"); }
//calling kernal
dim3 dimBlock( i,i, 1 ) ;
dim3 dimGrid ( ((WIDTH-1)/i) +1 , ((WIDTH-1)/i)+1 ,1 ) ;
cudaEventRecord(start, 0);
MatrixMul <<<dimGrid,dimBlock>>> ( array1_d , array2_d ,M_result_array_d , WIDTH) ;
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf ("taille du probleme:%d Time for the kernel: %f \n",WIDTH,time);
//copy back result_array_d to result_array_h
cudaMemcpy(M_result_array_h , M_result_array_d , WIDTH*WIDTH*sizeof(float) , cudaMemcpyDeviceToHost) ;
//printf the result array
for (int i = 0 ; i<WIDTH ; i++ )
{ for (int j = 0 ; j < WIDTH ; j++ )
{ printf ("%f ",M_result_array_h[i][j] ) ; }
printf ("\n") ; }
cudaFree(array1_d);
cudaFree(array2_d);
cudaFree(M_result_array_h);
system("pause") ; }
The stack overflow problem is not CUDA related. These allocations:
float array1_h[WIDTH][WIDTH] ,array2_h[WIDTH][WIDTH] ,M_result_array_h[WIDTH][WIDTH];
are created by the compiler on the stack. The stack space is limited. (This is the host code, so the stack here has nothing to do with the GPU.)
One possible approach to fix this is to create dynamic allocations for these variables, which will be made on the heap, which doesn't have the same limits as the stack.
So one possible fix is to replace this:
float array1_h[WIDTH][WIDTH] ,array2_h[WIDTH][WIDTH] ,M_result_array_h[WIDTH][WIDTH];
with this:
typedef float ar_type[WIDTH];
ar_type *array1_h, *array2_h, *M_result_array_h;
array1_h = (ar_type *)malloc(WIDTH*WIDTH*sizeof(float));
array2_h = (ar_type *)malloc(WIDTH*WIDTH*sizeof(float));
M_result_array_h = (ar_type *)malloc(WIDTH*WIDTH*sizeof(float));
Also note that this:
const int i=64 ;
...
dim3 dimBlock( i,i, 1 ) ;
is not valid. You are requesting a 64x64 threadblock (4096 threads total) and this is not legal for any CUDA GPU. You can fix this particular issue by changing i to 32.
After fixing that, it seems that your kernel has no thread-check to prevent out-of-bounds threads from executing and generating out-of-bounds accesses. You can fix that by adding this thread-check immediately before the for-loop in your kernel:
if ((row < WIDTH) && (col < WIDTH))
Finally, this line has a typo:
cudaFree(M_result_array_h);
I think you meant:
cudaFree(M_result_array_d);
You can discover these other errors (2-4) if you add proper cuda error checking to your code, and/or run your code with cuda-memcheck.
Use rtContextGetStackSize/rtContextSetStackSize to find out how large your stack is and set it larger if needed.
Keep in mind that the memory on your graphic card is shared with other graphical processes and you can't use all of it.
Furthermore you can partition your matrix and compute a Partitioned Matrix Multiplication with a block-by-block algorithm, instead of the entire Matrix.

CUDA, "illegal memory access was encountered" in Memcpy

I have this cuda file:
#include "cuda.h"
#include "../../HandleError.h"
#include "Sphere.hpp"
#include <stdlib.h>
#include <CImg.h>
#define WIDTH 1280
#define HEIGHT 720
#define rnd(x) (x*rand()/RAND_MAX)
#define SPHERES_COUNT 5
using namespace cimg_library;
__global__
void kernel(unsigned char* bitmap, Sphere* s)
{
// Map threadIdx/blockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
float ox = x - blockDim.x * gridDim.x / 2;
float oy = y - blockDim.y * gridDim.y / 2;
float r = 0.2, g = 0.2, b = 0.5;
float maxz = -INF;
for (int i = 0; i < SPHERES_COUNT; i++) {
float n, t = s[i].hit(ox, oy, &n);
if (t > maxz) {
float fscale = n;
r = s[i].r * fscale;
g = s[i].g * fscale;
b = s[i].b * fscale;
maxz = t;
}
}
bitmap[offset*3] = (int)(r * 255);
bitmap[offset*3 + 1] = (int)(g * 255);
bitmap[offset*3 + 2] = (int)(b * 255);
}
__constant__ Sphere s[SPHERES_COUNT];
int main ()
{
//Capture start time
cudaEvent_t start, stop;
HANDLE_ERROR(cudaEventCreate(&start));
HANDLE_ERROR(cudaEventCreate(&stop));
HANDLE_ERROR(cudaEventRecord(start, 0));
//Create host bitmap
CImg<unsigned char> image(WIDTH, HEIGHT, 1, 3);
image.permute_axes("cxyz");
//Allocate device bitmap data
unsigned char* dev_bitmap;
HANDLE_ERROR(cudaMalloc((void**)&dev_bitmap, image.size()*sizeof(unsigned char)));
//Generate spheres and copy them on the GPU one by one
Sphere* temp_s = (Sphere*)malloc(SPHERES_COUNT*sizeof(Sphere));
for (int i=0; i <SPHERES_COUNT; i++) {
temp_s[i].r = rnd(1.0f);
temp_s[i].g = rnd(1.0f);
temp_s[i].b = rnd(1.0f);
temp_s[i].x = rnd(1000.0f) - 500;
temp_s[i].y = rnd(1000.0f) - 500;
temp_s[i].z = rnd(1000.0f) - 500;
temp_s[i].radius = rnd(100.0f) + 20;
}
HANDLE_ERROR(cudaMemcpyToSymbol(s, temp_s, sizeof(Sphere)*SPHERES_COUNT));
free(temp_s);
//Generate a bitmap from spere data
dim3 grids(WIDTH/16, HEIGHT/16);
dim3 threads(16, 16);
kernel<<<grids, threads>>>(dev_bitmap, s);
//Copy the bitmap back from the GPU for display
HANDLE_ERROR(cudaMemcpy(image.data(), dev_bitmap,
image.size()*sizeof(unsigned char),
cudaMemcpyDeviceToHost));
cudaFree(dev_bitmap);
image.permute_axes("yzcx");
image.save("render.bmp");
}
It compiles fine, but when executed I get this error:
an illegal memory access was encountered in main.cu at line 82
that is, here:
//Copy the bitmap back from the GPU for display
HANDLE_ERROR(cudaMemcpy(image.data(), dev_bitmap,
image.size()*sizeof(unsigned char),
cudaMemcpyDeviceToHost));
I cannot understand why...
I know that If remove this:
bitmap[offset*3] = (int)(r * 255);
bitmap[offset*3 + 1] = (int)(g * 255);
bitmap[offset*3 + 2] = (int)(b * 255);
The error is not reported, so I thought It may be an out of index error, reported later, but I have An identical version of this program that makes no use of constant memory, and it works fine with the very same version of the kernel function...
There are two things at issue here. The first is this:
__constant__ Sphere s[SPHERES_COUNT];
int main ()
{
......
kernel<<<grids, threads>>>(dev_bitmap, s);
......
In host code, s is a host memory variable which provides a handle for the CUDA runtime to hook up with the device constant memory symbol. It doesn't contain a valid device pointer and can't be passed to kernel calls. The result is a invalid memory access error.
You could do this:
__constant__ Sphere s[SPHERES_COUNT];
int main ()
{
......
Sphere *d_s;
cudaGetSymbolAddress((void **)&d_s, s);
kernel<<<grids, threads>>>(dev_bitmap, d_s);
......
which would cause a symbol lookup to get the device address of s, and it would be valid to pass that to the kernel. However, the GPU relies on the compiler emitting specific instructions to access memory through the constant cache. The device compiler will only emit these instructions when it can detect that a __constant__ variable is being accessed within a kernel, which is not possible when using a pointer. You can see more about how the compiler will generate code for constant variable access in this Stack Overflow question and answer.

CUDA -- simple code but some of my warps don't run

EDIT: As I was reading this question after myself, I figured it out.
The root of the problem is most likely that I didn't allocate enough memory. I will try to think about this and do it correctly and then answer to my question. Silly me. :-[ It doesn't explain the warps not showing up in stdout though...
Original question
I created a templated kernel in CUDA in which I iterate over sections of grayscale image data in global memory (shared memory optimizations are due when I get this working) to achieve morphological operations with disc-shaped structure elements. Each thread corresponds to a pixel of the image. When the data type is char, everything works as expected, all my threads do what they should. When I change it to unsigned short, it starts acting up and only computes the upper half of my image. When I put in some printfs (my device has 2.0 CC), I found out that some of the warps that should run aren't even computed.
Here's the relevant code.
From my main.cpp I call gcuda::ErodeGpuGray8(img, radius); and gcuda::ErodeGpuGray16(img, radius); which are the following functions:
// gcuda.h
…
i3d::Image3d<i3d::GRAY8> ErodeGpuGray8(i3d::Image3d<i3d::GRAY8> img, const unsigned int radius);
i3d::Image3d<i3d::GRAY16> ErodeGpuGray16(i3d::Image3d<i3d::GRAY16> img, const unsigned int radius);
…
// gcuda.cu
…
// call this from outside
Image3d<GRAY8> ErodeGpuGray8(Image3d<GRAY8> img, const unsigned int radius) {
return ErodeGpu<GRAY8>(img, radius);
}
// call this from outside
Image3d<GRAY16> ErodeGpuGray16(Image3d<GRAY16> img, const unsigned int radius) {
return ErodeGpu<GRAY16>(img, radius);
}
…
The library I'm using defines GRAY8 as char and GRAY16 as unsigned short.
Here's how I call the kernel (blockSize is a const int set to 128 in the relevant namespace):
// gcuda.cu
template<typename T> Image3d<T> ErodeGpu(Image3d<T> img, const unsigned int radius) {
unsigned int width = img.GetWidth();
unsigned int height = img.GetHeight();
unsigned int w = nextHighestPower2(width);
unsigned int h = nextHighestPower2(height);
const size_t n = width * height;
const size_t N = w * h;
Image3d<T>* rslt = new Image3d<T>(img);
T *vx = rslt->GetFirstVoxelAddr();
// kernel parameters
dim3 dimBlock( blockSize );
dim3 dimGrid( ceil( N / (float)blockSize) );
// source voxel array on device (orig)
T *vx_d;
// result voxel array on device (for result of erosion)
T *vxr1_d;
// allocate memory on device
gpuErrchk( cudaMalloc( (void**)&vx_d, n ) );
gpuErrchk( cudaMemcpy( vx_d, vx, n, cudaMemcpyHostToDevice ) );
gpuErrchk( cudaMalloc( (void**)&vxr1_d, n ) );
gpuErrchk( cudaMemcpy( vxr1_d, vx_d, n, cudaMemcpyDeviceToDevice ) );
ErodeGpu<T><<<dimGrid, dimBlock>>>(vx_d, vxr1_d, n, width, radius);
gpuErrchk( cudaMemcpy( vx, vxr1_d, n, cudaMemcpyDeviceToHost ) );
// free device memory
gpuErrchk( cudaFree( vx_d ) );
gpuErrchk( cudaFree( vxr1_d ) );
// for debug purposes
rslt->SaveImage("../erodegpu.png");
return rslt;
}
The dimensions of my testing image are 82x82, so n = 82*82 = 6724 and N = 128*128 = 16384.
This is my kernel:
// gcuda.cu
// CUDA Kernel -- used for image erosion with a circular structure element of radius "erosionR"
template<typename T> __global__ void ErodeGpu(const T *in, T *out, const unsigned int n, const int width, const int erosionR)
{
ErodeOrDilateCore<T>(ERODE, in, out, n, width, erosionR);
}
// The core of erosion or dilation. Operation is determined by the first parameter
template<typename T> __device__ void ErodeOrDilateCore(operation_t operation, const T *in, T *out, const unsigned int n, const int width, const int radius) {
// get thread number, this method is overkill for my purposes but generally should be bulletproof, right?
int blockId = blockIdx.x + blockIdx.y * gridDim.x + gridDim.x * gridDim.y * blockIdx.z;
int threadId = blockId * (blockDim.x * blockDim.y * blockDim.z) + (threadIdx.z * (blockDim.x * blockDim.y)) + (threadIdx.y * blockDim.x) + threadIdx.x;
int tx = threadId;
if (tx >= n) {
printf("[%d > %d]", tx, n);
return;
} else {
printf("{%d}", tx);
}
… (erosion implementation, stdout is the same when this is commented out so it's probably not the root of the problem)
}
To my understanding, this code should write a randomly sorted set of [X > N] and {X} strings to stdout, where X = thread ID and there should be n curly-bracketed numbers (i.e. the output of threads with the index < n) and N - n of the rest, but when I run it and count the curly-bracketed numbers using a regex, I find out that I only get 256 of them. Furthermore, they seem to occur in 32-member groups, which tells me that some warps are run and some are not.
I am really baffled by this. It doesn't help that when I don't comment out the erosion implementation part, the GRAY8 erosion works and the GRAY16 erosion doesn't, even though the stdout output is exactly the same in both cases (could be input-dependent, I only tried this with 2 images).
What am I missing? What could be the cause of this? Is there some memory-management mistake on my part or is it fine that some warps don't run and the erosion stuff is possibly just a bug in the image library that only occurs with the GRAY16 type?
So this was just a stupid malloc mistake.
Instead of
const size_t n = width * height;
const size_t N = w * h;
I used
const int n = width * height;
const int N = w * h;
and instead of the erroneous
gpuErrchk( cudaMalloc( (void**)&vx_d, n ) );
gpuErrchk( cudaMemcpy( vx_d, vx, n, cudaMemcpyHostToDevice ) );
gpuErrchk( cudaMalloc( (void**)&vxr1_d, n ) );
gpuErrchk( cudaMemcpy( vxr1_d, vx_d, n, cudaMemcpyDeviceToDevice ) );
…
gpuErrchk( cudaMemcpy( vx, vxr1_d, n, cudaMemcpyDeviceToHost ) );
I used
gpuErrchk( cudaMalloc( (void**)&vx_d, n * sizeof(T) ) );
gpuErrchk( cudaMemcpy( vx_d, vx, n * sizeof(T), cudaMemcpyHostToDevice ) );
gpuErrchk( cudaMalloc( (void**)&vxr1_d, n * sizeof(T) ) );
gpuErrchk( cudaMemcpy( vxr1_d, vx_d, n * sizeof(T), cudaMemcpyDeviceToDevice ) );
…
gpuErrchk( cudaMemcpy( vx, vxr1_d, n * sizeof(T), cudaMemcpyDeviceToHost ) );
and the erosion is working correctly now, which was the main problem I was trying to solve. I'm still not getting the stdout output I'm expecting though, so if someone could shed some light on that, please do so.