I am trying to implement the following loop in an OpenCL kernel.
for(i=0;i<N;i++) for(j=0;j<M;j++) weights[i*M+j] += gradients[i] * input[j];
This is my kernel. I am currently hardcoding M to be 4 and it is only working for the first 4 elements.
__kernel
void cwk3( __global float *gradients, __global float *inputs, __global float *weights)
{
// The global id tells us the index of the vector for this thread.
int gid1 = get_global_id(0);
int gid2 = get_global_id(1);
// Perform the addition.
weights[(gid1 * 4) + gid2] += gradients[gid1] * inputs[gid2];
}
The relevant c++ code is
float
*gradients = (float*) malloc( N *sizeof(float) ),
*inputs = (float*) malloc( M*sizeof(float) ),
*weights = (float*) malloc( N*M*sizeof(float) );
initialiseArrays( gradients, inputs, weights, N, M );
cl_mem deviceGradients = clCreateBuffer( context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, N*sizeof(float), gradients
, &status );
cl_mem deviceInputs = clCreateBuffer( context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, M*sizeof(float), inputs
, &status );
cl_mem deviceWeights = clCreateBuffer( context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, N*M*sizeof(float), weights
, &status );
cl_kernel kernel = compileKernelFromFile( "kernel.cl", "cwk3", context, device );
status = clSetKernelArg( kernel, 0, sizeof(deviceGradients), &deviceGradients );
status = clSetKernelArg( kernel, 1, sizeof(deviceInputs), &deviceInputs );
status = clSetKernelArg( kernel, 2, sizeof(deviceWeights), &deviceWeights );
size_t indexSpaceSize[2], workGroupSize[1];
indexSpaceSize[0] = N;
indexSpaceSize[1] = M;
workGroupSize [0] = 4;
status = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, indexSpaceSize, workGroupSize, 0, NULL, NULL );
if( status != CL_SUCCESS )
{
printf( "Failure enqueuing kernel: Error %d.\n", status );
return EXIT_FAILURE;
}
status = clEnqueueReadBuffer( queue, deviceWeights, CL_TRUE, 0, N*M*sizeof(float), weights, 0, NULL, NULL );
if( status != CL_SUCCESS )
{
printf( "Could not copy device data to host: Error %d.\n", status );
return EXIT_FAILURE;
}
This simply creates the buffers and copies them to the GPU, launches the kernel and then reads the answer back from the GPU to the CPU. N and M are read in as command line arguments. I am currently setting them both to 4 for testing
You seem to be confused about global and local work groups.
Global work size specifies total number of calls (work items) executed.
global_work_size=[M,N] will call the kernel MxN times in total. One work item can determine its position by get_global_id. OpenCL could implement this as something like this :
for(i=0;i<N;i++)
for(j=0;j<M;j++)
call_kernel(set global_id=[i,j])
Local work groups describe how to group together individual launched work items( which are created according to global sizes) and make them aware of each other and share memory between themselves. None of those features you use/need, so ignore them.
So to implement your for loop in OpenCL:
for(i=0;i<N;i++)
for(j=0;j<M;j++)
weights[i*M+j] += gradients[i] * input[j];
You would have this kernel:
__kernel
void cwk3( __global float *gradients, __global float *inputs, __global float *weights)
{
int gid1 = get_global_id(0);
int gid2 = get_global_id(1);
int M = get_global_size(0);
weights[(gid1 * M) + gid2] += gradients[gid1] * inputs[gid2];
}
And call it like this:
size_t global_work[2];
global_work[0]=M;
global_work[1]=N;
// This is 2D kernel, not 1D
// Offsets are 0
// Global work size is M*N
// Ignore local work size
status = clEnqueueNDRangeKernel( queue, kernel, 2, NULL, global_work);
Related
I want to transform an RGB-Image to a grayscale image.
My problem when I copy back the data the kernel returns zeros.
OpenCL code:
__kernel void grayscale(__global uchar * input, __global uchar * output)
{
int gid = get_global_id(0);
output[gid] = 0.0722 * input[gid][0] + 0.7152 * input[gid][1] + 0.2126 * input[gid][2];
}
Host code:
void RunKernel(char fileName[], char methodName[], Mat inputImg, Mat outputImg,
char outputLoc[], int mem_size){
/*
Initialisation of the device and read the kernel source.
*/
//Creating cl_mem objects for input and output. men_size is the image width*height
imgInMem = clCreateBuffer(img_context, CL_MEM_READ_ONLY,
mem_size * sizeof(uchar), NULL, &err);
imgOutMem = clCreateBuffer(img_context, CL_MEM_WRITE_ONLY,
mem_size * sizeof(uchar), NULL, &err);
//copy the data into cl_mem input
err = clEnqueueWriteBuffer(img_cmd_queue, imgInMem, CL_TRUE, 0, mem_size *sizeof(uchar),
&inputImg.data, 0, NULL, NULL);
//Create the program and load the kernel source to it
img_program = clCreateProgramWithSource(img_context, 1, (const char **) &kernel_src_str,
(const size_t *) &kernel_size, &err);
err = clBuildProgram(img_program, 1, &dev_id, NULL, NULL, NULL);
img_kernel = clCreateKernel(img_program, methodName, &err);
//Setting the kernel args
err = clSetKernelArg(img_kernel, 0, sizeof(cl_mem), (void *) &imgInMem);
err = clSetKernelArg(img_kernel, 1, sizeof(cl_mem), (void *) &imgOutMem);
//define the global size and local size
size_t global_work_size = mem_size;
size_t local_work_size = 256;
//Enqueue a command to execute a kernel on a device ("1" indicates 1-dim work)
err = clEnqueueNDRangeKernel(img_cmd_queue, img_kernel, 1, NULL, &global_work_size,
&local_work_size, 0, NULL, NULL);
err = clFinish(img_cmd_queue);
//Read back the result from device
err = clEnqueueReadBuffer(img_cmd_queue, imgOutMem, CL_TRUE, 0,
mem_size *sizeof(uchar), outputImg.data, 0, NULL, NULL);
/*
Release the necessary objects.
*/
}
After the clEnqueueReadBuffer if I write the values to the console it is all zeros. My outputImg is declared like this in the main:
Mat outImg(height,width,CV_8UC1,Scalar(0));
and call the method with this:
RunKernel("kernels/grayscale.cl","grayscale", inImg, outImg,"resources/grayscale_car_gpu.jpg", MEM_SIZE);
The problem is likely the 2D array syntax you're using:
0.0722 * input[gid][0] + 0.7152 * input[gid][1] + 0.2126 * input[gid][2]
What addresses do you think that is accessing exactly?
Instead, assuming you're trying to access sequential bytes as RGB (in BGR order, judging by the coefficient value), try:
0.0722 * input[3*gid+0] + 0.7152 * input[3*gid+1] + 0.2126 * input[3*gid+2]
You should add an "f" to the float constants (otherwise they are doubles, which are not supported on all devices).
You should add rounding from float back to uchar. So, together, something like:
convert_uchar_sat_rte(0.0722f * input[3*gid+0] +
0.7152f * input[3*gid+1] +
0.2126f * input[3*gid+2])
Finally, you're passing the same size buffer for the input and output images, but seemingly treating the input buffer as RGB, which is 3x larger than a single byte of monochrome. So you'll need to fix that in the host code.
Any time you're getting incorrect output from a kernel, simplify it to see if it is an input problem, a calculation problem, an output problem, or host cost issues. Keep narrowing it down until you've found your problem.
My computer has a GeForce 1080Ti. With 11GB of VRAM, I don't expect a memory issue, so I'm at a loss to explain why the following breaks my code.
I execute the kernel on the host with this code.
cl_mem buffer = clCreateBuffer(context.GetContext(), CL_MEM_READ_WRITE, n * n * sizeof(int), NULL, &error);
error = clSetKernelArg(context.GetKernel(myKernel), 1, n * n, m1);
error = clSetKernelArg(context.GetKernel(myKernel), 0, sizeof(cl_mem), &buffer);
error = clEnqueueNDRangeKernel(context.GetCommandQueue(0), context.GetKernel(myKernel), 1, NULL, 10, 10, 0, NULL, NULL);
clFinish(context.GetCommandQueue(0));
error = clEnqueueReadBuffer(context.GetCommandQueue(0), buffer, true, 0, n * n * sizeof(int), results, 0, NULL, NULL);
results is a pointer to an n-by-n int array. m1 is a pointer to an n-by-n-bit array. The variable n is divisible by 8, so we can interpret the array as a char array.
The first ten values of the array are set to 1025 by the kernel (the value isn't important):
__kernel void PopCountProduct (__global int *results)
{
results[get_global_id(0)] = 1025;
}
When I print out the result on the host, the first 10 indices are 1025. All is well and good.
Suddenly it stops working when I introduce an additional argument:
__kernel void PopCountProduct (__global int *results, __global char *m)
{
results[get_global_id(0)] = 1025;
}
Why is this happening? Am I missing something crucial about OpenCL?
You can't pas host pointer to clSetKernelArg in OpenCL 1.2. Similar thing can only be done in OpenCL 2.0+ by clSetKernelArgSVMPointer with SVM pointer if supported. But most probable making a buffer object on GPU and copying host memory to it is what you need.
I'm new to OpenCL.
I want to pass vector to OpenCL kernel using the C++ bindings.
Currently I have in host program:
std::vector<cl_float4> toKernel;
std::vector<cl_float3> output;
int n = 50;
//init some value in toKernel vector
//make kernel, program, command queue, etc...
cl::Buffer bufferX = cl::Buffer(
context,
CL_MEM_READ_ONLY,
vectorSize * sizeof(cl_float4)
);
cl::Buffer bufferNumber = cl::Buffer(
context,
CL_MEM_READ_ONLY,
sizeof(int)
);
cl::Buffer bufferOutput = cl::Buffer(
context,
CL_MEM_WRITE_ONLY,
vectorSize * sizeof(cl_float3)
);
queue.enqueueWriteBuffer(
bufferX,
CL_TRUE,
0,
vectorSize * sizeof(cl_float4),
toKernel.data()
);
queue.enqueueWriteBuffer(
bufferNumber,
CL_TRUE,
0,
sizeof(int),
&n
);
kernel.setArg(0, bufferX);
kernel.setArg(1, bufferNumber);
kernel.setArg(2, bufferOutput);
cl::NDRange global(vectorSize);
cl::NDRange local(1);
queue.enqueueNDRangeKernel(
kernel,
cl::NullRange,
global,
local
);
queue.enqueueReadBuffer(
bufferOutput,
CL_TRUE,
0,
vectorSize * sizeof(cl_float3),
&output
);
And in my kernel program I access each vector value like this:
__kernel void calculate(__global const float4 *x, __global const int *n, __global float3 *out){
int i = get_global_id(0);
for (int j=0; j<n; j++) {
//do some calculation using x[i] and x[j]....
}
out[i] = calc_result;
}
Is this the correct way?
I'm not sure how to output debug message in OpenCL.
I get clEnqueueReadBuffer(-5) when I try to read the output from the kernel.
How do you read back std::vector from OpenCL?
-5 means "out of resources". You can't pass output as a pointer to enqueueReadBuffer. You need to pass output.data()
I've written a simple OpenCL kernel to calculate the cross-correlation of two images on the GPU. However, when I execute the kernel with enqueueNDRangeKernel the CPU usage of one core rises to 100%, but the host code does nothing except waiting for the enqueued command to finish. Is this normal behavior of an OpenCL program? What is going on there?
OpenCL kernel (if relevant):
kernel void cross_correlation(global double *f,
global double *g,
global double *res) {
// This work item will compute the cross-correlation value for pixel w
const int2 w = (int2)(get_global_id(0), get_global_id(1));
// Main loop
int xy_index = 0;
int xy_plus_w_index = w.x + w.y * X;
double integral = 0;
for ( int y = 0; y + w.y < Y; ++y ) {
for ( int x = 0; x + w.x < X; ++x, ++xy_index, ++xy_plus_w_index ) {
// xy_index is equal to x + y * X
// xy_plus_w_index is equal to (x + w.x) + (y + w.y) * X
integral += f[xy_index] * g[xy_plus_w_index];
}
xy_index += w.x;
xy_plus_w_index += w.x;
}
res[w.x + w.y * X] = integral;
}
The images f, g, res have a size of X times Y pixels, where X and Y are set at compile time. I'm testing the above kernel with X = 2048 and Y = 2048.
Additional info: I am running the kernel on a Nvidia GPU with OpenCL version 1.2. The C++ program is written using the OpenCL C++ Wrapper API and executed on Debian using optirun from the bumblebee package.
As requested, here is a minimal working example:
#include <CL/cl.hpp>
#include <sstream>
#include <fstream>
using namespace std;
int main ( int argc, char **argv ) {
const int X = 2048;
const int Y = 2048;
// Create context
cl::Context context ( CL_DEVICE_TYPE_GPU );
// Read kernel from file
ifstream kernel_file ( "cross_correlation.cl" );
stringstream buffer;
buffer << kernel_file.rdbuf ( );
string kernel_code = buffer.str ( );
// Build kernel
cl::Program::Sources sources;
sources.push_back ( { kernel_code.c_str ( ), kernel_code.length ( ) } );
cl::Program program ( context, sources );
program.build ( " -DX=2048 -DY=2048" );
// Allocate buffer memory
cl::Buffer fbuf ( context, CL_MEM_READ_WRITE, X * Y * sizeof(double) );
cl::Buffer gbuf ( context, CL_MEM_READ_WRITE, X * Y * sizeof(double) );
cl::Buffer resbuf ( context, CL_MEM_WRITE_ONLY, X * Y * sizeof(double) );
// Create command queue
cl::CommandQueue queue ( context );
// Create kernel
cl::Kernel kernel ( program, "cross_correlation" );
kernel.setArg ( 0, fbuf );
kernel.setArg ( 1, gbuf );
kernel.setArg ( 2, resbuf );
// Set input arguments
double *f = new double[X*Y];
double *g = new double[X*Y];
for ( int i = 0; i < X * Y; i++ )
f[i] = g[i] = 0.001 * i;
queue.enqueueWriteBuffer ( fbuf, CL_TRUE, 0, X * Y * sizeof(double), f );
queue.enqueueWriteBuffer ( gbuf, CL_TRUE, 0, X * Y * sizeof(double), g );
// Execute kernel
queue.enqueueNDRangeKernel ( kernel, cl::NullRange, cl::NDRange ( X, Y ), cl::NullRange, NULL, NULL );
queue.finish ( );
return 0;
}
You don't say how you call enqueueNDRangeKernel - which is the critical bit. As I understand it, for NVidia, the call is blocking (although I don't think it's part of the standard that it should be so.)
You can get around this by having a separate thread invoke enqueueNDRangeKernel and let that thread block on it whilst your other threads continue, and teh blocking thread can signal an event when it completes.
There's a discussion on it here - and it raises some caveats about having multiple calls to the enqueue occurring in parallel.
I'm trying to calculate a matrice multiplication of size N (square matrix) but I'm getting a stack overflow error(I'm new to Cuda ):
if I test the code for N < 300 everything is fine, but if I test it with N> 300 it does not work, and a stack overflow error was displayed but there is enough memory.in my graphics card GF 820M .
if N = 300 then 300 * 300 * 4(size of float) = 360000 byte : necessary space in the device to allocate for an array of type float.and here it must allocate for 3 Table to do multiplication .therefore 360000 * 3 = 1080000 bytes and if I control the CudaMalloc nothing is displayed.
I inform you that my main goal is to test for N large enough.How do I solve that? thank you in advance for any help you might be able to provide.
#include <stdio.h>
#include<device_launch_parameters.h>
#include<cuda.h>
#include<time.h>
#include<cuda_runtime.h>
#include <math.h>
__global__ void MatrixMul( float *Md , float *Nd , float *Pd , const int WIDTH )
{ // calculate thread id
unsigned int row = blockIdx.y*blockDim.y+threadIdx.y;
unsigned int col = blockIdx.x*blockDim.x+threadIdx.x;
for (int k = 0 ; k<WIDTH ; k++ )
{ Pd[row*WIDTH + col]+= Md[row * WIDTH + k ] * Nd[ k * WIDTH + col] ; }}
int main ()
{ const int i=64 ;
cudaEvent_t start, stop;
float time;
cudaEventCreate(&start);
cudaEventCreate(&stop);
const int WIDTH =300;
cudaError_t cudaStatus;
float array1_h[WIDTH][WIDTH] ,array2_h[WIDTH][WIDTH] ,M_result_array_h[WIDTH][WIDTH];
float *array1_d , *array2_d ,*M_result_array_d ; // device array
// Allocate GPU buffers for 2 vectors (two input, one output)
cudaStatus = cudaMalloc((void **) &array1_d , WIDTH*WIDTH*sizeof (float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!"); }
cudaStatus = cudaMalloc((void **) &array2_d , WIDTH*WIDTH*sizeof (float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!"); }
for ( int i = 0 ; i<WIDTH ; i++ ) {
for (int j = 0 ; j<WIDTH ; j++ )
{ array1_h[i][j] = 1 ; array2_h[i][j] = 2 ; }}
//copy host array to device array; cudaMemcpy ( dest , source , WIDTH , direction )
cudaMemcpy ( array1_d , array1_h , WIDTH*WIDTH*sizeof (float) , cudaMemcpyHostToDevice ) ;
cudaMemcpy ( array2_d , array2_h , WIDTH*WIDTH*sizeof (float) , cudaMemcpyHostToDevice ) ;
//allocating memory for resultent device array
cudaStatus = cudaMalloc((void **) &M_result_array_d , WIDTH*WIDTH*sizeof (float) ) ;
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!"); }
//calling kernal
dim3 dimBlock( i,i, 1 ) ;
dim3 dimGrid ( ((WIDTH-1)/i) +1 , ((WIDTH-1)/i)+1 ,1 ) ;
cudaEventRecord(start, 0);
MatrixMul <<<dimGrid,dimBlock>>> ( array1_d , array2_d ,M_result_array_d , WIDTH) ;
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf ("taille du probleme:%d Time for the kernel: %f \n",WIDTH,time);
//copy back result_array_d to result_array_h
cudaMemcpy(M_result_array_h , M_result_array_d , WIDTH*WIDTH*sizeof(float) , cudaMemcpyDeviceToHost) ;
//printf the result array
for (int i = 0 ; i<WIDTH ; i++ )
{ for (int j = 0 ; j < WIDTH ; j++ )
{ printf ("%f ",M_result_array_h[i][j] ) ; }
printf ("\n") ; }
cudaFree(array1_d);
cudaFree(array2_d);
cudaFree(M_result_array_h);
system("pause") ; }
The stack overflow problem is not CUDA related. These allocations:
float array1_h[WIDTH][WIDTH] ,array2_h[WIDTH][WIDTH] ,M_result_array_h[WIDTH][WIDTH];
are created by the compiler on the stack. The stack space is limited. (This is the host code, so the stack here has nothing to do with the GPU.)
One possible approach to fix this is to create dynamic allocations for these variables, which will be made on the heap, which doesn't have the same limits as the stack.
So one possible fix is to replace this:
float array1_h[WIDTH][WIDTH] ,array2_h[WIDTH][WIDTH] ,M_result_array_h[WIDTH][WIDTH];
with this:
typedef float ar_type[WIDTH];
ar_type *array1_h, *array2_h, *M_result_array_h;
array1_h = (ar_type *)malloc(WIDTH*WIDTH*sizeof(float));
array2_h = (ar_type *)malloc(WIDTH*WIDTH*sizeof(float));
M_result_array_h = (ar_type *)malloc(WIDTH*WIDTH*sizeof(float));
Also note that this:
const int i=64 ;
...
dim3 dimBlock( i,i, 1 ) ;
is not valid. You are requesting a 64x64 threadblock (4096 threads total) and this is not legal for any CUDA GPU. You can fix this particular issue by changing i to 32.
After fixing that, it seems that your kernel has no thread-check to prevent out-of-bounds threads from executing and generating out-of-bounds accesses. You can fix that by adding this thread-check immediately before the for-loop in your kernel:
if ((row < WIDTH) && (col < WIDTH))
Finally, this line has a typo:
cudaFree(M_result_array_h);
I think you meant:
cudaFree(M_result_array_d);
You can discover these other errors (2-4) if you add proper cuda error checking to your code, and/or run your code with cuda-memcheck.
Use rtContextGetStackSize/rtContextSetStackSize to find out how large your stack is and set it larger if needed.
Keep in mind that the memory on your graphic card is shared with other graphical processes and you can't use all of it.
Furthermore you can partition your matrix and compute a Partitioned Matrix Multiplication with a block-by-block algorithm, instead of the entire Matrix.