Passing std::vector to openCL kernel C++ bindings - c++

I'm new to OpenCL.
I want to pass vector to OpenCL kernel using the C++ bindings.
Currently I have in host program:
std::vector<cl_float4> toKernel;
std::vector<cl_float3> output;
int n = 50;
//init some value in toKernel vector
//make kernel, program, command queue, etc...
cl::Buffer bufferX = cl::Buffer(
context,
CL_MEM_READ_ONLY,
vectorSize * sizeof(cl_float4)
);
cl::Buffer bufferNumber = cl::Buffer(
context,
CL_MEM_READ_ONLY,
sizeof(int)
);
cl::Buffer bufferOutput = cl::Buffer(
context,
CL_MEM_WRITE_ONLY,
vectorSize * sizeof(cl_float3)
);
queue.enqueueWriteBuffer(
bufferX,
CL_TRUE,
0,
vectorSize * sizeof(cl_float4),
toKernel.data()
);
queue.enqueueWriteBuffer(
bufferNumber,
CL_TRUE,
0,
sizeof(int),
&n
);
kernel.setArg(0, bufferX);
kernel.setArg(1, bufferNumber);
kernel.setArg(2, bufferOutput);
cl::NDRange global(vectorSize);
cl::NDRange local(1);
queue.enqueueNDRangeKernel(
kernel,
cl::NullRange,
global,
local
);
queue.enqueueReadBuffer(
bufferOutput,
CL_TRUE,
0,
vectorSize * sizeof(cl_float3),
&output
);
And in my kernel program I access each vector value like this:
__kernel void calculate(__global const float4 *x, __global const int *n, __global float3 *out){
int i = get_global_id(0);
for (int j=0; j<n; j++) {
//do some calculation using x[i] and x[j]....
}
out[i] = calc_result;
}
Is this the correct way?
I'm not sure how to output debug message in OpenCL.
I get clEnqueueReadBuffer(-5) when I try to read the output from the kernel.
How do you read back std::vector from OpenCL?

-5 means "out of resources". You can't pass output as a pointer to enqueueReadBuffer. You need to pass output.data()

Related

How to copy a big array to memory and use it in OpenCL kernel?

I have an array of uint8_t. The size of the array is about 2.000.000. I need to do some calculations on these values, but after I call the kernel and copy the modified values back, it returns only zeros.
I'm creating the array, the "row" and "columns" are int.
uint8_t arrayIn[rows * columns];
uint8_t arrayOut[rows * columns];
I'm creating the cl_mem objects and copy the array data into.
arrayInMem = clCreateBuffer(context, CL_MEM_READ_ONLY, rows * columns * sizeof(uint8_t), NULL, &err);
arrayOutMem = clCreateBuffer(context, CL_MEM_WRITE_ONLY, rows * columns * sizeof(uint8_t), NULL, &err);
err = clEnqueueWriteBuffer(img_cmd_queue, arrayInMem, CL_TRUE, 0, rows * columns * sizeof(uint8_t), arrayIn, 0, NULL, NULL);
Setting the kernel arg like this.
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&arrayInMem);
err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&arrayOutMem);
Reading back to the host the modified array.
err = clEnqueueReadBuffer(img_cmd_queue, arrayOutMem, CL_TRUE, 0, MEM_SIZE * sizeof(uint8_t), arrayOut, 0, NULL, NULL);
The kernel signature look like this:
__kernel void calculate(__global uchar * arrayInKernel, __global uchar * arrayOutKernel){
//do some calculation like this eg.
//int gid = get_global_id(0);
//arrayOutKernel[gid] = 2 * arrayInKernel[gid];
}
Could somebody help, what am I missing out?
Your code is fine, assuming MEM_SIZE = rows * columns. The argument order in clEnqueueReadBuffer also is correct.
I could imagine that you forgot to call clFinish(img_cmd_queue); after clEnqueueWriteBuffer, clEnqueueNDRangeKernel and clEnqueueReadBuffer and before you check the results in arrayOut. All these commands end up in a queue and without clFinish the queue may be executed after you checked results.

Image grayscale in OpenCL

I want to transform an RGB-Image to a grayscale image.
My problem when I copy back the data the kernel returns zeros.
OpenCL code:
__kernel void grayscale(__global uchar * input, __global uchar * output)
{
int gid = get_global_id(0);
output[gid] = 0.0722 * input[gid][0] + 0.7152 * input[gid][1] + 0.2126 * input[gid][2];
}
Host code:
void RunKernel(char fileName[], char methodName[], Mat inputImg, Mat outputImg,
char outputLoc[], int mem_size){
/*
Initialisation of the device and read the kernel source.
*/
//Creating cl_mem objects for input and output. men_size is the image width*height
imgInMem = clCreateBuffer(img_context, CL_MEM_READ_ONLY,
mem_size * sizeof(uchar), NULL, &err);
imgOutMem = clCreateBuffer(img_context, CL_MEM_WRITE_ONLY,
mem_size * sizeof(uchar), NULL, &err);
//copy the data into cl_mem input
err = clEnqueueWriteBuffer(img_cmd_queue, imgInMem, CL_TRUE, 0, mem_size *sizeof(uchar),
&inputImg.data, 0, NULL, NULL);
//Create the program and load the kernel source to it
img_program = clCreateProgramWithSource(img_context, 1, (const char **) &kernel_src_str,
(const size_t *) &kernel_size, &err);
err = clBuildProgram(img_program, 1, &dev_id, NULL, NULL, NULL);
img_kernel = clCreateKernel(img_program, methodName, &err);
//Setting the kernel args
err = clSetKernelArg(img_kernel, 0, sizeof(cl_mem), (void *) &imgInMem);
err = clSetKernelArg(img_kernel, 1, sizeof(cl_mem), (void *) &imgOutMem);
//define the global size and local size
size_t global_work_size = mem_size;
size_t local_work_size = 256;
//Enqueue a command to execute a kernel on a device ("1" indicates 1-dim work)
err = clEnqueueNDRangeKernel(img_cmd_queue, img_kernel, 1, NULL, &global_work_size,
&local_work_size, 0, NULL, NULL);
err = clFinish(img_cmd_queue);
//Read back the result from device
err = clEnqueueReadBuffer(img_cmd_queue, imgOutMem, CL_TRUE, 0,
mem_size *sizeof(uchar), outputImg.data, 0, NULL, NULL);
/*
Release the necessary objects.
*/
}
After the clEnqueueReadBuffer if I write the values to the console it is all zeros. My outputImg is declared like this in the main:
Mat outImg(height,width,CV_8UC1,Scalar(0));
and call the method with this:
RunKernel("kernels/grayscale.cl","grayscale", inImg, outImg,"resources/grayscale_car_gpu.jpg", MEM_SIZE);
The problem is likely the 2D array syntax you're using:
0.0722 * input[gid][0] + 0.7152 * input[gid][1] + 0.2126 * input[gid][2]
What addresses do you think that is accessing exactly?
Instead, assuming you're trying to access sequential bytes as RGB (in BGR order, judging by the coefficient value), try:
0.0722 * input[3*gid+0] + 0.7152 * input[3*gid+1] + 0.2126 * input[3*gid+2]
You should add an "f" to the float constants (otherwise they are doubles, which are not supported on all devices).
You should add rounding from float back to uchar. So, together, something like:
convert_uchar_sat_rte(0.0722f * input[3*gid+0] +
0.7152f * input[3*gid+1] +
0.2126f * input[3*gid+2])
Finally, you're passing the same size buffer for the input and output images, but seemingly treating the input buffer as RGB, which is 3x larger than a single byte of monochrome. So you'll need to fix that in the host code.
Any time you're getting incorrect output from a kernel, simplify it to see if it is an input problem, a calculation problem, an output problem, or host cost issues. Keep narrowing it down until you've found your problem.

Why is OpenCL nested loop only working for some elements

I am trying to implement the following loop in an OpenCL kernel.
for(i=0;i<N;i++) for(j=0;j<M;j++) weights[i*M+j] += gradients[i] * input[j];
This is my kernel. I am currently hardcoding M to be 4 and it is only working for the first 4 elements.
__kernel
void cwk3( __global float *gradients, __global float *inputs, __global float *weights)
{
// The global id tells us the index of the vector for this thread.
int gid1 = get_global_id(0);
int gid2 = get_global_id(1);
// Perform the addition.
weights[(gid1 * 4) + gid2] += gradients[gid1] * inputs[gid2];
}
The relevant c++ code is
float
*gradients = (float*) malloc( N *sizeof(float) ),
*inputs = (float*) malloc( M*sizeof(float) ),
*weights = (float*) malloc( N*M*sizeof(float) );
initialiseArrays( gradients, inputs, weights, N, M );
cl_mem deviceGradients = clCreateBuffer( context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, N*sizeof(float), gradients
, &status );
cl_mem deviceInputs = clCreateBuffer( context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, M*sizeof(float), inputs
, &status );
cl_mem deviceWeights = clCreateBuffer( context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, N*M*sizeof(float), weights
, &status );
cl_kernel kernel = compileKernelFromFile( "kernel.cl", "cwk3", context, device );
status = clSetKernelArg( kernel, 0, sizeof(deviceGradients), &deviceGradients );
status = clSetKernelArg( kernel, 1, sizeof(deviceInputs), &deviceInputs );
status = clSetKernelArg( kernel, 2, sizeof(deviceWeights), &deviceWeights );
size_t indexSpaceSize[2], workGroupSize[1];
indexSpaceSize[0] = N;
indexSpaceSize[1] = M;
workGroupSize [0] = 4;
status = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, indexSpaceSize, workGroupSize, 0, NULL, NULL );
if( status != CL_SUCCESS )
{
printf( "Failure enqueuing kernel: Error %d.\n", status );
return EXIT_FAILURE;
}
status = clEnqueueReadBuffer( queue, deviceWeights, CL_TRUE, 0, N*M*sizeof(float), weights, 0, NULL, NULL );
if( status != CL_SUCCESS )
{
printf( "Could not copy device data to host: Error %d.\n", status );
return EXIT_FAILURE;
}
This simply creates the buffers and copies them to the GPU, launches the kernel and then reads the answer back from the GPU to the CPU. N and M are read in as command line arguments. I am currently setting them both to 4 for testing
You seem to be confused about global and local work groups.
Global work size specifies total number of calls (work items) executed.
global_work_size=[M,N] will call the kernel MxN times in total. One work item can determine its position by get_global_id. OpenCL could implement this as something like this :
for(i=0;i<N;i++)
for(j=0;j<M;j++)
call_kernel(set global_id=[i,j])
Local work groups describe how to group together individual launched work items( which are created according to global sizes) and make them aware of each other and share memory between themselves. None of those features you use/need, so ignore them.
So to implement your for loop in OpenCL:
for(i=0;i<N;i++)
for(j=0;j<M;j++)
weights[i*M+j] += gradients[i] * input[j];
You would have this kernel:
__kernel
void cwk3( __global float *gradients, __global float *inputs, __global float *weights)
{
int gid1 = get_global_id(0);
int gid2 = get_global_id(1);
int M = get_global_size(0);
weights[(gid1 * M) + gid2] += gradients[gid1] * inputs[gid2];
}
And call it like this:
size_t global_work[2];
global_work[0]=M;
global_work[1]=N;
// This is 2D kernel, not 1D
// Offsets are 0
// Global work size is M*N
// Ignore local work size
status = clEnqueueNDRangeKernel( queue, kernel, 2, NULL, global_work);

OpenCL, manage device buffer pointer from host?

I'm trying to implement a code previously written in CUDA using OpenCL to run on Altera FPGA. I'm having problem reading back data that are supposed to be in the buffer. I use the same structure as CUDA version, only thing different is cudaMalloc can allocate memory for all types of pointer while for clCreateBuffer I have to use cl_mem. My code looks like this:
cl_mem d_buffer=clCreateBuffer(...);
//CUDA version:
//float* d_buffer;
//cudaMalloc((void **)&d_buffer, MemSz);
clEnqueueWriteBuffer(queue, d_buffer, ..., h_data, );
//cudaMemcpy(d_buffer, h_Data, MemSz, cudaMemcpyHostToDevice);
#define d_buffer(index1, index2, index3) &d_buffer + index1/index2*index3
//#define d_buffer(index1, index2, index3) d_buffer + index1/index2*index3
cl_mem* d_data=d_buffer(1,2,3);
clEnqueueReadBuffer(queue, *d_data,...)// Error reading d_data
I tried clEnqueueMapBuffer or CL_MEM_ALLOC_HOST_PTR for the clCreateBuffer, it doesn't work either.
cl_mem is an opaque object. You should not perform pointer arithmetic on it; attempting to do so will result in very nasty bugs.
I'm not familiar with how CUDA handles buffer allocation, but the implication of your commented out code is that CUDA buffers are always Host-Visible. This is very strictly not the case in OpenCL. OpenCL allows you to "Map" a buffer to host-visible memory, but it won't be implicitly visible to the host. If you intend to read an arbitrary index of the buffer, you need to either map it first or copy it to host data.
float * h_data = new float[1000];
cl_mem d_buffer=clCreateBuffer(...);
clEnqueueWriteBuffer(queue, d_buffer, true, 0, 1000 * sizeof(float), h_data, 0, nullptr, nullptr);
//======OR======
//float * d_data = static_cast<float*>(clEnqueueMapBuffer(queue, d_buffer, true, CL_MAP_WRITE, 0, 1000 * sizeof(float), 0, nullptr, nullptr, nullptr));
//std::copy(h_data, h_data + 1000, d_data);
//clEnqueueUnmapMemObject(queue, d_buffer, d_data, 0, nullptr, nullptr);
//clEnqueueBarrier(queue);
//Do work with buffer, probably in OpenCL Kernel...
float result;
size_t index = 1 / 2 * 3; //This is what you wrote in the original post
clEnqueueReadBuffer(queue, d_buffer, true, index * sizeof(float), 1 * sizeof(float), &result, 0, nullptr, nullptr);
//======OR======
//float * result_ptr = static_cast<float*>(clEnqueueMapBuffer(queue, d_buffer, true, CL_MAP_READ, index * sizeof(float), 1 * sizeof(float), 0, nullptr, nullptr, nullptr));
//result = *result_ptr;
//clEnqueueUnmapMemObject(queue, d_buffer, result_ptr, 0, nullptr, nullptr);
//clEnqueueBarrier(queue);
std::cout << "Result was " << result << std::endl;

OpenCL instantiating local memory array: invalid pointer error in kernel

I'm trying to create 2 local arrays for a kernel to use. My goal is to copy a global input buffer into the first array (arr1), and instantiate the second array (arr2) so its elements can be accessed and set later.
My kernel looks like this:
__kernel void do_things (__global uchar* in, __global uchar* out,
uint numIterations, __local uchar* arr1, __local uchar* arr2)
{
size_t work_size = get_global_size(0) * get_global_size(1);
event_t event;
async_work_group_copy(arr1, in, work_size, event);
wait_group_events(1, &event);
int cIndex = (get_global_id(0) * get_global_size(1)) + get_global_id(1);
arr2[cIndex] = 0;
//Do other stuff later
}
In the C++ code I'm calling this from, I set the kernel arguments like this:
//Create input and output buffers
cl_mem inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, myInputVector.size(), (void*)
myInputVector.data(), NULL);
cl_mem outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
myInputVector.size(), NULL, NULL);
//Set kernel arguments.
clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&inputBuffer));
clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&outputBuffer));
clSetKernelArg(kernel, 2, sizeof(cl_uint), &iterations));
clSetKernelArg(kernel, 3, sizeof(inputBuffer), NULL));
clSetKernelArg(kernel, 4, sizeof(inputBuffer), NULL));
Where myInputVector is a vector full of uchars.
Then, I enqueue it with a 2D work size, rows * cols big. myInputVector has a size of rows * cols.
//Execute the kernel
size_t global_work_size[2] = { rows, cols }; //2d work size
status = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL,
global_work_size, NULL, 0, NULL, NULL);
The problem is, I'm getting crashes when I run the kernel. Specifically, this line in the kernel:
arr2[cIndex] = 0;
is responsible for the crash (omitting it makes it so it doesn't crash anymore). The error reads:
*** glibc detected *** ./MyProgram: free(): invalid pointer: 0x0000000001a28fb0 ***
All I want is to be able to access arr2 alongside arr1. arr2 should be the same size as arr1. If that's the case, Why am I getting this bizarre error? Why is this an invalid pointer?
The issue is that you are allocating only sizeof(cl_mem) for your local buffers. And a cl_mem is simply a typedef of some sort of pointer type (therefore 4 to 8 bytes depending on your system).
What then happen in your kernel is that you are accessing beyond the size of the local buffer you allocated and the GPU launches a memory fault.
clSetKernelArg(kernel, 3, myInputVector.size(), NULL);
clSetKernelArg(kernel, 4, myInputVector.size(), NULL);
Should fix your problem. Also note that the size you are providing is the size in bytes so you would need to multiply by the sizeof of the vector element type (which is not clear from code).