I want to transform an RGB-Image to a grayscale image.
My problem when I copy back the data the kernel returns zeros.
OpenCL code:
__kernel void grayscale(__global uchar * input, __global uchar * output)
{
int gid = get_global_id(0);
output[gid] = 0.0722 * input[gid][0] + 0.7152 * input[gid][1] + 0.2126 * input[gid][2];
}
Host code:
void RunKernel(char fileName[], char methodName[], Mat inputImg, Mat outputImg,
char outputLoc[], int mem_size){
/*
Initialisation of the device and read the kernel source.
*/
//Creating cl_mem objects for input and output. men_size is the image width*height
imgInMem = clCreateBuffer(img_context, CL_MEM_READ_ONLY,
mem_size * sizeof(uchar), NULL, &err);
imgOutMem = clCreateBuffer(img_context, CL_MEM_WRITE_ONLY,
mem_size * sizeof(uchar), NULL, &err);
//copy the data into cl_mem input
err = clEnqueueWriteBuffer(img_cmd_queue, imgInMem, CL_TRUE, 0, mem_size *sizeof(uchar),
&inputImg.data, 0, NULL, NULL);
//Create the program and load the kernel source to it
img_program = clCreateProgramWithSource(img_context, 1, (const char **) &kernel_src_str,
(const size_t *) &kernel_size, &err);
err = clBuildProgram(img_program, 1, &dev_id, NULL, NULL, NULL);
img_kernel = clCreateKernel(img_program, methodName, &err);
//Setting the kernel args
err = clSetKernelArg(img_kernel, 0, sizeof(cl_mem), (void *) &imgInMem);
err = clSetKernelArg(img_kernel, 1, sizeof(cl_mem), (void *) &imgOutMem);
//define the global size and local size
size_t global_work_size = mem_size;
size_t local_work_size = 256;
//Enqueue a command to execute a kernel on a device ("1" indicates 1-dim work)
err = clEnqueueNDRangeKernel(img_cmd_queue, img_kernel, 1, NULL, &global_work_size,
&local_work_size, 0, NULL, NULL);
err = clFinish(img_cmd_queue);
//Read back the result from device
err = clEnqueueReadBuffer(img_cmd_queue, imgOutMem, CL_TRUE, 0,
mem_size *sizeof(uchar), outputImg.data, 0, NULL, NULL);
/*
Release the necessary objects.
*/
}
After the clEnqueueReadBuffer if I write the values to the console it is all zeros. My outputImg is declared like this in the main:
Mat outImg(height,width,CV_8UC1,Scalar(0));
and call the method with this:
RunKernel("kernels/grayscale.cl","grayscale", inImg, outImg,"resources/grayscale_car_gpu.jpg", MEM_SIZE);
The problem is likely the 2D array syntax you're using:
0.0722 * input[gid][0] + 0.7152 * input[gid][1] + 0.2126 * input[gid][2]
What addresses do you think that is accessing exactly?
Instead, assuming you're trying to access sequential bytes as RGB (in BGR order, judging by the coefficient value), try:
0.0722 * input[3*gid+0] + 0.7152 * input[3*gid+1] + 0.2126 * input[3*gid+2]
You should add an "f" to the float constants (otherwise they are doubles, which are not supported on all devices).
You should add rounding from float back to uchar. So, together, something like:
convert_uchar_sat_rte(0.0722f * input[3*gid+0] +
0.7152f * input[3*gid+1] +
0.2126f * input[3*gid+2])
Finally, you're passing the same size buffer for the input and output images, but seemingly treating the input buffer as RGB, which is 3x larger than a single byte of monochrome. So you'll need to fix that in the host code.
Any time you're getting incorrect output from a kernel, simplify it to see if it is an input problem, a calculation problem, an output problem, or host cost issues. Keep narrowing it down until you've found your problem.
Related
I have an array of uint8_t. The size of the array is about 2.000.000. I need to do some calculations on these values, but after I call the kernel and copy the modified values back, it returns only zeros.
I'm creating the array, the "row" and "columns" are int.
uint8_t arrayIn[rows * columns];
uint8_t arrayOut[rows * columns];
I'm creating the cl_mem objects and copy the array data into.
arrayInMem = clCreateBuffer(context, CL_MEM_READ_ONLY, rows * columns * sizeof(uint8_t), NULL, &err);
arrayOutMem = clCreateBuffer(context, CL_MEM_WRITE_ONLY, rows * columns * sizeof(uint8_t), NULL, &err);
err = clEnqueueWriteBuffer(img_cmd_queue, arrayInMem, CL_TRUE, 0, rows * columns * sizeof(uint8_t), arrayIn, 0, NULL, NULL);
Setting the kernel arg like this.
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&arrayInMem);
err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&arrayOutMem);
Reading back to the host the modified array.
err = clEnqueueReadBuffer(img_cmd_queue, arrayOutMem, CL_TRUE, 0, MEM_SIZE * sizeof(uint8_t), arrayOut, 0, NULL, NULL);
The kernel signature look like this:
__kernel void calculate(__global uchar * arrayInKernel, __global uchar * arrayOutKernel){
//do some calculation like this eg.
//int gid = get_global_id(0);
//arrayOutKernel[gid] = 2 * arrayInKernel[gid];
}
Could somebody help, what am I missing out?
Your code is fine, assuming MEM_SIZE = rows * columns. The argument order in clEnqueueReadBuffer also is correct.
I could imagine that you forgot to call clFinish(img_cmd_queue); after clEnqueueWriteBuffer, clEnqueueNDRangeKernel and clEnqueueReadBuffer and before you check the results in arrayOut. All these commands end up in a queue and without clFinish the queue may be executed after you checked results.
I've been following these openCL examples. OpenCL isn't giving me any errors even when checking error codes with cl_int err, or from the kernel. But when I output the results of landmap_flags[i], It shows I'm only getting garbage values back from the GPU. I could get the above example to work but when I included my data it started to break down. I'm also unsure if the landmap_flags array is too large for the kernel to handle? (96 * 96 * 96 elements of uchar).
Kernel Code:
// CL noise lib
.
.
.
kernel void terrain_gen(global uchar* landmap_flags, global float3* pos, int LOD, int chunkSize) {
const uint n = get_global_id(0);
const uint x = n%(chunkSize+(2 * LOD));
const uint y = (n/(chunkSize+(2 * LOD)))%(chunkSize+(2 * LOD));
const uint z = n/((chunkSize+(2 * LOD))*(chunkSize+(2 * LOD)));
enum BLOCK { STONE, DIRT, SNOW, GRASS, SAND, GRAVEL, GAETAN, BEDROCK, AIR };
const float frequency = 500;
const float noise_1 = (_slang_library_noise2(x+(chunkSize * pos[n].x),z+(chunkSize * pos[n].z))) / frequency;
landmap_flags[n] = (noise_1*noise_1*40.0f+6.0f>(y+(chunkSize * pos[n].y))) ? DIRT : AIR;
}
The kernel is building fine and isn't returning any errors but I figured I could have an error with how I handle the data.
And my code for setting up buffers:
// set up devices, platform, etc.
.
.
.
cl::Buffer buffer_landmap(context, CL_MEM_READ_WRITE, sizeof(cl_uchar) * 96 * 96 * 96);
cl::Buffer buffer_pos(context, CL_MEM_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS | CL_MEM_COPY_HOST_PTR, sizeof(cl_float3));
cl::Buffer buffer_LOD(context, CL_MEM_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS | CL_MEM_COPY_HOST_PTR, sizeof(cl_int));
cl::Buffer buffer_chunkSize(context, CL_MEM_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS | CL_MEM_COPY_HOST_PTR, sizeof(cl_int));
queue.enqueueWriteBuffer(buffer_landmap, CL_TRUE, 0, sizeof(cl_uchar) * 96 * 96 * 96, landmap_flags);
queue.enqueueWriteBuffer(buffer_pos, CL_TRUE, 0, sizeof(cl_float3), pos);
queue.enqueueWriteBuffer(buffer_LOD, CL_TRUE, 0, sizeof(cl_int), LOD);
queue.enqueueWriteBuffer(buffer_chunkSize, CL_TRUE, 0, sizeof(cl_int), chunkSize);
cl::Kernel get_noise(program, "terrain_gen");
get_noise.setArg(0, buffer_landmap);
get_noise.setArg(1, buffer_pos);
get_noise.setArg(2, buffer_LOD);
get_noise.setArg(3, buffer_chunkSize);
queue.enqueueNDRangeKernel(get_noise, cl::NullRange, cl::NDRange(1024));
queue.enqueueReadBuffer(buffer_landmap, CL_TRUE, 0, sizeof(cl_uchar) * 96 * 96 * 96, landmap_flags);
queue.finish();
The way I intend for this code to work is to pass three buffers (pos, LOD and chunkSize) as scalar values, and only need to return the landmap_flags to the CPU. Could it be that I'm using incorrect arguments for enqueueNDRangeKernel? A possibility could be that my work group size is too large, or I have too many work groups.
EDIT: I edited my code, scalars no longer passed as buffers, the only thing being written and read is the landmap_flags, kernel has been edited for this as well to treat pos as a scalar value.
kernel void terrain_gen(global uchar* landmap_flags, float3 pos, int LOD, int chunkSize) {
const uint n = get_global_id(0);
const uint x = n%(chunkSize+(2 * LOD));
const uint y = (n/(chunkSize+(2 * LOD)))%(chunkSize+(2 * LOD));
const uint z = n/((chunkSize+(2 * LOD))*(chunkSize+(2 * LOD)));
enum BLOCK { STONE, DIRT, SNOW, GRASS, SAND, GRAVEL, GAETAN, BEDROCK, AIR };
const float frequency = 500;
const float noise_1 = (_slang_library_noise2(x+(chunkSize * pos.x),z+(chunkSize * pos.z))) / frequency;
landmap_flags[n] = (noise_1*noise_1*40.0f+6.0f>(y+(chunkSize * pos.y))) ? DIRT : AIR;
}
cl::Buffer buffer_landmap(context, CL_MEM_READ_WRITE, sizeof(cl_uchar) * 96 * 96 * 96);
cl::CommandQueue queue(context, default_device);
queue.enqueueWriteBuffer(buffer_landmap, CL_TRUE, 0, sizeof(cl_uchar) * 96 * 96 * 96, landmap_flags);
cl::Kernel get_noise(program, "terrain_gen");
get_noise.setArg(0, buffer_landmap);
get_noise.setArg(1, pos);
get_noise.setArg(2, LOD);
get_noise.setArg(3, chunkSize);
queue.enqueueNDRangeKernel(get_noise, cl::NullRange, cl::NDRange(96 * 96 * 96));
queue.enqueueReadBuffer(buffer_landmap, CL_TRUE, 0, sizeof(cl_uchar) * 96 * 96 * 96, landmap_flags);
queue.finish();
#doqtor's observations in the comments are spot on, those are very serious issues.
Additionally, I've noticed the following:
Your pos buffer is created using CL_MEM_HOST_NO_ACCESS but then you call enqueueWriteBuffer() on it. (Though according to the text of your question, you actually want this to be a scalar, not a buffer? And then your kernel code treats it as a long vector as pointed out in the comments…)
You're using CL_MEM_COPY_HOST_PTR to create buffers without passing the host pointer.
You seem to be submitting a work size of 1024 items, but your result buffer is for 96 * 96 * 96 = 884736 items and that's how much data you're reading from the buffer too. (This buffer size is fine, you should not be getting anywhere near VRAM size with that.)
Furthermore, you say that
OpenCL isn't giving me any errors even when checking error codes with cl_int err, or from the kernel.
Given the misuse of flags when creating buffers, this seems… unlikely? Three of your four buffer creations should be failing with CL_INVALID_HOST_PTR due to issue 2 above. I suggest you take another look at your error handling code. (you've not posted it, so I can't comment on specifics)
I have a OpenCL buffer that is created via:
return cl::Buffer(_context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, size);
I write data to that buffer and want to use it later in a kernel.
I get a strange behavior thought because my kernel wont work with that buffer. Only when I randomly call
BufferContainer blah(oclEnvironment, cv::Size(width, height), 3);
calls the above function to create a same sized buffer again, the kernel works. I don't call blah.Write(...) at all. It seems to work with the data I wrote to the first buffer. But if I comment out that single line with the "blah" buffer it wont work again.
e: both buffers are created with the exact same dimensions.
e2: does it have something to do with the command queue and the order of objects there?
Basically I try to run a kernel to reduce the image and find the max hsv v value. Then after that kernel finishes and gives me the max I run the next kernel with one parameter set to that found maximum. So the call chain is like:
float maxV = _maxValueReduce->GetValueMaximum(oclEnvironment, fiBuffer, width, height, true);
//starting to paramter the next kernel
...
_kernel.setArg(8, maxV);
oclEnvironment._commandQueue.enqueueNDRangeKernel(_kernel, cl::NullRange, global, local);
And the GetValueMaximum(...) starts itself a reducing kernel to find that maximum.
e3:
float OclMaxValueReduce::GetValueMaximum(OclEnvironment& oclEnvironment,
BufferContainer& source, int width, int height, const bool sync)
{
//Create the result buffer
//Intel HD 530 can have a max. workgroup size of 256.
int dim1 = 16;
int dim2 = 16;
cl::NDRange local(dim1, dim2,1);
cl::NDRange global(source._size.width, source._size.height, 1);
//Calculate the number of workgroups
int numberOfWorkgroups = ceil((width * height) / (float)(dim1 * dim2));
//each workgroup reduces the data to a single element. This elements are then reduced on host in the final reduction step.
//First create the buffer for the workgroups result
BufferContainer result(oclEnvironment, cv::Size(numberOfWorkgroups, 1), sizeof(float));
//set the kernel arguments
_kernel.setArg(0, source.GetOclBuffer());
_kernel.setArg(1, result.GetOclBuffer());
_kernel.setArg(2, width);
_kernel.setArg(3, height);
oclEnvironment._commandQueue.enqueueNDRangeKernel(_kernel, cl::NullRange, global, local);
if (sync)
oclEnvironment._commandQueue.finish();
//retrieve the reduced result array. The final reduce step is done here on host.
float* dest = new float[numberOfWorkgroups];
ReadBuffer(oclEnvironment, result.GetOclBuffer(), dest, numberOfWorkgroups);
std::vector<float> resultArray(dest, dest + numberOfWorkgroups);
delete[] dest;
//find and return the max in array.
std::vector<float>::iterator it;
it = std::max_element(resultArray.begin(), resultArray.end());
return resultArray[std::distance(resultArray.begin(), it)];
}
and this calls the read buffer:
/* Read a float array from ocl buffer*/
void OclMaxValueReduce::ReadBuffer(OclEnvironment oclEnvironment, cl::Buffer
&resultBuffer, float* dest, const size_t size) {
int errcode;
float* resultData = (float*)oclEnvironment._commandQueue.enqueueMapBuffer(resultBuffer, true, CL_MAP_READ, 0, size * sizeof(float), 0, 0, &errcode);
if (errcode)
throw std::exception(std::string("OclEnvironment::ReadBuffer: OCL could not map Buffer!").data(), errcode);
//std::copy(resultData, (resultData + size), dest);
memcpy(dest, resultData, size * sizeof(float));
cl::Event testEvent;
oclEnvironment._commandQueue.enqueueUnmapMemObject(resultBuffer, resultData, NULL, &testEvent); // Unmap Buffer
testEvent.wait();
}
My computer has a GeForce 1080Ti. With 11GB of VRAM, I don't expect a memory issue, so I'm at a loss to explain why the following breaks my code.
I execute the kernel on the host with this code.
cl_mem buffer = clCreateBuffer(context.GetContext(), CL_MEM_READ_WRITE, n * n * sizeof(int), NULL, &error);
error = clSetKernelArg(context.GetKernel(myKernel), 1, n * n, m1);
error = clSetKernelArg(context.GetKernel(myKernel), 0, sizeof(cl_mem), &buffer);
error = clEnqueueNDRangeKernel(context.GetCommandQueue(0), context.GetKernel(myKernel), 1, NULL, 10, 10, 0, NULL, NULL);
clFinish(context.GetCommandQueue(0));
error = clEnqueueReadBuffer(context.GetCommandQueue(0), buffer, true, 0, n * n * sizeof(int), results, 0, NULL, NULL);
results is a pointer to an n-by-n int array. m1 is a pointer to an n-by-n-bit array. The variable n is divisible by 8, so we can interpret the array as a char array.
The first ten values of the array are set to 1025 by the kernel (the value isn't important):
__kernel void PopCountProduct (__global int *results)
{
results[get_global_id(0)] = 1025;
}
When I print out the result on the host, the first 10 indices are 1025. All is well and good.
Suddenly it stops working when I introduce an additional argument:
__kernel void PopCountProduct (__global int *results, __global char *m)
{
results[get_global_id(0)] = 1025;
}
Why is this happening? Am I missing something crucial about OpenCL?
You can't pas host pointer to clSetKernelArg in OpenCL 1.2. Similar thing can only be done in OpenCL 2.0+ by clSetKernelArgSVMPointer with SVM pointer if supported. But most probable making a buffer object on GPU and copying host memory to it is what you need.
I'm trying to implement a code previously written in CUDA using OpenCL to run on Altera FPGA. I'm having problem reading back data that are supposed to be in the buffer. I use the same structure as CUDA version, only thing different is cudaMalloc can allocate memory for all types of pointer while for clCreateBuffer I have to use cl_mem. My code looks like this:
cl_mem d_buffer=clCreateBuffer(...);
//CUDA version:
//float* d_buffer;
//cudaMalloc((void **)&d_buffer, MemSz);
clEnqueueWriteBuffer(queue, d_buffer, ..., h_data, );
//cudaMemcpy(d_buffer, h_Data, MemSz, cudaMemcpyHostToDevice);
#define d_buffer(index1, index2, index3) &d_buffer + index1/index2*index3
//#define d_buffer(index1, index2, index3) d_buffer + index1/index2*index3
cl_mem* d_data=d_buffer(1,2,3);
clEnqueueReadBuffer(queue, *d_data,...)// Error reading d_data
I tried clEnqueueMapBuffer or CL_MEM_ALLOC_HOST_PTR for the clCreateBuffer, it doesn't work either.
cl_mem is an opaque object. You should not perform pointer arithmetic on it; attempting to do so will result in very nasty bugs.
I'm not familiar with how CUDA handles buffer allocation, but the implication of your commented out code is that CUDA buffers are always Host-Visible. This is very strictly not the case in OpenCL. OpenCL allows you to "Map" a buffer to host-visible memory, but it won't be implicitly visible to the host. If you intend to read an arbitrary index of the buffer, you need to either map it first or copy it to host data.
float * h_data = new float[1000];
cl_mem d_buffer=clCreateBuffer(...);
clEnqueueWriteBuffer(queue, d_buffer, true, 0, 1000 * sizeof(float), h_data, 0, nullptr, nullptr);
//======OR======
//float * d_data = static_cast<float*>(clEnqueueMapBuffer(queue, d_buffer, true, CL_MAP_WRITE, 0, 1000 * sizeof(float), 0, nullptr, nullptr, nullptr));
//std::copy(h_data, h_data + 1000, d_data);
//clEnqueueUnmapMemObject(queue, d_buffer, d_data, 0, nullptr, nullptr);
//clEnqueueBarrier(queue);
//Do work with buffer, probably in OpenCL Kernel...
float result;
size_t index = 1 / 2 * 3; //This is what you wrote in the original post
clEnqueueReadBuffer(queue, d_buffer, true, index * sizeof(float), 1 * sizeof(float), &result, 0, nullptr, nullptr);
//======OR======
//float * result_ptr = static_cast<float*>(clEnqueueMapBuffer(queue, d_buffer, true, CL_MAP_READ, index * sizeof(float), 1 * sizeof(float), 0, nullptr, nullptr, nullptr));
//result = *result_ptr;
//clEnqueueUnmapMemObject(queue, d_buffer, result_ptr, 0, nullptr, nullptr);
//clEnqueueBarrier(queue);
std::cout << "Result was " << result << std::endl;