Opencl kernel runs on Intel platform but not on Nvidia - c++

I am implementing a ray tracer using openCL. I have installed NVidia's CUDA sdk and everything seems to be set up fine, both my platforms are detected (Intel's and Nvidia's), and each one sees its devices (intel has HD Graphics 4000 and Nvidia has my GPU: GeForce GT 630M).
My problem is that I am able to run my application using the Intel platform but not using Nvidia's platform. I don't believe the problem is in my code, but here is my device code:
#include "constants.h" //only a couple of #define
typedef struct Sphere {
float x, y, z;
float radius;
float r, g, b;
}Sphere;
float hit(Sphere s, float ox, float oy, float *n) {
float radius = s.radius;
float dx = ox - s.x;
float dy = oy - s.y;
if (dx*dx + dy*dy < radius*radius) {
float dz = sqrt(radius*radius - dx*dx - dy*dy);
*n = dz / sqrt(radius * radius);
return dz + s.z;
}
return -INF;
}
__kernel void rayTracer(__global Sphere* spheres, write_only image2d_t res) {
// Get the index of the current element to be processed
int x = get_global_id(0);
int y = get_global_id(1);
int ox = x - WIDTH / 2;
int oy = y - HEIGHT / 2;
float r = 0, g = 0, b = 0;
float maxz = (float) -INF;
for (int i = 0; i<NUM_SPHERES; i++)
{
float n;
float t = hit(spheres[i], ox, oy, &n);
if (t > maxz)
{
float fscale = 1;
r = spheres[i].r * fscale;
g = spheres[i].g * fscale;
b = spheres[i].b * fscale;
}
}
write_imagei(res, (int2)(x, y), (int4)(r, g, b, 0));
}
My host application is also straightforward. I simply initialize openCL structures, setup the data and then read it back.
Again, when using the Intel platform my application runs fine and I can see the raytraced image. When using Nvidia's, although the API error codes are always 0, no result is displayed.
Does anybody have any Ideas what might be the problem?
Thanks in advance
---EDIT---
Here are some pieces of host code
Setting up OpenCL structures:
//Setup OpenCL
cl_platform_id platform = getPlatforms();
cl_device_id device = getDevices(platform, CL_DEVICE_TYPE_GPU);
cl_context_properties ctxProps[] =
{
CL_CONTEXT_PLATFORM, (cl_context_properties)platform,
0, 0
};
cl_context ctx = clCreateContext(ctxProps, 1, &device, NULL, NULL, &err);
cl_command_queue queue1 = clCreateCommandQueue(ctx, device, NULL, &err);
GetPlatforms and GetDevices are functions that asks the user to chose a platform and device
Creating the program and building it:
cl_program prog = clCreateProgramWithSource(ctx, 1, srcs, &srcSize, &err);
err = clBuildProgram(prog, 1, &device, NULL, NULL, NULL);
if (err < 0)
{
//PRINT BUILD ERROR
size_t log_size;
clGetProgramBuildInfo(prog, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
char* log = (char*)calloc(log_size + 1, sizeof(char));
clGetProgramBuildInfo(prog, device, CL_PROGRAM_BUILD_LOG, log_size + 1, log, NULL);
printf("%s/n", log);
free(log);
std::cin >> err;
return 1;
}
cl_kernel krn = clCreateKernel(prog, "rayTracer", &err);
//....CREATE SOME SPHERES...
//Setup device data
cl_image_format fmt;
fmt.image_channel_order = CL_RGBA;
fmt.image_channel_data_type = CL_UNSIGNED_INT8;
cl_mem spheresBuff = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, spheres.size() * sizeof(Sphere), spheres.data(), &err);
cl_mem resBuff = clCreateImage2D(ctx, CL_MEM_WRITE_ONLY, &fmt, WIDTH, HEIGHT, 0, NULL, &err);
//Setup kernel arguments
err = clSetKernelArg(krn, 0, sizeof(cl_mem), (void*)&spheresBuff);
err = clSetKernelArg(krn, 1, sizeof(cl_mem), (void*)&resBuff);
//Run kernel
size_t gSize[] = { WIDTH, HEIGHT };
err = clEnqueueNDRangeKernel(queue1, krn, 2, NULL, gSize, NULL, 0, NULL, NULL);
//Read result
Image img = createRGBAImage(WIDTH, HEIGHT);
size_t origin[] = { 0, 0, 0 };
size_t region[] = { WIDTH , HEIGHT , 1 };
err = clEnqueueReadImage(queue1, resBuff, CL_TRUE, origin, region, 0, 0, img.pixel.data(), 0, NULL, NULL);

Try to use clEnqueueMapBuffer with CL_MAP_READ prior to kernel execution and clEnqueueUnmapMemObject after kernel execution for your spheresBuff.

Related

Error CL_INVALID_VALUE on simple C++ OpenCL image manipulation program

I'm writing a simple OpenCL program in C++ where i need to flip an input image upside-down, i'm using CImg to read and write image files.
the problem is that even though the program compiles and run without any error, the output file is blank.
Here's the cl kernel code:
const sampler_t sampler = CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
__kernel void img_turn(
read_only image2d_t I,
write_only image2d_t O
)
{
int gid_x = get_global_id(0);
int gid_y = get_global_id(1);
int w = get_image_width(I);
int h = get_image_height(I);
if (gid_x >= w || gid_y >= h)
return;
uint4 p = read_imageui(I, sampler, (int2)(gid_x, gid_y));
write_imageui(O, (int2)(gid_x, h - gid_y), p);
}
and here's bits of the host code, first the input image (Edited):
CImg<unsigned char> img_in(img_file_name);
cl_image_format format = {
CL_RGBA,
CL_UNSIGNED_INT8,
};
cl_image_desc desc = {
.image_type = CL_MEM_OBJECT_IMAGE2D,
.image_width = (size_t) img_in.width(),
.image_height = (size_t) img_in.height(),
.image_row_pitch = 0,
.image_slice_pitch = 0,
.num_mip_levels = 0,
.num_samples = 0,
.buffer = NULL,
};
cl_mem input_img = clCreateImage(
context,
CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
(const cl_image_format *) &format,
(const cl_image_desc *) &desc,
img_in.data(),
&errNum
);
the definition of the output image (Edited):
CImg<unsigned char> img_out(img_in.width(), img_in.height(), 1, 4);
format = {
CL_RGBA,
CL_UNSIGNED_INT8,
};
desc = {
.image_type = CL_MEM_OBJECT_IMAGE2D,
.image_width = (size_t) img_out.width(),
.image_height = (size_t) img_out.height(),
.image_row_pitch = 0,
.image_slice_pitch = 0,
.num_mip_levels = 0,
.num_samples = 0,
.buffer = NULL,
};
cl_mem output_img = clCreateImage(
context,
CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
(const cl_image_format *) &format,
(const cl_image_desc *) &desc,
img_out.data(),
NULL
);
and the last part of the code, where i enqueue the images and run the program (Edited):
size_t origins[3] = {0, 0, 0};
size_t region_in[3] = {(size_t) img_in.width(), (size_t) img_in.height(), (size_t) 1};
errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), input_img);
errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), output_img);
size_t global[2] = {(size_t) img_in.width(), (size_t) img_in.height()};
clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, global, NULL, 0, NULL, &kernel_event);
errNum = clEnqueueWriteImage(command_queue, input_img, CL_TRUE, origins, region_in, 0, 0, img_in.data(), 0, NULL, NULL);
size_t region_out[3] = {(size_t) img_out.width(), (size_t) img_out.height(), (size_t) 1};
errNum = clEnqueueReadImage(command_queue, output_img, CL_TRUE, origins, region_out, 0, 0, img_out.data(), 0, NULL, NULL);
clWaitForEvents(1, &kernel_event);
img_out.save("./output_img.png");
after compiling and running the program the 'output_img.png' image file is created but it's blank: 0Bytes and no data whatsoever when opened with a text editor.
Edit:
So after PeterT's suggestion (and after some corrections of some dumb mistakes i made), the program now seems to be doing something (it executes for 3 seconds), but still produces nothing.
Edit 2:
After a bit of debugging, i pinpointed the problem: clEnqueueReadImage returns the error CL_INVALID_VALUE, and the documentation specifies that it returns that error if the region being read specified by origin and region is out of bounds ...
But i don't know why. It's the same size of the input image, but clEnqueueWriteImage doesn't return any error, even if called with the same parameters.
Edit 3:
The problem has been fixed by Egor's response. But now it doesn't output the wanted result:
Input image:
Output image:
First, you create OpenCL image object using CL_RGBA format and pass the pointer to CImg pixel data. But CImg uses "planar" structure to keep the data and the values for color channels are not interleaved (for more information please see How pixel data are stored with CImg?). For example, colored image with alpha channel will be stored in memory as:
R1R2R3...B1B2B3...G1G2G3...A1A2A3...
But CL_RGBA format implies the interleaved channels for the image: R1G1B1A1R2G2B2A2R3G3B3A3.... Therefore, it is necessary to convert the image to CL_RGBA format before copying it to the device memory. For example, using following function:
struct rgba_pixel {
unsigned char r;
unsigned char g;
unsigned char b;
unsigned char a;
};
constexpr unsigned int r_channel_idx = 0;
constexpr unsigned int g_channel_idx = 1;
constexpr unsigned int b_channel_idx = 2;
constexpr unsigned int a_channel_idx = 3;
std::vector<rgba_pixel>
convert_cimg_to_rgba_buffer(const cimg_library::CImg<unsigned char>& img) {
const unsigned int img_height = static_cast<unsigned int>(img.height());
const unsigned int img_width = static_cast<unsigned int>(img.width());
const unsigned int number_of_channels = static_cast<unsigned int>(img.spectrum());
const bool has_r_channel = number_of_channels > r_channel_idx;
const bool has_g_channel = number_of_channels > g_channel_idx;
const bool has_b_channel = number_of_channels > b_channel_idx;
const bool has_a_channel = number_of_channels > a_channel_idx;
std::vector<rgba_pixel> rgba_buf(static_cast<std::size_t>(img_width) * img_height);
for (unsigned int y = 0; y < img_height; ++y) {
for (unsigned int x = 0; x < img_width; ++x) {
const std::size_t pixel_idx = static_cast<std::size_t>(img_width) * y + x;
rgba_buf[pixel_idx].r = has_r_channel ? *img.data(x, y, 0, r_channel_idx) : 0;
rgba_buf[pixel_idx].g = has_g_channel ? *img.data(x, y, 0, g_channel_idx) : 0;
rgba_buf[pixel_idx].b = has_b_channel ? *img.data(x, y, 0, b_channel_idx) : 0;
rgba_buf[pixel_idx].a = has_a_channel ? *img.data(x, y, 0, a_channel_idx) : UCHAR_MAX;
}
}
return rgba_buf;
}
So the code to copy the image to the device will look like:
size_t origins[3] = { 0, 0, 0 };
size_t region[3] = { (size_t)img_in.width(), (size_t)img_in.height(), (size_t)1 };
auto rgba_buf = convert_cimg_to_rgba_buffer(img_in);
ret = clEnqueueWriteImage(command_queue, input_img, CL_TRUE, origins, region, 0, 0, rgba_buf.data(), 0, NULL, NULL);
Also, it will be necessary to convert the output image before saving it. For example using following function:
void
copy_rgba_buffer_to_cimg(const std::vector<rgba_pixel>& rgba_buf, cimg_library::CImg<unsigned char>& img) {
const unsigned int img_height = static_cast<unsigned int>(img.height());
const unsigned int img_width = static_cast<unsigned int>(img.width());
const unsigned int number_of_channels = static_cast<unsigned int>(img.spectrum());
const bool has_r_channel = number_of_channels > r_channel_idx;
const bool has_g_channel = number_of_channels > g_channel_idx;
const bool has_b_channel = number_of_channels > b_channel_idx;
const bool has_a_channel = number_of_channels > a_channel_idx;
for (unsigned int y = 0; y < img_height; ++y) {
for (unsigned int x = 0; x < img_width; ++x) {
const std::size_t pixel_idx = static_cast<std::size_t>(img_width) * y + x;
if (has_r_channel) *img.data(x, y, 0, r_channel_idx) = rgba_buf[pixel_idx].r;
if (has_g_channel) *img.data(x, y, 0, g_channel_idx) = rgba_buf[pixel_idx].g;
if (has_b_channel) *img.data(x, y, 0, b_channel_idx) = rgba_buf[pixel_idx].b;
if (has_a_channel) *img.data(x, y, 0, a_channel_idx) = rgba_buf[pixel_idx].a;
}
}
}
And the code to copy the image from the device will look like:
ret = clEnqueueReadImage(command_queue, output_img, CL_TRUE, origins, region, 0, 0, rgba_buf.data(), 0, NULL, NULL);
copy_rgba_buffer_to_cimg(rgba_buf, img_out);
img_out.save("./output_img.png");
Next, you create the command-queue with default properties. It means that the commands enqueued to the command-queue will be executed in order. Also, you use blocking read and write (blocking_read and blocking_write flags are set to CL_TRUE for clEnqueueReadImage and clEnqueueWriteImage function calls). In this case the code can work without using OpenCL events to synchronize the execution of the commands. It is just necessary to enqueue the commands in the correct order and use blocking read command to get the result:
size_t origins[3] = { 0, 0, 0 };
size_t region[3] = { (size_t)img_in.width(), (size_t)img_in.height(), (size_t)1 };
auto rgba_buf = convert_cimg_to_rgba_buffer(img_in);
ret = clEnqueueWriteImage(command_queue, input_img, CL_FALSE, origins, region, 0, 0, rgba_buf.data(), 0, NULL, NULL);
size_t global[2] = { (size_t)img_in.width(), (size_t)img_in.height() };
clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, global, NULL, 0, NULL, NULL);
ret = clEnqueueReadImage(command_queue, output_img, CL_TRUE, origins, region, 0, 0, rgba_buf.data(), 0, NULL, NULL);
copy_rgba_buffer_to_cimg(rgba_buf, img_out);
img_out.save("./output_img.png");
Finally, new y position for the pixel should be calculated as get_image_height() - (gid_y + 1) because gid_y is in interval [0, get_image_height()). So the kernel code should look like:
write_imageui(O, (int2)(gid_x, h - gid_y - 1), p);
Minor note, if you directly copy the image to the device using clEnqueueWriteImage you can omit CL_MEM_USE_HOST_PTR flag for clCreateImage call.

OpenCL image2d_t returning wrong values when mapping it back to host

I am writing a raytracer, and was trying to write the result of the generated image to an image2d_t and then write it to disk by mapping it back to the host.
Problem is, I couldn't make the OpenCL kernel write to the image2d_t and read the result back on the host. The image back on host is a mostly black image with some white dots.
Here is the important part of my host code:
cl_image_format rgba_format;
rgba_format.image_channel_order = CL_RGBA;
rgba_format.image_channel_data_type = CL_UNSIGNED_INT8;
outputImage = clCreateImage2D(context, CL_MEM_WRITE_ONLY,
&rgba_format, width, height, 0, NULL, &err);
if(err < 0) fatalError("failed to create OpenCL image");
err = clSetKernelArg(sampleKernel, 0, sizeof(outputImage), &outputImage);
if(err < 0) fatalError("failed to set kernel argument.");
size_t global_offset[2] = {0, 0};
size_t work_size[2] = {height, width};
int err = clEnqueueNDRangeKernel(queue, sampleKernel, 2, global_offset,
work_size, NULL, 0, NULL, NULL);
if(err < 0) fatalError("failed to enqueue kernel execution.");
clFinish(queue);
// Map the entire output image.
size_t row_pitch;
size_t origin[3] = {0, 0, 0};
size_t region[3] = {width, height, 1};
uint8_t *output = (uint8_t *) clEnqueueMapImage(queue, outputImage,
CL_TRUE, CL_MAP_READ, origin, region, &row_pitch, NULL, 0, NULL,
NULL, &err);
if(err < 0) fatalError("failed to map output kernel image.");
for(int i = 0; i < 5; ++i)
printf("%u %u %u %u\n", output[i*4], output[i*4 + 1], output[i*4 + 2],
output[i*4 + 3]);
savePPM(output);
My client code:
__kernel void sample(__write_only image2d_t out)
{
int2 coord = (get_global_id(1), get_global_id(0));
uint4 color = (255, 0, 0, 255);
write_imageui(out, coord, color);
}
The printf() output is:
255 255 255 255
0 0 0 0
0 0 0 0
0 0 0 0
0 0 0 0
And the generated image is:
I know I am probably making a rookie mistake, but I can't figure out what the problem is and I couldn't find anywhere on the internet any guides to make this work.
I am using a MacBook Pro with an Intel Iris Pro Graphics GPU.
Solved the problem. Do not EVER forget to add the type qualifier on OpenCL vectors.
The fixed Kernel is:
__kernel void sample(__write_only image2d_t out)
{
int2 coord = (int2) (get_global_id(1), get_global_id(0));
uint4 color = (uint4) (255, 0, 0, 255);
write_imageui(out, coord, color);
}

Euclidean distance using OpenCL

I am trying to compute the euclidean distance of a set of 5D points (pixels) to a 5D single point (center) and store in another result vector, I want to use vector indexing to store all info in a single vector so for the ith pixel, the 5 dimensions are (5i) , (5i+1) , ...
I am new to OpenCL and I just edited a sample code on the internet for my own intentions. The theory is right but the code doesn't show the right answers !
Here is the kernel:
//d_kernel.cl
__kernel void distance_kernel(__global double *pixelInfo,
__global double *clusterCentres,
__global double *distanceFromClusterCentre)
{
int index = get_global_id(0);
int d, dl, da, db, dx, dy;
dl = pixelInfo[5 * index] - clusterCentres[0];
dl = dl * dl;
da = pixelInfo[5 * index + 1] - clusterCentres[1];
da = da * da;
db = pixelInfo[5 * index + 2] - clusterCentres[2];
db = db * db;
dx = pixelInfo[5 * index + 3] - clusterCentres[3];
dx = dx * dx;
dy = pixelInfo[5 * index + 4] - clusterCentres[4];
dy = dy * dy;
distanceFromClusterCentre[index] = dx + dy + dl + da + db;
}
and here is the HOST CODE:
#include <iostream>
#include <CL/cl.h>
#include <vector>
using namespace std;
#define MAX_SOURCE_SIZE (0x100000)
int main(int argc, char **argv)
{
// Create the two input vectors
int i;
const int pixelsNumber = 1024;
const int clustersNumber = 1;
std::vector<double> pixelInfo;
pixelInfo.resize(5 * pixelsNumber);
std::fill(pixelInfo.begin(), pixelInfo.end(), 500);
std::vector<double> clusterCentres;
clusterCentres.resize(5 * clustersNumber);
std::fill(clusterCentres.begin(), clusterCentres.end(), 200);
std::vector<double> distanceFromClusterCentre;
distanceFromClusterCentre.resize(pixelsNumber);
std::fill(distanceFromClusterCentre.begin(), distanceFromClusterCentre.end(), 0);
// Load the kernel source code into the array source_str
FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("d_kernel.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);
// Get platform and device information
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1,
&device_id, &ret_num_devices);
// Create an OpenCL context
cl_context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
// Create a command queue
cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
// Create memory buffers on the device for each vector
cl_mem pixelInfo_mem = clCreateBuffer(context, CL_MEM_READ_ONLY,
5 * pixelsNumber * sizeof(int), NULL, &ret);
cl_mem clusterCentres_mem = clCreateBuffer(context, CL_MEM_READ_ONLY,
5 * clustersNumber * sizeof(int), NULL, &ret);
cl_mem distanceFromClusterCentre_mem = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
pixelsNumber * sizeof(int), NULL, &ret);
// Copy the vectors to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue, pixelInfo_mem, CL_TRUE, 0,
5 * pixelsNumber * sizeof(int), pixelInfo.data(), 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, clusterCentres_mem, CL_TRUE, 0,
5 * clustersNumber * sizeof(int), clusterCentres.data(), 0, NULL, NULL);
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&source_str, (const size_t *)&source_size, &ret);
// Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "vector_add", &ret);
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&pixelInfo_mem);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&clusterCentres_mem);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&distanceFromClusterCentre_mem);
// Execute the OpenCL kernel on the list
size_t global_item_size = pixelsNumber; // Process the entire lists
size_t local_item_size = 64; // Divide work items into groups of 64
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
// Read the memory buffer result on the device to the local vector result
ret = clEnqueueReadBuffer(command_queue, distanceFromClusterCentre_mem, CL_TRUE, 0,
pixelsNumber * sizeof(int), distanceFromClusterCentre.data(), 0, NULL, NULL);
// Display the result to the screen
for (i = 0; i < pixelsNumber; i++)
{
cout << "Pixel " << i << ": " << distanceFromClusterCentre[i] << endl;
//system("PAUSE");
}
// Clean up
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(pixelInfo_mem);
ret = clReleaseMemObject(clusterCentres_mem);
ret = clReleaseMemObject(distanceFromClusterCentre_mem);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
free(pixelInfo.data());
free(clusterCentres.data());
free(distanceFromClusterCentre.data());
system("PAUSE");
return 0;
}
and a part of the RESULT is:
.
.
.
Pixel 501: -1.11874e+306
Pixel 502: -1.16263e+306
Pixel 503: -1.07485e+306
Pixel 504: -1.03079e+306
Pixel 505: -9.42843e+305
Pixel 506: -9.86903e+305
Pixel 507: -8.98954e+305
Pixel 508: -9.86903e+305
Pixel 509: -8.98954e+305
Pixel 510: -9.43014e+305
Press any key to continue . . .
Pixel 511: -8.55065e+305
Pixel 512: 0
Pixel 513: 0
Pixel 514: 0
Pixel 515: 0
Pixel 516: 0
Pixel 517: 0
Pixel 518: 0
Pixel 519: 0
Pixel 520: 0
.
.
.
after index 511 the rest of the vector is zero !
You created your vectors of double's and then you treat them as there were ints (created buffer for ints, writing data to int buffers and reading back results as there were ints). To avoid such mistakes you could write your code this way:
cl_mem pixelInfo_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, pixelInfo.size() * sizeof(pixelInfo[0]), NULL, &ret);
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Not all work-items being used opencl

so I'm able to compile and execute my kernel, the problem is that only two work-items are being used. I'm basically trying to fill up a float array[8] with {0,1,2,3,4,5,6,7}. So this is a very simple hello world application. Bellow is my kernel.
// Highly simplified to demonstrate
__kernel void rnd_float32_matrix (
__global float * res
) {
uint idx = get_global_id(0);
res[idx] = idx;
}
I then create and execute the kernel with the following code...
// Some more code
cl::Program program(context, sources, &err);
program.build(devices, NULL, NULL, NULL);
cl::Kernel kernel(program, "rnd_float32_matrix", &err);
kernel.setArg(0, src_d);
cl::CommandQueue queue(context, devices[0], 0, &err);
cl::Event event;
err = queue.enqueueNDRangeKernel(
kernel,
cl::NullRange,
cl::NDRange(8),
// I've tried cl::NDRange(8) as well
cl::NDRange(1),
NULL,
&event
);
event.wait();
err = queue.enqueueReadBuffer(
// This is:
// cl::Buffer src_d(
// context,
// CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
// mem_size,
// src_h,
// &err);
src_d,
CL_TRUE,
0,
8,
// This is float * src_h = new float[8];
src_h);
for(int i = 0; i < 8; i ++) {
std::cout << src_h[i] << std::endl;
}
I may not show it in the code, but I also do select a gpu device and using context.getInfo(..) it shows I'm using my NVidia GTX 770M card which shows 1024, 1024, 64 work-items available in dimensions 0, 1 and 2. When this array prints I keep getting... 0, 1, 0, 0, 0, 0, 0, 0. I've also tried setting res[idx] = 5, and I get... 5, 5, 0, 0, 0, 0, 0, 0. So it seems that only two give work-items are actually being used. What am I doing wrong?
Your command to read the data back from the device is only reading 8 bytes, which is two floats:
err = queue.enqueueReadBuffer(
src_d,
CL_TRUE,
0,
8, // <- This is the number of bytes, not the number of elements!
// This is float * src_h = new float[8];
src_h);
To read 8 floats, you would need to do this:
err = queue.enqueueReadBuffer(
src_d,
CL_TRUE,
0,
8 * sizeof(cl_float),
// This is float * src_h = new float[8];
src_h);

Reading wrong data on OpenCL

I faced with the problem that the kernel writes data in wrong place or host reads data incorrectly sometimes. I write the same data (index at which I write the data) to two global arrays with different types. To ensure that the index is corrent are used the global counter which incremented by means of atom_inc. The problem occures when data are read from second array on the host.
For instance:
.....
output array index: 442: (output1 value:442.0000 output2 value:442)
output array index: 443: (output1 value:443.0000 output2 value:443)
output array index: 444: (output1 value:444.0000 output2 value:444)
output array index: 445: (output1 value:445.0000 output2 value:445)
output array index: 446: (output1 value:446.0000 output2 value:1152892928)
output array index: 447: (output1 value:447.0000 output2 value:447)
output array index: 448: (output1 value:448.0000 output2 value:1152909312)
output array index: 449: (output1 value:449.0000 output2 value:1152917504)
output array index: 450: (output1 value:450.0000 output2 value:1152925696)
......
As you can see at indicies 446, 448, 449 and 450+ output2 contains wrong values. What can be the possible reason of this?
Device: ATI Radeon HD5750
Code sample:
#include <stdio.h>
#include <math.h>
#include <OpenCL/OpenCL.h>
// wtf example
const char *programSource =
"__kernel void kernel1(__global uint *counter,\n" \
"__global float *weights,\n" \
"__global uint *weights_pos)\n" \
"{\n"\
"const uint global_size = get_global_size(0);\n" \
"const uint global_id = get_global_id(0);\n" \
"uint local_id = get_local_id(0);\n" \
"if(global_id == 0) {\n" \
"counter[5] = 0; // set index of pos in weights to zero\n" \
"}\n" \
"uint insert_index = atom_inc(&counter[5]);\n" \
"weights[insert_index] = insert_index;\n" \
"weights_pos[insert_index] = insert_index;\n" \
"}";
void art_process_sinogram(const char* tiff_filename,
const float *angles2,
const unsigned int n_angles2,
const unsigned int n_ray2s,
const float distanc2e)
{
/******************************
* OPENCL ENVIRONMENT
*/
cl_int status;
cl_uint numPlatforms = 0;
cl_platform_id *platforms = NULL;
cl_device_id device_id;
//discover platforms
status = clGetPlatformIDs(0, NULL, &numPlatforms);
platforms = (cl_platform_id*)malloc(numPlatforms * sizeof(cl_platform_id));
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
//discover devices
cl_uint numDevices = 0;
cl_device_id *devices = NULL;
status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id));
status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);
device_id = devices[1];
//create context
cl_context context = NULL;
context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status);
cl_program program = clCreateProgramWithSource(context, 1, (const char **)&programSource, NULL, &status);
clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
cl_kernel kernel_weights = clCreateKernel(program, "kernel1", &status);
//create queue
cl_command_queue command_queue1 = clCreateCommandQueue(context, device_id, 0, &status);
/******************************
* HARDWARE PARAMETERS
*/
cl_uint wavefronts_per_SIMD = 7;
size_t global_work_size;
size_t local_work_size = 64;
cl_uint max_compute_units;
clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &max_compute_units, NULL);
size_t wg_count = max_compute_units * wavefronts_per_SIMD;
global_work_size = wg_count * local_work_size;
/**************************** DATA PART *************************************/
size_t w_portion_size = 768 * sizeof(cl_float);
size_t w_pos_portion_size = 768 * sizeof(cl_uint);
size_t counters_data_size = 6 * sizeof(cl_uint);
cl_uint counters_data[6];
counters_data[0] = 1;
counters_data[1] = 2; // max number of the cells intersected by the ray
counters_data[2] = 3;
counters_data[3] = 4;
counters_data[4] = 5; // same to the number of rays
counters_data[5] = 0; // counter inside kernel
/*****************
* Main buffers
*/
cl_mem weights1_buffer = clCreateBuffer(context,
CL_MEM_READ_WRITE,
w_portion_size,
NULL,
NULL);
cl_mem weights_pos1_buffer = clCreateBuffer(context,
CL_MEM_READ_WRITE,
w_pos_portion_size,
NULL,
NULL);
/*****************
* Supplement buffers (constant)
*/
cl_mem counters_data_buffer = clCreateBuffer(context,
CL_MEM_READ_ONLY,
counters_data_size,
NULL,
&status);
cl_event supplement_buffer_ready[1];
status = clEnqueueWriteBuffer(command_queue1,
counters_data_buffer,
CL_FALSE,
0,
counters_data_size,
counters_data,
0,
NULL,
&supplement_buffer_ready[0]);
status = clSetKernelArg(kernel_weights, 0, sizeof(void *), (void *)&counters_data_buffer);
status = clSetKernelArg(kernel_weights, 1, sizeof(void *), (void *)&weights1_buffer);
status = clSetKernelArg(kernel_weights, 2, sizeof(void *), (void *)&weights_pos1_buffer);
status = clEnqueueNDRangeKernel(command_queue1,
kernel_weights,
1, // work dimensional 1D, 2D, 3D
NULL, // offset
&global_work_size, // total number of WI
&local_work_size, // nomber of WI in WG
1, // num events in wait list
supplement_buffer_ready, // event wait list
NULL); // event
clFinish(command_queue1);
cl_float *output1 = (cl_float *) clEnqueueMapBuffer(command_queue1,
weights1_buffer,//*pmain_weights_buffer,
CL_TRUE,
CL_MAP_READ,
0,
w_portion_size,
0, NULL, NULL, NULL);
cl_uint *output2 = malloc(w_portion_size);
status = clEnqueueReadBuffer(command_queue1, weights_pos1_buffer,
CL_TRUE, 0, w_pos_portion_size, output2,
0, NULL, NULL);
clFinish(command_queue1);
for(int i = 0; i < 790; ++i) {
printf("output array index: %d: (output1 value:%.4f \t output2 value:%d) \n", i, output1[i], output2[i]);
}
}
SOLUTION:
The kernel should be looks like (need checking index):
__kernel void k_1(__global uint *counter,
__global uint *weights,
__global uint2 *weights_pos)
{
const uint global_size = get_global_size(0);
const uint global_id = get_global_id(0);
uint local_id = get_local_id(0);
uint insert_index = atom_inc(&counter[5]);
if(insert_index < 768) {
weights[insert_index]= insert_index;
weights_pos[insert_index].x = insert_index;
weights_pos[insert_index].y = insert_index;
}
}
You are messing up with buffer dimensions.
1) Your buffers contains 768 elements each (see initialization of w_portion_size and w_pos_portion_size)
2) Workgroup size on my machine is 896 (see initialization of wg_count)
3) You print out 790 values.
Apart from this, one conceptual error is here:
if(global_id == 0) {
counter[5] = 0; // set index of pos in weights to zero
}
//atomic increments on counter[5]
You can't assume that the first virtual processor will execute this line before the others. You should completely remove this line, since you initialize counter[5] on the host side. (I believe that this is the cause of your problem, but I can't reproduce that).
After fixing these problems your code seems to run fine (intel implementation).
The kernel should be looks like (need checking index):
__kernel void k_1(__global uint *counter,
__global uint *weights,
__global uint2 *weights_pos)
{
const uint global_size = get_global_size(0);
const uint global_id = get_global_id(0);
uint local_id = get_local_id(0);
uint insert_index = atom_inc(&counter[5]);
if(insert_index < 768) {
weights[insert_index]= insert_index;
weights_pos[insert_index].x = insert_index;
weights_pos[insert_index].y = insert_index;
}
}