OpenCL conditional statement issue - if-statement

I've attempted to parallelize a set of conditional statements, but the output does not match existing implementation after the first loop which contains the kernel executes (mapI is an int array of 135, and on the 60th index of the second loop it fails, totaling 195 calls to mapI). I've checked that all arrays are passing to and from the kernel correctly by comparing them to the existing implementation and am baffled as to why this computation does not return the correct result, as it does for the first loop of the code. All OpenCL overhead functions return CL_SUCCESS.
cl_mem Q1cl, Q3cl;
Q1cl = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(double)*um->Npts*um->Nel, Q1, &err);
Q3cl = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(double)*um->Npts*um->Nel, Q3, &err);
nxcl = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(double)*um->Nel*um->Nfaces*um->Nfq, nx, &err);
nycl = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(double)*um->Nel*um->Nfaces*um->Nfq, ny, &err);
mapIcl = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int)*(um->Nfq+um->Ninflow), mapI, &err);
mapFcl = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int)*(um->Nfq+um->Nfar), mapF, &err);
fluxQ1cl = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(double)*um->Nel*um->Nfaces*um->Nfq, *fluxQ1check, &err);
err = clSetKernelArg(kernel[7], 0, sizeof(cl_mem), (void*)&mapIcl);
err = clSetKernelArg(kernel[7], 1, sizeof(cl_mem), (void*)&nxcl);
err = clSetKernelArg(kernel[7], 2, sizeof(cl_mem), (void*)&nycl);
err = clSetKernelArg(kernel[7], 3, sizeof(cl_mem), (void*)&mapFcl);
err = clSetKernelArg(kernel[7], 4, sizeof(cl_mem), (void*)&Q1cl);
err = clSetKernelArg(kernel[7], 5, sizeof(cl_mem), (void*)&Q3cl);
err = clSetKernelArg(kernel[7], 6, sizeof(cl_mem), (void*)&fluxQ1cl);
globalWorkSize[0] = Ninflow; //Old implementation, now NEL
globalWorkSize[1] = Nfar; //Old implementation, now NFACES*NFQ
err = clEnqueueNDRangeKernel(queue[0], kernel[7], 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
err = clEnqueueReadBuffer(queue[0], fluxQ1cl, CL_TRUE, 0, sizeof(double)*um->Nel*um->Nfaces*um->Nfq, *fluxQ1check, 0, NULL, NULL);
Kernel Code:
__kernel void umBC(__global int* mapI,
__global double* nx,
__global double* ny,
__global int* mapF,
__global double* Q1,
__global double* Q3,
__global double* fluxQ1)
{
int id, idF;
double e[9][2] = { {0, 0}, {1, 0}, {0, 1}, {-1, 0}, {0, -1}, {1, 1}, {-1, 1}, {-1, -1}, {1, -1}};
double t_1 = 1. / 9.;
double uf = 0.;
double vf = 0.;
int globalx = get_global_id(0);
int globaly = get_global_id(1);
id = mapI[globalx];
fluxQ1[id] = ((e[1][0]*nx[id] + e[1][1]*ny[id]) < 0)*((Q1[id]-Q3[id] -2.*t_1*1.*(e[1][0]*uf+e[1][0]*vf)*3.) * (e[1][0]*nx[id] + e[1][1]*ny[id])) + 0.;
uf = 0.01;
vf = 0.;
idF = mapF[globaly];
fluxQ1[idF] = ((e[1][0]*nx[idF] + e[1][1]*ny[idF]) < 0)*((Q1[idF]-Q3[idF] -2.*t_1*1.*(e[1][0]*uf+e[1][0]*vf)*3.) * (e[1][0]*nx[idF] + e[1][1]*ny[idF])) + 0.;
}
Edit: Below is the working implementation, thank you again doqtor and Lee for your help. To implementthis I needed to change the way mapI and mapF worked to match the sizing of fluxQ.
__kernel void umBC(__global int* mapI,
__global double* nx,
__global double* ny,
__global int* mapF,
__global double* Q1,
__global double* Q3,
__global double* fluxQ1)
{
double e[9][2] = { {0, 0}, {1, 0}, {0, 1}, {-1, 0}, {0, -1}, {1, 1}, {-1, 1}, {-1, -1}, {1, -1}};
double t_1 = 1. / 9.;
double uf = 0.;
double vf = 0.;
int globalx = get_global_id(0); //NEL
int globaly = get_global_id(1); //NFACES*NFQ
if(mapI[globalx*NFACES*NFQ+globaly] != NEL*NFACES*NFQ+1000){
fluxQ1[globalx*NFACES*NFQ+globaly] = 0.0;
if ((e[1][0]*nx[globalx*NFACES*NFQ+globaly] + e[1][1]*ny[globalx*NFACES*NFQ+globaly]) < 0){
fluxQ1[globalx*NFACES*NFQ+globaly] = (Q1[globalx*NFACES*NFQ+globaly]-Q3[globalx*NFACES*NFQ+globaly] -2.*t_1*1.*(e[1][0]*uf+e[1][0]*vf)*3.) * (e[1][0]*nx[globalx*NFACES*NFQ+globaly] + e[1][1]*ny[globalx*NFACES*NFQ+globaly]);
}
}
uf = 0.01;
vf = 0.;
if(mapF[globalx*NFACES*NFQ+globaly] != NEL*NFACES*NFQ+1000){
fluxQ1[globalx*NFACES*NFQ+globaly] = 0.0;
if ((e[1][0]*nx[globalx*NFACES*NFQ+globaly] + e[1][1]*ny[globalx*NFACES*NFQ+globaly]) < 0){
fluxQ1[globalx*NFACES*NFQ+globaly] = (Q1[globalx*NFACES*NFQ+globaly]-Q3[globalx*NFACES*NFQ+globaly] -2.*t_1*1.*(e[1][0]*uf+e[1][0]*vf)*3.) * (e[1][0]*nx[globalx*NFACES*NFQ+globaly] + e[1][1]*ny[globalx*NFACES*NFQ+globaly]);
}
}
}

You get wrong results because your kernel is not implemented correctly. #Lee already said that, I will try to further explain why.
Let's say for simplicity we consider executing kernel using 2x2 global range.
Now, the (globalx, globaly) for each work item will be:
(0, 0)
(0, 1)
(1, 0)
(1, 1)
so:
work item (0, 0) writes to fluxQ1[mapI[0]] and fluxQ1[mapF[0]]
work item (0, 1) writes to fluxQ1[mapI[0]] and fluxQ1[mapF[1]]
work item (1, 0) writes to fluxQ1[mapI[1]] and fluxQ1[mapF[0]]
work item (1, 1) writes to fluxQ1[mapI[1]] and fluxQ1[mapF[1]]
Assuming that mapI[0], mapI[1], mapF[0] and mapF[1] return unique ids then 2 work items write to the same location in parallel which obviously can't give you correct results. This will get worse if mapI and mapF can return same ids.
Some of the results you get correct by luck.
Changing number of global or local work items won't help, using select or not won't help either. You need to ensure work items don't write to the same location at the same time!

You have multiple work-items in each dimension, yet you are writing to a location indexed by only one:
int globalx = get_global_id(0);
int globaly = get_global_id(1);
id = mapI[globalx];
fluxQ1[id] =
So multiple work-items here will share the same globalx. That means they will read the same id from mapI, and will write to the same location in fluxQ1.

Related

Error CL_INVALID_VALUE on simple C++ OpenCL image manipulation program

I'm writing a simple OpenCL program in C++ where i need to flip an input image upside-down, i'm using CImg to read and write image files.
the problem is that even though the program compiles and run without any error, the output file is blank.
Here's the cl kernel code:
const sampler_t sampler = CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
__kernel void img_turn(
read_only image2d_t I,
write_only image2d_t O
)
{
int gid_x = get_global_id(0);
int gid_y = get_global_id(1);
int w = get_image_width(I);
int h = get_image_height(I);
if (gid_x >= w || gid_y >= h)
return;
uint4 p = read_imageui(I, sampler, (int2)(gid_x, gid_y));
write_imageui(O, (int2)(gid_x, h - gid_y), p);
}
and here's bits of the host code, first the input image (Edited):
CImg<unsigned char> img_in(img_file_name);
cl_image_format format = {
CL_RGBA,
CL_UNSIGNED_INT8,
};
cl_image_desc desc = {
.image_type = CL_MEM_OBJECT_IMAGE2D,
.image_width = (size_t) img_in.width(),
.image_height = (size_t) img_in.height(),
.image_row_pitch = 0,
.image_slice_pitch = 0,
.num_mip_levels = 0,
.num_samples = 0,
.buffer = NULL,
};
cl_mem input_img = clCreateImage(
context,
CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
(const cl_image_format *) &format,
(const cl_image_desc *) &desc,
img_in.data(),
&errNum
);
the definition of the output image (Edited):
CImg<unsigned char> img_out(img_in.width(), img_in.height(), 1, 4);
format = {
CL_RGBA,
CL_UNSIGNED_INT8,
};
desc = {
.image_type = CL_MEM_OBJECT_IMAGE2D,
.image_width = (size_t) img_out.width(),
.image_height = (size_t) img_out.height(),
.image_row_pitch = 0,
.image_slice_pitch = 0,
.num_mip_levels = 0,
.num_samples = 0,
.buffer = NULL,
};
cl_mem output_img = clCreateImage(
context,
CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
(const cl_image_format *) &format,
(const cl_image_desc *) &desc,
img_out.data(),
NULL
);
and the last part of the code, where i enqueue the images and run the program (Edited):
size_t origins[3] = {0, 0, 0};
size_t region_in[3] = {(size_t) img_in.width(), (size_t) img_in.height(), (size_t) 1};
errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), input_img);
errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), output_img);
size_t global[2] = {(size_t) img_in.width(), (size_t) img_in.height()};
clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, global, NULL, 0, NULL, &kernel_event);
errNum = clEnqueueWriteImage(command_queue, input_img, CL_TRUE, origins, region_in, 0, 0, img_in.data(), 0, NULL, NULL);
size_t region_out[3] = {(size_t) img_out.width(), (size_t) img_out.height(), (size_t) 1};
errNum = clEnqueueReadImage(command_queue, output_img, CL_TRUE, origins, region_out, 0, 0, img_out.data(), 0, NULL, NULL);
clWaitForEvents(1, &kernel_event);
img_out.save("./output_img.png");
after compiling and running the program the 'output_img.png' image file is created but it's blank: 0Bytes and no data whatsoever when opened with a text editor.
Edit:
So after PeterT's suggestion (and after some corrections of some dumb mistakes i made), the program now seems to be doing something (it executes for 3 seconds), but still produces nothing.
Edit 2:
After a bit of debugging, i pinpointed the problem: clEnqueueReadImage returns the error CL_INVALID_VALUE, and the documentation specifies that it returns that error if the region being read specified by origin and region is out of bounds ...
But i don't know why. It's the same size of the input image, but clEnqueueWriteImage doesn't return any error, even if called with the same parameters.
Edit 3:
The problem has been fixed by Egor's response. But now it doesn't output the wanted result:
Input image:
Output image:
First, you create OpenCL image object using CL_RGBA format and pass the pointer to CImg pixel data. But CImg uses "planar" structure to keep the data and the values for color channels are not interleaved (for more information please see How pixel data are stored with CImg?). For example, colored image with alpha channel will be stored in memory as:
R1R2R3...B1B2B3...G1G2G3...A1A2A3...
But CL_RGBA format implies the interleaved channels for the image: R1G1B1A1R2G2B2A2R3G3B3A3.... Therefore, it is necessary to convert the image to CL_RGBA format before copying it to the device memory. For example, using following function:
struct rgba_pixel {
unsigned char r;
unsigned char g;
unsigned char b;
unsigned char a;
};
constexpr unsigned int r_channel_idx = 0;
constexpr unsigned int g_channel_idx = 1;
constexpr unsigned int b_channel_idx = 2;
constexpr unsigned int a_channel_idx = 3;
std::vector<rgba_pixel>
convert_cimg_to_rgba_buffer(const cimg_library::CImg<unsigned char>& img) {
const unsigned int img_height = static_cast<unsigned int>(img.height());
const unsigned int img_width = static_cast<unsigned int>(img.width());
const unsigned int number_of_channels = static_cast<unsigned int>(img.spectrum());
const bool has_r_channel = number_of_channels > r_channel_idx;
const bool has_g_channel = number_of_channels > g_channel_idx;
const bool has_b_channel = number_of_channels > b_channel_idx;
const bool has_a_channel = number_of_channels > a_channel_idx;
std::vector<rgba_pixel> rgba_buf(static_cast<std::size_t>(img_width) * img_height);
for (unsigned int y = 0; y < img_height; ++y) {
for (unsigned int x = 0; x < img_width; ++x) {
const std::size_t pixel_idx = static_cast<std::size_t>(img_width) * y + x;
rgba_buf[pixel_idx].r = has_r_channel ? *img.data(x, y, 0, r_channel_idx) : 0;
rgba_buf[pixel_idx].g = has_g_channel ? *img.data(x, y, 0, g_channel_idx) : 0;
rgba_buf[pixel_idx].b = has_b_channel ? *img.data(x, y, 0, b_channel_idx) : 0;
rgba_buf[pixel_idx].a = has_a_channel ? *img.data(x, y, 0, a_channel_idx) : UCHAR_MAX;
}
}
return rgba_buf;
}
So the code to copy the image to the device will look like:
size_t origins[3] = { 0, 0, 0 };
size_t region[3] = { (size_t)img_in.width(), (size_t)img_in.height(), (size_t)1 };
auto rgba_buf = convert_cimg_to_rgba_buffer(img_in);
ret = clEnqueueWriteImage(command_queue, input_img, CL_TRUE, origins, region, 0, 0, rgba_buf.data(), 0, NULL, NULL);
Also, it will be necessary to convert the output image before saving it. For example using following function:
void
copy_rgba_buffer_to_cimg(const std::vector<rgba_pixel>& rgba_buf, cimg_library::CImg<unsigned char>& img) {
const unsigned int img_height = static_cast<unsigned int>(img.height());
const unsigned int img_width = static_cast<unsigned int>(img.width());
const unsigned int number_of_channels = static_cast<unsigned int>(img.spectrum());
const bool has_r_channel = number_of_channels > r_channel_idx;
const bool has_g_channel = number_of_channels > g_channel_idx;
const bool has_b_channel = number_of_channels > b_channel_idx;
const bool has_a_channel = number_of_channels > a_channel_idx;
for (unsigned int y = 0; y < img_height; ++y) {
for (unsigned int x = 0; x < img_width; ++x) {
const std::size_t pixel_idx = static_cast<std::size_t>(img_width) * y + x;
if (has_r_channel) *img.data(x, y, 0, r_channel_idx) = rgba_buf[pixel_idx].r;
if (has_g_channel) *img.data(x, y, 0, g_channel_idx) = rgba_buf[pixel_idx].g;
if (has_b_channel) *img.data(x, y, 0, b_channel_idx) = rgba_buf[pixel_idx].b;
if (has_a_channel) *img.data(x, y, 0, a_channel_idx) = rgba_buf[pixel_idx].a;
}
}
}
And the code to copy the image from the device will look like:
ret = clEnqueueReadImage(command_queue, output_img, CL_TRUE, origins, region, 0, 0, rgba_buf.data(), 0, NULL, NULL);
copy_rgba_buffer_to_cimg(rgba_buf, img_out);
img_out.save("./output_img.png");
Next, you create the command-queue with default properties. It means that the commands enqueued to the command-queue will be executed in order. Also, you use blocking read and write (blocking_read and blocking_write flags are set to CL_TRUE for clEnqueueReadImage and clEnqueueWriteImage function calls). In this case the code can work without using OpenCL events to synchronize the execution of the commands. It is just necessary to enqueue the commands in the correct order and use blocking read command to get the result:
size_t origins[3] = { 0, 0, 0 };
size_t region[3] = { (size_t)img_in.width(), (size_t)img_in.height(), (size_t)1 };
auto rgba_buf = convert_cimg_to_rgba_buffer(img_in);
ret = clEnqueueWriteImage(command_queue, input_img, CL_FALSE, origins, region, 0, 0, rgba_buf.data(), 0, NULL, NULL);
size_t global[2] = { (size_t)img_in.width(), (size_t)img_in.height() };
clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, global, NULL, 0, NULL, NULL);
ret = clEnqueueReadImage(command_queue, output_img, CL_TRUE, origins, region, 0, 0, rgba_buf.data(), 0, NULL, NULL);
copy_rgba_buffer_to_cimg(rgba_buf, img_out);
img_out.save("./output_img.png");
Finally, new y position for the pixel should be calculated as get_image_height() - (gid_y + 1) because gid_y is in interval [0, get_image_height()). So the kernel code should look like:
write_imageui(O, (int2)(gid_x, h - gid_y - 1), p);
Minor note, if you directly copy the image to the device using clEnqueueWriteImage you can omit CL_MEM_USE_HOST_PTR flag for clCreateImage call.

OpenCL.clSetKernelArg returns -51

I tried to make parallel bfs in openCL but I didn't have enough experience with c++.
So this is probably memory error, but I really don't know how to fix it.
I also can't find what does error value -51 means.
As a result I got "Unhandled exception at 0x00007FFCFB06A549 (amdocl64.dll) in my project.exe: 0xC0000005: Access violation reading location 0xFFFFFFFFFFFFFFFF" in next line.
main
Graph G(AdjacencyList, Directed);
int startVertex;
vector<int> distance;
vector<bool> visited;
distance = vector<int>(G.numVertices);
visited = vector<bool>(G.numVertices);
bool done = false;
const bool true_value = true;
int level = 0;
// Allocation on device
const int size = G.numVertices * sizeof(int);
const int adjacencySize = G.adjacencyList.size() * sizeof(int);
//OpenCL
cl_int status;
cl_int ret;
cl_platform_id platform_id;
clGetPlatformIDs(1, &platform_id, NULL);
cl_device_id device_id;
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
cl_context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &status);
cl_command_queue command_queue = clCreateCommandQueueWithProperties(context, device_id, NULL, &status);
cl_mem d_adjacencyList = clCreateBuffer(context, CL_MEM_READ_WRITE, adjacencySize, NULL, &status);
cl_mem d_edgesOffset = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &status);
cl_mem d_edgesSize = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &status);
cl_mem d_distance = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &status);
cl_mem d_done = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(bool), NULL, &status);
status = clEnqueueWriteBuffer(command_queue, d_adjacencyList, CL_TRUE, 0, adjacencySize, &G.adjacencyList[0], 0, NULL, NULL);
status = clEnqueueWriteBuffer(command_queue, d_edgesOffset, CL_TRUE, 0, size, &G.edgesOffset[0], 0, NULL, NULL);
status = clEnqueueWriteBuffer(command_queue, d_edgesSize, CL_TRUE, 0, size, &G.edgesSize[0], 0, NULL, NULL);
distance = vector<int>(G.numVertices, INT_MAX);
distance[start] = 0;
status = clEnqueueWriteBuffer(command_queue, d_distance, CL_TRUE, 0, size, distance.data(), 0, NULL, NULL);
char* source_str = NULL;
size_t source_size;
FILE* fp;
fp = fopen("bfs.cl", "r");
if (!fp)
{
cout << "Failed to load Kernel\n";
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
cl_program program = clCreateProgramWithSource(context, 1, (const char**)&source_str, (const size_t*)&source_size, &status);
status = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
cl_kernel kernel = clCreateKernel(program, "bfs", &status);
status = clSetKernelArg(kernel, 0, sizeof(int), (void*)&G.numVertices);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&d_adjacencyList);
status = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&d_edgesOffset);
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&d_edgesOffset);
status = clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&d_edgesSize);
status = clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&d_distance); //here retirns -51
status = clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*)&level);
status = clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*)&d_done);
kernel
__kernel void bfs(int n, __global int *adjacencyList,__global int *edgesOffset,__global int *edgesSize,__global int *distance, int level,__global bool *done) {
int tid = get_global_id(0);
if (tid < n) {
if (distance[tid] == level) {
for (int i = edgesOffset[tid]; i < edgesOffset[tid] + edgesSize[tid]; ++i) {
int v = adjacencyList[i];
if (distance[v] == INT_MAX) {
*done = false;
distance[v] = level + 1;
}
}
}
}
}
Hi #Parrison welcome to StackOverflow!
All the OpenCL error codes are defined in cl.h. In the latest (version 3) cl.h you will find the error codes defined between lines 194 and 270, where on line 241 you will find:
#define CL_INVALID_ARG_SIZE -51
So the OpenCL ICD reckons that you have passed the wrong variable size for distance.
However, I can see many other errors before this one. For example, you need to set the size of the OpenCL buffers based on the sizes of OpenCL variable not native variables, e.g.:
cl_int instead of int
cl_float instead of float
and especially cl_bool instead of bool.
There is no guarantee that an OpenCL cl_int is the same size a host int and an OpenCL cl_bool is defined as an unsigned int which is highly unlikely to be the same size as a bool!
Ensure that all the parameters to your OpenCL kernel are defined correctly and that
you are creating the correct buffers and variables for them in the main program.

Euclidean distance using OpenCL

I am trying to compute the euclidean distance of a set of 5D points (pixels) to a 5D single point (center) and store in another result vector, I want to use vector indexing to store all info in a single vector so for the ith pixel, the 5 dimensions are (5i) , (5i+1) , ...
I am new to OpenCL and I just edited a sample code on the internet for my own intentions. The theory is right but the code doesn't show the right answers !
Here is the kernel:
//d_kernel.cl
__kernel void distance_kernel(__global double *pixelInfo,
__global double *clusterCentres,
__global double *distanceFromClusterCentre)
{
int index = get_global_id(0);
int d, dl, da, db, dx, dy;
dl = pixelInfo[5 * index] - clusterCentres[0];
dl = dl * dl;
da = pixelInfo[5 * index + 1] - clusterCentres[1];
da = da * da;
db = pixelInfo[5 * index + 2] - clusterCentres[2];
db = db * db;
dx = pixelInfo[5 * index + 3] - clusterCentres[3];
dx = dx * dx;
dy = pixelInfo[5 * index + 4] - clusterCentres[4];
dy = dy * dy;
distanceFromClusterCentre[index] = dx + dy + dl + da + db;
}
and here is the HOST CODE:
#include <iostream>
#include <CL/cl.h>
#include <vector>
using namespace std;
#define MAX_SOURCE_SIZE (0x100000)
int main(int argc, char **argv)
{
// Create the two input vectors
int i;
const int pixelsNumber = 1024;
const int clustersNumber = 1;
std::vector<double> pixelInfo;
pixelInfo.resize(5 * pixelsNumber);
std::fill(pixelInfo.begin(), pixelInfo.end(), 500);
std::vector<double> clusterCentres;
clusterCentres.resize(5 * clustersNumber);
std::fill(clusterCentres.begin(), clusterCentres.end(), 200);
std::vector<double> distanceFromClusterCentre;
distanceFromClusterCentre.resize(pixelsNumber);
std::fill(distanceFromClusterCentre.begin(), distanceFromClusterCentre.end(), 0);
// Load the kernel source code into the array source_str
FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("d_kernel.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);
// Get platform and device information
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1,
&device_id, &ret_num_devices);
// Create an OpenCL context
cl_context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
// Create a command queue
cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
// Create memory buffers on the device for each vector
cl_mem pixelInfo_mem = clCreateBuffer(context, CL_MEM_READ_ONLY,
5 * pixelsNumber * sizeof(int), NULL, &ret);
cl_mem clusterCentres_mem = clCreateBuffer(context, CL_MEM_READ_ONLY,
5 * clustersNumber * sizeof(int), NULL, &ret);
cl_mem distanceFromClusterCentre_mem = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
pixelsNumber * sizeof(int), NULL, &ret);
// Copy the vectors to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue, pixelInfo_mem, CL_TRUE, 0,
5 * pixelsNumber * sizeof(int), pixelInfo.data(), 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, clusterCentres_mem, CL_TRUE, 0,
5 * clustersNumber * sizeof(int), clusterCentres.data(), 0, NULL, NULL);
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&source_str, (const size_t *)&source_size, &ret);
// Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "vector_add", &ret);
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&pixelInfo_mem);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&clusterCentres_mem);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&distanceFromClusterCentre_mem);
// Execute the OpenCL kernel on the list
size_t global_item_size = pixelsNumber; // Process the entire lists
size_t local_item_size = 64; // Divide work items into groups of 64
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
// Read the memory buffer result on the device to the local vector result
ret = clEnqueueReadBuffer(command_queue, distanceFromClusterCentre_mem, CL_TRUE, 0,
pixelsNumber * sizeof(int), distanceFromClusterCentre.data(), 0, NULL, NULL);
// Display the result to the screen
for (i = 0; i < pixelsNumber; i++)
{
cout << "Pixel " << i << ": " << distanceFromClusterCentre[i] << endl;
//system("PAUSE");
}
// Clean up
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(pixelInfo_mem);
ret = clReleaseMemObject(clusterCentres_mem);
ret = clReleaseMemObject(distanceFromClusterCentre_mem);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
free(pixelInfo.data());
free(clusterCentres.data());
free(distanceFromClusterCentre.data());
system("PAUSE");
return 0;
}
and a part of the RESULT is:
.
.
.
Pixel 501: -1.11874e+306
Pixel 502: -1.16263e+306
Pixel 503: -1.07485e+306
Pixel 504: -1.03079e+306
Pixel 505: -9.42843e+305
Pixel 506: -9.86903e+305
Pixel 507: -8.98954e+305
Pixel 508: -9.86903e+305
Pixel 509: -8.98954e+305
Pixel 510: -9.43014e+305
Press any key to continue . . .
Pixel 511: -8.55065e+305
Pixel 512: 0
Pixel 513: 0
Pixel 514: 0
Pixel 515: 0
Pixel 516: 0
Pixel 517: 0
Pixel 518: 0
Pixel 519: 0
Pixel 520: 0
.
.
.
after index 511 the rest of the vector is zero !
You created your vectors of double's and then you treat them as there were ints (created buffer for ints, writing data to int buffers and reading back results as there were ints). To avoid such mistakes you could write your code this way:
cl_mem pixelInfo_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, pixelInfo.size() * sizeof(pixelInfo[0]), NULL, &ret);
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Not all work-items being used opencl

so I'm able to compile and execute my kernel, the problem is that only two work-items are being used. I'm basically trying to fill up a float array[8] with {0,1,2,3,4,5,6,7}. So this is a very simple hello world application. Bellow is my kernel.
// Highly simplified to demonstrate
__kernel void rnd_float32_matrix (
__global float * res
) {
uint idx = get_global_id(0);
res[idx] = idx;
}
I then create and execute the kernel with the following code...
// Some more code
cl::Program program(context, sources, &err);
program.build(devices, NULL, NULL, NULL);
cl::Kernel kernel(program, "rnd_float32_matrix", &err);
kernel.setArg(0, src_d);
cl::CommandQueue queue(context, devices[0], 0, &err);
cl::Event event;
err = queue.enqueueNDRangeKernel(
kernel,
cl::NullRange,
cl::NDRange(8),
// I've tried cl::NDRange(8) as well
cl::NDRange(1),
NULL,
&event
);
event.wait();
err = queue.enqueueReadBuffer(
// This is:
// cl::Buffer src_d(
// context,
// CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
// mem_size,
// src_h,
// &err);
src_d,
CL_TRUE,
0,
8,
// This is float * src_h = new float[8];
src_h);
for(int i = 0; i < 8; i ++) {
std::cout << src_h[i] << std::endl;
}
I may not show it in the code, but I also do select a gpu device and using context.getInfo(..) it shows I'm using my NVidia GTX 770M card which shows 1024, 1024, 64 work-items available in dimensions 0, 1 and 2. When this array prints I keep getting... 0, 1, 0, 0, 0, 0, 0, 0. I've also tried setting res[idx] = 5, and I get... 5, 5, 0, 0, 0, 0, 0, 0. So it seems that only two give work-items are actually being used. What am I doing wrong?
Your command to read the data back from the device is only reading 8 bytes, which is two floats:
err = queue.enqueueReadBuffer(
src_d,
CL_TRUE,
0,
8, // <- This is the number of bytes, not the number of elements!
// This is float * src_h = new float[8];
src_h);
To read 8 floats, you would need to do this:
err = queue.enqueueReadBuffer(
src_d,
CL_TRUE,
0,
8 * sizeof(cl_float),
// This is float * src_h = new float[8];
src_h);

Opencl kernel runs on Intel platform but not on Nvidia

I am implementing a ray tracer using openCL. I have installed NVidia's CUDA sdk and everything seems to be set up fine, both my platforms are detected (Intel's and Nvidia's), and each one sees its devices (intel has HD Graphics 4000 and Nvidia has my GPU: GeForce GT 630M).
My problem is that I am able to run my application using the Intel platform but not using Nvidia's platform. I don't believe the problem is in my code, but here is my device code:
#include "constants.h" //only a couple of #define
typedef struct Sphere {
float x, y, z;
float radius;
float r, g, b;
}Sphere;
float hit(Sphere s, float ox, float oy, float *n) {
float radius = s.radius;
float dx = ox - s.x;
float dy = oy - s.y;
if (dx*dx + dy*dy < radius*radius) {
float dz = sqrt(radius*radius - dx*dx - dy*dy);
*n = dz / sqrt(radius * radius);
return dz + s.z;
}
return -INF;
}
__kernel void rayTracer(__global Sphere* spheres, write_only image2d_t res) {
// Get the index of the current element to be processed
int x = get_global_id(0);
int y = get_global_id(1);
int ox = x - WIDTH / 2;
int oy = y - HEIGHT / 2;
float r = 0, g = 0, b = 0;
float maxz = (float) -INF;
for (int i = 0; i<NUM_SPHERES; i++)
{
float n;
float t = hit(spheres[i], ox, oy, &n);
if (t > maxz)
{
float fscale = 1;
r = spheres[i].r * fscale;
g = spheres[i].g * fscale;
b = spheres[i].b * fscale;
}
}
write_imagei(res, (int2)(x, y), (int4)(r, g, b, 0));
}
My host application is also straightforward. I simply initialize openCL structures, setup the data and then read it back.
Again, when using the Intel platform my application runs fine and I can see the raytraced image. When using Nvidia's, although the API error codes are always 0, no result is displayed.
Does anybody have any Ideas what might be the problem?
Thanks in advance
---EDIT---
Here are some pieces of host code
Setting up OpenCL structures:
//Setup OpenCL
cl_platform_id platform = getPlatforms();
cl_device_id device = getDevices(platform, CL_DEVICE_TYPE_GPU);
cl_context_properties ctxProps[] =
{
CL_CONTEXT_PLATFORM, (cl_context_properties)platform,
0, 0
};
cl_context ctx = clCreateContext(ctxProps, 1, &device, NULL, NULL, &err);
cl_command_queue queue1 = clCreateCommandQueue(ctx, device, NULL, &err);
GetPlatforms and GetDevices are functions that asks the user to chose a platform and device
Creating the program and building it:
cl_program prog = clCreateProgramWithSource(ctx, 1, srcs, &srcSize, &err);
err = clBuildProgram(prog, 1, &device, NULL, NULL, NULL);
if (err < 0)
{
//PRINT BUILD ERROR
size_t log_size;
clGetProgramBuildInfo(prog, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
char* log = (char*)calloc(log_size + 1, sizeof(char));
clGetProgramBuildInfo(prog, device, CL_PROGRAM_BUILD_LOG, log_size + 1, log, NULL);
printf("%s/n", log);
free(log);
std::cin >> err;
return 1;
}
cl_kernel krn = clCreateKernel(prog, "rayTracer", &err);
//....CREATE SOME SPHERES...
//Setup device data
cl_image_format fmt;
fmt.image_channel_order = CL_RGBA;
fmt.image_channel_data_type = CL_UNSIGNED_INT8;
cl_mem spheresBuff = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, spheres.size() * sizeof(Sphere), spheres.data(), &err);
cl_mem resBuff = clCreateImage2D(ctx, CL_MEM_WRITE_ONLY, &fmt, WIDTH, HEIGHT, 0, NULL, &err);
//Setup kernel arguments
err = clSetKernelArg(krn, 0, sizeof(cl_mem), (void*)&spheresBuff);
err = clSetKernelArg(krn, 1, sizeof(cl_mem), (void*)&resBuff);
//Run kernel
size_t gSize[] = { WIDTH, HEIGHT };
err = clEnqueueNDRangeKernel(queue1, krn, 2, NULL, gSize, NULL, 0, NULL, NULL);
//Read result
Image img = createRGBAImage(WIDTH, HEIGHT);
size_t origin[] = { 0, 0, 0 };
size_t region[] = { WIDTH , HEIGHT , 1 };
err = clEnqueueReadImage(queue1, resBuff, CL_TRUE, origin, region, 0, 0, img.pixel.data(), 0, NULL, NULL);
Try to use clEnqueueMapBuffer with CL_MAP_READ prior to kernel execution and clEnqueueUnmapMemObject after kernel execution for your spheresBuff.