I faced with the problem that the kernel writes data in wrong place or host reads data incorrectly sometimes. I write the same data (index at which I write the data) to two global arrays with different types. To ensure that the index is corrent are used the global counter which incremented by means of atom_inc. The problem occures when data are read from second array on the host.
For instance:
.....
output array index: 442: (output1 value:442.0000 output2 value:442)
output array index: 443: (output1 value:443.0000 output2 value:443)
output array index: 444: (output1 value:444.0000 output2 value:444)
output array index: 445: (output1 value:445.0000 output2 value:445)
output array index: 446: (output1 value:446.0000 output2 value:1152892928)
output array index: 447: (output1 value:447.0000 output2 value:447)
output array index: 448: (output1 value:448.0000 output2 value:1152909312)
output array index: 449: (output1 value:449.0000 output2 value:1152917504)
output array index: 450: (output1 value:450.0000 output2 value:1152925696)
......
As you can see at indicies 446, 448, 449 and 450+ output2 contains wrong values. What can be the possible reason of this?
Device: ATI Radeon HD5750
Code sample:
#include <stdio.h>
#include <math.h>
#include <OpenCL/OpenCL.h>
// wtf example
const char *programSource =
"__kernel void kernel1(__global uint *counter,\n" \
"__global float *weights,\n" \
"__global uint *weights_pos)\n" \
"{\n"\
"const uint global_size = get_global_size(0);\n" \
"const uint global_id = get_global_id(0);\n" \
"uint local_id = get_local_id(0);\n" \
"if(global_id == 0) {\n" \
"counter[5] = 0; // set index of pos in weights to zero\n" \
"}\n" \
"uint insert_index = atom_inc(&counter[5]);\n" \
"weights[insert_index] = insert_index;\n" \
"weights_pos[insert_index] = insert_index;\n" \
"}";
void art_process_sinogram(const char* tiff_filename,
const float *angles2,
const unsigned int n_angles2,
const unsigned int n_ray2s,
const float distanc2e)
{
/******************************
* OPENCL ENVIRONMENT
*/
cl_int status;
cl_uint numPlatforms = 0;
cl_platform_id *platforms = NULL;
cl_device_id device_id;
//discover platforms
status = clGetPlatformIDs(0, NULL, &numPlatforms);
platforms = (cl_platform_id*)malloc(numPlatforms * sizeof(cl_platform_id));
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
//discover devices
cl_uint numDevices = 0;
cl_device_id *devices = NULL;
status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id));
status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);
device_id = devices[1];
//create context
cl_context context = NULL;
context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status);
cl_program program = clCreateProgramWithSource(context, 1, (const char **)&programSource, NULL, &status);
clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
cl_kernel kernel_weights = clCreateKernel(program, "kernel1", &status);
//create queue
cl_command_queue command_queue1 = clCreateCommandQueue(context, device_id, 0, &status);
/******************************
* HARDWARE PARAMETERS
*/
cl_uint wavefronts_per_SIMD = 7;
size_t global_work_size;
size_t local_work_size = 64;
cl_uint max_compute_units;
clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &max_compute_units, NULL);
size_t wg_count = max_compute_units * wavefronts_per_SIMD;
global_work_size = wg_count * local_work_size;
/**************************** DATA PART *************************************/
size_t w_portion_size = 768 * sizeof(cl_float);
size_t w_pos_portion_size = 768 * sizeof(cl_uint);
size_t counters_data_size = 6 * sizeof(cl_uint);
cl_uint counters_data[6];
counters_data[0] = 1;
counters_data[1] = 2; // max number of the cells intersected by the ray
counters_data[2] = 3;
counters_data[3] = 4;
counters_data[4] = 5; // same to the number of rays
counters_data[5] = 0; // counter inside kernel
/*****************
* Main buffers
*/
cl_mem weights1_buffer = clCreateBuffer(context,
CL_MEM_READ_WRITE,
w_portion_size,
NULL,
NULL);
cl_mem weights_pos1_buffer = clCreateBuffer(context,
CL_MEM_READ_WRITE,
w_pos_portion_size,
NULL,
NULL);
/*****************
* Supplement buffers (constant)
*/
cl_mem counters_data_buffer = clCreateBuffer(context,
CL_MEM_READ_ONLY,
counters_data_size,
NULL,
&status);
cl_event supplement_buffer_ready[1];
status = clEnqueueWriteBuffer(command_queue1,
counters_data_buffer,
CL_FALSE,
0,
counters_data_size,
counters_data,
0,
NULL,
&supplement_buffer_ready[0]);
status = clSetKernelArg(kernel_weights, 0, sizeof(void *), (void *)&counters_data_buffer);
status = clSetKernelArg(kernel_weights, 1, sizeof(void *), (void *)&weights1_buffer);
status = clSetKernelArg(kernel_weights, 2, sizeof(void *), (void *)&weights_pos1_buffer);
status = clEnqueueNDRangeKernel(command_queue1,
kernel_weights,
1, // work dimensional 1D, 2D, 3D
NULL, // offset
&global_work_size, // total number of WI
&local_work_size, // nomber of WI in WG
1, // num events in wait list
supplement_buffer_ready, // event wait list
NULL); // event
clFinish(command_queue1);
cl_float *output1 = (cl_float *) clEnqueueMapBuffer(command_queue1,
weights1_buffer,//*pmain_weights_buffer,
CL_TRUE,
CL_MAP_READ,
0,
w_portion_size,
0, NULL, NULL, NULL);
cl_uint *output2 = malloc(w_portion_size);
status = clEnqueueReadBuffer(command_queue1, weights_pos1_buffer,
CL_TRUE, 0, w_pos_portion_size, output2,
0, NULL, NULL);
clFinish(command_queue1);
for(int i = 0; i < 790; ++i) {
printf("output array index: %d: (output1 value:%.4f \t output2 value:%d) \n", i, output1[i], output2[i]);
}
}
SOLUTION:
The kernel should be looks like (need checking index):
__kernel void k_1(__global uint *counter,
__global uint *weights,
__global uint2 *weights_pos)
{
const uint global_size = get_global_size(0);
const uint global_id = get_global_id(0);
uint local_id = get_local_id(0);
uint insert_index = atom_inc(&counter[5]);
if(insert_index < 768) {
weights[insert_index]= insert_index;
weights_pos[insert_index].x = insert_index;
weights_pos[insert_index].y = insert_index;
}
}
You are messing up with buffer dimensions.
1) Your buffers contains 768 elements each (see initialization of w_portion_size and w_pos_portion_size)
2) Workgroup size on my machine is 896 (see initialization of wg_count)
3) You print out 790 values.
Apart from this, one conceptual error is here:
if(global_id == 0) {
counter[5] = 0; // set index of pos in weights to zero
}
//atomic increments on counter[5]
You can't assume that the first virtual processor will execute this line before the others. You should completely remove this line, since you initialize counter[5] on the host side. (I believe that this is the cause of your problem, but I can't reproduce that).
After fixing these problems your code seems to run fine (intel implementation).
The kernel should be looks like (need checking index):
__kernel void k_1(__global uint *counter,
__global uint *weights,
__global uint2 *weights_pos)
{
const uint global_size = get_global_size(0);
const uint global_id = get_global_id(0);
uint local_id = get_local_id(0);
uint insert_index = atom_inc(&counter[5]);
if(insert_index < 768) {
weights[insert_index]= insert_index;
weights_pos[insert_index].x = insert_index;
weights_pos[insert_index].y = insert_index;
}
}
Related
I'm writing a simple OpenCL program in C++ where i need to flip an input image upside-down, i'm using CImg to read and write image files.
the problem is that even though the program compiles and run without any error, the output file is blank.
Here's the cl kernel code:
const sampler_t sampler = CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
__kernel void img_turn(
read_only image2d_t I,
write_only image2d_t O
)
{
int gid_x = get_global_id(0);
int gid_y = get_global_id(1);
int w = get_image_width(I);
int h = get_image_height(I);
if (gid_x >= w || gid_y >= h)
return;
uint4 p = read_imageui(I, sampler, (int2)(gid_x, gid_y));
write_imageui(O, (int2)(gid_x, h - gid_y), p);
}
and here's bits of the host code, first the input image (Edited):
CImg<unsigned char> img_in(img_file_name);
cl_image_format format = {
CL_RGBA,
CL_UNSIGNED_INT8,
};
cl_image_desc desc = {
.image_type = CL_MEM_OBJECT_IMAGE2D,
.image_width = (size_t) img_in.width(),
.image_height = (size_t) img_in.height(),
.image_row_pitch = 0,
.image_slice_pitch = 0,
.num_mip_levels = 0,
.num_samples = 0,
.buffer = NULL,
};
cl_mem input_img = clCreateImage(
context,
CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
(const cl_image_format *) &format,
(const cl_image_desc *) &desc,
img_in.data(),
&errNum
);
the definition of the output image (Edited):
CImg<unsigned char> img_out(img_in.width(), img_in.height(), 1, 4);
format = {
CL_RGBA,
CL_UNSIGNED_INT8,
};
desc = {
.image_type = CL_MEM_OBJECT_IMAGE2D,
.image_width = (size_t) img_out.width(),
.image_height = (size_t) img_out.height(),
.image_row_pitch = 0,
.image_slice_pitch = 0,
.num_mip_levels = 0,
.num_samples = 0,
.buffer = NULL,
};
cl_mem output_img = clCreateImage(
context,
CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
(const cl_image_format *) &format,
(const cl_image_desc *) &desc,
img_out.data(),
NULL
);
and the last part of the code, where i enqueue the images and run the program (Edited):
size_t origins[3] = {0, 0, 0};
size_t region_in[3] = {(size_t) img_in.width(), (size_t) img_in.height(), (size_t) 1};
errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), input_img);
errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), output_img);
size_t global[2] = {(size_t) img_in.width(), (size_t) img_in.height()};
clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, global, NULL, 0, NULL, &kernel_event);
errNum = clEnqueueWriteImage(command_queue, input_img, CL_TRUE, origins, region_in, 0, 0, img_in.data(), 0, NULL, NULL);
size_t region_out[3] = {(size_t) img_out.width(), (size_t) img_out.height(), (size_t) 1};
errNum = clEnqueueReadImage(command_queue, output_img, CL_TRUE, origins, region_out, 0, 0, img_out.data(), 0, NULL, NULL);
clWaitForEvents(1, &kernel_event);
img_out.save("./output_img.png");
after compiling and running the program the 'output_img.png' image file is created but it's blank: 0Bytes and no data whatsoever when opened with a text editor.
Edit:
So after PeterT's suggestion (and after some corrections of some dumb mistakes i made), the program now seems to be doing something (it executes for 3 seconds), but still produces nothing.
Edit 2:
After a bit of debugging, i pinpointed the problem: clEnqueueReadImage returns the error CL_INVALID_VALUE, and the documentation specifies that it returns that error if the region being read specified by origin and region is out of bounds ...
But i don't know why. It's the same size of the input image, but clEnqueueWriteImage doesn't return any error, even if called with the same parameters.
Edit 3:
The problem has been fixed by Egor's response. But now it doesn't output the wanted result:
Input image:
Output image:
First, you create OpenCL image object using CL_RGBA format and pass the pointer to CImg pixel data. But CImg uses "planar" structure to keep the data and the values for color channels are not interleaved (for more information please see How pixel data are stored with CImg?). For example, colored image with alpha channel will be stored in memory as:
R1R2R3...B1B2B3...G1G2G3...A1A2A3...
But CL_RGBA format implies the interleaved channels for the image: R1G1B1A1R2G2B2A2R3G3B3A3.... Therefore, it is necessary to convert the image to CL_RGBA format before copying it to the device memory. For example, using following function:
struct rgba_pixel {
unsigned char r;
unsigned char g;
unsigned char b;
unsigned char a;
};
constexpr unsigned int r_channel_idx = 0;
constexpr unsigned int g_channel_idx = 1;
constexpr unsigned int b_channel_idx = 2;
constexpr unsigned int a_channel_idx = 3;
std::vector<rgba_pixel>
convert_cimg_to_rgba_buffer(const cimg_library::CImg<unsigned char>& img) {
const unsigned int img_height = static_cast<unsigned int>(img.height());
const unsigned int img_width = static_cast<unsigned int>(img.width());
const unsigned int number_of_channels = static_cast<unsigned int>(img.spectrum());
const bool has_r_channel = number_of_channels > r_channel_idx;
const bool has_g_channel = number_of_channels > g_channel_idx;
const bool has_b_channel = number_of_channels > b_channel_idx;
const bool has_a_channel = number_of_channels > a_channel_idx;
std::vector<rgba_pixel> rgba_buf(static_cast<std::size_t>(img_width) * img_height);
for (unsigned int y = 0; y < img_height; ++y) {
for (unsigned int x = 0; x < img_width; ++x) {
const std::size_t pixel_idx = static_cast<std::size_t>(img_width) * y + x;
rgba_buf[pixel_idx].r = has_r_channel ? *img.data(x, y, 0, r_channel_idx) : 0;
rgba_buf[pixel_idx].g = has_g_channel ? *img.data(x, y, 0, g_channel_idx) : 0;
rgba_buf[pixel_idx].b = has_b_channel ? *img.data(x, y, 0, b_channel_idx) : 0;
rgba_buf[pixel_idx].a = has_a_channel ? *img.data(x, y, 0, a_channel_idx) : UCHAR_MAX;
}
}
return rgba_buf;
}
So the code to copy the image to the device will look like:
size_t origins[3] = { 0, 0, 0 };
size_t region[3] = { (size_t)img_in.width(), (size_t)img_in.height(), (size_t)1 };
auto rgba_buf = convert_cimg_to_rgba_buffer(img_in);
ret = clEnqueueWriteImage(command_queue, input_img, CL_TRUE, origins, region, 0, 0, rgba_buf.data(), 0, NULL, NULL);
Also, it will be necessary to convert the output image before saving it. For example using following function:
void
copy_rgba_buffer_to_cimg(const std::vector<rgba_pixel>& rgba_buf, cimg_library::CImg<unsigned char>& img) {
const unsigned int img_height = static_cast<unsigned int>(img.height());
const unsigned int img_width = static_cast<unsigned int>(img.width());
const unsigned int number_of_channels = static_cast<unsigned int>(img.spectrum());
const bool has_r_channel = number_of_channels > r_channel_idx;
const bool has_g_channel = number_of_channels > g_channel_idx;
const bool has_b_channel = number_of_channels > b_channel_idx;
const bool has_a_channel = number_of_channels > a_channel_idx;
for (unsigned int y = 0; y < img_height; ++y) {
for (unsigned int x = 0; x < img_width; ++x) {
const std::size_t pixel_idx = static_cast<std::size_t>(img_width) * y + x;
if (has_r_channel) *img.data(x, y, 0, r_channel_idx) = rgba_buf[pixel_idx].r;
if (has_g_channel) *img.data(x, y, 0, g_channel_idx) = rgba_buf[pixel_idx].g;
if (has_b_channel) *img.data(x, y, 0, b_channel_idx) = rgba_buf[pixel_idx].b;
if (has_a_channel) *img.data(x, y, 0, a_channel_idx) = rgba_buf[pixel_idx].a;
}
}
}
And the code to copy the image from the device will look like:
ret = clEnqueueReadImage(command_queue, output_img, CL_TRUE, origins, region, 0, 0, rgba_buf.data(), 0, NULL, NULL);
copy_rgba_buffer_to_cimg(rgba_buf, img_out);
img_out.save("./output_img.png");
Next, you create the command-queue with default properties. It means that the commands enqueued to the command-queue will be executed in order. Also, you use blocking read and write (blocking_read and blocking_write flags are set to CL_TRUE for clEnqueueReadImage and clEnqueueWriteImage function calls). In this case the code can work without using OpenCL events to synchronize the execution of the commands. It is just necessary to enqueue the commands in the correct order and use blocking read command to get the result:
size_t origins[3] = { 0, 0, 0 };
size_t region[3] = { (size_t)img_in.width(), (size_t)img_in.height(), (size_t)1 };
auto rgba_buf = convert_cimg_to_rgba_buffer(img_in);
ret = clEnqueueWriteImage(command_queue, input_img, CL_FALSE, origins, region, 0, 0, rgba_buf.data(), 0, NULL, NULL);
size_t global[2] = { (size_t)img_in.width(), (size_t)img_in.height() };
clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, global, NULL, 0, NULL, NULL);
ret = clEnqueueReadImage(command_queue, output_img, CL_TRUE, origins, region, 0, 0, rgba_buf.data(), 0, NULL, NULL);
copy_rgba_buffer_to_cimg(rgba_buf, img_out);
img_out.save("./output_img.png");
Finally, new y position for the pixel should be calculated as get_image_height() - (gid_y + 1) because gid_y is in interval [0, get_image_height()). So the kernel code should look like:
write_imageui(O, (int2)(gid_x, h - gid_y - 1), p);
Minor note, if you directly copy the image to the device using clEnqueueWriteImage you can omit CL_MEM_USE_HOST_PTR flag for clCreateImage call.
https://i.stack.imgur.com/TA9v6.png
I have been trying to get to compile a kernel that assigns certain indices to an std::vector using OpenCL through clEnqueueReadBuffer function but it does not seem to work correctly since the first result is the only assigned in the std::vector
the source code for the host in c++ is the following:
cl_mem originalPixelsBuffer = clCreateBuffer(p1.context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(Color) * imageObj->SourceLength(), source, &p1.status);
CheckErrorCode(p1.status, p1.program, p1.devices[0], "Failed to Create buffer 0");
cl_mem targetBuffer = clCreateBuffer(p1.context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(Color) * imageObj->OutputLength(), target, &p1.status);
CheckErrorCode(p1.status, p1.program, p1.devices[0], "Failed to Create buffer 1");
//write buffers
p1.status = clEnqueueWriteBuffer(p1.commandQueue, originalPixelsBuffer, CL_FALSE, 0, sizeof(Color) * imageObj->SourceLength(), source, 0, NULL, NULL);
CheckErrorCode(p1.status, p1.program, p1.devices[0], "Failed to write buffer 0");
p1.status = clEnqueueWriteBuffer(p1.commandQueue, targetBuffer, CL_TRUE, 0, sizeof(Color) * imageObj->OutputLength(), target, 0, NULL, NULL);
CheckErrorCode(p1.status, p1.program, p1.devices[0], "Failed to write buffer 1");
size_t globalWorkSize[2] = { imageObj->originalWidth * 4, imageObj->originalHeight * 4 };
size_t localWorkSize[2]{ 64,64 };
SetLocalWorkSize(IsDivisibleBy64(localWorkSize[0]), localWorkSize);
//execute kernel
p1.status = clEnqueueNDRangeKernel(p1.commandQueue, Kernel, 1, NULL, globalWorkSize, IsDisibibleByLocalWorkSize(globalWorkSize, localWorkSize) ? localWorkSize : NULL, 0, NULL, NULL);
CheckErrorCode(p1.status, p1.program, p1.devices[0], "Failed to clEnqueueDRangeKernel");
//read buffer
p1.status = clEnqueueReadBuffer(p1.commandQueue, targetBuffer, CL_TRUE, 0, sizeof(Color) * imageObj->OutputLength(), target, 0, NULL, NULL);
CheckErrorCode(p1.status, p1.program, p1.devices[0], "Failed to write buffer 1");
the kernel code:
__kernel void interp(__global struct Color* source,__global struct Color* target,uint64 width,uint64 height,uint64 ratio,uint64 limit, uint64 originalHeight)
{
__private fp32 wIndex = (int64)get_global_id(0);
__private fp32 hIndex = (int64)get_global_id(1);
if(((int64)wIndex)%ratio==MATCH && ((int64)hIndex)%ratio ==MATCH)
{
__private int64 Index = (wIndex/ratio) * (originalHeight/ratio) + (hIndex/ratio);
if(Index < limit)
{
__private int64 tIndex = wIndex * height + hIndex;
target[tIndex].R = source[Index].R;
target[tIndex].G = source[Index].G;
target[tIndex].B = source[Index].B;
target[tIndex].A = source[Index].A;
}
}
}```
I am a beginner at OpenCL. I tried to run a very simple kernel code, adding 1 to each value of vector. Everything runs fine, returns no error code (I checked return value after each step). The source Code :
cl_device_id device_id = NULL;
cl_context context = NULL;
cl_command_queue command_queue = NULL;
cl_mem memobj , resobj = NULL;
cl_program program = NULL;
cl_kernel kernel = NULL;
cl_platform_id platform_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret;
size_t work_units_per_kernels;
int input[10] = {1,2,3,4,5,6,7,8,9,10};
int output[10];
int length = 10 ;
FILE *fp;
char fileName[] = "/home/tuan/OpenCLPlayaround/hello.cl";
char *source_str;
size_t source_size;
/* Load the source code containing the kernel*/
fp = fopen(fileName, "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(0x100000);
source_size = fread(source_str,1,0x100000, fp);
fclose(fp);
ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
std::cout<<ret<<" code"<<std::endl;
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);
std::cout<<ret<<" code"<<std::endl;
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
std::cout<<ret<<" code"<<std::endl;
command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
//Check Concept of memory
memobj = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,length * sizeof(int), input, &ret);
resobj = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length * sizeof(int), output, &ret);
std::cout<<ret<<" code"<<std::endl;
program = clCreateProgramWithSource(context,1,(const char**)&source_str, (const size_t*)&source_size, &ret);
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
kernel = clCreateKernel(program, "hello", &ret);
ret = clSetKernelArg(kernel,0, sizeof(memobj),(void *)&memobj);
ret = clSetKernelArg(kernel,1, sizeof(resobj),(void *)&resobj);
ret = clEnqueueTask(command_queue, kernel, 0, NULL,NULL);
ret = clEnqueueReadBuffer(command_queue, resobj, CL_TRUE, 0, length* sizeof(int),output, 0, NULL, NULL);
for (int i = 0 ; i <10 ; i++) {
std::cout<<output[i]<<" "<<std::endl;
}
return 0;
The result is somewhat bizarre, while it should be {2,3,4,5,6,7,8,9,10,11} :
2
-16777216
65535
1
-1242789408
32767
4201449
0
2
0
And my kernel :
__kernel void hello(__global int* a, __global int* b)
{
int sam = 0;
int gid = get_global_id(0);
b[gid] = sam + a[gid] +1 ;
}
Can somebody explain why ? Its bursting my head for hours !
clEnqueueTask is equivalent to calling clEnqueueNDRangeKernel with work_dim = 1, global_work_offset = NULL, global_work_size[0] set to 1, and local_work_size[0] set to 1.
so use clEnqueueNDRangeKernel.
I am trying to compute the euclidean distance of a set of 5D points (pixels) to a 5D single point (center) and store in another result vector, I want to use vector indexing to store all info in a single vector so for the ith pixel, the 5 dimensions are (5i) , (5i+1) , ...
I am new to OpenCL and I just edited a sample code on the internet for my own intentions. The theory is right but the code doesn't show the right answers !
Here is the kernel:
//d_kernel.cl
__kernel void distance_kernel(__global double *pixelInfo,
__global double *clusterCentres,
__global double *distanceFromClusterCentre)
{
int index = get_global_id(0);
int d, dl, da, db, dx, dy;
dl = pixelInfo[5 * index] - clusterCentres[0];
dl = dl * dl;
da = pixelInfo[5 * index + 1] - clusterCentres[1];
da = da * da;
db = pixelInfo[5 * index + 2] - clusterCentres[2];
db = db * db;
dx = pixelInfo[5 * index + 3] - clusterCentres[3];
dx = dx * dx;
dy = pixelInfo[5 * index + 4] - clusterCentres[4];
dy = dy * dy;
distanceFromClusterCentre[index] = dx + dy + dl + da + db;
}
and here is the HOST CODE:
#include <iostream>
#include <CL/cl.h>
#include <vector>
using namespace std;
#define MAX_SOURCE_SIZE (0x100000)
int main(int argc, char **argv)
{
// Create the two input vectors
int i;
const int pixelsNumber = 1024;
const int clustersNumber = 1;
std::vector<double> pixelInfo;
pixelInfo.resize(5 * pixelsNumber);
std::fill(pixelInfo.begin(), pixelInfo.end(), 500);
std::vector<double> clusterCentres;
clusterCentres.resize(5 * clustersNumber);
std::fill(clusterCentres.begin(), clusterCentres.end(), 200);
std::vector<double> distanceFromClusterCentre;
distanceFromClusterCentre.resize(pixelsNumber);
std::fill(distanceFromClusterCentre.begin(), distanceFromClusterCentre.end(), 0);
// Load the kernel source code into the array source_str
FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("d_kernel.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);
// Get platform and device information
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1,
&device_id, &ret_num_devices);
// Create an OpenCL context
cl_context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
// Create a command queue
cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
// Create memory buffers on the device for each vector
cl_mem pixelInfo_mem = clCreateBuffer(context, CL_MEM_READ_ONLY,
5 * pixelsNumber * sizeof(int), NULL, &ret);
cl_mem clusterCentres_mem = clCreateBuffer(context, CL_MEM_READ_ONLY,
5 * clustersNumber * sizeof(int), NULL, &ret);
cl_mem distanceFromClusterCentre_mem = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
pixelsNumber * sizeof(int), NULL, &ret);
// Copy the vectors to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue, pixelInfo_mem, CL_TRUE, 0,
5 * pixelsNumber * sizeof(int), pixelInfo.data(), 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, clusterCentres_mem, CL_TRUE, 0,
5 * clustersNumber * sizeof(int), clusterCentres.data(), 0, NULL, NULL);
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&source_str, (const size_t *)&source_size, &ret);
// Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "vector_add", &ret);
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&pixelInfo_mem);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&clusterCentres_mem);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&distanceFromClusterCentre_mem);
// Execute the OpenCL kernel on the list
size_t global_item_size = pixelsNumber; // Process the entire lists
size_t local_item_size = 64; // Divide work items into groups of 64
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
// Read the memory buffer result on the device to the local vector result
ret = clEnqueueReadBuffer(command_queue, distanceFromClusterCentre_mem, CL_TRUE, 0,
pixelsNumber * sizeof(int), distanceFromClusterCentre.data(), 0, NULL, NULL);
// Display the result to the screen
for (i = 0; i < pixelsNumber; i++)
{
cout << "Pixel " << i << ": " << distanceFromClusterCentre[i] << endl;
//system("PAUSE");
}
// Clean up
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(pixelInfo_mem);
ret = clReleaseMemObject(clusterCentres_mem);
ret = clReleaseMemObject(distanceFromClusterCentre_mem);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
free(pixelInfo.data());
free(clusterCentres.data());
free(distanceFromClusterCentre.data());
system("PAUSE");
return 0;
}
and a part of the RESULT is:
.
.
.
Pixel 501: -1.11874e+306
Pixel 502: -1.16263e+306
Pixel 503: -1.07485e+306
Pixel 504: -1.03079e+306
Pixel 505: -9.42843e+305
Pixel 506: -9.86903e+305
Pixel 507: -8.98954e+305
Pixel 508: -9.86903e+305
Pixel 509: -8.98954e+305
Pixel 510: -9.43014e+305
Press any key to continue . . .
Pixel 511: -8.55065e+305
Pixel 512: 0
Pixel 513: 0
Pixel 514: 0
Pixel 515: 0
Pixel 516: 0
Pixel 517: 0
Pixel 518: 0
Pixel 519: 0
Pixel 520: 0
.
.
.
after index 511 the rest of the vector is zero !
You created your vectors of double's and then you treat them as there were ints (created buffer for ints, writing data to int buffers and reading back results as there were ints). To avoid such mistakes you could write your code this way:
cl_mem pixelInfo_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, pixelInfo.size() * sizeof(pixelInfo[0]), NULL, &ret);
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Hello when I used Mac OS + OpenCL Framework this code worked properly, but when OS was changed to openSUSE 11.4 + (OpenCL implementation from AMD) the code become throw such error. It seems typedef float clfft_complex[2]; makes this error. What you can say about that?
Error:
Err: "/tmp/OCLRS2tPp.cl", line 4: error: kernel pointer arguments must point to
addrSpace global, local, or constant
__kernel void linear_interp(__global clfft_complex *input,
^
1 error detected in the compilation of "/tmp/OCLRS2tPp.cl".
Internal error: clc compiler invocation failed.
Kernel code:
typedef float clfft_complex[2];
__kernel void linear_interp(__global clfft_complex *input,
__global clfft_complex *output)
{
int global_id = get_global_id(0);
input[global_id][0] = 1.5f;
input[global_id][1] = 5.5f;
}
Host code:
//////////////////////////////////
/* Preparing OpenCL Environment */
//////////////////////////////////
cl_uint cl_platformsN = 0;
cl_platform_id *cl_platformIDs = NULL;
clGetPlatformIDs (0, NULL, &cl_platformsN);
cl_platformIDs = (cl_platform_id*)malloc( cl_platformsN * sizeof(cl_platform_id));
clGetPlatformIDs(cl_platformsN, cl_platformIDs, NULL);
cl_int status = CL_SUCCESS;
cl_device_id device; // Compute device
cl_context context; // Compute context
CL_CHECK_ERROR(clGetDeviceIDs(cl_platformIDs[0], DEVICE_TYPE, 1, &device, NULL));
context = clCreateContext(NULL, 1, &device, NULL, NULL, &status);
////////////
/* Device */
////////////
cl_uint wavefronts_per_SIMD = 7;
cl_int device_max_cu;
size_t wg_count;
size_t global_work_size;
#if DEVICE_TYPE == CL_DEVICE_TYPE_GPU
size_t local_work_size = 64;
#else
size_t local_work_size = 1;
#endif
// Get info about the compute units on the device
CL_CHECK_ERROR(clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &device_max_cu, NULL));
wg_count = device_max_cu * wavefronts_per_SIMD;
global_work_size = wg_count * local_work_size;
/////////////////////
/* Input Data Part */
/////////////////////
/* Input a slice properties */
int bits_per_sample;
int samples_per_pixel;
int theta_size;
int slice_size;
/* Read the slice */
clfft_complex *data_tiff = tiff_read_complex(tiff_input,
&bits_per_sample,
&samples_per_pixel,
&slice_size,
&theta_size);
////////////////////////
/* OpenCL - DFI Part */
////////////////////////
/* Sync events */
const int events_num = 5;
cl_event event_list[events_num];
/* Command Queue */
cl_command_queue command_queue = clCreateCommandQueue(context, device, 0, &status);
/* Program */
const char* programSource = load_program_source(KERNELS_FILE_PATH);
if(programSource == NULL) {
fprintf(stderr, "Programm '%s' can not be created. File was not found.", KERNELS_FILE_PATH);
return;
}
cl_program program = clCreateProgramWithSource(context, 1,
(const char**)&programSource, NULL,
&status);
status = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
size_t paramValueSize = 1024 * 1024, param_value_size_ret;
char *paramValue;
paramValue = (char*)calloc(paramValueSize, sizeof(char));
status = clGetProgramBuildInfo( program,
device,
CL_PROGRAM_BUILD_LOG,
paramValueSize,
paramValue,
¶m_value_size_ret);
printf("Err: %s", paramValue);
char buf[0x10000];
clGetProgramBuildInfo(program,
device,
CL_PROGRAM_BUILD_LOG,
0x10000,
buf,
NULL);
if(status != CL_SUCCESS) {
fprintf(stderr, "Programm '%s' can not be build. (%s)", KERNELS_FILE_PATH, opencl_map_error(status));
return;
}
/* Kernels */
cl_kernel kernel_linear_interp = clCreateKernel(program, "linear_interp", &status);
First, I don't know why this code worked, but assuming that your input is a kernel pointer argument (cl_mem) with a specific memory space in global, then i think you cannot just force it to have another dimensional array of size 2, giving __global *input[2] as the argument, because you've already set the type of argument before calling the kernel. (btw where is your clSetKernelArg()?)
Second, why are you doing this to your input?
input[global_id][0] = 1.5f;
input[global_id][1] = 5.5f;
Because input memory space often should only be read-only.. or perhaps that kernel is just a piece of your kernel?
Anyway, i'm not sure what you're doing with that kernel, so:
If it means you just want a constant float[2] variable which applies
to all inputs, then you can just declare
__constant float var[2] = {1.5f, 5.5f};
If what you meant by input is actually your output, and you want
to write two floating points in a single workitem, then you can
change the type to float2, or by doing:
vstore2((float2)(1.5f,5.5f), 0, input[global_id]);
but don't forget to divide the local workitems by 2..