OpenCL: array of arrays of variable lengths - c++

I am trying to process an array of arrays of variables lengths with OpenCL 1.2 in C++. In each instance (workitem?) I want to process one sub array.
Below I've tried to treat the array of arrays as a 1D array, but it does not work - random parts of the data are not processes.
vector<cl::Platform> platforms; cl::Platform::get(&platforms); _ASSERT(platforms.size() > 0); auto platform = platforms.front(); //get the platform
std::vector<cl::Device> devices; platform.getDevices(CL_DEVICE_TYPE_GPU, &devices); _ASSERT(devices.size() > 0); auto device = devices.front(); // get the device
std::ifstream myFile(""); string src(istreambuf_iterator<char>(myFile), (istreambuf_iterator<char>())); cl::Program::Sources sources(1, std::make_pair(src.c_str(), src.length() + 1)); //create program from cl file
cl::Context context(device);
cl::Program program(context, sources);
auto err =; if (err!=0) printf("%s\n",program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device).c_str() );
cl::CommandQueue queue(context, device);
int lens[10] = { 5,7,9,6,21,12,4,18,15,10 }, *idx=new int[10], totSize=0, c=0;
for (int i = 0; i < 10; i++) totSize += lens[i];
double *dat = new double[totSize], **myDat = new double *[10]; // array of arrays of different lengths
for (int i = 0; i < 10; i++) {
idx[i] = c;
myDat[i] = dat + c;
for (int j = 0; j < lens[i]; j++) myDat[i][j] = c++;
cl::Buffer inBuf(context, CL_MEM_READ_ONLY | CL_MEM_HOST_NO_ACCESS | CL_MEM_COPY_HOST_PTR, sizeof(double)*totSize, dat, &err);
cl::Buffer iBuf(context, CL_MEM_READ_ONLY | CL_MEM_HOST_NO_ACCESS | CL_MEM_COPY_HOST_PTR, sizeof(int)*10, lens, &err);
cl::Buffer lBuf(context, CL_MEM_READ_ONLY | CL_MEM_HOST_NO_ACCESS | CL_MEM_COPY_HOST_PTR, sizeof(int)*10, idx, &err);
cl::Buffer outBuf(context, CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY, sizeof(double)*totSize, nullptr, &err);
cl::Kernel kernel(program, "processSubArr");
err = kernel.setArg(0, inBuf);
err = kernel.setArg(1, lBuf);
err = kernel.setArg(2, iBuf);
err = kernel.setArg(3, outBuf);
err=queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(10));
err=queue.enqueueReadBuffer(outBuf, CL_FALSE, 0, sizeof(double)*totSize, dat);
__kernel void processSubArr(__global double* data, __global int* len, __global int* idx, __global double* outData) {
for (int i=0;i<len[get_global_id(0)];i++)
outData[idx[get_global_id(0)]+i] = data[idx[get_global_id(0)]+i]+1000;
This is test code only. In my real problem I have to pass 8 arrays of arrays (all same dimensions). First dimension is 105 to 106 long, second 1 to 100 long. The kernel code is ~100 lines of code, calculating turbulent eddy viscosity and diffusivity on each sub array with the k-epsilon method.
Is this the way or am I on the completely wrong path? I'm new to OpenCL - any help will be much appreciated.

Updated code that works. I have no clue about the performance of this method.
vector<cl::Platform> platforms; cl::Platform::get(&platforms); _ASSERT(platforms.size() > 0); auto platform = platforms.front(); //get the platform
std::vector<cl::Device> devices; platform.getDevices(CL_DEVICE_TYPE_GPU, &devices); _ASSERT(devices.size() > 0); auto device = devices.front(); // get the device
std::ifstream myFile(""); string src(istreambuf_iterator<char>(myFile), (istreambuf_iterator<char>())); cl::Program::Sources sources(1, std::make_pair(src.c_str(), src.length() + 1)); //create program from cl file
cl::Context context(device);
cl::Program program(context, sources);
auto err =; if (err!=0) printf("%s\n",program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device).c_str() );
cl::CommandQueue queue(context, device);
int lens[10] = { 5,7,9,6,21,12,4,18,15,10 }, *idx=new int[10], totSize=0, c=0;
for (int i = 0; i < 10; i++) totSize += lens[i];
double *dat = new double[totSize], **myDat = new double *[10]; // array of arrays of different lengths
for (int i = 0; i < 10; i++) {
idx[i] = c;
myDat[i] = dat + c;
for (int j = 0; j < lens[i]; j++) myDat[i][j] = c++;
cl::Buffer inBuf(context, CL_MEM_READ_ONLY | CL_MEM_HOST_NO_ACCESS | CL_MEM_COPY_HOST_PTR, sizeof(double)*totSize, dat, &err);
cl::Buffer lBuf(context, CL_MEM_READ_ONLY | CL_MEM_HOST_NO_ACCESS | CL_MEM_COPY_HOST_PTR, sizeof(int)*10, lens, &err);
cl::Buffer iBuf(context, CL_MEM_READ_ONLY | CL_MEM_HOST_NO_ACCESS | CL_MEM_COPY_HOST_PTR, sizeof(int)*10, idx, &err);
cl::Buffer outBuf(context, CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY, sizeof(double)*totSize, nullptr, &err);
cl::Kernel kernel(program, "processSubArr");
err = kernel.setArg(0, inBuf);
err = kernel.setArg(1, lBuf);
err = kernel.setArg(2, iBuf);
err = kernel.setArg(3, outBuf);
err=queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(10));
err=queue.enqueueReadBuffer(outBuf, CL_FALSE, 0, sizeof(double)*totSize, dat); //queue.enqueueMapBuffer + memcpy faster?
for (int i = 0; i < 10; i++) {
int j = 0;
for (j = 0; j < lens[i]-1; j++)
cout << myDat[i][j] << ",";
cout << myDat[i][j] << endl;
delete[] dat;


OpenCL CL_INVALID_VALUE Error on clEnqueueWriteBuffer

I am trying to make an Algorithm run on OpenCL. I am using this repository (Source.cpp) as a template. I now want to convert the whole program into type of long algorithm instead of float. But I always get an CL_INVALID_VALUE (-30) exception at
the second clEnqueueWriteBuffer. I have wasted hours without finding the error, so maybe I have overseen something obvious (I have not done too much with opencl yet..) ?
My code (not working)
#include <cassert>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <fstream>
#include <CL/cl.h>
//#define DATA_SIZE 1024
#define DATA_SIZE 1024
using namespace std;
//$ /f/Tools/OCL_SDK_Light/lib/x86_64/opencl.lib blelloch_scan.cpp
const char* ProgramSource =
"__kernel void add(__global long *input, __global long *output, __global long *temp, int size){\n"\
"int thid = get_global_id(0); \n"\
"int offset = 1; \n"\
"printf('%d',thid); \n"\
"temp[2*thid] = input[2*thid]; \n"\
"temp[2*thid+1] = input[2*thid+1]; \n"\
"for(int d= size>>1; d>0; d >>= 1){ \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"if(thid < d){ \n"\
"int ai = offset*(2*thid + 1)-1; \n"\
"int bi = offset*(2*thid + 2)-1; \n"\
"temp[bi] += temp[ai]; } \n"\
"offset = offset*2; \n"\
"} \n"\
"temp[size-1] = 0; \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"for(int d = 1; d<size; d *= 2){ \n"\
"offset >>= 1; barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"if(thid < d) { \n"\
"int ai = offset*(2*thid+1)-1; int bi = offset*(2*thid+2)-1; \n"\
"long t = temp[ai]; temp[ai] = temp[bi]; temp[bi] += t; } \n"\
"} \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"output[2*thid] = temp[2*thid]; \n"\
"output[2*thid+1] = temp[2*thid+1]; \n"\
int main(void)
cl_context context;
cl_context_properties properties[3];
cl_command_queue command_queue;
cl_kernel kernel;
cl_program program;
cl_int err;
cl_uint num_platforms = 0;
cl_platform_id* platforms;
cl_device_id device_id;
cl_uint num_of_devices = 0;
cl_mem inputA, inputB, output;
size_t global, loc;
std::cout << "Setup \n";
long arr[DATA_SIZE];
long inputDataA[DATA_SIZE];
long results[2 * DATA_SIZE];
long i;
for (i = 1; i < DATA_SIZE - 1;i++)
inputDataA[i-1] = (long)i;
arr[i-1] = (long)i;
clock_t ends;
/* --------------------- Get platform ---------------------*/
cl_int clResult = clGetPlatformIDs(0, NULL, &num_platforms);
assert(clResult == CL_SUCCESS);
platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id) * num_platforms);
clResult = clGetPlatformIDs(num_platforms, platforms, NULL);
assert(clResult == CL_SUCCESS);
/* --------------------- ------------ ---------------------*/
/* --------------------- Get devices ---------------------*/
cl_device_id* devices = NULL;
cl_uint num_devices;
clResult = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
assert(clResult == CL_SUCCESS);
devices = (cl_device_id*)malloc(sizeof(cl_device_id) * num_platforms);
if (clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, num_devices, devices, NULL) != CL_SUCCESS)
printf("could not find device id");
assert(clResult == CL_SUCCESS);
/* --------------------- ----------- ---------------------*/
properties[0] = CL_CONTEXT_PLATFORM;
properties[1] = 0;
cl_int contextResult;
context = clCreateContext(NULL, 1, &devices[0], NULL, NULL, &contextResult);
assert(contextResult == CL_SUCCESS);
// create command queue using the context and device
command_queue = clCreateCommandQueueWithProperties(context, devices[0], 0, &err);
// create a program from the kernel source code
program = clCreateProgramWithSource(context, 1, (const char**)&ProgramSource, NULL, &err);
// compile the program
if (clBuildProgram(program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS)
printf("Error building program\n");
return 1;
// specify which kernel from the program to execute
kernel = clCreateKernel(program, "add", &err);
// create buffers for the input and ouput
cl_int result;
inputA = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(long) * DATA_SIZE, NULL, NULL);
inputB = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(long) * DATA_SIZE, NULL, NULL);
output = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(long) * DATA_SIZE, NULL, NULL);
// load data into the input buffer
clResult = clEnqueueWriteBuffer(command_queue, inputA, CL_TRUE, 0, sizeof(long) * DATA_SIZE, inputDataA, 0, NULL, NULL);
assert(clResult == CL_SUCCESS);
clResult = clEnqueueWriteBuffer(command_queue, inputB, CL_TRUE, 0, sizeof(long) * DATA_SIZE , 0, 0, NULL, NULL);
assert(clResult == CL_SUCCESS); // ERROR HERE
clResult = clEnqueueWriteBuffer(command_queue, output, CL_TRUE, 0, sizeof(long) * DATA_SIZE, 0, 0, NULL, NULL);
assert(clResult == CL_SUCCESS);
int temp = DATA_SIZE;
clock_t start = clock();
// set the argument list for the kernel command
clResult = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputA);
assert(clResult == CL_SUCCESS);
clResult = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
assert(clResult == CL_SUCCESS);
clResult = clSetKernelArg(kernel, 2, sizeof(cl_mem), &inputB);
assert(clResult == CL_SUCCESS);
clResult = clSetKernelArg(kernel, 3, sizeof(int), &temp);
assert(clResult == CL_SUCCESS);
global = DATA_SIZE; // num of processors
loc = 256;
printf("\n>> start parallel ---------- \n");
// enqueue the kernel command for execution
clResult = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, &loc, 0, NULL, NULL);
assert(clResult == CL_SUCCESS);
// copy the results from out of the output buffer
clResult = clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sizeof(long) * DATA_SIZE, results, 0, NULL, NULL);
assert(clResult == CL_SUCCESS);
ends = clock();
// print the results
int k = 1;
for (k = 1;k < 8; k++)
printf("%d - ", k);
printf("%d \n", results[k]);
double time_taken = ((double)(ends - start)) / CLK_TCK;
printf("\n>>finished parallel in %lf seconds\n", time_taken);
// cleanup - release OpenCL resources
/* -------sequential ------- */
printf("\n>> start sequential ---------- \n");
long prefixSum[DATA_SIZE] = { 0 };
const clock_t startSequential = clock();
prefixSum[0] = arr[0];
long idx = 1;
for (idx = 1; idx < DATA_SIZE; idx++) {
prefixSum[idx] = prefixSum[idx - 1] + arr[idx];
const clock_t endSequential = clock();
double seqTime = ((double)(endSequential - startSequential)) / CLK_TCK;
printf("\n>> finished sequential in %lf\n", seqTime);
for (int j = 0;j < 8; j++)
printf("%d - ", j);
printf("%d \n", prefixSum[j]);
return 0;
The repository code (working):
#include <cassert>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <fstream>
#include <CL/cl.h>
#define DATA_SIZE 1024
using namespace std;
ofstream outfile;
const char* ProgramSource =
"__kernel void add(__global float *input, __global float *output, __global float *temp, int size){\n"\
"int thid = get_global_id(0); \n"\
"int offset = 1; \n"\
"temp[2*thid] = input[2*thid]; \n"\
"temp[2*thid+1] = input[2*thid+1]; \n"\
"for(int d= size>>1; d>0; d >>= 1){ \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"if(thid < d){ \n"\
"int ai = offset*(2*thid + 1)-1; \n"\
"int bi = offset*(2*thid + 2)-1; \n"\
"temp[bi] += temp[ai]; } \n"\
"offset = offset*2; \n"\
"} \n"\
"temp[size-1] = 0; \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"for(int d = 1; d<size; d *= 2){ \n"\
"offset >>= 1; barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"if(thid < d) { \n"\
"int ai = offset*(2*thid+1)-1; int bi = offset*(2*thid+2)-1; \n"\
"float t = temp[ai]; temp[ai] = temp[bi]; temp[bi] += t; } \n"\
"} \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"output[2*thid] = temp[2*thid]; \n"\
"output[2*thid+1] = temp[2*thid+1]; \n"\
int main(void)
cl_uint num_platforms = 0;
cl_context context;
cl_context_properties properties[3];
cl_kernel kernel;
cl_platform_id* platforms;
cl_command_queue command_queue;
cl_program program;
cl_int err;
cl_uint num_of_platforms = 0;
cl_platform_id platform_id;
cl_device_id device_id;
cl_uint num_of_devices = 0;
cl_mem inputA, inputB, output;"shubham.txt");
size_t global, loc;
float inputDataA[DATA_SIZE];
float results[2 * DATA_SIZE] = { 0 };
int i;
for (i = 0; i < DATA_SIZE;i++)
inputDataA[i] = (float)i;
clock_t start, ends;
/* --------------------- Get platform ---------------------*/
cl_int clResult = clGetPlatformIDs(0, NULL, &num_platforms);
assert(clResult == CL_SUCCESS);
platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id) * num_platforms);
clResult = clGetPlatformIDs(num_platforms, platforms, NULL);
assert(clResult == CL_SUCCESS);
/* --------------------- ------------ ---------------------*/
/* --------------------- Get devices ---------------------*/
cl_device_id* devices = NULL;
cl_uint num_devices;
clResult = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
assert(clResult == CL_SUCCESS);
devices = (cl_device_id*)malloc(sizeof(cl_device_id) * num_platforms);
if (clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, num_devices, devices, NULL) != CL_SUCCESS)
printf("could not find device id");
assert(clResult == CL_SUCCESS);
/* --------------------- ----------- ---------------------*/
properties[0] = CL_CONTEXT_PLATFORM;
properties[1] = 0;
cl_int contextResult;
context = clCreateContext(NULL, 1, &devices[0], NULL, NULL, &contextResult);
assert(contextResult == CL_SUCCESS);
// create command queue using the context and device
// create command queue using the context and device
command_queue = clCreateCommandQueueWithProperties(context, devices[0], 0, &err);
// create a program from the kernel source code
program = clCreateProgramWithSource(context, 1, (const char**)&ProgramSource, NULL, &err);
// compile the program
if (clBuildProgram(program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS)
printf("Error building program\n");
return 1;
// specify which kernel from the program to execute
kernel = clCreateKernel(program, "add", &err);
// create buffers for the input and ouput
inputA = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * DATA_SIZE, NULL, NULL);
inputB = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * DATA_SIZE, NULL, NULL);
output = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * DATA_SIZE, NULL, NULL);
// load data into the input buffer
clEnqueueWriteBuffer(command_queue, inputA, CL_TRUE, 0, sizeof(float) * DATA_SIZE, inputDataA, 0, NULL, NULL);
clEnqueueWriteBuffer(command_queue, inputB, CL_TRUE, 0, sizeof(float) * DATA_SIZE, 0, 0, NULL, NULL);
clEnqueueWriteBuffer(command_queue, output, CL_TRUE, 0, sizeof(float) * DATA_SIZE, 0, 0, NULL, NULL);
int temp = DATA_SIZE;
start = clock();
// set the argument list for the kernel command
clResult = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputA);
assert(clResult == CL_SUCCESS);
clResult = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
assert(clResult == CL_SUCCESS);
clResult = clSetKernelArg(kernel, 2, sizeof(cl_mem), &inputB);
assert(clResult == CL_SUCCESS);
clResult = clSetKernelArg(kernel, 3, sizeof(int), &temp);
assert(clResult == CL_SUCCESS);
global = DATA_SIZE;
loc = 256;
// enqueue the kernel command for execution
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, &loc, 0, NULL, NULL);
// copy the results from out of the output buffer
clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sizeof(float) * DATA_SIZE, results, 0, NULL, NULL);
//clEnqueueReadBuffer(command_queue, inputB, CL_TRUE, 0, sizeof(float) *16, shubh, 0, NULL, NULL);
// print the results
printf("output: ");
for (i = 0;i < 5; i++)
printf("%f \n", results[i]);
outfile << results[i] << " ";
ends = clock();
double time_taken = ((double)(ends - start)) / CLK_TCK;
outfile << endl << "Time taken is : " << time_taken << endl;
return 0;
Thanks in Advance
I found your mistake. For me, it first wouldn't compile the OpenCL C code, so I debugged with
char info[1024];
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 1024*sizeof(char), (void*)info, NULL); // print build log
to get the build log:
<kernel>:4:8: warning: multi-character character constant
<kernel>:4:8: warning: incompatible integer to pointer conversion passing 'int' to parameter of type '__constant char *'
cl_kernel.h:4694:32: note: passing argument to parameter here
printf(constant char * restrict, ...) __asm("");
Seems ' instead of \" was the issue. Change this line:
"printf(\"%d\",thid); \n"
Then the OpenCL C code compiles and I can reproduce the CL_INVALID_VALUE error.
Here is the issue: You use clEnqueueWriteBuffer to copy data from inputB to 0. You need to add C++ arrays to copy the data into:
long inputDataA[DATA_SIZE];
long inputDataB[DATA_SIZE];
long outputData[DATA_SIZE];
clResult = clEnqueueWriteBuffer(command_queue, inputA, CL_TRUE, 0, sizeof(long) * DATA_SIZE, inputDataA, 0, NULL, NULL);
assert(clResult == CL_SUCCESS);
clResult = clEnqueueWriteBuffer(command_queue, inputB, CL_TRUE, 0, sizeof(long) * DATA_SIZE, inputDataB, 0, NULL, NULL);
assert(clResult == CL_SUCCESS); // WORKS NOW
clResult = clEnqueueWriteBuffer(command_queue, output, CL_TRUE, 0, sizeof(long) * DATA_SIZE, outputData, 0, NULL, NULL);
assert(clResult == CL_SUCCESS);
Then it works, and I get this output:
>> start parallel ----------
25625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035101234567891011121314151617181920212223242526272829303138438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441564656667686970717273747576777879808182838485868788899091929394953523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823839697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612741641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863932333435363738394041424344454647484950515253545556575859606162634484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784798328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628635445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745751921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222234804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105118648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948955125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425431281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581598008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308315765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066072242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542559609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909916406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706711601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901917687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987997367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667679929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210236726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027039289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589597047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347358968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269271 - 0
2 - 1
3 - 2
4 - 4
5 - 6
6 - 9
7 - 12
>>finished parallel in 0.023000 seconds
>> start sequential ----------
>> finished sequential in 0.000000
0 - 1
1 - 3
2 - 6
3 - 10
4 - 15
5 - 21
6 - 28
7 - 36
Also note, that in OpenCL C, long = 64-bit integer, but in C++, long = "at least" 32-bit integer, for whatever stupid reason. In C++ you shoud use long long int as this is alwqays 64-bit integer. You can use for example typedef int64_t slong;, where int64_t itself is a typedef of long long int.
Another issue is that the program is not deterministic. When executed multiple times, I get a different result each time. There must be some race condition present. I suppose you wrongly assume that barrier(CLK_GLOBAL_MEM_FENCE); provides global synchronization of all threads, but this is not true. The only global synchronization is to split the kernel into multiple kernels at the desired synchronizatipon points and execute them one after the other.
Finally, to make OpenCL development in C++ much easier, and to prevent wasting hours on such simple errors, I have created a lightweight OpenCL-Wrapper to eliminate all of the OpenCL code overhead. With this, your code is 4x shorter and much easier to understand:
#include "opencl.hpp"
#define DATA_SIZE 1024
int main() {
Clock clock;
Device device(select_device_with_most_flops()); // compile OpenCL C code for the fastest available device
Memory<slong> arr(device, DATA_SIZE, 1u, true, false);
Memory<slong> inputA(device, DATA_SIZE);
Memory<slong> inputB(device, DATA_SIZE);
Memory<slong> output(device, DATA_SIZE);
for(int i=1; i<DATA_SIZE-1; i++) {
inputA[i-1] = (slong)i;
arr[i-1] = (slong)i;
Kernel kernel(device, DATA_SIZE, "add", inputA, output, inputB);
double time_taken = clock.stop();
// print the results
for(int k=1; k<8; k++) {
printf("%d - ", k);
printf("%d \n", output[k]);
printf("\n>>finished parallel in %lf seconds\n", time_taken);
printf("\n>> start sequential ---------- \n");
long prefixSum[DATA_SIZE] = { 0 };
prefixSum[0] = arr[0];
for(long idx=1; idx<DATA_SIZE; idx++) {
prefixSum[idx] = prefixSum[idx-1]+arr[idx];
double seqTime = clock.stop();
printf("\n>> finished sequential in %lf\n", seqTime);
for(int j=0; j<8; j++) {
printf("%d - ", j);
printf("%d \n", prefixSum[j]);
return 0;
#include "kernel.hpp" // note: unbalanced round brackets () are not allowed and string literals can't be arbitrarily long, so periodically interrupt with )+R(
string opencl_c_container() { return R( // ########################## begin of OpenCL C code ####################################################################
kernel void add(__global long* input, __global long* output, __global long* temp, int size) {
int thid = get_global_id(0);
int offset = 1;
temp[2*thid] = input[2*thid];
temp[2*thid+1] = input[2*thid+1];
for(int d= size>>1; d>0; d >>= 1) {
if(thid < d) {
int ai = offset*(2*thid + 1)-1;
int bi = offset*(2*thid + 2)-1;
temp[bi] += temp[ai];
offset = offset*2;
temp[size-1] = 0;
for(int d = 1; d<size; d *= 2) {
offset >>= 1; barrier(CLK_GLOBAL_MEM_FENCE);
if(thid < d) {
int ai = offset*(2*thid+1)-1; int bi = offset*(2*thid+2)-1;
long t = temp[ai]; temp[ai] = temp[bi]; temp[bi] += t;
output[2*thid] = temp[2*thid];
output[2*thid+1] = temp[2*thid+1];
);} // ############################################################### end of OpenCL C code #####################################################################

OpenCL method get_global_id() works incorrectly on GPU

I want to parallelize temperatures distribution, using OpenCL technology. I stocked on problem with my GPU - work item id for every other kernel function are the same. Instead of result, for example, from 0 to 1024, I got this result. What I did incorrectcly?
enter image description here
include <iostream>
#include <string>
#include <fstream>
#include <omp.h>
#include <CL/cl.hpp>
float*** distributeOpenCL(float*** cuboid, int k, int m, int n)
// OpenCL init
int size = k * m * n;
float*** hResult = initCuboid(k, m, n);
cl_platform_id platform;
cl_device_id device;
cl_int error = 0;
std::ifstream file("");
std::string fileText = std::string(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>());
const char* srcText =;
size_t srcLength = fileText.size();
cl_context context;
cl_program program;
cl_kernel kernel;
cl_command_queue queue;
cl_mem dCuboid, dRes;
size_t localSize[2] = { k,m };
size_t globalSize[2] = { ceil(size / (float)localSize[0]) * localSize[0], ceil(size / (float)localSize[1]) * localSize[1] };
// Get GPU
error |= clGetPlatformIDs(1, &platform, NULL);
error |= clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
// Compile and build
context = clCreateContext(NULL, 1, &device, NULL, NULL, &error);
program = clCreateProgramWithSource(context, 1, &srcText, &srcLength, &error);
error |= clBuildProgram(program, 1, &device, NULL, NULL, NULL);
// What funtion from file we have to run
kernel = clCreateKernel(program, "distributeKernel", &error);
// Add to Queue
queue = clCreateCommandQueueWithProperties(context, device, NULL, &error);
// Create buffer
dCuboid = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * size, NULL, NULL);
dRes = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * size, NULL, NULL);
// Write data to buffer
error |= clEnqueueWriteBuffer(queue, dCuboid, CL_TRUE, 0, sizeof(float) * size, cuboid, 0, NULL, NULL);
// Kernel args
error |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &dCuboid);
error |= clSetKernelArg(kernel, 1, sizeof(int), &k);
error |= clSetKernelArg(kernel, 2, sizeof(int), &m);
error |= clSetKernelArg(kernel, 3, sizeof(int), &n);
error |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &dRes);
// Start task
error |= clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalSize, localSize, 0, NULL, NULL);
// Wait execution
// Read Result
error |= clEnqueueReadBuffer(queue, dRes, CL_TRUE, 0, sizeof(float) * size, hResult, 0, NULL, NULL);
//printCuboid(resP, k, m, n, resPFile);
// Deallocation
return hResult;
int main(int argc, char* argv[])
std::ofstream filledFile("filled.txt");
std::ofstream resLFile("resL.txt");
std::ofstream resPFile("resP.txt");
double durationL, durationP, time1, time2;
int k = 5, m = 5, n = 5, temp1 = 10, temp2 = 15;
float*** cuboid, *** resL, *** resP;
if (argc > 1) {
k = atoi(argv[1]), m = atoi(argv[2]), n = atoi(argv[3]),
temp1 = atoi(argv[4]), temp2 = atoi(argv[5]);
// Linear
cuboid = initCuboid(k, m, n);
fillCuboid(cuboid, k, m, n, temp1, temp2);
printCuboidToFile(cuboid, k, m, n, filledFile);
time1 = omp_get_wtime();
resL = distribute(cuboid, k, m, n);
time2 = omp_get_wtime();
durationL = time2 - time1;
printCuboidToFile(resL, k, m, n, resLFile);
// Parallel
time1 = omp_get_wtime();
resP = distributeOpenCL(cuboid, k, m, n);
time2 = omp_get_wtime();
durationP = time2 - time1;
//printCuboidToFile(resP, k, m, n, resPFile);
std::cout << "Linear time: " << durationL << std::endl;
std::cout << "Parallel time: " << durationP << std::endl;
std::cout << "Parallel faster than linear on: " << durationL - durationP << std::endl;
// Delete 3d arrays, closing files
deleteCuboid(cuboid, k, m, n);
deleteCuboid(resL, k, m, n);
deleteCuboid(resP, k, m, n);
return 0;
__kernel void distributeKernel(__global float*** cuboid, int k, int m, int n, __global float*** result)
int gz = get_global_id(0);
int gy = get_global_id(1);
printf("gy - %d \n", &gy);
printf("gz - %d \n", &gz);
bool isDissipated = false;
int size = k * m * n;
// Ends if temperatures in cube becomes balanced
while (!isDissipated) {
int dissipatedCount = 0;
for (int x = 0; x < n; x++) {
// Calc average temperature
float sum = 0;
int count = 0;
float average;
for (int zSum = gz - 1; zSum <= gz + 1; zSum++) {
for (int ySum = gy - 1; ySum <= gy + 1; ySum++) {
for (int xSum = x - 1; xSum <= x + 1; xSum++) {
if (zSum >= 0 && ySum >= 0 && xSum >= 0
&& zSum < k && ySum < m && xSum < n) {
sum += result[gz][gy][xSum];
average = round(sum / count * 100) / 100;
if (average == result[gz][gy][x]) {
else {
result[gz][gy][x] = average;
if (dissipatedCount == size) {
isDissipated = true;
To get the issue with the supposedly wrong get_global_id() fixed, start with a simple, minimal "Hello World"-style vector addition program and than advance forward to your temperature distribution application step-by-step.
With your code I see several issues:
You can only have 1D pointers (with a single *) in OpenCL.
__kernel void distributeKernel(__global float* cuboid, __global float* result)
Introduce a linear index to access more than 1 dimension: For 2D for example int n = x+y*get_global_size(0);
From what I see, k, m, n are lattice dimensions. Eliminate them from the kernel entirely. Get size via get_global_size(...).
The kernel looks rather complex with a lot of loops and branching. This could kill any performance benefit you hope to get from GPU parallelization. Get rid of loops and branching as far as possible. Also, there should not be any loop over one of the lattice dimensions since the lattice position is what you parallelize.
I would also advice to use only 1D parallelization in OpenCL and do the linear indexing yourself. This gives you more flexibility regarding workgroup size.

Got the initial value from GPU programming in OpenCL

I developed a small code to add two small vector using GPU by OpenCL library. The main code is as follows:
#include <iostream>
#include <CL/cl.hpp>
#include <cassert>
#include <fstream>
#include <time.h>
#include <cmath>
void randomInit(float *data, int size)
for (unsigned int i = 0; i < size; ++i)
data[i] = rand() / (float)RAND_MAX;
int main()
//get all platforms (drivers)
std::vector<cl::Platform> platforms;
assert(platforms.size() > 0);
cl::Platform myPlatform = platforms[0];
std::cout << "Using platform: "<<myPlatform.getInfo<CL_PLATFORM_NAME>()<<"\n";
//get default device of the default platform
std::vector<cl::Device> devices;
myPlatform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
assert(devices.size() > 0);
cl::Device myDevice = devices[0];
std::cout<< "Using device: "<<myDevice.getInfo<CL_DEVICE_NAME>()<<"\n";
std::ifstream vectorAddFile("" );
std::string src(std::istreambuf_iterator<char>(vectorAddFile), (std::istreambuf_iterator<char>()));
cl::Program::Sources sources(1, std::make_pair(src.c_str(), src.length() + 1));
cl::Context context(myDevice);
cl::Program program(context, sources);
int szVec = 10;
float* A = new float[szVec];
float* B = new float[szVec];
float* C = new float[szVec];
std::fill_n(C, szVec, 0);
// create buffers on the device
cl::Buffer buffer_A = cl::Buffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, szVec * sizeof(float), A);
cl::Buffer buffer_B = cl::Buffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, szVec * sizeof(float), B);
cl::Buffer buffer_C = cl::Buffer(context, CL_MEM_WRITE_ONLY|CL_MEM_COPY_HOST_PTR, szVec * sizeof(float), C);
//create queue to which we will push commands for the device.
cl::CommandQueue queue(context, myDevice);
//write arrays A and B to the device
//queue.enqueueWriteBuffer(buffer_A, CL_TRUE, 0, sizeof(float) * szVec, A);
//queue.enqueueWriteBuffer(buffer_B, CL_TRUE, 0, sizeof(float) * szVec, B);
auto err ="cl.std.CL1.2");
// run the kernel
cl::Kernel kernel(program,"vector_add", &err);
kernel.setArg(0, buffer_A);
kernel.setArg(1, buffer_B);
kernel.setArg(2, buffer_C);
queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(szVec), cl::NullRange);
//read result C from the device to array C
queue.enqueueReadBuffer(buffer_C, CL_TRUE, 0, sizeof(float) * szVec, C);
std::cout<<" result: \n";
for(int i = 0; i < szVec; i++)
std::cout << A[i] << " + " << B[i] << " = " << C[i] << std::endl;
std::cout << std::endl;
return 0;
and the kernel code is as follows:
__kernel void vector_add(__global float *A, __global float *B, __global float *C)
// Get the index of the current element
int i = get_global_id(0);
// Do the operation
C[i] = A[i] + B[i];
and the result i got is:
Using platform: NVIDIA CUDA
Using device: Tesla K20m
0.840188 + 0.477397 = 0
0.394383 + 0.628871 = 0
0.783099 + 0.364784 = 0
0.79844 + 0.513401 = 0
0.911647 + 0.95223 = 0
0.197551 + 0.916195 = 0
0.335223 + 0.635712 = 0
0.76823 + 0.717297 = 0
0.277775 + 0.141603 = 0
0.55397 + 0.606969 = 0
The problem as you can see, the result is always what I initialized vector C, I do not understand why. I also initialized vectorC with some other values and again the result was the initial values.
It's probably just a syntax error.
auto err ="cl.std.CL1.2");
should be
auto err ="-cl-std=CL1.2");
The documentation on clBuildProgram has more information about the supported options.
The problem stems from building the program with this command
auto err ="cl.std.CL1.2");
and by replacing the command above with
auto err =;
The problem solved.
But still I do not know why this happened. Any idea?

Shared memory in OpenCL

I intend to perform vector manipulations and was trying a small dummy program with vector addition and multiplication. However, the code does not run due to limitations on my knowledge on shared memory. All the sources in the internet show 2D matrix operations which I cannot translate to my vector problems. Please try to explain where am I going wrong considering the fact I am a novice in OpenCL. The code is given below:
Host Code:
std::vector<cl::Platform> platforms;
std::vector<cl::Device> devices;
cl::Context context;
cl::CommandQueue queue;
cl::Program program;
cl::Kernel kernel;
deviceUsed = 0;
cl_context_properties properties[] =
{ CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(),0 };
context = cl::Context(CL_DEVICE_TYPE_ALL, properties);
devices = context.getInfo<CL_CONTEXT_DEVICES>();
queue = cl::CommandQueue(context, devices[deviceUsed]);
cl::Program::Sources source( 1, std::make_pair(kernel_source.c_str(), kernel_source.size()));
program = cl::Program(context, source);;
std::vector < float > a;
std::vector < float > b;
std::vector < float > sum;
std::vector < float > prod;
int globalSize = 128;
int localSize = 16;
for (int i = 0; i < globalSize ; i++)
a[i] = 1.0f * i;
b[i] = 5.0f * i;
cl::Buffer buffer_A;
cl::Buffer buffer_B;
cl::Buffer buffer_sum;
cl::Buffer buffer_prod;
buffer_A = cl::Buffer (context, CL_MEM_READ_WRITE, sizeof(float) * globalSize);
buffer_B = cl::Buffer (context, CL_MEM_READ_WRITE, sizeof(float) * globalSize);
queue.enqueueWriteBuffer(buffer_A, CL_TRUE, 0, sizeof(float) * globalSize , &a[0]);
queue.enqueueWriteBuffer(buffer_B, CL_TRUE, 0, sizeof(float) * globalSize , &b[0]);
buffer_sum = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * globalSize);
buffer_prod = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * globalSize);
kernel.setArg(0, buffer_A);
kernel.setArg(1, buffer_B);
kernel.setArg(2, buffer_sum);
kernel.setArg(3, buffer_prod);
queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(globalSize/localSize), cl::NDRange(N), NULL);
queue.enqueueReadBuffer(buffer_sum, CL_TRUE, 0, sizeof(float) * globalSize, &sum[0]);
queue.enqueueReadBuffer(buffer_prod, CL_TRUE, 0, sizeof(float) * globalSize, &prod[0]);
#define STRINGI(ker) #ker
std::string kernel_source = STRINGI(
__kernel void KernelAddMul(__global float* a, __global float* b, __global float* sum, __global float* prod)
unsigned int j = get_local_id(0);
int N = get_local_size(0);
unsigned int i = N * get_global_id(0) + j;
float locSum[N];
float locProd[N];
__local float Asub[N];
__local float Bsub[N];
for(int k = 0; k < N; k++){
Asub[k] = a[i];
Bsub[k] = b[i];
locSum[k] = Asub[k] + Bsub[k];
locProd[k] = Asub[k] * Bsub[k];
sum[i] = locSum[k];
prod[i] = locProd[k];
I suspect that your code does not run because your kernel does not compile.
The following lines are invalid:
int N = get_local_size(0);
float locSum[N];
float locProd[N];
__local float Asub[N];
__local float Bsub[N];
N must be a constant, you cannot dynamically size the arrays using get_local_size(0).
I strongly recommend that you use a standalone compiler to compile your kernels:
CodeXL is very good, as is the Intel
SDK for OpenCL.
Anything is better than trying to debug your kernel in an application!

OpenCL kernel doesn't run correctly

I have following kernel in my project:
__kernel void zero(__global float* vh)
const float2 id = (float2)(get_global_id(0),get_global_id(1));
const float2 sz = (float2)(1,get_global_size(0));
vh[(int)dot(id,sz)] = 1;
And here's the way I launch it
std::vector<cl::Platform> platforms;
std::vector<cl::Device> devices;
cl::Platform::get(&platforms);, &devices);
std::string GPUname;, &GPUname);
std::cout << "Program runs on GPU: " << GPUname << std::endl;
cl::Context context(devices);
cl::CommandQueue queue(context,;
std::ifstream srcfile("");
std::string src(std::istreambuf_iterator<char>(srcfile), std::istreambuf_iterator<char>(0));
cl::Program program(context, cl::Program::Sources(1, std::make_pair(src.c_str(), src.size())));, "-Werror");
cl::Kernel kzero = cl::Kernel(program, "zero");
cl::Buffer buffer(context, CL_MEM_READ_ONLY, N * N * sizeof(float));
cl::NDRange gndr(N, N), lndr(8, 8);
kzero.setArg(0, buffer);
queue.enqueueNDRangeKernel(kzero, cl::NullRange, gndr, lndr);
float data[N * N];
queue.enqueueReadBuffer(buffer, CL_TRUE, 0, N * N * sizeof(float), data);
std::ofstream ofs;"solution.txt", std::ofstream::out);
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
ofs << data[i + N * j] << (j == N - 1 ? '\n' : ' ');
std::cout << "File written";
return 0;
But every time I run my program the file contains only zeros, although it should be ones. Where can be the error? I'm stuck in it?