Clock_gettime() function outputting incorrect time - c++

I am trying to get the runtime of the following code using the clock_gettime function. However when I am running the code I am receiving a time of 0.0000 every time it runs. I have output the start and stop time individually also and I am receiving the exact same answer.
struct timespec start, stop;
double accum;
if( clock_gettime( CLOCK_REALTIME, &start) == -1 ) {
perror( "clock gettime" );
exit( EXIT_FAILURE );
}
int src = 1, final_ret = 0;
for (int t = 0; t < rows - 1; t += pyramid_height)
{
int temp = src;
src = final_ret;
final_ret = temp;
// Calculate this for the kernel argument...
int arg0 = MIN(pyramid_height, rows-t-1);
int theHalo = HALO;
// Set the kernel arguments.
clSetKernelArg(cl.kernel(kn), 0, sizeof(cl_int), (void*) &arg0);
clSetKernelArg(cl.kernel(kn), 1, sizeof(cl_mem), (void*) &d_gpuWall);
clSetKernelArg(cl.kernel(kn), 2, sizeof(cl_mem), (void*) &d_gpuResult[src]);
clSetKernelArg(cl.kernel(kn), 3, sizeof(cl_mem), (void*) &d_gpuResult[final_ret]);
clSetKernelArg(cl.kernel(kn), 4, sizeof(cl_int), (void*) &cols);
clSetKernelArg(cl.kernel(kn), 5, sizeof(cl_int), (void*) &rows);
clSetKernelArg(cl.kernel(kn), 6, sizeof(cl_int), (void*) &t);
clSetKernelArg(cl.kernel(kn), 7, sizeof(cl_int), (void*) &borderCols);
clSetKernelArg(cl.kernel(kn), 8, sizeof(cl_int), (void*) &theHalo);
clSetKernelArg(cl.kernel(kn), 9, sizeof(cl_int) * (cl.localSize()), 0);
clSetKernelArg(cl.kernel(kn), 10, sizeof(cl_int) * (cl.localSize()), 0);
clSetKernelArg(cl.kernel(kn), 11, sizeof(cl_mem), (void*) &d_outputBuffer);
cl.launch(kn);
}
if( clock_gettime( CLOCK_REALTIME, &stop) == -1 ) {
perror( "clock gettime" );
exit( EXIT_FAILURE );
}
printf( "%lf\n", stop.tv_sec );
printf( "%lf\n", start.tv_sec );
accum = ( stop.tv_sec - start.tv_sec )
+ ( stop.tv_nsec - start.tv_nsec )
/ BILLION;
printf( "%lf\n", accum );
Any advice on what I'm doing wrong is much appreciated

timespec::tv_nsec is an integer type, so if BILLION is also an integer type then:
( stop.tv_nsec - start.tv_nsec )
/ BILLION;
will truncate to zero. If the tv_sec values are the same you get a zero difference.
Try:
double( stop.tv_nsec - start.tv_nsec )
/ BILLION;
That will perform the division with a double type.

Related

OpenCL alignment issue

I want to fill an array of glm::vec3 with an OpenCL kernel.
All I want to do is fill the array with [1.0, 2.0, 3.0].
So upon success I should get the triplet repeated 256 times.
[1.0, 2.0, 3.0][1.0, 2.0, 3.0][1.0, 2.0, 3.0] ... [1.0, 2.0, 3.0]
However the result looks like this
[1.0, 2.0, 2.0][2.0, 2.0, 2.0] ... [2.0, 2.0, 2.0]
Why?
Here is the code for the kernel
__kernel void fill_array(__global float *output_values)
{
int i = get_global_id(0);
float3 pos = (float3)(1.0, 2.0, 3.0);
vstore3(pos, 0, &(output_values[i]));
}
And here is the code to run it
#include <stdio.h>
#include <stdlib.h>
#include <vector>
#include "glm/glm.hpp"
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#define MAX_SOURCE_SIZE (0x100000)
int main(void)
{
std::vector<glm::vec3> values;
values.resize(256);
// Load the kernel source code into the array source_str
FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("E:/Dev/fill_array_kernel.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
// Get platform and device information
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_ALL, 1,
&device_id, &ret_num_devices);
// Create an OpenCL context
cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
// Create a command queue
cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
// Create memory buffers on the device for each vector
cl_mem output_mem = clCreateBuffer(context, CL_MEM_WRITE_ONLY, values.size() * sizeof(glm::vec3), NULL, &ret);
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&source_str, (const size_t *)&source_size, &ret);
// Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
if(ret != CL_SUCCESS)
{
cl_build_status build_status;
ret = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &build_status, NULL);
size_t ret_val_size;
ret = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
char *build_log = (char*)malloc(sizeof(char)*(ret_val_size + 1));
ret = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
build_log[ret_val_size] = '\0';
printf("%s\n", build_log);
free(build_log);
return -1;
}
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "fill_array", &ret);
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&output_mem);
// Execute the OpenCL kernel on the list
size_t global_item_size = values.size(); // Process the entire lists
size_t local_item_size = 64; // Process in groups of 64
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
// Read the memory buffer C on the device to the local variable C
ret = clEnqueueReadBuffer(command_queue, output_mem, CL_TRUE, 0, values.size() * sizeof(glm::vec3), values.data(), 0, NULL, NULL);
// Clean up
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(output_mem);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
return 0;
}
I was misusing the vstore function.
I should have used the 2nd parameter to specify the index in the array.
https://www.khronos.org/registry/OpenCL/sdk/1.0/docs/man/xhtml/vstoren.html
__kernel void fill_array(__global float *output_values)
{
int i = get_global_id(0);
float3 pos = (float3)(1.0, 2.0, 3.0);
vstore3(pos, i, output_values);
}

OpenCL: clSetKernelArg returns CL_INVALID_ARG_SIZE

I'm a newbie at OpenCL. I'm trying to pass 5 arguments into a kernel: an input buffer, an output buffer, an integer, and 2 local arrays the size of the input buffer.
//Create input/output cl_mem objects
cl_mem inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, inputVector.size(), (void *)inputVector.data(), NULL);
cl_mem outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
inputVector.size(), NULL, NULL);
//get iterator from command line
cl_uint iterations = (cl_uint) atoi(argv[3]);
//std::cout << "iterations: " << iterations << std::endl
//cout confirms I'm getting this correctly
//Set kernel arguments
bool argOK = (CL_SUCCESS == clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&inputBuffer));
argOK = argOK && (CL_SUCCESS == clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&outputBuffer));
argOK = argOK && (CL_SUCCESS == clSetKernelArg(kernel, 2, sizeof(cl_uint), &iterations));
argOK = argOK && (CL_SUCCESS == clSetKernelArg(kernel, 3, sizeof(inputBuffer), NULL));
argOK = argOK && (CL_SUCCESS == clSetKernelArg(kernel, 4, sizeof(inputBuffer), NULL));
//Check for failure
if(!argOK) {
std::cerr << "Error: clSetKernelArg failed\n";
return SDK_FAILURE;
}
When I run the program, it prints:
Error: clSetKernelArg failed
I did some digging/debugging, and eventually found that this line:
argOK = argOK && (CL_SUCCESS == clSetKernelArg(kernel, 2, sizeof(cl_uint), &iterations));
was the culprit. Changing it to:
argOK = argOK && (CL_INVALID_ARG_SIZE == clSetKernelArg(kernel, 2, sizeof(cl_uint), &iterations));
Lets the program continue successfully. Therefore, the clSetKernelArg(kernel, 2, ...) statement is returning CL_INVALID_ARG_SIZE. It's weird though, since it looks like I'm passing in the correct size for the iterations variable.
Here's my kernel for reference:
__kernel void do_the_thing (__global uchar* in, __global uchar* out,
__global uint * numIterations, __local uchar* arr1, __local uchar* arr2)
{
//Do stuff that I haven't written yet.
}
TL;DR: Why is the setKernelArg call returning CL_INVALID_ARG_SIZE?
The third argument of your kernel (index = 2) is a buffer (__global uint *) not a cl_uint scalar.
Change your kernel to:
__kernel void do_the_thing (__global uchar* in, __global uchar* out, uint numIterations, __local uchar* arr1, __local uchar* arr2)
{
//Do stuff that I haven't written yet.
}

Euclidean distance using OpenCL

I am trying to compute the euclidean distance of a set of 5D points (pixels) to a 5D single point (center) and store in another result vector, I want to use vector indexing to store all info in a single vector so for the ith pixel, the 5 dimensions are (5i) , (5i+1) , ...
I am new to OpenCL and I just edited a sample code on the internet for my own intentions. The theory is right but the code doesn't show the right answers !
Here is the kernel:
//d_kernel.cl
__kernel void distance_kernel(__global double *pixelInfo,
__global double *clusterCentres,
__global double *distanceFromClusterCentre)
{
int index = get_global_id(0);
int d, dl, da, db, dx, dy;
dl = pixelInfo[5 * index] - clusterCentres[0];
dl = dl * dl;
da = pixelInfo[5 * index + 1] - clusterCentres[1];
da = da * da;
db = pixelInfo[5 * index + 2] - clusterCentres[2];
db = db * db;
dx = pixelInfo[5 * index + 3] - clusterCentres[3];
dx = dx * dx;
dy = pixelInfo[5 * index + 4] - clusterCentres[4];
dy = dy * dy;
distanceFromClusterCentre[index] = dx + dy + dl + da + db;
}
and here is the HOST CODE:
#include <iostream>
#include <CL/cl.h>
#include <vector>
using namespace std;
#define MAX_SOURCE_SIZE (0x100000)
int main(int argc, char **argv)
{
// Create the two input vectors
int i;
const int pixelsNumber = 1024;
const int clustersNumber = 1;
std::vector<double> pixelInfo;
pixelInfo.resize(5 * pixelsNumber);
std::fill(pixelInfo.begin(), pixelInfo.end(), 500);
std::vector<double> clusterCentres;
clusterCentres.resize(5 * clustersNumber);
std::fill(clusterCentres.begin(), clusterCentres.end(), 200);
std::vector<double> distanceFromClusterCentre;
distanceFromClusterCentre.resize(pixelsNumber);
std::fill(distanceFromClusterCentre.begin(), distanceFromClusterCentre.end(), 0);
// Load the kernel source code into the array source_str
FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("d_kernel.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);
// Get platform and device information
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1,
&device_id, &ret_num_devices);
// Create an OpenCL context
cl_context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
// Create a command queue
cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
// Create memory buffers on the device for each vector
cl_mem pixelInfo_mem = clCreateBuffer(context, CL_MEM_READ_ONLY,
5 * pixelsNumber * sizeof(int), NULL, &ret);
cl_mem clusterCentres_mem = clCreateBuffer(context, CL_MEM_READ_ONLY,
5 * clustersNumber * sizeof(int), NULL, &ret);
cl_mem distanceFromClusterCentre_mem = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
pixelsNumber * sizeof(int), NULL, &ret);
// Copy the vectors to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue, pixelInfo_mem, CL_TRUE, 0,
5 * pixelsNumber * sizeof(int), pixelInfo.data(), 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, clusterCentres_mem, CL_TRUE, 0,
5 * clustersNumber * sizeof(int), clusterCentres.data(), 0, NULL, NULL);
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&source_str, (const size_t *)&source_size, &ret);
// Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "vector_add", &ret);
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&pixelInfo_mem);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&clusterCentres_mem);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&distanceFromClusterCentre_mem);
// Execute the OpenCL kernel on the list
size_t global_item_size = pixelsNumber; // Process the entire lists
size_t local_item_size = 64; // Divide work items into groups of 64
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
// Read the memory buffer result on the device to the local vector result
ret = clEnqueueReadBuffer(command_queue, distanceFromClusterCentre_mem, CL_TRUE, 0,
pixelsNumber * sizeof(int), distanceFromClusterCentre.data(), 0, NULL, NULL);
// Display the result to the screen
for (i = 0; i < pixelsNumber; i++)
{
cout << "Pixel " << i << ": " << distanceFromClusterCentre[i] << endl;
//system("PAUSE");
}
// Clean up
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(pixelInfo_mem);
ret = clReleaseMemObject(clusterCentres_mem);
ret = clReleaseMemObject(distanceFromClusterCentre_mem);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
free(pixelInfo.data());
free(clusterCentres.data());
free(distanceFromClusterCentre.data());
system("PAUSE");
return 0;
}
and a part of the RESULT is:
.
.
.
Pixel 501: -1.11874e+306
Pixel 502: -1.16263e+306
Pixel 503: -1.07485e+306
Pixel 504: -1.03079e+306
Pixel 505: -9.42843e+305
Pixel 506: -9.86903e+305
Pixel 507: -8.98954e+305
Pixel 508: -9.86903e+305
Pixel 509: -8.98954e+305
Pixel 510: -9.43014e+305
Press any key to continue . . .
Pixel 511: -8.55065e+305
Pixel 512: 0
Pixel 513: 0
Pixel 514: 0
Pixel 515: 0
Pixel 516: 0
Pixel 517: 0
Pixel 518: 0
Pixel 519: 0
Pixel 520: 0
.
.
.
after index 511 the rest of the vector is zero !
You created your vectors of double's and then you treat them as there were ints (created buffer for ints, writing data to int buffers and reading back results as there were ints). To avoid such mistakes you could write your code this way:
cl_mem pixelInfo_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, pixelInfo.size() * sizeof(pixelInfo[0]), NULL, &ret);
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Not all work-items being used opencl

so I'm able to compile and execute my kernel, the problem is that only two work-items are being used. I'm basically trying to fill up a float array[8] with {0,1,2,3,4,5,6,7}. So this is a very simple hello world application. Bellow is my kernel.
// Highly simplified to demonstrate
__kernel void rnd_float32_matrix (
__global float * res
) {
uint idx = get_global_id(0);
res[idx] = idx;
}
I then create and execute the kernel with the following code...
// Some more code
cl::Program program(context, sources, &err);
program.build(devices, NULL, NULL, NULL);
cl::Kernel kernel(program, "rnd_float32_matrix", &err);
kernel.setArg(0, src_d);
cl::CommandQueue queue(context, devices[0], 0, &err);
cl::Event event;
err = queue.enqueueNDRangeKernel(
kernel,
cl::NullRange,
cl::NDRange(8),
// I've tried cl::NDRange(8) as well
cl::NDRange(1),
NULL,
&event
);
event.wait();
err = queue.enqueueReadBuffer(
// This is:
// cl::Buffer src_d(
// context,
// CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
// mem_size,
// src_h,
// &err);
src_d,
CL_TRUE,
0,
8,
// This is float * src_h = new float[8];
src_h);
for(int i = 0; i < 8; i ++) {
std::cout << src_h[i] << std::endl;
}
I may not show it in the code, but I also do select a gpu device and using context.getInfo(..) it shows I'm using my NVidia GTX 770M card which shows 1024, 1024, 64 work-items available in dimensions 0, 1 and 2. When this array prints I keep getting... 0, 1, 0, 0, 0, 0, 0, 0. I've also tried setting res[idx] = 5, and I get... 5, 5, 0, 0, 0, 0, 0, 0. So it seems that only two give work-items are actually being used. What am I doing wrong?
Your command to read the data back from the device is only reading 8 bytes, which is two floats:
err = queue.enqueueReadBuffer(
src_d,
CL_TRUE,
0,
8, // <- This is the number of bytes, not the number of elements!
// This is float * src_h = new float[8];
src_h);
To read 8 floats, you would need to do this:
err = queue.enqueueReadBuffer(
src_d,
CL_TRUE,
0,
8 * sizeof(cl_float),
// This is float * src_h = new float[8];
src_h);

clEnqueueWriteBuffer writes wrong data into VRAM

I have a very curious problem with clEnqueueWriteBuffer. In my current project, I would like to copy ~500 images (1GB) onto the graphics card and average some pixels. The images are stored in one big double* Array (size: width*height*nImages). If I copy 300 images into the VRAM and read it out using clEnqueueReadBuffer, I get exactly what I had stored in RAM:
RAM: 14450,5006076793 14450,5006076793 14456,8079379383 14455,2294939826 14444,7361060619
VRAM: 14450,5006076793 14450,5006076793 14456,8079379383 14455,2294939826 14444,7361060619
However, if I load more than 350 images, the content of my cl_mem object is corrupt:
RAM:14450,5006076793 14450,5006076793 14456,8079379383 14455,2294939826 14444,7361060619
VRAM:-6,27743856220419E+66 -6,27743856220419E+66 -6,27743856220419E+66 -6,27743856220419E+66 -6,27743856220419E+66
I would be very happy if you could help me out!
Here is my code:
private: System::Void button7_Click(System::Object^ sender, System::EventArgs^ e) {
std::string text;
text = StringConvA(maskedTextBox1->Text);
textBox1->Text += "You want a bin size of " + atoi(text.c_str()) + ". You have "+ nforegroundImages+" images.\r\n";
binWidth = atoi(text.c_str());
nbins = (int)ceil((double)nforegroundImages / (double)binWidth);
textBox1->Text += "That is going to give you "+nbins+" bins\r\n";
//create context and cmd_queue
context = clCreateContext(NULL, nDevices, &deviceID[0], NULL, NULL, &err);
cmd_queue = clCreateCommandQueue(context, deviceID[0], NULL, &err);
//allocate result memory
//each result image will have width*height double entries. res_im is an array of pointer to double.
res_im = (double*)malloc(width*height*sizeof(double)*nbins);
cl_mem imageData_mem, result_mem, nWavenumber_mem, binSize_mem, imageSizeInPixels_mem, nbins_mem;
imageData_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, width * height * sizeof(double)*nforegroundImages, NULL, NULL);
result_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, width * height * sizeof(double)*nbins, NULL, NULL);
nWavenumber_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int), NULL, NULL);
binSize_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int), NULL, NULL);
imageSizeInPixels_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int), NULL, NULL);
nbins_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int), NULL, NULL);
clFinish(cmd_queue);
int imageSizeInPixels = width*height;
err = clEnqueueWriteBuffer(cmd_queue, imageData_mem, CL_TRUE, 0, width*height*sizeof(double)*nforegroundImages, (void*)images, 0, NULL, NULL); //this is where the images are copied into VRAM. If nforegroundImages>300, the data in VRAM is wrong, otherwise it is the same as in the images array
err = clEnqueueWriteBuffer(cmd_queue, nWavenumber_mem, CL_TRUE, 0, sizeof(int), (void*)&nforegroundImages, 0, NULL, NULL);
err = clEnqueueWriteBuffer(cmd_queue, binSize_mem, CL_TRUE, 0, sizeof(int), (void*)&binWidth, 0, NULL, NULL);
err = clEnqueueWriteBuffer(cmd_queue, imageSizeInPixels_mem, CL_TRUE, 0, sizeof(int), (void*)&imageSizeInPixels, 0, NULL, NULL);
err = clEnqueueWriteBuffer(cmd_queue, nbins_mem, CL_TRUE, 0, sizeof(int), (void*)&nbins, 0, NULL, NULL);
clFinish(cmd_queue);
//read the content of imageData_mem and store it in test array
double * test = (double*)malloc(width*height*sizeof(double)*nforegroundImages);
err = clEnqueueReadBuffer(cmd_queue, imageData_mem, CL_TRUE, 0, width*height*sizeof(double)*nforegroundImages,
test, 0, NULL, NULL);
clFinish(cmd_queue);
//compare original value from the images array to the value retrieved from the VRAM
textBox1->Text += images[1] + "\t" + images[1] + "\t" + images[10] + "\t" + images[100] + "\t" + images[1000] + "\t\r\n"; //original data
textBox1->Text += test[1] + "\t" + test[1] + "\t" + test[10] + "\t" + test[100] + "\t" + test[1000] + "\t\r\n"; //retrieved from imageData_mem
free(test);
//build the program from the source file and print the program build log
cl_program program[2];
cl_kernel kernel[2];
const char * filename = "addKernel.c";
char *program_source = load_program_source(filename);
program[0] = clCreateProgramWithSource(context, 1, (const char**)&program_source,
NULL, &err);
if (err == CL_OUT_OF_HOST_MEMORY){
textBox1->Text += "Error: out of Host Memory!\r\n";
}
else if (err == CL_INVALID_CONTEXT){
textBox1->Text += "Error: invalid Context!\r\n";
}
else if (err == CL_INVALID_VALUE){
textBox1->Text += "Error: invalid Value!\r\n";
}
err = clBuildProgram(program[0], 0, NULL, NULL, NULL, NULL);
textBox1->Text += "Program build error: " + err + "\r\n";
cl_build_status status;
size_t logSize;
clGetProgramBuildInfo(program[0], deviceID[0], CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL);
clGetProgramBuildInfo(program[0], deviceID[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
char* programLog;
programLog = (char*)calloc(logSize + 1, sizeof(char));
clGetProgramBuildInfo(program[0], deviceID[0], CL_PROGRAM_BUILD_LOG, logSize + 1, programLog, NULL);
this->textBox1->Text += "Program build info: error=" + err + ", status=" + status + ", programLog:\r\n" + *programLog + "\r\n" + "In case of an error please make sure that openCL has been initialized\r\n";
kernel[0] = clCreateKernel(program[0], "filterSpectrum", &err);
//(__global double *imageData, __global double *result, __constant int *nWavenumbers, __constant int *binSize, __constant int *imageSizeInPixels,__constant int * nbins)
// Now setup the arguments to our kernel
err = clSetKernelArg(kernel[0], 0, sizeof(cl_mem), &imageData_mem);
err |= clSetKernelArg(kernel[0], 1, sizeof(cl_mem), &result_mem);
err |= clSetKernelArg(kernel[0], 2, sizeof(cl_mem), &nWavenumber_mem);
err |= clSetKernelArg(kernel[0], 3, sizeof(cl_mem), &binSize_mem);
err |= clSetKernelArg(kernel[0], 4, sizeof(cl_mem), &imageSizeInPixels_mem);
err |= clSetKernelArg(kernel[0], 5, sizeof(cl_mem), &nbins_mem);
size_t local_work_size = 32;
// Run the calculation by enqueuing it and forcing the
// command queue to complete the task
size_t global_work_size = width*height;
err = clEnqueueNDRangeKernel(cmd_queue, kernel[0], 1, NULL,&global_work_size, &local_work_size, 0, NULL, NULL);
clFinish(cmd_queue);
// Once finished read back the results from the answer
// array into the results array
err = clEnqueueReadBuffer(cmd_queue, result_mem, CL_TRUE, 0, width*height*sizeof(double)*nbins,
res_im, 0, NULL, NULL);
clFinish(cmd_queue);
textBox1->Text += "result values " + res_im[1] + "\t" + res_im[100] + "\t" + res_im[1000] + "\t" + res_im[10000] + "\t" + res_im[100000] + "\t" + res_im[1000000] + "\r\n";
hScrollBar2->Maximum = nbins+3;
clReleaseMemObject(imageSizeInPixels_mem);
clReleaseMemObject(imageData_mem);
clReleaseMemObject(result_mem);
clReleaseMemObject(nWavenumber_mem);
clReleaseMemObject(binSize_mem);
clReleaseMemObject(nbins_mem);
clReleaseCommandQueue(cmd_queue);
clReleaseContext(context);
}
You are most likely requesting more memory than the driver will allow in a single allocation. It looks like you aren't checking most of the error codes that the OpenCL runtime functions return; doing this makes it much easier to diagnose problems with OpenCL programs. You really should do this for every API call.
You can find out what the largest single memory allocation your device supports is with the following code snippet:
cl_ulong maxMemAlloc;
clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &maxMemAlloc, NULL);
textBox1->Text += "Maximum memory allocation size is " + maxMemAlloc + " bytes\r\n";
It's often the case that the largest memory allocation is much less than the total size of the GPU memory. The OpenCL specification only requires that it is at least 1/4 of the maximum size, or at least 128 MB.