I am trying to learn OpenCL by writing a simple program to add the absolute value of a subtraction of a point's dimensions. When I finished writing the code, the output seemed wrong and so I decided to integrate some printf's in the code and kernel to verify that all the variables are passed correctly to the kernel. By doing this, I learned that the input variables were NOT correctly sent to the kernel, because printing them would return incorrect data (all zeros, to be precise). I have tried changing the data type from uint8 to int, but that did not seem to have any effect. How can I correctly send uint8 variables to the memory buffer in OpenCL? I really cannot seem to identify what I am doing wrong in writing and sending the memory buffers so that they show up incorrectly and would appreciate any opinion, advice or help.
Thank you in advance.
EDIT: Question is now solved. I have updated the code below according to the kind feedback provided in the comment and answer sections. Many thanks!
Code below:
#include <iostream>
#include <chrono>
#include <CL/cl.hpp>
#include <stdio.h>
#include <stdlib.h>
using namespace std;
#define USE_PLATFORM_NR 0
#define SIZE 100*1024*1024UL
//SAD DEFINES
#define NUM_DIM_SAD 5
#define NUM_POINTS_SAD 10
//#define NUM_LOOPS_SAD 20
#define SAD_SEED 2014
//NUM_LOOPS * NUM_POINTS should be 75M
//SSD DEFINES
#define NUM_DIM_SSD 128
#define NUM_POINTS_SSD 150000
//#define NUM_LOOPS_SSD 1000
#define SSD_SEED 2048
//NUM_LOOPS * NUM_POINTS should be 150M
// Threadblock sizes (e.g. for kernels )
#define TS 5
// =================================================================================================
// Set the kernel as a string
const char* kernelstring =
"__kernel void SAD(const int num_points_sad, const int num_dim_sad,"
" const global unsigned char* m1_set,"
" const global unsigned char* m2_set,"
" global unsigned char* sad_gpu) {"
" const int Point = get_global_id(0);"
" unsigned char acc = 0;"
" printf(\" POINT: %d \\n \", Point); "
" for (int s=0; s<num_dim_sad ; s++) {"
" printf(\"GPU: i = %d | m1_set = %d| m2_set = %d \\n \",Point*num_dim_sad + s,m1_set[Point*num_dim_sad+s],m2_set[Point*num_dim_sad+s]);}"
" for (int k=0; k<num_dim_sad; k++) {"
" acc += abs( m1_set[Point*num_dim_sad + k] - m2_set[Point*num_dim_sad + k] );"
" }"
" printf(\"ACC: %d \\n \",acc);"
" sad_gpu[Point] = acc;"
"}";
// =================================================================================================
// Matrix-multiplication using a custom OpenCL SGEMM kernel.
int main() {
cout << "Computing naive SAD & SSD for result checking" << endl;
//naive implementation on CPU for result checking
uint8_t* m1_set;// [NUM_POINTS][NUM_DIM];
uint8_t* m2_set;// [NUM_POINTS][NUM_DIM];
m1_set = (uint8_t*)malloc(sizeof(uint8_t*) * NUM_POINTS_SAD * NUM_DIM_SAD);
m2_set = (uint8_t*)malloc(sizeof(uint8_t*) * NUM_POINTS_SAD * NUM_DIM_SAD);
uint8_t* sad; // [NUM_POINTS];
uint8_t* sad_gpu;// [NUM_POINTS];
sad = (uint8_t*)malloc(sizeof(uint8_t) * NUM_POINTS_SAD);
sad_gpu = (uint8_t*)malloc(sizeof(uint8_t) * NUM_POINTS_SAD);
srand(SAD_SEED);
for (int i = 0; i < NUM_POINTS_SAD * NUM_DIM_SAD; i++)
{
sad[i/NUM_DIM_SAD] = 0;
m1_set[i] = rand() / (uint8_t)RAND_MAX;
m2_set[i] = rand() / (uint8_t)RAND_MAX;
cout << "CPU: i = " << i << "| m1_set = " << (unsigned int)m1_set[i] << "| m2_set = " << (unsigned int)m2_set[i] << endl;
}
for (int i = 0; i < NUM_POINTS_SAD * NUM_DIM_SAD; i++)
sad[i/NUM_DIM_SAD] += abs(m1_set[i] - m2_set[i]);
cl_int err;
// Configure the OpenCL environment
printf(">>> Initializing OpenCL...\n");
cl_platform_id platform = USE_PLATFORM_NR;
err = clGetPlatformIDs(1, &platform, NULL);
if (err != CL_SUCCESS) { cout << err << "clGetPlatformId"; return -1;}
cl_device_id device = 0;
err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
if (err != CL_SUCCESS) { cout << err << "clGetDeviceIDs"; return -1; }
cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
if (err != CL_SUCCESS) { cout << err << "clCreateContext"; return -1; }
cl_command_queue queue = clCreateCommandQueue(context, device, 0, &err);
if (err != CL_SUCCESS) { cout << err << "clCreateCommandQueue"; return -1; }
char deviceName[1024];
err = clGetDeviceInfo(device, CL_DEVICE_NAME, 1024, deviceName, NULL);
if (err != CL_SUCCESS) { cout << err << "clGetDeviceInfo"; return -1; }
cl_event event = NULL;
// Compile the kernel
cl_program program = clCreateProgramWithSource(context, 1, &kernelstring_sad, NULL, &err);
if (err != CL_SUCCESS) { cout << err << "clCreateProgramWithSource"; return -1; }
err = clBuildProgram(program, 0, NULL, "", NULL, NULL);
if (err != CL_SUCCESS) { cout << err << "clBuildProgram"; return -1; }
// Check for compilation errors
size_t logSize;
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
if (err != CL_SUCCESS) { cout << err << "clGetProgramBuildInfo"; return -1; }
char* messages = (char*)malloc((1 + logSize) * sizeof(char));
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, logSize, messages, NULL);
if (err != CL_SUCCESS) { cout << err << "clGetProgramBuildInfo2"; return -1; }
messages[logSize] = '\0';
if (logSize > 10) { printf(">>> Compiler message: %s\n", messages); }
free(messages);
// Prepare OpenCL memory objects
cl_mem buf_m1 = clCreateBuffer(context, CL_MEM_READ_ONLY, NUM_DIM_SAD * NUM_POINTS_SAD * sizeof(uint8_t), NULL, &err);
if (err != CL_SUCCESS) { cout << err << "clCreateBuffer_m1"; return -1; }
cl_mem buf_m2 = clCreateBuffer(context, CL_MEM_READ_ONLY, NUM_DIM_SAD * NUM_POINTS_SAD * sizeof(uint8_t), NULL, &err);
if (err != CL_SUCCESS) { cout << err << "clCreateBuffer_m2"; return -1; }
cl_mem buf_sad = clCreateBuffer(context, CL_MEM_READ_WRITE, NUM_POINTS_SAD * sizeof(uint8_t), NULL, NULL);
if (err != CL_SUCCESS) { cout << err << "clCreateBuffer_sad"; return -1; }
// Copy matrices to the GPU
err = clEnqueueWriteBuffer(queue, buf_m1, CL_TRUE, 0, NUM_DIM_SAD * NUM_POINTS_SAD * sizeof(uint8_t), m1_set, 0, NULL, NULL);
if (err != CL_SUCCESS) { cout << err << "clEnqueueWriteBuffer_m1"; return -1; }
err = clEnqueueWriteBuffer(queue, buf_m2, CL_TRUE, 0, NUM_DIM_SAD * NUM_POINTS_SAD * sizeof(uint8_t), m2_set, 0, NULL, NULL);
if (err != CL_SUCCESS) { cout << err << "clEnqueueWriteBuffer_m2"; return -1; }
err = clEnqueueWriteBuffer(queue, buf_sad, CL_TRUE, 0, NUM_POINTS_SAD * sizeof(uint8_t), sad_gpu, 0, NULL, NULL);
if (err != CL_SUCCESS) { cout << err << "clEnqueueWriteBuffer_sad"; return -1; }
// Configure the kernel and set its arguments
int num_points_sad = NUM_POINTS_SAD;
int num_dim_sad = NUM_DIM_SAD;
cl_kernel kernel = clCreateKernel(program, "SAD", &err);
if (err != CL_SUCCESS) { cout << err << "clCreateKernel"; return -1; }
err = clSetKernelArg(kernel, 0, sizeof(int), (void*)&num_points_sad);
if (err != CL_SUCCESS) { cout << err << "clCreateKernel_arg0"; return -1; }
err = clSetKernelArg(kernel, 1, sizeof(int), (void*)&num_dim_sad);
if (err != CL_SUCCESS) { cout << err << "clCreateKernel_arg1"; return -1; }
err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&buf_m1);
if (err != CL_SUCCESS) { cout << err << "clCreateKernel_arg2"; return -1; }
err = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&buf_m2);
if (err != CL_SUCCESS) { cout << err << "clCreateKernel_arg3"; return -1; }
err = clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&buf_sad);
if (err != CL_SUCCESS) { cout << err << "clCreateKernel4"; return -1; }
// Start the timed loop
printf(">>> Starting SAD GPU run...\n");
std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();
// const size_t local[1] = { TS };
const size_t global[1] = { NUM_POINTS_SAD };
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global, NULL, 0, NULL, &event); //local
if (err != CL_SUCCESS) { cout << err << "clEnqueueNDRangeKernel"; return -1; }
// Wait for calculations to be finished
clWaitForEvents(1, &event);
// End the timed loop
std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
// Copy the output matrix C back to the CPU memory
clEnqueueReadBuffer(queue, buf_sad, CL_TRUE, 0, NUM_POINTS_SAD * sizeof(uint8_t), sad_gpu, 0, NULL, NULL);
auto us = std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count();
std::cout << "Time difference = " << us << " us " << std::endl;
// Free the OpenCL memory objects
clReleaseMemObject(buf_m1);
clReleaseMemObject(buf_m2);
clReleaseMemObject(buf_sad);
// Clean-up OpenCL
clReleaseCommandQueue(queue);
clReleaseContext(context);
clReleaseProgram(program);
clReleaseKernel(kernel);
for (int i = 0; i < NUM_POINTS_SAD; i++)
{
cout << "i: " << i;
cout << " | CPU: " << (unsigned int)sad[i];
cout << " | GPU: " << (unsigned int)sad_gpu[i];
cout << endl;
}
// Free the host memory objects
free(m1_set);
free(m2_set);
free(sad);
free(sad_gpu);
// Exit
return 0;
}
There is an error in function where the context is being created - one of the parameters is being passed at wrong position.
Instead:
cl_context context = clCreateContext(NULL, 1, &device, NULL, &err, NULL);
Should be:
cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
^^^^^^^^^^
Also the way the error are output is still not much helpful. Should be something like this:
cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
if (err != CL_SUCCESS)
{
cout << err << "clCreateContext";
return -1;
}
This way we stop the code execution when the error occurred and we know for which function it happened.
======= UPDATE ========================================================
There is wrong type being used in kernel: uint8 type in OpenCL is a vector type meaning array of 8 values of type int.
To fix the problem use uchar/unsigned char type in the OpenCL kernel which is an equivalent of uint8_t/unsigned char from c++.
See OpenCL data types and Scalar data types.
Related
I am experimenting with clCloneKernel to see how a kernel can be used by multiple host threads. The OpenCL specification declares that setting kernel arguments (amongst other things) is not thread-safe. So using clCloneKenrel should provide a work around if the same kernel needs to be called by multiple host threads.
The problem is that as soon as clCloneKernel is called on an initialized cl_kernel object (regardless if it is before of after setting the arguments) causes the program to segfault.
#include <iostream>
#include <fstream>
#include <sstream>
#include <string>
#ifdef __APPLE__
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
using namespace std;
const int MAXNUMDEV = 10;
string kernSource = " \
kernel void hello() \
{ \
int ID = get_global_id(0); \
int grID = get_group_id(0); \
printf(\"Work item %i from group %i says hello!\\n\", ID, grID); \
}";
//============================================
void cleanUp (cl_context c, cl_command_queue q, cl_program p, cl_kernel k)
{
if (k != 0)
clReleaseKernel (k);
if (p != 0)
clReleaseProgram (p);
if (q != 0)
clReleaseCommandQueue (q);
if (c != 0)
clReleaseContext (c);
}
//============================================
int main ()
{
cl_int errNum;
cl_uint numPlatforms;
cl_platform_id firstPlatformId;
cl_device_id devID[MAXNUMDEV];
cl_uint numDev;
cl_context cont = 0; // initialize for cleanup check
cl_command_queue q = 0;
cl_program pr = 0;
cl_kernel kernel = 0;
// Get a reference to an object representing a platform
errNum = clGetPlatformIDs (1, &firstPlatformId, &numPlatforms);
if (errNum != CL_SUCCESS || numPlatforms <= 0)
{
cerr << "Failed to find any OpenCL platforms." << endl;
return 1;
}
// Get the device IDs matching the CL_DEVICE_TYPE parameter, up to the MAXNUMDEV limit
errNum = clGetDeviceIDs (firstPlatformId, CL_DEVICE_TYPE_ALL, MAXNUMDEV, devID, &numDev);
if (errNum != CL_SUCCESS || numDev <= 0)
{
cerr << "Failed to find any OpenCL devices." << endl;
return 2;
}
char devName[100];
size_t nameLen;
for (int i = 0; i < numDev; i++)
{
errNum = clGetDeviceInfo (devID[i], CL_DEVICE_NAME, 100, (void *) devName, &nameLen);
if (errNum == CL_SUCCESS)
cout << "Device " << i << " is " << devName << endl;
}
cl_context_properties prop[] = {
CL_CONTEXT_PLATFORM,
(cl_context_properties) firstPlatformId,
0 // termination
};
cont = clCreateContext (prop, numDev, devID, NULL, // no callback function
NULL, // no data for callback
&errNum);
if (errNum != CL_SUCCESS)
{
cerr << "Failed to create a context." << endl;
cleanUp (cont, q, pr, kernel);
return 1;
}
cl_queue_properties qprop[] = {
CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
0
};
q = clCreateCommandQueueWithProperties (cont, devID[0], qprop, &errNum);
if (errNum != CL_SUCCESS)
{
cerr << "Failed to create a command queue" << endl;
cleanUp (cont, q, pr, kernel);
return 1;
}
const char *src = kernSource.c_str ();
size_t len = kernSource.size ();
pr = clCreateProgramWithSource (cont, 1, (const char **) (&src), &len, &errNum);
if (errNum != CL_SUCCESS)
{
cerr << "Failed to create program." << endl;
cleanUp (cont, q, pr, kernel);
return 1;
}
errNum = clBuildProgram (pr, 1, devID, NULL, NULL, NULL);
if (errNum != CL_SUCCESS)
{
cerr << "Failed to build program" << endl;
cleanUp (cont, q, pr, kernel);
return 1;
}
kernel = clCreateKernel (pr, "hello", &errNum);
if (errNum != CL_SUCCESS || kernel == NULL)
{
cerr << "Failed to create kernel" << endl;
cleanUp (cont, q, pr, kernel);
return 1;
}
cl_kernel copyKern = clCloneKernel(kernel, &errNum); // <<<<<<<<<<<<<<<
// work item index space and group size setup
size_t idxSpace[] = { 12 };
size_t localWorkSize[] = { 3 };
cl_event completeEv;
errNum = clEnqueueNDRangeKernel (q, kernel, 1, NULL, idxSpace, localWorkSize, 0, NULL, &completeEv);
// wait for enqueued command to finish
clWaitForEvents (1, &completeEv);
cleanUp (cont, q, pr, kernel);
return 0;
}
clCloneKernel() was introduced with OpenCL 2.1. Does your OpenCL platform implement this version of the standard? I suspect it might not, hence the crash.
My Problem is, that I can't modify values in my kernel.
This is the code that does not work:
1st: the kernel:
__kernel void GetCellIndex(__global Particle* particles) {
int globalID = get_global_id(0);
particles[globalID].position.x = globalID;
};
2nd: the struct used in the kernel:
typedef struct _Particle
{
float3 position;
}Particle;
3rd: the pushing from CPU to GPU:
(Particle*) particles = (Particle*)malloc(sizeof(Particle)*200);
for (int i = 0; i < 200; i++)
{
particles[i].position.x = 5f;
}
cl_mem cl_Particles = clCreateBuffer(context, CL_MEM_READ_WRITE |
CL_MEM_COPY_HOST_PTR, sizeof(Particle)*maxParticle, &particles[0], NULL);
//init of kernel etc.
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&cl_Particles);
if (err != 0) {
std::cout << "Error: setKernelArg 0 does not work!" << std::endl;
system("Pause");
}
4th: run the kernel:
size_t localItem = 1;
err = clEnqueueNDRangeKernel(queue, kernel, 1, 0, &(size_t)200+1, &localItem, 0, NULL, NULL);
if (err != 0) {
std::cout << "Error: EnqueueNDRange does not work!" << std::endl;
}
err = clFlush(queue);
if (err != 0) {
std::cout << "Error: Flush does not work: " << err << std::endl;
}
err = clFinish(queue);
if (err != 0) {
std::cout << "Error: Finish does not work: " << err << std::endl;
}
5th: the used struct on the GPU:
typedef struct _Particle
{
cl_float3 position;
}Particle;
6th: finally the reading of the buffer:
clEnqueueReadBuffer(queue, cl_Particles, CL_TRUE, 0, 200 * sizeof(Particle), particles, 0, NULL, NULL);
after this steps my kernel does not effect the values returned in clEnqueueReadBuffer...
Does anyone know why? what is the problem here
Solved the problem:
Change the malloc line to something like
particles = new Particles[200];
also write the data in a seperate step to the buffer (with clEnqueueWriteBuffer(...))
and the rest should work just like this code above
UPDATE: clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0, LIST_SIZE * sizeof(double), C, 0, NULL, NULL); is returning -5, CL_OUT_OF_RESOURCES. This funciton/call should never return this!
I've started using OpenCL and have come across a problem. If I allow a for loop (in the kernel) to run 10000 times I get all of C to be 0 if I allow the loop to run for 8000 the results are all correct.
I have added waits around the kernel to ensure it completes, thinking I was pulling the data out before completion and have tried both Clwaitforevent and CLFinish. No errors are signalled by any of the calls. I when I used ints the for loop would work at a size of 4000000. Float and doubles have the same problem however floats work at 10000, but not at 20000, when I used the floats I removed #pragma OPENCL EXTENSION cl_khr_fp64 : enable to check that wasn't the problem.
Is this some weird memory thing, I'm I using OpenCL wrong? I realise that in most kernels I woun't be implementing for loops like this, but this seems like an issue. I have also removed __private to see if that was the problem, no change. So is there a limit on the size of for loops in OpenCL kernels? Is is hardware specific? Or is this a bug?
The kernel is a simple kernel, which adds 2 arrays (A+B) together and outputs another (C). In order to get a feel for performance I put a for loop around each calculation to slow it up/increase the number of operations per run through.
The code for the kernel is as follows:
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
__kernel void vector_add(__global double *A, __global double *B, __global double *C)
{
// Get the index of the current element
int i = get_global_id(0);
// Do the operation
for (__private unsigned int j = 0; j < 10000; j++)
{
C[i] = A[i] + B[i];
}
}
The code I'm running is as follows: (I ensure that the variables are consistent between both pieces of code when I switch between float and double)
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#define MAX_SOURCE_SIZE (0x100000)
int main(void) {
// Create the two input vectors
int i;
const int LIST_SIZE = 4000000;
double *A = (double*)malloc(sizeof(double)*LIST_SIZE);
double *B = (double*)malloc(sizeof(double)*LIST_SIZE);
for(i = 0; i < LIST_SIZE; i++) {
A[i] = static_cast<double>(i);
B[i] = static_cast<double>(LIST_SIZE - i);
}
// Load the kernel source code into the array source_str
FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("vector_add_kernel.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
// Get platform and device information
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
// clGetPlatformIDs(1, &platform_id, NULL);
//clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, ret_num_devices);
cl_int ret = clGetPlatformIDs(1, &platform_id, NULL);
if (ret != CL_SUCCESS) {
printf("Error: Failed to get platforms! (%d) \n", ret);
return EXIT_FAILURE;
}
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices);
if (ret != CL_SUCCESS) {
printf("Error: Failed to query platforms to get devices! (%d) \n", ret);
return EXIT_FAILURE;
}
/*
cl_int ret = clGetPlatformIDs(1, &platform_id, NULL);
if (ret != CL_SUCCESS) {
printf("Error: Failed to get platforms! (%d) \n", ret);
return EXIT_FAILURE;
}
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_CPU, 1,
&device_id, &ret_num_devices);
if (ret != CL_SUCCESS) {
printf("Error: Failed to query platforms to get devices! (%d) \n", ret);
return EXIT_FAILURE;
}
*/
// Create an OpenCL context
cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
// Create a command queue
cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
// Create memory buffers on the device for each vector
cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
LIST_SIZE * sizeof(double), NULL, &ret);
cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
LIST_SIZE * sizeof(double), NULL, &ret);
cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
LIST_SIZE * sizeof(double), NULL, &ret);
if (ret != CL_SUCCESS) {
printf("Error: Buffer Fail! (%d) \n", ret);
return EXIT_FAILURE;
}
// Copy the lists A and B to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(double), A, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(double), B, 0, NULL, NULL);
std::cout << "Begin Compile" << "\n";
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&source_str, (const size_t *)&source_size, &ret);
if (ret != CL_SUCCESS) {
printf("Error: Program Fail! (%d) \n", ret);
return EXIT_FAILURE;
}
// Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
if (ret != CL_SUCCESS) {
printf("Error: ProgramBuild Fail! (%d) \n", ret);
return EXIT_FAILURE;
}
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "vector_add", &ret);
if (ret != CL_SUCCESS) {
printf("Error: Kernel Build Fail! (%d) \n", ret);
return EXIT_FAILURE;
}
std::cout << "End Compile" << "\n";
std::cout << "Begin Data Move" << "\n";
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
std::cout << "End Data Move" << "\n";
// Execute the OpenCL kernel on the list
size_t global_item_size = LIST_SIZE; // Process the entire lists
size_t local_item_size = 64; // Process in groups of 64
std::cout << "Begin Execute" << "\n";
cl_event event;
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, &event);
clFinish(command_queue);
//clWaitForEvents(1, &event);
std::cout << "End Execute" << "\n";
if (ret != CL_SUCCESS) {
printf("Error: Execute Fail! (%d) \n", ret);
return EXIT_FAILURE;
}
// Read the memory buffer C on the device to the local variable C
std::cout << "Begin Data Move" << "\n";
double *C = (double*)malloc(sizeof(double)*LIST_SIZE);
ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(double), C, 0, NULL, NULL);
if (ret != CL_SUCCESS) {
printf("Error: Read Fail! (%d) \n", ret);
return EXIT_FAILURE;
}
clFinish(command_queue);
std::cout << "End Data Move" << "\n";
std::cout << "Done" << "\n";
std::cin.get();
// Display the result to the screen
for(i = 0; i < LIST_SIZE; i++)
printf("%f + %f = %f \n", A[i], B[i], C[i]);
// Clean up
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(a_mem_obj);
ret = clReleaseMemObject(b_mem_obj);
ret = clReleaseMemObject(c_mem_obj);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
free(A);
free(B);
free(C);
std::cout << "Number of Devices: " << ret_num_devices << "\n";
std::cin.get();
return 0;
}
I've had a look on the internet and can't find people with similar problems, this is a concern as it could lead to code that works well till scaled up...
I'm running Ubuntu 14.04, and have a laptop graphics card for a RC520 which I run with bumblebee/optirun. If this bug isn't reproducible on other machines up to a loop size of 4000000 then I will log a bug with bumblebee/optirun.
Cheers
I found the issue, GPUs attached to displays/active VGAs/etc have a Watch Dog Timer that times out after ~5s. This is the case for cards that aren't teslas, which have this functionality to be turned off. Running on a secondary card is a work around. This sucks and needs to be fixed ASAP. It's definitely an NVidia issue, not sure about about AMD, either way, this is terrible.
Workarounds are registry changes in Windows and, in Linux/Ubuntu, altering the X conf and placing:
option "Interactive" "0"
In the gap with the graphics card, however X conf is now not generated in later versions and may have to be manually created. If anyone has a copy and paste console code fix to this that would be great and a better answer.
I modified slightly this program for my understanding, but now it is not running.
This is my code:
#include <iostream>
#include "CL/cl.h"
#include <math.h>
using namespace std;
#define MYSIZE 1000
#if defined(cl_khr_fp64) //Khronos extension available
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#define DOUBLE_SUPPORT_AVAILABLE
#elif defined(cl_amd_fp64) //AMD extension available
#pragma OPENCL EXTENSION cl_amd_fp64 : enable
#define DOUBLE_SUPPORT_AVAILABLE
#endif
#ifdef DOUBLE_SUPPORT_AVAILABLE
//double
typedef double myreal;
const char *SOURCE = "\n" \
"__kernel void addArray(__global double *A, __global double *B, __global double *C, const unsigned int size) \n" \
"{ \n" \
"int i = get_global_id(0); \n" \
"if(i < size) \n" \
" C[i] = A[i] + B[i]; \n" \
"} \n" \
"\n";
#else
//float
typedef float myreal;
const char *SOURCE = "\n" \
"__kernel void addArray(__global float *A, __global float *B, __global float *C, const unsigned int size) \n" \
"{ \n" \
"int i = get_global_id(0); \n" \
"if(i < size) \n" \
" C[i] = A[i] + B[i]; \n" \
"} \n" \
"\n";
#endif
int main(int argc, char *argv[])
{
int devType = CL_DEVICE_TYPE_GPU;
unsigned int count = MYSIZE;
cl_int err;//err returned from API
size_t global;//global size
size_t local;//local size
cl_platform_id platform;
cl_device_id device;
cl_context context;
cl_command_queue commands;
cl_program program;
cl_kernel kernel;
//connect to a compute device
err = clGetPlatformIDs(1, &platform, NULL);
if(err != CL_SUCCESS)
{
cerr << "ERROR: Could not find a platform" << endl;
return -1;
}
//get a device of the appropriate type
err = clGetDeviceIDs(platform, devType, 1, &device, NULL);
if(err != CL_SUCCESS)
{
cerr << "ERROR: Could not find a device" << endl;
return -1;
}
//create a context
context = clCreateContext(0, 1, &device, NULL, NULL, &err);
if(!context || (err != CL_SUCCESS))
{
cerr << "ERROR: Could not create a context" << endl;
return -1;
}
//create a command queue
commands = clCreateCommandQueue(context, device, 0, &err);
if(!commands || (err != CL_SUCCESS))
{
cerr << "ERROR: Could not create a command queue" << endl;
return -1;
}
//create the compute program from source
program = clCreateProgramWithSource(context, 1, (const char **) &SOURCE, NULL, &err);
if(!program || (err != CL_SUCCESS))
{
cerr << "ERROR: Could not create a program from source" << endl;
return -1;
}
//build the program executable
err = clBuildProgram(program, NULL, NULL, NULL, NULL, NULL);
if(err != CL_SUCCESS)
{
size_t len;
char buffer[2048];
clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len);
cerr << "ERROR: Could not build the program executable" << endl;
cerr << buffer << endl;
return -1;
}
//create the kernel
kernel = clCreateKernel(program, "addArray", &err);
if(!kernel || (err != CL_SUCCESS))
{
cerr << "Could not create the kernel" << endl;
return -1;
}
myreal *A = new myreal[MYSIZE];
myreal *B = new myreal[MYSIZE];
myreal *C = new myreal[MYSIZE];
for(int i = 0; i < MYSIZE; i++)
{
A[i] = sqrt(i);
B[i] = -sqrt(i);
}
unsigned int correct = 0;//correct answers
cl_mem A_cl;
cl_mem B_cl;
cl_mem C_cl;
//create device memory buffer
A_cl = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(myreal) * count, NULL, NULL);
B_cl = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(myreal) * count, NULL, NULL);
C_cl = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(myreal) * count, NULL, NULL);
if(!A_cl || !B_cl || !C_cl)
{
cerr << "Could not create device memory buffer" << endl;
return -1;
}
//transfer data to device
err = clEnqueueWriteBuffer(commands, A_cl, CL_TRUE, 0, sizeof(myreal) * count, A, 0, NULL, NULL);
if(err != CL_SUCCESS)
{
cerr << "Could not transfer data to device" << endl;
return -1;
}
err = clEnqueueWriteBuffer(commands, B_cl, CL_TRUE, 0, sizeof(myreal) * count, B, 0, NULL, NULL);
if(err != CL_SUCCESS)
{
cerr << "Could not transfer data to device" << endl;
return -1;
}
//set the arguments to the compute kernel
err = 0;
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &A_cl);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &B_cl);
err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &C_cl);
err |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &count);
if(err != CL_SUCCESS)
{
cerr << "Could not set args for kernel" << endl;
return -1;
}
//get max work group size
err = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL);
if(err != CL_SUCCESS)
{
cerr << "Could not get the kernel work group size" << endl;
return -1;
}
//execute the kernel using max work group size
global = count;
err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL);
if(err != CL_SUCCESS)
{
cerr << "Could not enqueue the kernel for execution: " << err << endl;
return -1;
}
//wait for all commands to finish
clFinish(commands);
//read back the results to C
err = clEnqueueReadBuffer(commands, C_cl, CL_TRUE, 0, sizeof(myreal) * count, C, 0, NULL, NULL);
if(err != CL_SUCCESS)
{
cerr << "Could not read data from C" << endl;
return -1;
}
//validate the results
correct = 0;
myreal check = 0;
for(int i = 0; i < MYSIZE; i++)
{
check = A[i] + B[i] - C[i];
if((check < 1e-14) && (check > -1e-14))
correct++;
}
cout << "Computed " << correct << " correct results with " << ((correct / MYSIZE) * 100) << "% success rate!" << endl;
delete[] A;
delete[] B;
delete[] C;
clReleaseMemObject(A_cl);
clReleaseMemObject(B_cl);
clReleaseMemObject(C_cl);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(commands);
clReleaseContext(context);
return 0;
return 0;
}
The output I am getting is:
Could not enqueue the kernel for execution: -54
What does this error -54 mean?
I am using MinGW 4.4 32 bit compiler on Windows7 64 bit.
My GPU is ATI Radeon 7670m which has OpenCL 1.1 drivers.
I am using APP SDK 2.9 for 64 bit.
In Khronos reference cl.h, -54 means CL_INVALID_WORK_GROUP_SIZE, which is quite self explicative.
Tip: if you don't have a bound on workgroup size, then you can pass NULL instead of local and let the enqueue function figure it out for you.
Actually, the problem is that my program will show the message of SIGSEGV fault, but not always. That means it sometimes runs well, but sometimes breaks down. So I wonder it is probably for my C program is using a lot of memory resource? And the resource limit changes every time?
Hope for your reply, thanks.
The code is so long and I'll be glad to hear from you about what section that you need.
But I have a piece of the debug information and I dont know would it be helpful for you guys:
[New Thread 0x7ffff7e63700 (LWP 31256)]
[New Thread 0x7ffff393f700 (LWP 31257)]
[New Thread 0x7ffff312c700 (LWP 31258)]
[New Thread 0x7ffff2919700 (LWP 31260)]
[New Thread 0x7ffff2106700 (LWP 31261)]
Detaching after fork from child process 31265.
Detaching after fork from child process 31266.
Program received signal SIGSEGV, Segmentation fault.
0x00007ffff708944a in _int_malloc () from /lib64/libc.so.6
As you can see, after the several threads are built, the malloc faces problems. Will it be the trouble of memory capacity?
And here is some of my codes:
#include <iostream>
#include <cstdlib>
#include <cstdio>
#include <cstring>
#include <cmath>
#include <ctime>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#include "compcol_double.h"
#include "comprow_double.h"
#include "coord_double.h"
#include "iohb_double.h"
#include "dehaze_set_opencl.h"
#include "default_set_opencl.h"
#include "load_image_opencl.h"
using namespace std;
//relative path is where program is executing
const char *kernel_path = "dehaze.cl";
const char *kernel_name = "dehaze";
const int ARRAY_SIZE = 100;
int main(int argc, char **argv){
//OpenCL program
cl_device_id device_id = NULL;
cl_context context = NULL;
cl_command_queue command_queue = NULL;
cl_program program = NULL;
cl_kernel kernel = NULL;
cl_platform_id platform_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret;
cl_int errNum;
//Image
cl_mem imageObjects[2] = {0,0};
cl_sampler sampler = NULL;
//Get Platform and Device Info
ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);
//Create OpenCL Context
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
//Create Command Queue
command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
//Create Program
program = CreateProgram(context, device_id, kernel_path);
if (program == NULL) {
return 1;
}
// Make sure the device supports images, otherwise exit
cl_bool imageSupport = CL_FALSE;
clGetDeviceInfo(device_id, CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool),
&imageSupport, NULL);
if (imageSupport != CL_TRUE)
{
std::cerr << "OpenCL device does not support images." << std::endl;
return 1;
}
// Now the Image Processor Kernel is loaded
// Load input image from file and load it into
// an OpenCL image object
int width, height;
imageObjects[0] = LoadImage(context, (char *) "./pic/Flowers.JPG", width, height);
if (imageObjects[0] == 0)
{
std::cerr << "Error loading: " << std::string(argv[1]) << std::endl;
return 1;
}
// Create ouput image object
cl_image_format clImageFormat;
clImageFormat.image_channel_order = CL_RGBA;
clImageFormat.image_channel_data_type = CL_UNORM_INT8;
imageObjects[1] = clCreateImage2D(context,
CL_MEM_WRITE_ONLY,
&clImageFormat,
width,
height,
0,
NULL,
&errNum);
if (errNum != CL_SUCCESS)
{
std::cerr << "Error creating CL output image object." << std::endl;
return 1;
}
// Create sampler for sampling image object
sampler = clCreateSampler(context,
CL_FALSE, // Non-normalized coordinates
CL_ADDRESS_CLAMP_TO_EDGE,
CL_FILTER_NEAREST,
&errNum);
if (errNum != CL_SUCCESS)
{
std::cerr << "Error creating CL sampler object." << std::endl;
return 1;
}
//Create OpenCL kernel
//Kernel1: calculate the t's value
//t is the mainly matrix in this algorithm
kernel = clCreateKernel(program, "get_t_mat", NULL);
if(kernel == NULL){
std::cerr<<"Failed to create kernel"<<std::endl;
return 1;
}
int t_size = width * height;
int img_size = width * height;
float t_mat[width * height];
memset( t_mat, 0, sizeof(t_mat));
cl_mem t_buffer = clCreateBuffer(context,
CL_MEM_READ_WRITE,
sizeof(float) * t_size,
NULL, NULL);
if(t_buffer == NULL){
std::cerr << "Error creating buffer" <<endl;
return 1;
}
// Set the kernel arguments
errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &imageObjects[0]);
errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &t_buffer);
errNum |= clSetKernelArg(kernel, 2, sizeof(cl_sampler), &sampler);
errNum |= clSetKernelArg(kernel, 3, sizeof(cl_int), &width);
errNum |= clSetKernelArg(kernel, 4, sizeof(cl_int), &height);
if (errNum != CL_SUCCESS)
{
std::cerr << "Error setting kernel arguments." << std::endl;
return 1;
}
size_t localWorkSize[2] = { 16, 16 };
size_t globalWorkSize[2] = { RoundUp(localWorkSize[0], width),
RoundUp(localWorkSize[1], height) };
// Queue the kernel up for execution
errNum = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL,
globalWorkSize, localWorkSize,
0, NULL, NULL);
if (errNum != CL_SUCCESS)
{
std::cerr << "Error queuing kernel for execution." << std::endl;
return 1;
}
errNum = clEnqueueReadBuffer(command_queue,
t_buffer,
CL_TRUE, 0,
t_size*sizeof(float),
t_mat,
0, NULL, NULL);
if( errNum!=CL_SUCCESS){
std::cerr << "Error write back buffer" <<endl;
return 1;
}
//Kernel2: calculate the win_b
kernel = clCreateKernel(program, "get_win_b", NULL);
if(kernel == NULL){
std::cerr<<"Failed to create kernel"<<std::endl;
return 1;
}
int win_b_size = width * height;
float win_b[width * height];
memset( win_b, 0, sizeof(win_b));
cl_mem win_b_buffer = clCreateBuffer(context,
CL_MEM_READ_WRITE,
sizeof(float) * t_size,
NULL, NULL);
if(win_b_buffer == NULL){
std::cerr << "Error creating buffer" <<endl;
return 1;
}
// Set the kernel arguments
errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &t_buffer);
//errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &imageObjects[1]);
errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &win_b_buffer);
//errNum |= clSetKernelArg(kernel, 2, sizeof(cl_sampler), &sampler);
errNum |= clSetKernelArg(kernel, 2, sizeof(cl_int), &width);
errNum |= clSetKernelArg(kernel, 3, sizeof(cl_int), &height);
if (errNum != CL_SUCCESS)
{
std::cerr << "Error setting kernel arguments." << std::endl;
return 1;
}
// Queue the kernel up for execution
errNum = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL,
globalWorkSize, localWorkSize,
0, NULL, NULL);
if (errNum != CL_SUCCESS)
{
std::cerr << "Error queuing kernel for execution." << std::endl;
return 1;
}
errNum = clEnqueueReadBuffer(command_queue,
win_b_buffer,
CL_TRUE, 0,
win_b_size*sizeof(float),
win_b,
0, NULL, NULL);
if( errNum!=CL_SUCCESS){
std::cerr << "Error write back buffer" <<endl;
return 1;
}
cout << 1 << endl;
//Kernel 3: vals
int neb_size = 9;
kernel = clCreateKernel(program, "get_vals", NULL);
if(kernel == NULL){
std::cerr<<"Failed to create kernel"<<std::endl;
return 1;
}
long long tlen = width * height * neb_size * neb_size;
double *vals = new double[tlen];
int *row_inds = new int[tlen];
int *col_inds = new int[tlen];
memset(vals,0,sizeof(float)*tlen);
memset(row_inds,0,sizeof(int)*tlen);
memset(col_inds,0,sizeof(int)*tlen);
int indsM[width*height];
for(int i = 0; i<width*height; i++)
indsM[i] = i+1;
// int test_size = 0;
cl_mem vals_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE,
sizeof(float) * tlen, NULL, NULL);
cl_mem row_inds_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE,
sizeof(int) * tlen, NULL, NULL);
cl_mem col_inds_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE,
sizeof(int) * tlen, NULL, NULL);
cl_mem indsM_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE,
sizeof(int)*width*height, NULL, NULL);
//cl_mem test_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE,
// sizeof(float)*test_size, NULL, NULL);
if(vals_buffer == NULL || row_inds_buffer == NULL
|| col_inds_buffer == NULL || indsM_buffer == NULL ){
std::cerr << "Error creating buffer" <<endl;
return 1;
}
errNum = clEnqueueWriteBuffer( command_queue, indsM_buffer, CL_FALSE, 0,
width*height, indsM, 0, NULL, NULL);
if(errNum != CL_SUCCESS){
cerr<<"Error writing buffer"<<endl;
exit(1);
}
// Set the kernel arguments
// Needs to be repaired
errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &imageObjects[0]);
errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &indsM_buffer);
errNum |= clSetKernelArg(kernel, 2, sizeof(cl_sampler), &sampler);
errNum |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &vals_buffer);
errNum |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &row_inds_buffer);
errNum |= clSetKernelArg(kernel, 5, sizeof(cl_mem), &col_inds_buffer);
errNum |= clSetKernelArg(kernel, 6, sizeof(cl_int), &width);
errNum |= clSetKernelArg(kernel, 7, sizeof(cl_int), &height);
if (errNum != CL_SUCCESS)
{
std::cerr << "Error setting kernel arguments." << std::endl;
return 1;
}
// Queue the kernel up for execution
size_t t_localWorkSize[2] = { 1, 1 };
size_t t_globalWorkSize[2] = { RoundUp(localWorkSize[0], width),
RoundUp(localWorkSize[1], height) };
errNum = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL,
t_globalWorkSize, t_localWorkSize,
0, NULL, NULL);
if (errNum != CL_SUCCESS)
{
std::cerr << "Error queuing kernel for execution." << std::endl;
return 1;
}
errNum = clEnqueueReadBuffer(command_queue, vals_buffer, CL_TRUE, 0,
tlen*sizeof(float), vals, 0, NULL, NULL);
errNum |= clEnqueueReadBuffer(command_queue, row_inds_buffer, CL_TRUE, 0,
tlen*sizeof(float), row_inds, 0, NULL, NULL);
errNum |= clEnqueueReadBuffer(command_queue, col_inds_buffer, CL_TRUE, 0,
tlen*sizeof(float), col_inds, 0, NULL, NULL);
// cout << 1 << endl;
if( errNum!=CL_SUCCESS){
std::cerr << "Error write back buffer" <<endl;
return 1;
}
Coord_Mat_double SparseMat(width,height,tlen,vals,row_inds,col_inds);
cout << SparseMat.dim(0) << endl;
cout << width << endl;
// Read the output buffer back to the Host
/*
char *buffer = new char [width * height * 4];
size_t origin[3] = { 0, 0, 0 };
size_t region[3] = { width, height, 1};
errNum = clEnqueueReadImage(command_queue, imageObjects[1], CL_TRUE,
origin, region, 0, 0, buffer,
0, NULL, NULL);
if (errNum != CL_SUCCESS)
{
std::cerr << "Error reading result buffer." << std::endl;
return 1;
}
*/
//std::cout << std::endl;
std::cout << "Executed program succesfully." << std::endl;
//memset(buffer, 0xff, width * height * 4);
// Save the image out to disk
/*
if (!SaveImage((char *) "out2.png", buffer, width, height))
{
std::cerr << "Error writing output image" << std::endl;
delete [] buffer;
return 1;
}
delete [] buffer;
*/
return 0;
}
THX
you can use gdb.
compile all your source code with the -g flag.
from terminal run:
gdb <your program>
then in the gdb shell:
r <arguments>
now wait for SIGSEGV when it occur type: where or: bt
it will show you the exact place in your code it was when it crashed.