How to print results from kernel in OpenCL? - c++

I am new to OpenCL. I am trying to use OpenCL c++ kernel language extension http://amd-dev.wpengine.netdna-cdn.com/wordpress/media/2012/10/CPP_kernel_language.pdf. I am trying to print results using page 10 code of this document. Please find the code below from this documentation and correct me if am wrong anywhere.
class Test{
public:
void setX(int value){ x = value;}
int getX(){ return x;}
private:
int x;
};
int main() {
cl_mem classObj = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, sizeof(Test), &tempClass, &ret);
void* dm_idata = clEnqueueMapBuffer(command_queue, classObj, CL_TRUE, CL_MAP_WRITE, 0 , sizeof(Test), 0, NULL, NULL, &ret);
tempClass.setX(10); //prints this value
clEnqueueUnmapMemObject(command_queue, classObj, dm_idata, 0, NULL, NULL);//class is passed to the device
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL);
clEnqueueMapBuffer(command_queue, classObj, CL_TRUE, CL_MAP_WRITE, 0, sizeof(Test), 0, NULL, NULL, &ret);//class is passed back to the host
printf("\n temp value: %d\n", tempClass.getX());
}
Here is the kernel code.
class Test {
setX (int value);
private:
int x;
};
__kernel void foo(__global Test* Inclass){
if(get_global_id(0) == 0)
Inclass->setX(6);
}
It prints the value from host code. I need to get the result from kernel. Any help is highly appreciated.
The result I got is
temp value = 10

Your second call to clEnqueueMapBuffer should be passing CL_MAP_READ, not CL_MAP_WRITE, since you want to read the data.

Related

C++ OpenCL Build Error: kernelSource undeclared

Im trying to run a OpenCL sample from the internet. It looks like this:
VecAdd.c
#define PROGRAM_FILE "vecAdd.cl"
#define KERNEL_FUNC "vecAdd"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#ifdef MAC
#include <OpenCL/cl.h>
#else
#include <CL/cl.h>
#endif
int main( int argc, char* argv[] )
{
// Length of vectors
unsigned int n = 100000;
// Host input vectors
double *h_a;
double *h_b;
// Host output vector
double *h_c;
// Device input buffers
cl_mem d_a;
cl_mem d_b;
// Device output buffer
cl_mem d_c;
cl_platform_id cpPlatform; // OpenCL platform
cl_device_id device_id; // device ID
cl_context context; // context
cl_command_queue queue; // command queue
cl_program program; // program
cl_kernel kernel; // kernel
// Size, in bytes, of each vector
size_t bytes = n*sizeof(double);
// Allocate memory for each vector on host
h_a = (double*)malloc(bytes);
h_b = (double*)malloc(bytes);
h_c = (double*)malloc(bytes);
// Initialize vectors on host
int i;
for( i = 0; i < n; i++ )
{
h_a[i] = sinf(i)*sinf(i);
h_b[i] = cosf(i)*cosf(i);
}
size_t globalSize, localSize;
cl_int err;
// Number of work items in each local work group
localSize = 64;
// Number of total work items - localSize must be devisor
globalSize = ceil(n/(float)localSize)*localSize;
// Bind to platform
err = clGetPlatformIDs(1, &cpPlatform, NULL);
// Get ID for the device
err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
// Create a context
context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
// Create a command queue
queue = clCreateCommandQueue(context, device_id, 0, &err);
// Create the compute program from the source buffer
program = clCreateProgramWithSource(context, 1,
(const char **) & kernelSource, NULL, &err);
// Build the program executable
clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
// Create the compute kernel in the program we wish to run
kernel = clCreateKernel(program, "vecAdd", &err);
// Create the input and output arrays in device memory for our calculation
d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL);
d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL);
// Write our data set into the input array in device memory
err = clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0,
bytes, h_a, 0, NULL, NULL);
err |= clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0,
bytes, h_b, 0, NULL, NULL);
// Set the arguments to our compute kernel
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b);
err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c);
err |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &n);
// Execute the kernel over the entire range of the data set
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize,
0, NULL, NULL);
// Wait for the command queue to get serviced before reading back results
clFinish(queue);
// Read the results from the device
clEnqueueReadBuffer(queue, d_c, CL_TRUE, 0,
bytes, h_c, 0, NULL, NULL );
//Sum up vector c and print result divided by n, this should equal 1 within error
double sum = 0;
for(i=0; i<n; i++)
sum += h_c[i];
printf("final result: %f\n", sum/n);
// release OpenCL resources
clReleaseMemObject(d_a);
clReleaseMemObject(d_b);
clReleaseMemObject(d_c);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(queue);
clReleaseContext(context);
//release host memory
free(h_a);
free(h_b);
free(h_c);
return 0;
}
VecAdd.cl
// OpenCL kernel. Each work item takes care of one element of c
__kernel void vecAdd( __global double *a,
__global double *b,
__global double *c,
const unsigned int n)
{
//Get our global thread ID
int id = get_global_id(0);
//Make sure we do not go out of bounds
if (id < n)
c[id] = a[id] + b[id];
}
When I try to run VecAdd.c with CodeBlocks I get an error on this line:
program = clCreateProgramWithSource(context, 1, (const char **) & kernelSource, NULL, &err);
The Error look like this:
vecAdd.c|79|error: 'kernelSource' undeclared (first use in this function)
I expected no error since the print_info.cpp sample worked fine and printed:
OpenCL Device Info:
Name: Intel(R) UHD Graphics 620
Vendor: Intel(R) Corporation
Version: OpenCL 3.0 NEO
Max size of work-items: (256,256,256)
Max size of work-groups: 256
Number of compute units: 24
Global memory size (bytes): 6762340352
Local memory size per compute unit (bytes): 2730
The sample code is incomplete. It's missing the part where it reads the VecAdd.cl file to the string kernelSource. You may add:
#include <iostream> // write to console
#include <fstream> // read/write files
// ...
int main( int argc, char* argv[] )
{
// ...
std::string kernelSource = "";
{
std::ifstream file("./VecAdd.cl", std::ios::in); // path might be different for you
if(file.fail()) stc::cout << "Error: File does not exist!\n";
kernelSource = std::string((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
file.close();
}
// Create the compute program from the source buffer
program = clCreateProgramWithSource(context, 1, (const char**)&kernelSource, NULL, &err);
// ...
}
For a much easier start with OpenCL, have a look at this OpenCL-Wrapper. This simplifies using the API a lot, without giving up any functionality or performance. By default it comes with a vector addition example. Notice how much shorter and less complicated the code is compared to the regular OpenCL bloat.

OpenCL.clSetKernelArg returns -51

I tried to make parallel bfs in openCL but I didn't have enough experience with c++.
So this is probably memory error, but I really don't know how to fix it.
I also can't find what does error value -51 means.
As a result I got "Unhandled exception at 0x00007FFCFB06A549 (amdocl64.dll) in my project.exe: 0xC0000005: Access violation reading location 0xFFFFFFFFFFFFFFFF" in next line.
main
Graph G(AdjacencyList, Directed);
int startVertex;
vector<int> distance;
vector<bool> visited;
distance = vector<int>(G.numVertices);
visited = vector<bool>(G.numVertices);
bool done = false;
const bool true_value = true;
int level = 0;
// Allocation on device
const int size = G.numVertices * sizeof(int);
const int adjacencySize = G.adjacencyList.size() * sizeof(int);
//OpenCL
cl_int status;
cl_int ret;
cl_platform_id platform_id;
clGetPlatformIDs(1, &platform_id, NULL);
cl_device_id device_id;
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
cl_context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &status);
cl_command_queue command_queue = clCreateCommandQueueWithProperties(context, device_id, NULL, &status);
cl_mem d_adjacencyList = clCreateBuffer(context, CL_MEM_READ_WRITE, adjacencySize, NULL, &status);
cl_mem d_edgesOffset = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &status);
cl_mem d_edgesSize = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &status);
cl_mem d_distance = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &status);
cl_mem d_done = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(bool), NULL, &status);
status = clEnqueueWriteBuffer(command_queue, d_adjacencyList, CL_TRUE, 0, adjacencySize, &G.adjacencyList[0], 0, NULL, NULL);
status = clEnqueueWriteBuffer(command_queue, d_edgesOffset, CL_TRUE, 0, size, &G.edgesOffset[0], 0, NULL, NULL);
status = clEnqueueWriteBuffer(command_queue, d_edgesSize, CL_TRUE, 0, size, &G.edgesSize[0], 0, NULL, NULL);
distance = vector<int>(G.numVertices, INT_MAX);
distance[start] = 0;
status = clEnqueueWriteBuffer(command_queue, d_distance, CL_TRUE, 0, size, distance.data(), 0, NULL, NULL);
char* source_str = NULL;
size_t source_size;
FILE* fp;
fp = fopen("bfs.cl", "r");
if (!fp)
{
cout << "Failed to load Kernel\n";
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
cl_program program = clCreateProgramWithSource(context, 1, (const char**)&source_str, (const size_t*)&source_size, &status);
status = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
cl_kernel kernel = clCreateKernel(program, "bfs", &status);
status = clSetKernelArg(kernel, 0, sizeof(int), (void*)&G.numVertices);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&d_adjacencyList);
status = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&d_edgesOffset);
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&d_edgesOffset);
status = clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&d_edgesSize);
status = clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&d_distance); //here retirns -51
status = clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*)&level);
status = clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*)&d_done);
kernel
__kernel void bfs(int n, __global int *adjacencyList,__global int *edgesOffset,__global int *edgesSize,__global int *distance, int level,__global bool *done) {
int tid = get_global_id(0);
if (tid < n) {
if (distance[tid] == level) {
for (int i = edgesOffset[tid]; i < edgesOffset[tid] + edgesSize[tid]; ++i) {
int v = adjacencyList[i];
if (distance[v] == INT_MAX) {
*done = false;
distance[v] = level + 1;
}
}
}
}
}
Hi #Parrison welcome to StackOverflow!
All the OpenCL error codes are defined in cl.h. In the latest (version 3) cl.h you will find the error codes defined between lines 194 and 270, where on line 241 you will find:
#define CL_INVALID_ARG_SIZE -51
So the OpenCL ICD reckons that you have passed the wrong variable size for distance.
However, I can see many other errors before this one. For example, you need to set the size of the OpenCL buffers based on the sizes of OpenCL variable not native variables, e.g.:
cl_int instead of int
cl_float instead of float
and especially cl_bool instead of bool.
There is no guarantee that an OpenCL cl_int is the same size a host int and an OpenCL cl_bool is defined as an unsigned int which is highly unlikely to be the same size as a bool!
Ensure that all the parameters to your OpenCL kernel are defined correctly and that
you are creating the correct buffers and variables for them in the main program.

OpenCL directories declared invalid after using them once

I have a strange problem with my OpenCL project. When I add the directories to the compiler(compiler is Dev-C++) it works at first. Problem is if I close Dev-C++ and open it up again all the directories are declared invalid. If I delete them and add them again they seem to work. Why is this happening. Here is my code if it helps:
// Copyright (c) 2010 Advanced Micro Devices, Inc. All rights reserved.
//
// A minimalist OpenCL program.
#include <CL/cl.h>
#include <CL/cl.hpp>
#include <stdio.h>
#include <CL/cl_ext.h>
#define NWITEMS 512
// A simple memset kernel
const char *source =
"__kernel void memset( __global uint *dst ) \n"
"{ \n"
" dst[get_global_id(0)] = get_global_id(0); \n"
"} \n";
int main(int argc, char ** argv, global_work_size)
{
// 1. Get a platform.
cl_platform_id platform;
clGetPlatformIDs( 1, &platform, NULL );
// 2. Find a gpu device.
cl_device_id device;
clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU,
1,
&device,
NULL);
// 3. Create a context and command queue on that device.
cl_context context = clCreateContext( NULL,
1,
&device,
NULL, NULL, NULL);
cl_command_queue queue = clCreateCommandQueue( context,
device,
0, NULL );
// 4. Perform runtime source compilation, and obtain kernel entry point.
cl_program program = clCreateProgramWithSource( context,
1,
&source,
NULL, NULL );
clBuildProgram( program, 1, &device, NULL, NULL, NULL );
cl_kernel kernel = clCreateKernel( program, "memset", NULL );
// 5. Create a data buffer.
cl_mem buffer = clCreateBuffer( context,
CL_MEM_WRITE_ONLY, ...);
// 6. Launch the kernel. Let OpenCL pick the local work size.
size_t KERNEL_WORK_GROUP_SIZE = NWITEMS;
clSetKernelArg(kernel, 0, sizeof(buffer), (void*) &buffer);
clEnqueueNDRangeKernel( queue,
kernel,
1,
NULL,
&global_work_size,
NULL, 0, NULL, NULL);
clFinish( queue );
// 7. Look at the results via synchronous buffer map.
cl_uint *ptr;
ptr = (cl_uint *) clEnqueueMapBuffer( queue,
buffer,
CL_TRUE,
CL_MAP_READ,
0,
NWITEMS * sizeof(cl_uint),
0, NULL, NULL, NULL );
int i;
for(i=0; i < NWITEMS; i++)
printf("%d %d\n", i, ptr[i]);
return(0);
}

Not all work-items being used opencl

so I'm able to compile and execute my kernel, the problem is that only two work-items are being used. I'm basically trying to fill up a float array[8] with {0,1,2,3,4,5,6,7}. So this is a very simple hello world application. Bellow is my kernel.
// Highly simplified to demonstrate
__kernel void rnd_float32_matrix (
__global float * res
) {
uint idx = get_global_id(0);
res[idx] = idx;
}
I then create and execute the kernel with the following code...
// Some more code
cl::Program program(context, sources, &err);
program.build(devices, NULL, NULL, NULL);
cl::Kernel kernel(program, "rnd_float32_matrix", &err);
kernel.setArg(0, src_d);
cl::CommandQueue queue(context, devices[0], 0, &err);
cl::Event event;
err = queue.enqueueNDRangeKernel(
kernel,
cl::NullRange,
cl::NDRange(8),
// I've tried cl::NDRange(8) as well
cl::NDRange(1),
NULL,
&event
);
event.wait();
err = queue.enqueueReadBuffer(
// This is:
// cl::Buffer src_d(
// context,
// CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
// mem_size,
// src_h,
// &err);
src_d,
CL_TRUE,
0,
8,
// This is float * src_h = new float[8];
src_h);
for(int i = 0; i < 8; i ++) {
std::cout << src_h[i] << std::endl;
}
I may not show it in the code, but I also do select a gpu device and using context.getInfo(..) it shows I'm using my NVidia GTX 770M card which shows 1024, 1024, 64 work-items available in dimensions 0, 1 and 2. When this array prints I keep getting... 0, 1, 0, 0, 0, 0, 0, 0. I've also tried setting res[idx] = 5, and I get... 5, 5, 0, 0, 0, 0, 0, 0. So it seems that only two give work-items are actually being used. What am I doing wrong?
Your command to read the data back from the device is only reading 8 bytes, which is two floats:
err = queue.enqueueReadBuffer(
src_d,
CL_TRUE,
0,
8, // <- This is the number of bytes, not the number of elements!
// This is float * src_h = new float[8];
src_h);
To read 8 floats, you would need to do this:
err = queue.enqueueReadBuffer(
src_d,
CL_TRUE,
0,
8 * sizeof(cl_float),
// This is float * src_h = new float[8];
src_h);

CL_MEM_ALLOC_HOST_PTR slower than CL_MEM_USE_HOST_PTR

So I've been playing around with OpenCL for a bit now and testing the speeds of memory transfer between host and device.
I was using Intel OpenCL SDK and running on the Intel i5 Processor with integrated graphics.
I then discovered clEnqueueMapBuffer instead of clEnqueueWriteBuffer which turned out to be faster by almost 10 times when using pinned memory like so:
int amt = 16*1024*1024;
...
k_a = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(int)*amt, a, NULL);
k_b = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(int)*amt, b, NULL);
k_c = clCreateBuffer(context,CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, sizeof(int)*amt, ret, NULL);
int* map_a = (int*) clEnqueueMapBuffer(c_q, k_a, CL_TRUE, CL_MAP_READ, 0, sizeof(int)*amt, 0, NULL, NULL, &error);
int* map_b = (int*) clEnqueueMapBuffer(c_q, k_b, CL_TRUE, CL_MAP_READ, 0, sizeof(int)*amt, 0, NULL, NULL, &error);
int* map_c = (int*) clEnqueueMapBuffer(c_q, k_c, CL_TRUE, CL_MAP_WRITE, 0, sizeof(int)*amt, 0, NULL, NULL, &error);
clFinish(c_q);
Where a b and ret are 128 bit aligned int arrays.
The time came out to about 22.026186 ms, compared to 198.604528 ms using clEnqueueWriteBuffer
However, when I changed my code to
k_a = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, sizeof(int)*amt, NULL, NULL);
k_b = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, sizeof(int)*amt, NULL, NULL);
k_c = clCreateBuffer(context,CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, sizeof(int)*amt, NULL, NULL);
int* map_a = (int*)clEnqueueMapBuffer(c_q, k_a, CL_TRUE, CL_MAP_READ, 0, sizeof(int)*amt, 0, NULL, NULL, &error);
int* map_b = (int*)clEnqueueMapBuffer(c_q, k_b, CL_TRUE, CL_MAP_READ, 0, sizeof(int)*amt, 0, NULL, NULL, &error);
int* map_c = (int*)clEnqueueMapBuffer(c_q, k_c, CL_TRUE, CL_MAP_WRITE, 0, sizeof(int)*amt, 0, NULL, NULL, &error);
/** initiate map_a and map_b **/
the time increases to 91.350065 ms
What could be the problem? Or is it a problem at all?
EDIT:
This is how I initialize the arrays in the second code:
for (int i = 0; i < amt; i++)
{
map_a[i] = i;
map_b[i] = i;
}
And now that I check, map_a and map_b do contain the right elements at the end of the program, but map_c contains all 0's. I did this:
clEnqueueUnmapMemObject(c_q, k_a, map_a, 0, NULL, NULL);
clEnqueueUnmapMemObject(c_q, k_b, map_b, 0, NULL, NULL);
clEnqueueUnmapMemObject(c_q, k_c, map_c, 0, NULL, NULL);
and my kernel is just
__kernel void test(__global int* a, __global int* b, __global int* c)
{
int i = get_global_id(0);
c[i] = a[i] + b[i];
}
My understanding is that CL_MEM_ALLOC_HOST_PTR allocates but doesn't copy. Does the 2nd block of code actually get any data onto the device?
Also, clCreateBuffer when used with CL_MEM_USE_HOST_PTR and CL_MEM_COPY_HOST_PTR shouldn't require clEnqueueWrite, as the buffer is created with the memory pointed to by void *host_ptr.
Using "pinned" memory in OpenCL should be a process like:
int amt = 16*1024*1024;
int Array[] = new int[amt];
int Error = 0;
//Note, since we are using NULL for the data pointer, we HAVE to use CL_MEM_ALLOC_HOST_PTR
//This allocates memory on the devices
cl_mem B1 = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(int)*amt, NULL, &Error);
//Map the Device memory to host memory, aka pinning it
int *host_ptr = clEnqueueMapBuffer(queue, B1, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(int)*amt, 0, NULL, NULL, &Error);
//Copy from host memory to pinned host memory which copies to the card automatically`
memcpy(host_ptr, Array, sizeof(int)*amt);
//Call your kernel and everything else and memcpy back the pinned back to host when
//you are done
Edit: One final thing you can do to speed up the program is to not make the memory read/write blocking by using CL_FALSE instead of CL_TRUE. Just make sure to call clFinish() before data gets copied back to the host so that the command queue is emptied and all commands are processed.
Source: OpenCL In Action
With the right combination of flags, you should be able to achieve "zero copy" (i.e. very fast) map/unmap on Intel Integrated Graphics since there is no need for a "CPU to GPU" copy since they both use the same memory (that's what the "Integrated" means). Read the Intel OpenCL Optimization Guide section on memory.