Too many events in OpenCl host program on intel fpga - c++

When I run my opencl program on x86,there was nothing happened.But when I run the same opencl code in intel FPGA host,the program will prompt me ([Runtime Warning]: Too many 'event' objects in the host. This causes deterioration in runtime performance.)
I tried to find some answers on google,the intel manual said that I need to release the cl_event objects in my program,but in my opencl program, I did not create any cl_event object,so I could not use the clReleaseEvent()API to release any cl_event.What should I do? Thanks for your help.
This is my code snippet:
The OpenCl init and release function:
bool init()
{
cl_int status;
if(!setCwdToExeDir()) {
return false;
}
// Get the OpenCL platform.
platform = findPlatform("Intel(R) FPGA SDK for OpenCL(TM)");
if(platform == NULL) {
printf("ERROR: Unable to find Intel(R) FPGA OpenCL platform.\n");
return false;
}
// User-visible output - Platform information
{
char char_buffer[STRING_BUFFER_LEN];
printf("Querying platform for info:\n");
printf("★★★★★★★★★★★★★★★★★★★★★★★\n");
clGetPlatformInfo(platform, CL_PLATFORM_NAME, STRING_BUFFER_LEN, char_buffer, NULL);
printf("%-40s ★%s★\n", "CL_PLATFORM_NAME", char_buffer);
clGetPlatformInfo(platform, CL_PLATFORM_VERSION, STRING_BUFFER_LEN, char_buffer, NULL);
printf("%-40s ★%s★\n\n", "CL_PLATFORM_VERSION ", char_buffer);
}
// Query the available OpenCL devices.
scoped_array<cl_device_id> devices;
cl_uint num_devices;
devices.reset(getDevices(platform, CL_DEVICE_TYPE_ALL, &num_devices));
// We'll just use the first device.
device = devices[0];
// Create the context.&oclContextCallback
context = clCreateContext(NULL, 1, &device, &oclContextCallback, NULL, &status);
checkError(status, "Failed to create context");
// Create the command queue.
queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &status);
checkError(status, "Failed to create command queue");
// Create the program.
std::string binary_file = getBoardBinaryFile("No1k", device);
printf("Using AOCX: %s\n", binary_file.c_str());
program = createProgramFromBinary(context, binary_file.c_str(), &device, 1);
// Build the program that was just created.
status = clBuildProgram(program, 0, NULL, "", NULL, NULL);
checkError(status, "Failed to build program");
// Create the kernel - name passed in here must match kernel name in the
// original CL file, that was compiled into an AOCX file using the AOC tool
const char *kernel_kr = "kr"; // Kernel name, as defined in the CL file
kr = clCreateKernel(program, kernel_kr, &status);
checkError(status, "Failed to create kernel kr");
const char *kernel_kk = "kk"; // Kernel name, as defined in the CL file
kk = clCreateKernel(program, kernel_kk, &status);
checkError(status, "Failed to create kernel kk");
const char *kernel_name = "thirdKernel"; // Kernel name, as defined in the CL file
kernel = clCreateKernel(program, kernel_name, &status);
checkError(status, "Failed to create kernel");
printf("★ OpenCL init successfully!\n");
return true;
}
void cleanup()
{
if(kr) {
clReleaseKernel(kr);
}
if(kk) {
clReleaseKernel(kk);
}
if(kernel) {
clReleaseKernel(kernel);
}
if(program) {
clReleaseProgram(program);
}
if(queue) {
clReleaseCommandQueue(queue);
}
if(context) {
clReleaseContext(context);
}
printf("★ OpenCL clean successfully!\n");
}
The main function:
float *temp = (float *) malloc(sizeof(float) * dim * dim);
for (unsigned int m = 0; m < dim; m++) {
for (unsigned int n = 0; n < dim; n++) {
temp[m * dim + n] = mat[m * dim + n];
if (m == n) {
result[m * dim + n] = 1.0f;
} else {
result[m * dim + n] = 0.0f;
}
}
}
cl_int err = 0;
cl_mem mem_temp = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_ONLY, sizeof(float) * dim *dim,
temp,
&err);
checkError(err, "Failed to CreateBuffer mem_temp");
cl_mem mem_ratio = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * dim,
NULL,
&err);
checkError(err, "Failed to CreateBuffer mem_ratio");
cl_mem mem_res = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_ONLY, sizeof(float) * dim *dim,
result,
&err);
checkError(err, "Failed to CreateBuffer mem_res");
size_t work_item;
size_t work_group;
if (dim <= 300) {
work_group = dim * dim;
work_item = dim;
} else {
work_item = 300;
work_group = dim * dim;
}
size_t global_work_size[] = {work_group};
size_t local_work_size[] = {work_item};
for (unsigned int i = 0; i < dim; i++) {
err = clSetKernelArg(kr, 0, sizeof(cl_mem), &mem_temp);
checkError(err, "Failed to SetKernelArg kr mem_temp");
err = clSetKernelArg(kr, 1, sizeof(cl_mem), &mem_ratio);
checkError(err, "Failed to SetKernelArg kr mem_ratio");
err = clSetKernelArg(kr, 2, sizeof(cl_uint), (void *) &i);
checkError(err, "Failed to SetKernelArg kr i");
err = clSetKernelArg(kr, 3, sizeof(cl_uint), (void *) &dim);
checkError(err, "Failed to SetKernelArg kr dim");
err = clEnqueueNDRangeKernel(queue, kr, 1, NULL, &global_work_size[0], &local_work_size[0], 0, NULL, NULL);
checkError(err, "Failed to EnqueueNDRangeKernel kr");
err = clSetKernelArg(kk, 0, sizeof(cl_mem), &mem_res);
checkError(err, "Failed to SetKernelArg kk mem_res");
err = clSetKernelArg(kk, 1, sizeof(cl_mem), &mem_temp);
checkError(err, "Failed to SetKernelArg kk mem_temp");
err = clSetKernelArg(kk, 2, sizeof(cl_mem), &mem_ratio);
checkError(err, "Failed to SetKernelArg kk mem_ratio");
err = clSetKernelArg(kk, 3, sizeof(cl_uint), (void *) &i);
checkError(err, "Failed to SetKernelArg kk i");
err = clSetKernelArg(kk, 4, sizeof(cl_uint), (void *) &dim);
checkError(err, "Failed to SetKernelArg kk dim");
err = clEnqueueNDRangeKernel(queue, kk, 1, NULL, &global_work_size[0], &local_work_size[0], 0, NULL, NULL);
checkError(err, "Failed to EnqueueNDRangeKernel kk");
}
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem_res);
checkError(err, "Failed to SetKernelArg mem_res");
err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &mem_temp);
checkError(err, "Failed to SetKernelArg mem_temp");
err = clSetKernelArg(kernel, 2, sizeof(cl_uint), (void *) &dim);
checkError(err, "Failed to SetKernelArg dim");
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_work_size[0], &local_work_size[0], 0, NULL, NULL);
checkError(err, "Failed to EnqueueNDRangeKernel");
err = clFinish(queue);
checkError(err, "Failed to Finish");
err = clEnqueueReadBuffer(queue, mem_res, CL_TRUE, 0, sizeof(float) * dim * dim, result, 0, NULL, NULL);
checkError(err, "Failed to ReadBuffer mem_res");
free(temp);
err = clReleaseMemObject(mem_res);
checkError(err, "Failed to ReleaseMemObject mem_res");
err = clReleaseMemObject(mem_temp);
checkError(err, "Failed to ReleaseMemObject mem_temp");
err = clReleaseMemObject(mem_ratio);
checkError(err, "Failed to ReleaseMemObject mem_ratio");
As you can see, I didn't create any cl_event in the code, but in the callback function, it prompts too many events in the host.
This is my compilation command that I used in Makefile:
ifeq ($(VERBOSE),1)
ECHO :=
else
ECHO := #
endif
# Where is the Intel(R) FPGA SDK for OpenCL(TM) software?
ifeq ($(wildcard $(INTELFPGAOCLSDKROOT)),)
$(error Set INTELFPGAOCLSDKROOT to the root directory of the Intel(R) FPGA SDK for OpenCL(TM) software installation)
endif
ifeq ($(wildcard $(INTELFPGAOCLSDKROOT)/host/include/CL/opencl.h),)
$(error Set INTELFPGAOCLSDKROOT to the root directory of the Intel(R) FPGA SDK for OpenCL(TM) software installation.)
endif
# OpenCL compile and link flags.
AOCL_COMPILE_CONFIG := $(shell aocl compile-config --arm)
#AOCL_LINK_CONFIG := $(shell aocl link-config --arm)
AOCL_LINK_CONFIG := $(wildcard $(INTELFPGAOCLSDKROOT)/host/arm32/lib/*.so) $(wildcard $(AOCL_BOARD_PACKAGE_ROOT)/arm32/lib/*.so)
# Compilation flags
ifeq ($(DEBUG),1)
CXXFLAGS += -g
else
CXXFLAGS += -O3
endif
# Compiler. ARM cross-compiler.
CXX := arm-linux-gnueabihf-g++
# Target
TARGET := host
TARGET_DIR := bin
TARGET_NAME := time_FPGA_trasform_and_kernel_exe
# Directories
INC_DIRS := ../common/inc
LIB_DIRS :=
# Files
INCS := $(wildcard )
SRCS := $(wildcard host/src/*.cpp ../common/src/AOCLUtils/*.cpp)
LIBS := rt pthread
# Make it all!
all : $(TARGET_DIR)/$(TARGET)
# Host executable target.
$(TARGET_DIR)/$(TARGET) : Makefile $(SRCS) $(INCS) $(TARGET_DIR)
$(CXX) $(CPPFLAGS) $(CXXFLAGS) -Wall -fPIC $(foreach D,$(INC_DIRS),-I$D) \
$(AOCL_COMPILE_CONFIG) $(SRCS) $(AOCL_LINK_CONFIG) \
$(foreach D,$(LIB_DIRS),-L$D) \
$(foreach L,$(LIBS),-l$L) \
-o $(TARGET_DIR)/$(TARGET_NAME)
$(TARGET_DIR) :
$(ECHO)mkdir $(TARGET_DIR)
# Standard make targets
clean :
$(ECHO)rm -f $(TARGET_DIR)/$(TARGET_NAME)
.PHONY : all clean

Related

OpenCL.clSetKernelArg returns -51

I tried to make parallel bfs in openCL but I didn't have enough experience with c++.
So this is probably memory error, but I really don't know how to fix it.
I also can't find what does error value -51 means.
As a result I got "Unhandled exception at 0x00007FFCFB06A549 (amdocl64.dll) in my project.exe: 0xC0000005: Access violation reading location 0xFFFFFFFFFFFFFFFF" in next line.
main
Graph G(AdjacencyList, Directed);
int startVertex;
vector<int> distance;
vector<bool> visited;
distance = vector<int>(G.numVertices);
visited = vector<bool>(G.numVertices);
bool done = false;
const bool true_value = true;
int level = 0;
// Allocation on device
const int size = G.numVertices * sizeof(int);
const int adjacencySize = G.adjacencyList.size() * sizeof(int);
//OpenCL
cl_int status;
cl_int ret;
cl_platform_id platform_id;
clGetPlatformIDs(1, &platform_id, NULL);
cl_device_id device_id;
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
cl_context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &status);
cl_command_queue command_queue = clCreateCommandQueueWithProperties(context, device_id, NULL, &status);
cl_mem d_adjacencyList = clCreateBuffer(context, CL_MEM_READ_WRITE, adjacencySize, NULL, &status);
cl_mem d_edgesOffset = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &status);
cl_mem d_edgesSize = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &status);
cl_mem d_distance = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &status);
cl_mem d_done = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(bool), NULL, &status);
status = clEnqueueWriteBuffer(command_queue, d_adjacencyList, CL_TRUE, 0, adjacencySize, &G.adjacencyList[0], 0, NULL, NULL);
status = clEnqueueWriteBuffer(command_queue, d_edgesOffset, CL_TRUE, 0, size, &G.edgesOffset[0], 0, NULL, NULL);
status = clEnqueueWriteBuffer(command_queue, d_edgesSize, CL_TRUE, 0, size, &G.edgesSize[0], 0, NULL, NULL);
distance = vector<int>(G.numVertices, INT_MAX);
distance[start] = 0;
status = clEnqueueWriteBuffer(command_queue, d_distance, CL_TRUE, 0, size, distance.data(), 0, NULL, NULL);
char* source_str = NULL;
size_t source_size;
FILE* fp;
fp = fopen("bfs.cl", "r");
if (!fp)
{
cout << "Failed to load Kernel\n";
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
cl_program program = clCreateProgramWithSource(context, 1, (const char**)&source_str, (const size_t*)&source_size, &status);
status = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
cl_kernel kernel = clCreateKernel(program, "bfs", &status);
status = clSetKernelArg(kernel, 0, sizeof(int), (void*)&G.numVertices);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&d_adjacencyList);
status = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&d_edgesOffset);
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&d_edgesOffset);
status = clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&d_edgesSize);
status = clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&d_distance); //here retirns -51
status = clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*)&level);
status = clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*)&d_done);
kernel
__kernel void bfs(int n, __global int *adjacencyList,__global int *edgesOffset,__global int *edgesSize,__global int *distance, int level,__global bool *done) {
int tid = get_global_id(0);
if (tid < n) {
if (distance[tid] == level) {
for (int i = edgesOffset[tid]; i < edgesOffset[tid] + edgesSize[tid]; ++i) {
int v = adjacencyList[i];
if (distance[v] == INT_MAX) {
*done = false;
distance[v] = level + 1;
}
}
}
}
}
Hi #Parrison welcome to StackOverflow!
All the OpenCL error codes are defined in cl.h. In the latest (version 3) cl.h you will find the error codes defined between lines 194 and 270, where on line 241 you will find:
#define CL_INVALID_ARG_SIZE -51
So the OpenCL ICD reckons that you have passed the wrong variable size for distance.
However, I can see many other errors before this one. For example, you need to set the size of the OpenCL buffers based on the sizes of OpenCL variable not native variables, e.g.:
cl_int instead of int
cl_float instead of float
and especially cl_bool instead of bool.
There is no guarantee that an OpenCL cl_int is the same size a host int and an OpenCL cl_bool is defined as an unsigned int which is highly unlikely to be the same size as a bool!
Ensure that all the parameters to your OpenCL kernel are defined correctly and that
you are creating the correct buffers and variables for them in the main program.

clGetProgramBuildInfo return empty string?

I'm trying to debug my code but the method clGetProgramBuildInfo with CL_PROGRAM_BUILD_LOG parameter return an empty string, this is the code:
cl_int err;
cl_uint num_platforms;
err = clGetPlatformIDs(0, NULL, &num_platforms);
cl_platform_id platform[num_platforms];
err = clGetPlatformIDs(num_platforms, platform, NULL);
cl_uint num_devices;
err = clGetDeviceIDs(platform[0], CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
cl_device_id device[num_devices];
err = clGetDeviceIDs(platform[0], CL_DEVICE_TYPE_ALL, num_devices, device, NULL);
cl_context context = clCreateContext(0, 1, device, NULL, NULL, &err);
string cl_str = util::loadProgram("vadd.cl");
const char * c = cl_str.c_str();
cl_program program = clCreateProgramWithSource ( context,
1,
(const char **) &c,
NULL,
NULL);
err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
cl_char string[10240] = {0};
char* param_value;
clGetProgramBuildInfo ( program,
device[1],
CL_PROGRAM_BUILD_LOG,
10240,
string,
NULL);
cout << string << endl;
could you tell me what i do wrong? Thanks

OpenCL alignment issue

I want to fill an array of glm::vec3 with an OpenCL kernel.
All I want to do is fill the array with [1.0, 2.0, 3.0].
So upon success I should get the triplet repeated 256 times.
[1.0, 2.0, 3.0][1.0, 2.0, 3.0][1.0, 2.0, 3.0] ... [1.0, 2.0, 3.0]
However the result looks like this
[1.0, 2.0, 2.0][2.0, 2.0, 2.0] ... [2.0, 2.0, 2.0]
Why?
Here is the code for the kernel
__kernel void fill_array(__global float *output_values)
{
int i = get_global_id(0);
float3 pos = (float3)(1.0, 2.0, 3.0);
vstore3(pos, 0, &(output_values[i]));
}
And here is the code to run it
#include <stdio.h>
#include <stdlib.h>
#include <vector>
#include "glm/glm.hpp"
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#define MAX_SOURCE_SIZE (0x100000)
int main(void)
{
std::vector<glm::vec3> values;
values.resize(256);
// Load the kernel source code into the array source_str
FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("E:/Dev/fill_array_kernel.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
// Get platform and device information
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_ALL, 1,
&device_id, &ret_num_devices);
// Create an OpenCL context
cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
// Create a command queue
cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
// Create memory buffers on the device for each vector
cl_mem output_mem = clCreateBuffer(context, CL_MEM_WRITE_ONLY, values.size() * sizeof(glm::vec3), NULL, &ret);
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&source_str, (const size_t *)&source_size, &ret);
// Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
if(ret != CL_SUCCESS)
{
cl_build_status build_status;
ret = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &build_status, NULL);
size_t ret_val_size;
ret = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
char *build_log = (char*)malloc(sizeof(char)*(ret_val_size + 1));
ret = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
build_log[ret_val_size] = '\0';
printf("%s\n", build_log);
free(build_log);
return -1;
}
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "fill_array", &ret);
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&output_mem);
// Execute the OpenCL kernel on the list
size_t global_item_size = values.size(); // Process the entire lists
size_t local_item_size = 64; // Process in groups of 64
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
// Read the memory buffer C on the device to the local variable C
ret = clEnqueueReadBuffer(command_queue, output_mem, CL_TRUE, 0, values.size() * sizeof(glm::vec3), values.data(), 0, NULL, NULL);
// Clean up
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(output_mem);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
return 0;
}
I was misusing the vstore function.
I should have used the 2nd parameter to specify the index in the array.
https://www.khronos.org/registry/OpenCL/sdk/1.0/docs/man/xhtml/vstoren.html
__kernel void fill_array(__global float *output_values)
{
int i = get_global_id(0);
float3 pos = (float3)(1.0, 2.0, 3.0);
vstore3(pos, i, output_values);
}

OpenCL - Kernel method returns unexpected results

I am a beginner at OpenCL. I tried to run a very simple kernel code, adding 1 to each value of vector. Everything runs fine, returns no error code (I checked return value after each step). The source Code :
cl_device_id device_id = NULL;
cl_context context = NULL;
cl_command_queue command_queue = NULL;
cl_mem memobj , resobj = NULL;
cl_program program = NULL;
cl_kernel kernel = NULL;
cl_platform_id platform_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret;
size_t work_units_per_kernels;
int input[10] = {1,2,3,4,5,6,7,8,9,10};
int output[10];
int length = 10 ;
FILE *fp;
char fileName[] = "/home/tuan/OpenCLPlayaround/hello.cl";
char *source_str;
size_t source_size;
/* Load the source code containing the kernel*/
fp = fopen(fileName, "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(0x100000);
source_size = fread(source_str,1,0x100000, fp);
fclose(fp);
ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
std::cout<<ret<<" code"<<std::endl;
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);
std::cout<<ret<<" code"<<std::endl;
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
std::cout<<ret<<" code"<<std::endl;
command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
//Check Concept of memory
memobj = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,length * sizeof(int), input, &ret);
resobj = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length * sizeof(int), output, &ret);
std::cout<<ret<<" code"<<std::endl;
program = clCreateProgramWithSource(context,1,(const char**)&source_str, (const size_t*)&source_size, &ret);
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
kernel = clCreateKernel(program, "hello", &ret);
ret = clSetKernelArg(kernel,0, sizeof(memobj),(void *)&memobj);
ret = clSetKernelArg(kernel,1, sizeof(resobj),(void *)&resobj);
ret = clEnqueueTask(command_queue, kernel, 0, NULL,NULL);
ret = clEnqueueReadBuffer(command_queue, resobj, CL_TRUE, 0, length* sizeof(int),output, 0, NULL, NULL);
for (int i = 0 ; i <10 ; i++) {
std::cout<<output[i]<<" "<<std::endl;
}
return 0;
The result is somewhat bizarre, while it should be {2,3,4,5,6,7,8,9,10,11} :
2
-16777216
65535
1
-1242789408
32767
4201449
0
2
0
And my kernel :
__kernel void hello(__global int* a, __global int* b)
{
int sam = 0;
int gid = get_global_id(0);
b[gid] = sam + a[gid] +1 ;
}
Can somebody explain why ? Its bursting my head for hours !
clEnqueueTask is equivalent to calling clEnqueueNDRangeKernel with work_dim = 1, global_work_offset = NULL, global_work_size[0] set to 1, and local_work_size[0] set to 1.
so use clEnqueueNDRangeKernel.

clEnqueueWriteBuffer writes wrong data into VRAM

I have a very curious problem with clEnqueueWriteBuffer. In my current project, I would like to copy ~500 images (1GB) onto the graphics card and average some pixels. The images are stored in one big double* Array (size: width*height*nImages). If I copy 300 images into the VRAM and read it out using clEnqueueReadBuffer, I get exactly what I had stored in RAM:
RAM: 14450,5006076793 14450,5006076793 14456,8079379383 14455,2294939826 14444,7361060619
VRAM: 14450,5006076793 14450,5006076793 14456,8079379383 14455,2294939826 14444,7361060619
However, if I load more than 350 images, the content of my cl_mem object is corrupt:
RAM:14450,5006076793 14450,5006076793 14456,8079379383 14455,2294939826 14444,7361060619
VRAM:-6,27743856220419E+66 -6,27743856220419E+66 -6,27743856220419E+66 -6,27743856220419E+66 -6,27743856220419E+66
I would be very happy if you could help me out!
Here is my code:
private: System::Void button7_Click(System::Object^ sender, System::EventArgs^ e) {
std::string text;
text = StringConvA(maskedTextBox1->Text);
textBox1->Text += "You want a bin size of " + atoi(text.c_str()) + ". You have "+ nforegroundImages+" images.\r\n";
binWidth = atoi(text.c_str());
nbins = (int)ceil((double)nforegroundImages / (double)binWidth);
textBox1->Text += "That is going to give you "+nbins+" bins\r\n";
//create context and cmd_queue
context = clCreateContext(NULL, nDevices, &deviceID[0], NULL, NULL, &err);
cmd_queue = clCreateCommandQueue(context, deviceID[0], NULL, &err);
//allocate result memory
//each result image will have width*height double entries. res_im is an array of pointer to double.
res_im = (double*)malloc(width*height*sizeof(double)*nbins);
cl_mem imageData_mem, result_mem, nWavenumber_mem, binSize_mem, imageSizeInPixels_mem, nbins_mem;
imageData_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, width * height * sizeof(double)*nforegroundImages, NULL, NULL);
result_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, width * height * sizeof(double)*nbins, NULL, NULL);
nWavenumber_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int), NULL, NULL);
binSize_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int), NULL, NULL);
imageSizeInPixels_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int), NULL, NULL);
nbins_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int), NULL, NULL);
clFinish(cmd_queue);
int imageSizeInPixels = width*height;
err = clEnqueueWriteBuffer(cmd_queue, imageData_mem, CL_TRUE, 0, width*height*sizeof(double)*nforegroundImages, (void*)images, 0, NULL, NULL); //this is where the images are copied into VRAM. If nforegroundImages>300, the data in VRAM is wrong, otherwise it is the same as in the images array
err = clEnqueueWriteBuffer(cmd_queue, nWavenumber_mem, CL_TRUE, 0, sizeof(int), (void*)&nforegroundImages, 0, NULL, NULL);
err = clEnqueueWriteBuffer(cmd_queue, binSize_mem, CL_TRUE, 0, sizeof(int), (void*)&binWidth, 0, NULL, NULL);
err = clEnqueueWriteBuffer(cmd_queue, imageSizeInPixels_mem, CL_TRUE, 0, sizeof(int), (void*)&imageSizeInPixels, 0, NULL, NULL);
err = clEnqueueWriteBuffer(cmd_queue, nbins_mem, CL_TRUE, 0, sizeof(int), (void*)&nbins, 0, NULL, NULL);
clFinish(cmd_queue);
//read the content of imageData_mem and store it in test array
double * test = (double*)malloc(width*height*sizeof(double)*nforegroundImages);
err = clEnqueueReadBuffer(cmd_queue, imageData_mem, CL_TRUE, 0, width*height*sizeof(double)*nforegroundImages,
test, 0, NULL, NULL);
clFinish(cmd_queue);
//compare original value from the images array to the value retrieved from the VRAM
textBox1->Text += images[1] + "\t" + images[1] + "\t" + images[10] + "\t" + images[100] + "\t" + images[1000] + "\t\r\n"; //original data
textBox1->Text += test[1] + "\t" + test[1] + "\t" + test[10] + "\t" + test[100] + "\t" + test[1000] + "\t\r\n"; //retrieved from imageData_mem
free(test);
//build the program from the source file and print the program build log
cl_program program[2];
cl_kernel kernel[2];
const char * filename = "addKernel.c";
char *program_source = load_program_source(filename);
program[0] = clCreateProgramWithSource(context, 1, (const char**)&program_source,
NULL, &err);
if (err == CL_OUT_OF_HOST_MEMORY){
textBox1->Text += "Error: out of Host Memory!\r\n";
}
else if (err == CL_INVALID_CONTEXT){
textBox1->Text += "Error: invalid Context!\r\n";
}
else if (err == CL_INVALID_VALUE){
textBox1->Text += "Error: invalid Value!\r\n";
}
err = clBuildProgram(program[0], 0, NULL, NULL, NULL, NULL);
textBox1->Text += "Program build error: " + err + "\r\n";
cl_build_status status;
size_t logSize;
clGetProgramBuildInfo(program[0], deviceID[0], CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL);
clGetProgramBuildInfo(program[0], deviceID[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
char* programLog;
programLog = (char*)calloc(logSize + 1, sizeof(char));
clGetProgramBuildInfo(program[0], deviceID[0], CL_PROGRAM_BUILD_LOG, logSize + 1, programLog, NULL);
this->textBox1->Text += "Program build info: error=" + err + ", status=" + status + ", programLog:\r\n" + *programLog + "\r\n" + "In case of an error please make sure that openCL has been initialized\r\n";
kernel[0] = clCreateKernel(program[0], "filterSpectrum", &err);
//(__global double *imageData, __global double *result, __constant int *nWavenumbers, __constant int *binSize, __constant int *imageSizeInPixels,__constant int * nbins)
// Now setup the arguments to our kernel
err = clSetKernelArg(kernel[0], 0, sizeof(cl_mem), &imageData_mem);
err |= clSetKernelArg(kernel[0], 1, sizeof(cl_mem), &result_mem);
err |= clSetKernelArg(kernel[0], 2, sizeof(cl_mem), &nWavenumber_mem);
err |= clSetKernelArg(kernel[0], 3, sizeof(cl_mem), &binSize_mem);
err |= clSetKernelArg(kernel[0], 4, sizeof(cl_mem), &imageSizeInPixels_mem);
err |= clSetKernelArg(kernel[0], 5, sizeof(cl_mem), &nbins_mem);
size_t local_work_size = 32;
// Run the calculation by enqueuing it and forcing the
// command queue to complete the task
size_t global_work_size = width*height;
err = clEnqueueNDRangeKernel(cmd_queue, kernel[0], 1, NULL,&global_work_size, &local_work_size, 0, NULL, NULL);
clFinish(cmd_queue);
// Once finished read back the results from the answer
// array into the results array
err = clEnqueueReadBuffer(cmd_queue, result_mem, CL_TRUE, 0, width*height*sizeof(double)*nbins,
res_im, 0, NULL, NULL);
clFinish(cmd_queue);
textBox1->Text += "result values " + res_im[1] + "\t" + res_im[100] + "\t" + res_im[1000] + "\t" + res_im[10000] + "\t" + res_im[100000] + "\t" + res_im[1000000] + "\r\n";
hScrollBar2->Maximum = nbins+3;
clReleaseMemObject(imageSizeInPixels_mem);
clReleaseMemObject(imageData_mem);
clReleaseMemObject(result_mem);
clReleaseMemObject(nWavenumber_mem);
clReleaseMemObject(binSize_mem);
clReleaseMemObject(nbins_mem);
clReleaseCommandQueue(cmd_queue);
clReleaseContext(context);
}
You are most likely requesting more memory than the driver will allow in a single allocation. It looks like you aren't checking most of the error codes that the OpenCL runtime functions return; doing this makes it much easier to diagnose problems with OpenCL programs. You really should do this for every API call.
You can find out what the largest single memory allocation your device supports is with the following code snippet:
cl_ulong maxMemAlloc;
clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &maxMemAlloc, NULL);
textBox1->Text += "Maximum memory allocation size is " + maxMemAlloc + " bytes\r\n";
It's often the case that the largest memory allocation is much less than the total size of the GPU memory. The OpenCL specification only requires that it is at least 1/4 of the maximum size, or at least 128 MB.