Can dedicated memory be released on the GPU using OpenCL? - c++

Is there a way to free up dedicated memory when a function that runs on GPU using OpenCL ends? I have noticed that if you repeatedly call a program that uses OpenCL in GPU and it ends, a certain amount of dedicated memory is not released, which would cause an error if the function is called too many times.
UPDATE 22/12/2019:
I enclose a fragment of the code that is within the iteration. The configuration of the cl_program and cl_context is done outside the iteration:
void launch_kernel(float * blockC, float * blockR, float * blockEst, float * blockEstv, float * blockL, cl_program program, cl_context context, std::vector<cl_device_id> deviceIds, int n){
cl_kernel kernel_function = clCreateKernel(program, "function_example", &error);
cl_event event;
cl_command_queue queue = clCreateCommandQueue(context, (deviceIds)[0], 0, &error);
std::size_t size[1] = { (size_t)n };
cl_mem blockCBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * (n * s), (void *)&blockC[0], &error);
cl_mem blockRBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * (n * s), (void *)&blockR[0], &error);
cl_mem blockEstBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(float) * n, (void *)&blockEst[0], &error);
cl_mem blockEstvBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(float) * n, (void *)&blockEstv[0], &error);
cl_mem blockLBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(float) * n, (void *)&blockL[0], &error);
clSetKernelArg(kernel_solution, 0, sizeof(cl_mem), &blockCBuffer);
clSetKernelArg(kernel_solution, 1, sizeof(cl_mem), &blockRBuffer);
clSetKernelArg(kernel_solution, 2, sizeof(cl_mem), &blockEstBuffer);
clSetKernelArg(kernel_solution, 3, sizeof(cl_mem), &blockEstvBuffer);
clSetKernelArg(kernel_solution, 4, sizeof(cl_mem), &blockLBuffer);
clSetKernelArg(kernel_solution, 5, sizeof(int), &s);
openclSingleton->checkError(clEnqueueNDRangeKernel(queue, kernel_function, 1, nullptr, size, nullptr, 0, nullptr, nullptr));
clEnqueueMapBuffer(queue, blockEstBuffer, CL_TRUE, CL_MAP_READ, 0, n * sizeof(float) , 0, nullptr, nullptr, &error);
clEnqueueMapBuffer(queue, blockEstvBuffer, CL_TRUE, CL_MAP_READ, 0, n * sizeof(float) , 0, nullptr, nullptr, &error);
clEnqueueMapBuffer(queue, blockLBuffer, CL_TRUE, CL_MAP_READ, 0, n * sizeof(float) , 0, nullptr, nullptr, &error);
clReleaseMemObject(blockCBuffer);
clReleaseMemObject(blockRBuffer);
clReleaseMemObject(blockEstBuffer);
clReleaseMemObject(blockEstvBuffer);
clReleaseMemObject(blockLBuffer);
clFlush(queue);
clFinish(queue);
clWaitForEvents(1, &event);
clReleaseCommandQueue(queue);
clReleaseKernel(kernel_function);
}
UPDATE 23/12/2019
Updated the code with the iterative process that calls the function in OpenCl. The problem arises when at the end of the launch_kernel function it leaves some dedicated memory used, which causes that if the variable m is too large, the memory becomes full and the program crashes due to lack of resources.
std::vector<cl_device_id> deviceIds;
cl_program program;
cl_context context;
... //configuration program and context
int n;
float *blockEst, *blockEstv, *blockL, *blockC, *blockR;
for(int i = 0; i < m; m++){
blockC = (float*)std::malloc(sizeof(float)*n*s);
blockR = (float*)std::malloc(sizeof(float)*n*s);
blockEst = (float*)std::malloc(sizeof(float)*n);
blockEstv = (float*)std::malloc(sizeof(float)*n);
blockL = (float*)std::malloc(sizeof(float)*n);
.... //Set values blockC and blockR
launch_kernel(blockC, blockR, blockEst, blockEstv, blockL, program, context, deviceIds, n);
... //Save value blockEst, blockEstv and blockL
std::free(blockC);
std::free(blockR);
std::free(blockEst);
std::free(blockEstv);
std::free(blockL);
}
clReleaseProgram(program);
clReleaseContext(context);
clReleaseDevice(deviceIds[0]);

Related

OpenCL.clSetKernelArg returns -51

I tried to make parallel bfs in openCL but I didn't have enough experience with c++.
So this is probably memory error, but I really don't know how to fix it.
I also can't find what does error value -51 means.
As a result I got "Unhandled exception at 0x00007FFCFB06A549 (amdocl64.dll) in my project.exe: 0xC0000005: Access violation reading location 0xFFFFFFFFFFFFFFFF" in next line.
main
Graph G(AdjacencyList, Directed);
int startVertex;
vector<int> distance;
vector<bool> visited;
distance = vector<int>(G.numVertices);
visited = vector<bool>(G.numVertices);
bool done = false;
const bool true_value = true;
int level = 0;
// Allocation on device
const int size = G.numVertices * sizeof(int);
const int adjacencySize = G.adjacencyList.size() * sizeof(int);
//OpenCL
cl_int status;
cl_int ret;
cl_platform_id platform_id;
clGetPlatformIDs(1, &platform_id, NULL);
cl_device_id device_id;
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
cl_context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &status);
cl_command_queue command_queue = clCreateCommandQueueWithProperties(context, device_id, NULL, &status);
cl_mem d_adjacencyList = clCreateBuffer(context, CL_MEM_READ_WRITE, adjacencySize, NULL, &status);
cl_mem d_edgesOffset = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &status);
cl_mem d_edgesSize = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &status);
cl_mem d_distance = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &status);
cl_mem d_done = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(bool), NULL, &status);
status = clEnqueueWriteBuffer(command_queue, d_adjacencyList, CL_TRUE, 0, adjacencySize, &G.adjacencyList[0], 0, NULL, NULL);
status = clEnqueueWriteBuffer(command_queue, d_edgesOffset, CL_TRUE, 0, size, &G.edgesOffset[0], 0, NULL, NULL);
status = clEnqueueWriteBuffer(command_queue, d_edgesSize, CL_TRUE, 0, size, &G.edgesSize[0], 0, NULL, NULL);
distance = vector<int>(G.numVertices, INT_MAX);
distance[start] = 0;
status = clEnqueueWriteBuffer(command_queue, d_distance, CL_TRUE, 0, size, distance.data(), 0, NULL, NULL);
char* source_str = NULL;
size_t source_size;
FILE* fp;
fp = fopen("bfs.cl", "r");
if (!fp)
{
cout << "Failed to load Kernel\n";
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
cl_program program = clCreateProgramWithSource(context, 1, (const char**)&source_str, (const size_t*)&source_size, &status);
status = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
cl_kernel kernel = clCreateKernel(program, "bfs", &status);
status = clSetKernelArg(kernel, 0, sizeof(int), (void*)&G.numVertices);
status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&d_adjacencyList);
status = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&d_edgesOffset);
status = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void*)&d_edgesOffset);
status = clSetKernelArg(kernel, 4, sizeof(cl_mem), (void*)&d_edgesSize);
status = clSetKernelArg(kernel, 5, sizeof(cl_mem), (void*)&d_distance); //here retirns -51
status = clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*)&level);
status = clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*)&d_done);
kernel
__kernel void bfs(int n, __global int *adjacencyList,__global int *edgesOffset,__global int *edgesSize,__global int *distance, int level,__global bool *done) {
int tid = get_global_id(0);
if (tid < n) {
if (distance[tid] == level) {
for (int i = edgesOffset[tid]; i < edgesOffset[tid] + edgesSize[tid]; ++i) {
int v = adjacencyList[i];
if (distance[v] == INT_MAX) {
*done = false;
distance[v] = level + 1;
}
}
}
}
}
Hi #Parrison welcome to StackOverflow!
All the OpenCL error codes are defined in cl.h. In the latest (version 3) cl.h you will find the error codes defined between lines 194 and 270, where on line 241 you will find:
#define CL_INVALID_ARG_SIZE -51
So the OpenCL ICD reckons that you have passed the wrong variable size for distance.
However, I can see many other errors before this one. For example, you need to set the size of the OpenCL buffers based on the sizes of OpenCL variable not native variables, e.g.:
cl_int instead of int
cl_float instead of float
and especially cl_bool instead of bool.
There is no guarantee that an OpenCL cl_int is the same size a host int and an OpenCL cl_bool is defined as an unsigned int which is highly unlikely to be the same size as a bool!
Ensure that all the parameters to your OpenCL kernel are defined correctly and that
you are creating the correct buffers and variables for them in the main program.

Model class fails to initialize in DirectX 10

My project uses DirectX 10 and some of its boilerplate to render a scene, however, it crashes with an error message "Could not initialize the model object." As far as I understand, making it up to this point means that, at the very least, the model has been successfully created, so the error must be in one of the files below, which is fortunate as the most difficult tasks are handled by the FallBodyClass.cpp that hosts OpenCL API interactions. If needed, I can try attaching parts of it in a later edit.
During debug, my IDE shows that all components of m_Model (m_vertexBuffer, m_indexBuffer etc) are shown as with _vfptr . I do not know what to make of it, but it does seem to confirm that modelclass.cpp is the point of failure.
graphicsclass.cpp
GraphicsClass::GraphicsClass()
{
m_Direct3D = 0;
m_Model = 0;
m_ColorShader = 0;
m_bodies = BODIES;
}
GraphicsClass::GraphicsClass(const GraphicsClass& other)
{}
GraphicsClass::~GraphicsClass()
{}
bool GraphicsClass::Initialize(int screenWidth, int screenHeight, HWND hwnd)
{
bool result;
// Create the Direct3D object.
m_Direct3D = new D3DClass;
if (!m_Direct3D)
{
return false;
}
// Initialize the Direct3D object.
result = m_Direct3D->Initialize(screenWidth, screenHeight, VSYNC_ENABLED, hwnd, FULL_SCREEN, SCREEN_DEPTH, SCREEN_NEAR);
if (!result)
{
MessageBox(hwnd, L"Could not initialize Direct3D", L"Error", MB_OK);
return false;
}
// Create the model object.
m_Model = new ModelClass(m_bodies);
if (!m_Model)
{
return false;
}
// Initialize the model object.
result = m_Model->Initialize(m_Direct3D->GetDevice());
if (!result)
{
MessageBox(hwnd, L"Could not initialize the model object.", L"Error", MB_OK);
return false;
}
modelclass.cpp
ModelClass::ModelClass(int bodies)
{
m_vertexBuffer = 0;
m_indexBuffer = 0;
m_positions = 0;
m_velocities = 0;
m_bodySystem = 0;
m_bodies = bodies;
}
ModelClass::ModelClass(const ModelClass& other)
{}
ModelClass::~ModelClass()
{}
bool ModelClass::Initialize(ID3D10Device* device)
{
bool result;
TwoLines twoLinesConstants = CalculateLinesConstants(M_PI_4);
m_positions = new float[COORD_DIM * m_bodies];
m_velocities = new float[VEL_DIM * m_bodies];
m_bodySystem = new class FallBodyClass(m_bodies, &m_positions, &m_velocities, twoLinesConstants, result);
if (!result) {
return false;
}
// Initialize the vertex and index buffer that hold the geometry for the triangle.
result = InitializeBuffers(device, twoLinesConstants);
if(!result)
{
return false;
}
return true;
}
FallBodyclass.cpp
FallBodyClass::FallBodyClass(int bodies, float ** positionsCPU, float ** velocitiesCPU, TwoLines twoLines, bool & success)
:bodies(bodies)
{
cl_int ret;
// getting the first available platform
cl_platform_id clPlatformID[2];
cl_platform_id GPUplatform;
cl_uint num_platforms;
//char str[1024];
ret = clGetPlatformIDs(2, clPlatformID, &num_platforms);
GPUplatform = clPlatformID[0]; //choose GPU platform
//error |= clGetPlatformInfo(GPUplatform, CL_PLATFORM_NAME, 0, NULL, NULL);
//clGetPlatformInfo(GPUplatform, CL_PLATFORM_VENDOR, sizeof(str), str, NULL);
// getting the first GPU device
ret |= clGetDeviceIDs(GPUplatform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
if (ret != CL_SUCCESS)
{
success = false;
return;
}
//clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(str), str, NULL);
// creating the context
context = clCreateContext(0, 1, &device, NULL, NULL, &ret);
if (ret != CL_SUCCESS)
{
success = false;
return;
}
cl_queue_properties props[] = {
CL_QUEUE_PROFILING_ENABLE
};
// creating the command queue
queue = clCreateCommandQueueWithProperties(context, device, props, &ret);
if (ret != CL_SUCCESS)
{
success = false;
return;
}
// setting the local variables
// (at the same time one of them supposed to be 0 and another to be 1)
read = 0;
write = 1;
// reading the kernel
FILE * f = NULL;
char fileName[18] = "kernel.cl";
f = fopen(fileName, "rb");
if(f == NULL)
{
success = false;
return;
}
// getting the length of the source code for the kernel
fseek(f, 0, SEEK_END);
size_t codeLength = ftell(f);
rewind(f);
char * code = (char *)malloc(codeLength + 1);
if (fread(code, codeLength, 1, f) != 1)
{
fclose(f);
free(code);
success = false;
return;
}
// closing the file and 0-terminating the source code
fclose(f);
code[codeLength] = '\0';
// creating the program
program = clCreateProgramWithSource(context, 1, (const char **)&code, &codeLength, &ret);
if (ret != CL_SUCCESS)
{
success = false;
return;
}
// clearing the memory
free(code);
// building the program
ret |= clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
// creating the kernel
kernel = clCreateKernel(program, "impactManager", &ret);
// setting the local size of the group the largest possible in order to load all computational units
int numGroups;
ret |= clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(numGroups), &numGroups, NULL);
localSize = bodies / numGroups;
// allocating pinned buffers for velocities and positions, and stuck
positionsCPUBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, COORD_DIM * bodies * sizeof(float) , NULL, NULL);
velocitiesCPUBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, VEL_DIM * bodies * sizeof(float) , NULL, NULL);
linesCPUBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, 8 * sizeof(float), NULL, NULL);
// get pointers to arrays to operate with the buffers (array map buffers here (to program) as float-arrays)
*positionsCPU = (float *)clEnqueueMapBuffer(queue, positionsCPUBuffer, CL_TRUE, CL_MAP_WRITE, 0, COORD_DIM * bodies * sizeof(float), 0, NULL, NULL, NULL);
*velocitiesCPU = (float *)clEnqueueMapBuffer(queue, velocitiesCPUBuffer, CL_TRUE, CL_MAP_WRITE, 0, VEL_DIM * bodies * sizeof(float), 0, NULL, NULL, NULL);
float * linesCPU = (float *)clEnqueueMapBuffer(queue, linesCPUBuffer, CL_TRUE, CL_MAP_WRITE, 0, 8 * sizeof(float), 0, NULL, NULL, NULL);
// initialization of the bodies' positions and velocities, and stuck
initBodies(*positionsCPU, *velocitiesCPU);
initLines(twoLines, linesCPU);
// unmapping the pointers to arrays (invalidates array pointers)
clEnqueueUnmapMemObject(queue, positionsCPUBuffer, *positionsCPU, 0, NULL, NULL);
clEnqueueUnmapMemObject(queue, velocitiesCPUBuffer, *velocitiesCPU, 0, NULL, NULL);
clEnqueueUnmapMemObject(queue, linesCPUBuffer, linesCPU, 0, NULL, NULL);
// allocate two arrays on GPU for positions and velocities
for (int i = 0; i < 2; ++i) {
positionsGPU[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, COORD_DIM * bodies * sizeof(float), NULL, NULL);
ret |= clEnqueueWriteBuffer(queue, positionsGPU[i], CL_TRUE, 0, COORD_DIM * bodies * sizeof(float), *positionsCPU, 0, NULL, NULL);
velocitiesGPU[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, VEL_DIM * bodies * sizeof(float), NULL, NULL);
ret |= clEnqueueWriteBuffer(queue, velocitiesGPU[i], CL_TRUE, 0, VEL_DIM * bodies * sizeof(float), *velocitiesCPU, 0, NULL, NULL);
}
linesGPU = clCreateBuffer(context, CL_MEM_READ_WRITE, 8 * sizeof(float), NULL, NULL);
ret |= clEnqueueWriteBuffer(queue, linesGPU, CL_TRUE, 0, 8 * sizeof(float), linesCPU, 0, NULL, NULL);
if (ret != CL_SUCCESS)
{
success = false;
return;
}
}
void FallBodyClass::initLines(IN TwoLines l, OUT float *linesCPU)
{
linesCPU[0] = l.a1;
linesCPU[1] = l.b1;
linesCPU[2] = l.R1.x;
linesCPU[3] = l.R1.y;
linesCPU[4] = l.a2;
linesCPU[5] = l.b2;
linesCPU[6] = l.R2.x;
linesCPU[7] = l.R2.y;
}
// initialization of the bodies' positions and velocities
void FallBodyClass::initBodies(float * positionsCPU, float * velocitiesCPU)
{
float scale = 0.20f;
// initialization of the memory
memset(positionsCPU, 0, COORD_DIM * bodies * sizeof(float));
memset(velocitiesCPU, 0, VEL_DIM * bodies * sizeof(float));
// for the randomization
srand((unsigned int)time(NULL));
for (int i = 0; i < bodies; i++)
{
positionsCPU[COORD_DIM * i] = 1.8*((rand() / (float)RAND_MAX) - 0.5); //x axis
positionsCPU[COORD_DIM * i + 1] = 0.9; //y axis
positionsCPU[COORD_DIM * i + 2] = 0.0f; //z axis
positionsCPU[COORD_DIM * i + 3] = 0.0f; // stuck variable
// velocities are zeros
velocitiesCPU[VEL_DIM* i] = 0.0;
velocitiesCPU[VEL_DIM* i + 1] = -2 * (rand() / (float)RAND_MAX);
velocitiesCPU[VEL_DIM* i + 2] = 0.0;
}
}
// updating the bodies' positions and velocities. Stuck is updated inside too
void FallBodyClass::update(float dt, float * positionsCPU, float * velocitiesCPU, bool & success)
{
cl_int error = CL_SUCCESS;
size_t global_work_size;
size_t local_work_size;
success = true;
if (localSize > bodies)
localSize = bodies;
local_work_size = localSize;
global_work_size = bodies;
// passing the arguments
// we write the new positions and velocities and read the previous ones
error |= clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&positionsGPU[write]);
error |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&velocitiesGPU[write]);
error |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&positionsGPU[read]);
error |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&velocitiesGPU[read]);
error |= clSetKernelArg(kernel, 4, sizeof(cl_float), (void *)&dt);
error |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&linesGPU);
// just swap read and write in order not to copy the arrays
int temp;
temp = write;
write = read;
read = temp;
// executing the kernel
error |= clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
// synchronization
clFinish(queue);
// asynchronously reading the updated values
error |= clEnqueueReadBuffer(queue, positionsGPU[read], CL_FALSE, 0, COORD_DIM * bodies * sizeof(float), positionsCPU, 0, NULL, NULL);
if (error != CL_SUCCESS)
{
success = false;
}
error |= clEnqueueReadBuffer(queue, velocitiesGPU[read], CL_FALSE, 0, VEL_DIM * bodies * sizeof(float), velocitiesCPU, 0, NULL, NULL);
if (error != CL_SUCCESS)
{
success = false;
}
///////////
bool toReboot = positionsCPU[3]; //fourth index of the [0] first element
//bool toReboot = false;
////////////
if (toReboot) {
positionsCPU = (float *)clEnqueueMapBuffer(queue, positionsCPUBuffer, CL_TRUE, CL_MAP_WRITE, 0, COORD_DIM * bodies * sizeof(float), 0, NULL, NULL, NULL);
velocitiesCPU = (float *)clEnqueueMapBuffer(queue, velocitiesCPUBuffer, CL_TRUE, CL_MAP_WRITE, 0, VEL_DIM * bodies * sizeof(float), 0, NULL, NULL, NULL);
initBodies(positionsCPU, velocitiesCPU);
// unmapping the pointers
clEnqueueUnmapMemObject(queue, positionsCPUBuffer, positionsCPU, 0, NULL, NULL);
clEnqueueUnmapMemObject(queue, velocitiesCPUBuffer, velocitiesCPU, 0, NULL, NULL);
//update values on GPU side
error |= clEnqueueWriteBuffer(queue, positionsGPU[read], CL_TRUE, 0, COORD_DIM * bodies * sizeof(float), positionsCPU, 0, NULL, NULL);
error |= clEnqueueWriteBuffer(queue, velocitiesGPU[read], CL_TRUE, 0, VEL_DIM * bodies * sizeof(float), velocitiesCPU, 0, NULL, NULL);
}
return;
}
FallBodyClass::~FallBodyClass(void)
{
// synchronization (if something has to be done)
clFinish(queue);
// releasing all objects
clReleaseMemObject(linesGPU);
clReleaseMemObject(linesCPUBuffer);
clReleaseMemObject(velocitiesGPU[0]);
clReleaseMemObject(velocitiesGPU[1]);
clReleaseMemObject(positionsGPU[0]);
clReleaseMemObject(positionsGPU[1]);
clReleaseMemObject(positionsCPUBuffer);
clReleaseMemObject(velocitiesCPUBuffer);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(queue);
clReleaseContext(context);
}

OpenCL clEnqueueWriteBuffer pass pointer error

Is it necessary that array pointer passed to clEnqueueWriteBuffer should malloc in the same scope?
Here is my code:
class int_matrix{
public:
int_matrix(size_t size_row, size_t size_col) :
_size_row(size_row), _size_col(size_col) {
element = (int*)malloc(size_row * size_col * sizeof(int));
}
friend int_matrix cl_prod_l(int_matrix& lhs, int_matrix& rhs);
private:
int* element;
};
int_matrix cl_prod_l(int_matrix& lhs, int_matrix& rhs) {
...
int_matrix return_val(lhs._size_row, rhs._size_col, 0); // Initialize elements in retrun_val
cl_mem lhs_buffer, rhs_buffer, return_buffer;
/* config buffer */
lhs_buffer = clCreateBuffer(int_matrix::_clconfig._context,
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, M*K * sizeof(int), NULL, &err);
rhs_buffer = clCreateBuffer(int_matrix::_clconfig._context,
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, N*K * sizeof(int), NULL, &err);
return_buffer = clCreateBuffer(int_matrix::_clconfig._context,
CL_MEM_READ_WRITE, M*N * sizeof(int), NULL, &err);
cl_kernel Kernel= clCreateKernel(int_matrix::_clconfig._program, ker, &err);
/* enqueue buffer */
clEnqueueWriteBuffer(int_matrix::_clconfig._cmdque, lhs_buffer, CL_TRUE, 0, M*K * sizeof(int), lhs.element, 0, NULL, NULL);
clEnqueueWriteBuffer(int_matrix::_clconfig._cmdque, rhs_buffer, CL_TRUE, 0, N*K * sizeof(int), rhs.element, 0, NULL, NULL);
clEnqueueWriteBuffer(int_matrix::_clconfig._cmdque, return_buffer, CL_TRUE, 0, M*N * sizeof(int), return_val.element, 0, NULL, NULL);
...
}
In this example, I find lhs.element, rhs.element and return_val.element cannot be passed in kernel. But when I change to some array malloc in this this function(copy the same value), the kernel can return the right result.
So is there some limitations on the array pointer passed to clEnqueueWriteBuffer?
Emmm...
I find the answer my seld, cl_mem object and int* element should be put in the same scope.

Not all work-items being used opencl

so I'm able to compile and execute my kernel, the problem is that only two work-items are being used. I'm basically trying to fill up a float array[8] with {0,1,2,3,4,5,6,7}. So this is a very simple hello world application. Bellow is my kernel.
// Highly simplified to demonstrate
__kernel void rnd_float32_matrix (
__global float * res
) {
uint idx = get_global_id(0);
res[idx] = idx;
}
I then create and execute the kernel with the following code...
// Some more code
cl::Program program(context, sources, &err);
program.build(devices, NULL, NULL, NULL);
cl::Kernel kernel(program, "rnd_float32_matrix", &err);
kernel.setArg(0, src_d);
cl::CommandQueue queue(context, devices[0], 0, &err);
cl::Event event;
err = queue.enqueueNDRangeKernel(
kernel,
cl::NullRange,
cl::NDRange(8),
// I've tried cl::NDRange(8) as well
cl::NDRange(1),
NULL,
&event
);
event.wait();
err = queue.enqueueReadBuffer(
// This is:
// cl::Buffer src_d(
// context,
// CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
// mem_size,
// src_h,
// &err);
src_d,
CL_TRUE,
0,
8,
// This is float * src_h = new float[8];
src_h);
for(int i = 0; i < 8; i ++) {
std::cout << src_h[i] << std::endl;
}
I may not show it in the code, but I also do select a gpu device and using context.getInfo(..) it shows I'm using my NVidia GTX 770M card which shows 1024, 1024, 64 work-items available in dimensions 0, 1 and 2. When this array prints I keep getting... 0, 1, 0, 0, 0, 0, 0, 0. I've also tried setting res[idx] = 5, and I get... 5, 5, 0, 0, 0, 0, 0, 0. So it seems that only two give work-items are actually being used. What am I doing wrong?
Your command to read the data back from the device is only reading 8 bytes, which is two floats:
err = queue.enqueueReadBuffer(
src_d,
CL_TRUE,
0,
8, // <- This is the number of bytes, not the number of elements!
// This is float * src_h = new float[8];
src_h);
To read 8 floats, you would need to do this:
err = queue.enqueueReadBuffer(
src_d,
CL_TRUE,
0,
8 * sizeof(cl_float),
// This is float * src_h = new float[8];
src_h);

CL_MEM_ALLOC_HOST_PTR slower than CL_MEM_USE_HOST_PTR

So I've been playing around with OpenCL for a bit now and testing the speeds of memory transfer between host and device.
I was using Intel OpenCL SDK and running on the Intel i5 Processor with integrated graphics.
I then discovered clEnqueueMapBuffer instead of clEnqueueWriteBuffer which turned out to be faster by almost 10 times when using pinned memory like so:
int amt = 16*1024*1024;
...
k_a = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(int)*amt, a, NULL);
k_b = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(int)*amt, b, NULL);
k_c = clCreateBuffer(context,CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, sizeof(int)*amt, ret, NULL);
int* map_a = (int*) clEnqueueMapBuffer(c_q, k_a, CL_TRUE, CL_MAP_READ, 0, sizeof(int)*amt, 0, NULL, NULL, &error);
int* map_b = (int*) clEnqueueMapBuffer(c_q, k_b, CL_TRUE, CL_MAP_READ, 0, sizeof(int)*amt, 0, NULL, NULL, &error);
int* map_c = (int*) clEnqueueMapBuffer(c_q, k_c, CL_TRUE, CL_MAP_WRITE, 0, sizeof(int)*amt, 0, NULL, NULL, &error);
clFinish(c_q);
Where a b and ret are 128 bit aligned int arrays.
The time came out to about 22.026186 ms, compared to 198.604528 ms using clEnqueueWriteBuffer
However, when I changed my code to
k_a = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, sizeof(int)*amt, NULL, NULL);
k_b = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, sizeof(int)*amt, NULL, NULL);
k_c = clCreateBuffer(context,CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, sizeof(int)*amt, NULL, NULL);
int* map_a = (int*)clEnqueueMapBuffer(c_q, k_a, CL_TRUE, CL_MAP_READ, 0, sizeof(int)*amt, 0, NULL, NULL, &error);
int* map_b = (int*)clEnqueueMapBuffer(c_q, k_b, CL_TRUE, CL_MAP_READ, 0, sizeof(int)*amt, 0, NULL, NULL, &error);
int* map_c = (int*)clEnqueueMapBuffer(c_q, k_c, CL_TRUE, CL_MAP_WRITE, 0, sizeof(int)*amt, 0, NULL, NULL, &error);
/** initiate map_a and map_b **/
the time increases to 91.350065 ms
What could be the problem? Or is it a problem at all?
EDIT:
This is how I initialize the arrays in the second code:
for (int i = 0; i < amt; i++)
{
map_a[i] = i;
map_b[i] = i;
}
And now that I check, map_a and map_b do contain the right elements at the end of the program, but map_c contains all 0's. I did this:
clEnqueueUnmapMemObject(c_q, k_a, map_a, 0, NULL, NULL);
clEnqueueUnmapMemObject(c_q, k_b, map_b, 0, NULL, NULL);
clEnqueueUnmapMemObject(c_q, k_c, map_c, 0, NULL, NULL);
and my kernel is just
__kernel void test(__global int* a, __global int* b, __global int* c)
{
int i = get_global_id(0);
c[i] = a[i] + b[i];
}
My understanding is that CL_MEM_ALLOC_HOST_PTR allocates but doesn't copy. Does the 2nd block of code actually get any data onto the device?
Also, clCreateBuffer when used with CL_MEM_USE_HOST_PTR and CL_MEM_COPY_HOST_PTR shouldn't require clEnqueueWrite, as the buffer is created with the memory pointed to by void *host_ptr.
Using "pinned" memory in OpenCL should be a process like:
int amt = 16*1024*1024;
int Array[] = new int[amt];
int Error = 0;
//Note, since we are using NULL for the data pointer, we HAVE to use CL_MEM_ALLOC_HOST_PTR
//This allocates memory on the devices
cl_mem B1 = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(int)*amt, NULL, &Error);
//Map the Device memory to host memory, aka pinning it
int *host_ptr = clEnqueueMapBuffer(queue, B1, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(int)*amt, 0, NULL, NULL, &Error);
//Copy from host memory to pinned host memory which copies to the card automatically`
memcpy(host_ptr, Array, sizeof(int)*amt);
//Call your kernel and everything else and memcpy back the pinned back to host when
//you are done
Edit: One final thing you can do to speed up the program is to not make the memory read/write blocking by using CL_FALSE instead of CL_TRUE. Just make sure to call clFinish() before data gets copied back to the host so that the command queue is emptied and all commands are processed.
Source: OpenCL In Action
With the right combination of flags, you should be able to achieve "zero copy" (i.e. very fast) map/unmap on Intel Integrated Graphics since there is no need for a "CPU to GPU" copy since they both use the same memory (that's what the "Integrated" means). Read the Intel OpenCL Optimization Guide section on memory.