OpenCL clEnqueueWriteBuffer pass pointer error - c++

Is it necessary that array pointer passed to clEnqueueWriteBuffer should malloc in the same scope?
Here is my code:
class int_matrix{
public:
int_matrix(size_t size_row, size_t size_col) :
_size_row(size_row), _size_col(size_col) {
element = (int*)malloc(size_row * size_col * sizeof(int));
}
friend int_matrix cl_prod_l(int_matrix& lhs, int_matrix& rhs);
private:
int* element;
};
int_matrix cl_prod_l(int_matrix& lhs, int_matrix& rhs) {
...
int_matrix return_val(lhs._size_row, rhs._size_col, 0); // Initialize elements in retrun_val
cl_mem lhs_buffer, rhs_buffer, return_buffer;
/* config buffer */
lhs_buffer = clCreateBuffer(int_matrix::_clconfig._context,
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, M*K * sizeof(int), NULL, &err);
rhs_buffer = clCreateBuffer(int_matrix::_clconfig._context,
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, N*K * sizeof(int), NULL, &err);
return_buffer = clCreateBuffer(int_matrix::_clconfig._context,
CL_MEM_READ_WRITE, M*N * sizeof(int), NULL, &err);
cl_kernel Kernel= clCreateKernel(int_matrix::_clconfig._program, ker, &err);
/* enqueue buffer */
clEnqueueWriteBuffer(int_matrix::_clconfig._cmdque, lhs_buffer, CL_TRUE, 0, M*K * sizeof(int), lhs.element, 0, NULL, NULL);
clEnqueueWriteBuffer(int_matrix::_clconfig._cmdque, rhs_buffer, CL_TRUE, 0, N*K * sizeof(int), rhs.element, 0, NULL, NULL);
clEnqueueWriteBuffer(int_matrix::_clconfig._cmdque, return_buffer, CL_TRUE, 0, M*N * sizeof(int), return_val.element, 0, NULL, NULL);
...
}
In this example, I find lhs.element, rhs.element and return_val.element cannot be passed in kernel. But when I change to some array malloc in this this function(copy the same value), the kernel can return the right result.
So is there some limitations on the array pointer passed to clEnqueueWriteBuffer?

Emmm...
I find the answer my seld, cl_mem object and int* element should be put in the same scope.

Related

clEnqueueReadBuffer does not assign all values to cl_mem buffer which point to std::vector's data

https://i.stack.imgur.com/TA9v6.png
I have been trying to get to compile a kernel that assigns certain indices to an std::vector using OpenCL through clEnqueueReadBuffer function but it does not seem to work correctly since the first result is the only assigned in the std::vector
the source code for the host in c++ is the following:
cl_mem originalPixelsBuffer = clCreateBuffer(p1.context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(Color) * imageObj->SourceLength(), source, &p1.status);
CheckErrorCode(p1.status, p1.program, p1.devices[0], "Failed to Create buffer 0");
cl_mem targetBuffer = clCreateBuffer(p1.context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(Color) * imageObj->OutputLength(), target, &p1.status);
CheckErrorCode(p1.status, p1.program, p1.devices[0], "Failed to Create buffer 1");
//write buffers
p1.status = clEnqueueWriteBuffer(p1.commandQueue, originalPixelsBuffer, CL_FALSE, 0, sizeof(Color) * imageObj->SourceLength(), source, 0, NULL, NULL);
CheckErrorCode(p1.status, p1.program, p1.devices[0], "Failed to write buffer 0");
p1.status = clEnqueueWriteBuffer(p1.commandQueue, targetBuffer, CL_TRUE, 0, sizeof(Color) * imageObj->OutputLength(), target, 0, NULL, NULL);
CheckErrorCode(p1.status, p1.program, p1.devices[0], "Failed to write buffer 1");
size_t globalWorkSize[2] = { imageObj->originalWidth * 4, imageObj->originalHeight * 4 };
size_t localWorkSize[2]{ 64,64 };
SetLocalWorkSize(IsDivisibleBy64(localWorkSize[0]), localWorkSize);
//execute kernel
p1.status = clEnqueueNDRangeKernel(p1.commandQueue, Kernel, 1, NULL, globalWorkSize, IsDisibibleByLocalWorkSize(globalWorkSize, localWorkSize) ? localWorkSize : NULL, 0, NULL, NULL);
CheckErrorCode(p1.status, p1.program, p1.devices[0], "Failed to clEnqueueDRangeKernel");
//read buffer
p1.status = clEnqueueReadBuffer(p1.commandQueue, targetBuffer, CL_TRUE, 0, sizeof(Color) * imageObj->OutputLength(), target, 0, NULL, NULL);
CheckErrorCode(p1.status, p1.program, p1.devices[0], "Failed to write buffer 1");
the kernel code:
__kernel void interp(__global struct Color* source,__global struct Color* target,uint64 width,uint64 height,uint64 ratio,uint64 limit, uint64 originalHeight)
{
__private fp32 wIndex = (int64)get_global_id(0);
__private fp32 hIndex = (int64)get_global_id(1);
if(((int64)wIndex)%ratio==MATCH && ((int64)hIndex)%ratio ==MATCH)
{
__private int64 Index = (wIndex/ratio) * (originalHeight/ratio) + (hIndex/ratio);
if(Index < limit)
{
__private int64 tIndex = wIndex * height + hIndex;
target[tIndex].R = source[Index].R;
target[tIndex].G = source[Index].G;
target[tIndex].B = source[Index].B;
target[tIndex].A = source[Index].A;
}
}
}```

Can dedicated memory be released on the GPU using OpenCL?

Is there a way to free up dedicated memory when a function that runs on GPU using OpenCL ends? I have noticed that if you repeatedly call a program that uses OpenCL in GPU and it ends, a certain amount of dedicated memory is not released, which would cause an error if the function is called too many times.
UPDATE 22/12/2019:
I enclose a fragment of the code that is within the iteration. The configuration of the cl_program and cl_context is done outside the iteration:
void launch_kernel(float * blockC, float * blockR, float * blockEst, float * blockEstv, float * blockL, cl_program program, cl_context context, std::vector<cl_device_id> deviceIds, int n){
cl_kernel kernel_function = clCreateKernel(program, "function_example", &error);
cl_event event;
cl_command_queue queue = clCreateCommandQueue(context, (deviceIds)[0], 0, &error);
std::size_t size[1] = { (size_t)n };
cl_mem blockCBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * (n * s), (void *)&blockC[0], &error);
cl_mem blockRBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * (n * s), (void *)&blockR[0], &error);
cl_mem blockEstBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(float) * n, (void *)&blockEst[0], &error);
cl_mem blockEstvBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(float) * n, (void *)&blockEstv[0], &error);
cl_mem blockLBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(float) * n, (void *)&blockL[0], &error);
clSetKernelArg(kernel_solution, 0, sizeof(cl_mem), &blockCBuffer);
clSetKernelArg(kernel_solution, 1, sizeof(cl_mem), &blockRBuffer);
clSetKernelArg(kernel_solution, 2, sizeof(cl_mem), &blockEstBuffer);
clSetKernelArg(kernel_solution, 3, sizeof(cl_mem), &blockEstvBuffer);
clSetKernelArg(kernel_solution, 4, sizeof(cl_mem), &blockLBuffer);
clSetKernelArg(kernel_solution, 5, sizeof(int), &s);
openclSingleton->checkError(clEnqueueNDRangeKernel(queue, kernel_function, 1, nullptr, size, nullptr, 0, nullptr, nullptr));
clEnqueueMapBuffer(queue, blockEstBuffer, CL_TRUE, CL_MAP_READ, 0, n * sizeof(float) , 0, nullptr, nullptr, &error);
clEnqueueMapBuffer(queue, blockEstvBuffer, CL_TRUE, CL_MAP_READ, 0, n * sizeof(float) , 0, nullptr, nullptr, &error);
clEnqueueMapBuffer(queue, blockLBuffer, CL_TRUE, CL_MAP_READ, 0, n * sizeof(float) , 0, nullptr, nullptr, &error);
clReleaseMemObject(blockCBuffer);
clReleaseMemObject(blockRBuffer);
clReleaseMemObject(blockEstBuffer);
clReleaseMemObject(blockEstvBuffer);
clReleaseMemObject(blockLBuffer);
clFlush(queue);
clFinish(queue);
clWaitForEvents(1, &event);
clReleaseCommandQueue(queue);
clReleaseKernel(kernel_function);
}
UPDATE 23/12/2019
Updated the code with the iterative process that calls the function in OpenCl. The problem arises when at the end of the launch_kernel function it leaves some dedicated memory used, which causes that if the variable m is too large, the memory becomes full and the program crashes due to lack of resources.
std::vector<cl_device_id> deviceIds;
cl_program program;
cl_context context;
... //configuration program and context
int n;
float *blockEst, *blockEstv, *blockL, *blockC, *blockR;
for(int i = 0; i < m; m++){
blockC = (float*)std::malloc(sizeof(float)*n*s);
blockR = (float*)std::malloc(sizeof(float)*n*s);
blockEst = (float*)std::malloc(sizeof(float)*n);
blockEstv = (float*)std::malloc(sizeof(float)*n);
blockL = (float*)std::malloc(sizeof(float)*n);
.... //Set values blockC and blockR
launch_kernel(blockC, blockR, blockEst, blockEstv, blockL, program, context, deviceIds, n);
... //Save value blockEst, blockEstv and blockL
std::free(blockC);
std::free(blockR);
std::free(blockEst);
std::free(blockEstv);
std::free(blockL);
}
clReleaseProgram(program);
clReleaseContext(context);
clReleaseDevice(deviceIds[0]);

Model class fails to initialize in DirectX 10

My project uses DirectX 10 and some of its boilerplate to render a scene, however, it crashes with an error message "Could not initialize the model object." As far as I understand, making it up to this point means that, at the very least, the model has been successfully created, so the error must be in one of the files below, which is fortunate as the most difficult tasks are handled by the FallBodyClass.cpp that hosts OpenCL API interactions. If needed, I can try attaching parts of it in a later edit.
During debug, my IDE shows that all components of m_Model (m_vertexBuffer, m_indexBuffer etc) are shown as with _vfptr . I do not know what to make of it, but it does seem to confirm that modelclass.cpp is the point of failure.
graphicsclass.cpp
GraphicsClass::GraphicsClass()
{
m_Direct3D = 0;
m_Model = 0;
m_ColorShader = 0;
m_bodies = BODIES;
}
GraphicsClass::GraphicsClass(const GraphicsClass& other)
{}
GraphicsClass::~GraphicsClass()
{}
bool GraphicsClass::Initialize(int screenWidth, int screenHeight, HWND hwnd)
{
bool result;
// Create the Direct3D object.
m_Direct3D = new D3DClass;
if (!m_Direct3D)
{
return false;
}
// Initialize the Direct3D object.
result = m_Direct3D->Initialize(screenWidth, screenHeight, VSYNC_ENABLED, hwnd, FULL_SCREEN, SCREEN_DEPTH, SCREEN_NEAR);
if (!result)
{
MessageBox(hwnd, L"Could not initialize Direct3D", L"Error", MB_OK);
return false;
}
// Create the model object.
m_Model = new ModelClass(m_bodies);
if (!m_Model)
{
return false;
}
// Initialize the model object.
result = m_Model->Initialize(m_Direct3D->GetDevice());
if (!result)
{
MessageBox(hwnd, L"Could not initialize the model object.", L"Error", MB_OK);
return false;
}
modelclass.cpp
ModelClass::ModelClass(int bodies)
{
m_vertexBuffer = 0;
m_indexBuffer = 0;
m_positions = 0;
m_velocities = 0;
m_bodySystem = 0;
m_bodies = bodies;
}
ModelClass::ModelClass(const ModelClass& other)
{}
ModelClass::~ModelClass()
{}
bool ModelClass::Initialize(ID3D10Device* device)
{
bool result;
TwoLines twoLinesConstants = CalculateLinesConstants(M_PI_4);
m_positions = new float[COORD_DIM * m_bodies];
m_velocities = new float[VEL_DIM * m_bodies];
m_bodySystem = new class FallBodyClass(m_bodies, &m_positions, &m_velocities, twoLinesConstants, result);
if (!result) {
return false;
}
// Initialize the vertex and index buffer that hold the geometry for the triangle.
result = InitializeBuffers(device, twoLinesConstants);
if(!result)
{
return false;
}
return true;
}
FallBodyclass.cpp
FallBodyClass::FallBodyClass(int bodies, float ** positionsCPU, float ** velocitiesCPU, TwoLines twoLines, bool & success)
:bodies(bodies)
{
cl_int ret;
// getting the first available platform
cl_platform_id clPlatformID[2];
cl_platform_id GPUplatform;
cl_uint num_platforms;
//char str[1024];
ret = clGetPlatformIDs(2, clPlatformID, &num_platforms);
GPUplatform = clPlatformID[0]; //choose GPU platform
//error |= clGetPlatformInfo(GPUplatform, CL_PLATFORM_NAME, 0, NULL, NULL);
//clGetPlatformInfo(GPUplatform, CL_PLATFORM_VENDOR, sizeof(str), str, NULL);
// getting the first GPU device
ret |= clGetDeviceIDs(GPUplatform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
if (ret != CL_SUCCESS)
{
success = false;
return;
}
//clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(str), str, NULL);
// creating the context
context = clCreateContext(0, 1, &device, NULL, NULL, &ret);
if (ret != CL_SUCCESS)
{
success = false;
return;
}
cl_queue_properties props[] = {
CL_QUEUE_PROFILING_ENABLE
};
// creating the command queue
queue = clCreateCommandQueueWithProperties(context, device, props, &ret);
if (ret != CL_SUCCESS)
{
success = false;
return;
}
// setting the local variables
// (at the same time one of them supposed to be 0 and another to be 1)
read = 0;
write = 1;
// reading the kernel
FILE * f = NULL;
char fileName[18] = "kernel.cl";
f = fopen(fileName, "rb");
if(f == NULL)
{
success = false;
return;
}
// getting the length of the source code for the kernel
fseek(f, 0, SEEK_END);
size_t codeLength = ftell(f);
rewind(f);
char * code = (char *)malloc(codeLength + 1);
if (fread(code, codeLength, 1, f) != 1)
{
fclose(f);
free(code);
success = false;
return;
}
// closing the file and 0-terminating the source code
fclose(f);
code[codeLength] = '\0';
// creating the program
program = clCreateProgramWithSource(context, 1, (const char **)&code, &codeLength, &ret);
if (ret != CL_SUCCESS)
{
success = false;
return;
}
// clearing the memory
free(code);
// building the program
ret |= clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
// creating the kernel
kernel = clCreateKernel(program, "impactManager", &ret);
// setting the local size of the group the largest possible in order to load all computational units
int numGroups;
ret |= clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(numGroups), &numGroups, NULL);
localSize = bodies / numGroups;
// allocating pinned buffers for velocities and positions, and stuck
positionsCPUBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, COORD_DIM * bodies * sizeof(float) , NULL, NULL);
velocitiesCPUBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, VEL_DIM * bodies * sizeof(float) , NULL, NULL);
linesCPUBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, 8 * sizeof(float), NULL, NULL);
// get pointers to arrays to operate with the buffers (array map buffers here (to program) as float-arrays)
*positionsCPU = (float *)clEnqueueMapBuffer(queue, positionsCPUBuffer, CL_TRUE, CL_MAP_WRITE, 0, COORD_DIM * bodies * sizeof(float), 0, NULL, NULL, NULL);
*velocitiesCPU = (float *)clEnqueueMapBuffer(queue, velocitiesCPUBuffer, CL_TRUE, CL_MAP_WRITE, 0, VEL_DIM * bodies * sizeof(float), 0, NULL, NULL, NULL);
float * linesCPU = (float *)clEnqueueMapBuffer(queue, linesCPUBuffer, CL_TRUE, CL_MAP_WRITE, 0, 8 * sizeof(float), 0, NULL, NULL, NULL);
// initialization of the bodies' positions and velocities, and stuck
initBodies(*positionsCPU, *velocitiesCPU);
initLines(twoLines, linesCPU);
// unmapping the pointers to arrays (invalidates array pointers)
clEnqueueUnmapMemObject(queue, positionsCPUBuffer, *positionsCPU, 0, NULL, NULL);
clEnqueueUnmapMemObject(queue, velocitiesCPUBuffer, *velocitiesCPU, 0, NULL, NULL);
clEnqueueUnmapMemObject(queue, linesCPUBuffer, linesCPU, 0, NULL, NULL);
// allocate two arrays on GPU for positions and velocities
for (int i = 0; i < 2; ++i) {
positionsGPU[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, COORD_DIM * bodies * sizeof(float), NULL, NULL);
ret |= clEnqueueWriteBuffer(queue, positionsGPU[i], CL_TRUE, 0, COORD_DIM * bodies * sizeof(float), *positionsCPU, 0, NULL, NULL);
velocitiesGPU[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, VEL_DIM * bodies * sizeof(float), NULL, NULL);
ret |= clEnqueueWriteBuffer(queue, velocitiesGPU[i], CL_TRUE, 0, VEL_DIM * bodies * sizeof(float), *velocitiesCPU, 0, NULL, NULL);
}
linesGPU = clCreateBuffer(context, CL_MEM_READ_WRITE, 8 * sizeof(float), NULL, NULL);
ret |= clEnqueueWriteBuffer(queue, linesGPU, CL_TRUE, 0, 8 * sizeof(float), linesCPU, 0, NULL, NULL);
if (ret != CL_SUCCESS)
{
success = false;
return;
}
}
void FallBodyClass::initLines(IN TwoLines l, OUT float *linesCPU)
{
linesCPU[0] = l.a1;
linesCPU[1] = l.b1;
linesCPU[2] = l.R1.x;
linesCPU[3] = l.R1.y;
linesCPU[4] = l.a2;
linesCPU[5] = l.b2;
linesCPU[6] = l.R2.x;
linesCPU[7] = l.R2.y;
}
// initialization of the bodies' positions and velocities
void FallBodyClass::initBodies(float * positionsCPU, float * velocitiesCPU)
{
float scale = 0.20f;
// initialization of the memory
memset(positionsCPU, 0, COORD_DIM * bodies * sizeof(float));
memset(velocitiesCPU, 0, VEL_DIM * bodies * sizeof(float));
// for the randomization
srand((unsigned int)time(NULL));
for (int i = 0; i < bodies; i++)
{
positionsCPU[COORD_DIM * i] = 1.8*((rand() / (float)RAND_MAX) - 0.5); //x axis
positionsCPU[COORD_DIM * i + 1] = 0.9; //y axis
positionsCPU[COORD_DIM * i + 2] = 0.0f; //z axis
positionsCPU[COORD_DIM * i + 3] = 0.0f; // stuck variable
// velocities are zeros
velocitiesCPU[VEL_DIM* i] = 0.0;
velocitiesCPU[VEL_DIM* i + 1] = -2 * (rand() / (float)RAND_MAX);
velocitiesCPU[VEL_DIM* i + 2] = 0.0;
}
}
// updating the bodies' positions and velocities. Stuck is updated inside too
void FallBodyClass::update(float dt, float * positionsCPU, float * velocitiesCPU, bool & success)
{
cl_int error = CL_SUCCESS;
size_t global_work_size;
size_t local_work_size;
success = true;
if (localSize > bodies)
localSize = bodies;
local_work_size = localSize;
global_work_size = bodies;
// passing the arguments
// we write the new positions and velocities and read the previous ones
error |= clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&positionsGPU[write]);
error |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&velocitiesGPU[write]);
error |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&positionsGPU[read]);
error |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&velocitiesGPU[read]);
error |= clSetKernelArg(kernel, 4, sizeof(cl_float), (void *)&dt);
error |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&linesGPU);
// just swap read and write in order not to copy the arrays
int temp;
temp = write;
write = read;
read = temp;
// executing the kernel
error |= clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
// synchronization
clFinish(queue);
// asynchronously reading the updated values
error |= clEnqueueReadBuffer(queue, positionsGPU[read], CL_FALSE, 0, COORD_DIM * bodies * sizeof(float), positionsCPU, 0, NULL, NULL);
if (error != CL_SUCCESS)
{
success = false;
}
error |= clEnqueueReadBuffer(queue, velocitiesGPU[read], CL_FALSE, 0, VEL_DIM * bodies * sizeof(float), velocitiesCPU, 0, NULL, NULL);
if (error != CL_SUCCESS)
{
success = false;
}
///////////
bool toReboot = positionsCPU[3]; //fourth index of the [0] first element
//bool toReboot = false;
////////////
if (toReboot) {
positionsCPU = (float *)clEnqueueMapBuffer(queue, positionsCPUBuffer, CL_TRUE, CL_MAP_WRITE, 0, COORD_DIM * bodies * sizeof(float), 0, NULL, NULL, NULL);
velocitiesCPU = (float *)clEnqueueMapBuffer(queue, velocitiesCPUBuffer, CL_TRUE, CL_MAP_WRITE, 0, VEL_DIM * bodies * sizeof(float), 0, NULL, NULL, NULL);
initBodies(positionsCPU, velocitiesCPU);
// unmapping the pointers
clEnqueueUnmapMemObject(queue, positionsCPUBuffer, positionsCPU, 0, NULL, NULL);
clEnqueueUnmapMemObject(queue, velocitiesCPUBuffer, velocitiesCPU, 0, NULL, NULL);
//update values on GPU side
error |= clEnqueueWriteBuffer(queue, positionsGPU[read], CL_TRUE, 0, COORD_DIM * bodies * sizeof(float), positionsCPU, 0, NULL, NULL);
error |= clEnqueueWriteBuffer(queue, velocitiesGPU[read], CL_TRUE, 0, VEL_DIM * bodies * sizeof(float), velocitiesCPU, 0, NULL, NULL);
}
return;
}
FallBodyClass::~FallBodyClass(void)
{
// synchronization (if something has to be done)
clFinish(queue);
// releasing all objects
clReleaseMemObject(linesGPU);
clReleaseMemObject(linesCPUBuffer);
clReleaseMemObject(velocitiesGPU[0]);
clReleaseMemObject(velocitiesGPU[1]);
clReleaseMemObject(positionsGPU[0]);
clReleaseMemObject(positionsGPU[1]);
clReleaseMemObject(positionsCPUBuffer);
clReleaseMemObject(velocitiesCPUBuffer);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(queue);
clReleaseContext(context);
}

Not all work-items being used opencl

so I'm able to compile and execute my kernel, the problem is that only two work-items are being used. I'm basically trying to fill up a float array[8] with {0,1,2,3,4,5,6,7}. So this is a very simple hello world application. Bellow is my kernel.
// Highly simplified to demonstrate
__kernel void rnd_float32_matrix (
__global float * res
) {
uint idx = get_global_id(0);
res[idx] = idx;
}
I then create and execute the kernel with the following code...
// Some more code
cl::Program program(context, sources, &err);
program.build(devices, NULL, NULL, NULL);
cl::Kernel kernel(program, "rnd_float32_matrix", &err);
kernel.setArg(0, src_d);
cl::CommandQueue queue(context, devices[0], 0, &err);
cl::Event event;
err = queue.enqueueNDRangeKernel(
kernel,
cl::NullRange,
cl::NDRange(8),
// I've tried cl::NDRange(8) as well
cl::NDRange(1),
NULL,
&event
);
event.wait();
err = queue.enqueueReadBuffer(
// This is:
// cl::Buffer src_d(
// context,
// CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
// mem_size,
// src_h,
// &err);
src_d,
CL_TRUE,
0,
8,
// This is float * src_h = new float[8];
src_h);
for(int i = 0; i < 8; i ++) {
std::cout << src_h[i] << std::endl;
}
I may not show it in the code, but I also do select a gpu device and using context.getInfo(..) it shows I'm using my NVidia GTX 770M card which shows 1024, 1024, 64 work-items available in dimensions 0, 1 and 2. When this array prints I keep getting... 0, 1, 0, 0, 0, 0, 0, 0. I've also tried setting res[idx] = 5, and I get... 5, 5, 0, 0, 0, 0, 0, 0. So it seems that only two give work-items are actually being used. What am I doing wrong?
Your command to read the data back from the device is only reading 8 bytes, which is two floats:
err = queue.enqueueReadBuffer(
src_d,
CL_TRUE,
0,
8, // <- This is the number of bytes, not the number of elements!
// This is float * src_h = new float[8];
src_h);
To read 8 floats, you would need to do this:
err = queue.enqueueReadBuffer(
src_d,
CL_TRUE,
0,
8 * sizeof(cl_float),
// This is float * src_h = new float[8];
src_h);

CL_MEM_ALLOC_HOST_PTR slower than CL_MEM_USE_HOST_PTR

So I've been playing around with OpenCL for a bit now and testing the speeds of memory transfer between host and device.
I was using Intel OpenCL SDK and running on the Intel i5 Processor with integrated graphics.
I then discovered clEnqueueMapBuffer instead of clEnqueueWriteBuffer which turned out to be faster by almost 10 times when using pinned memory like so:
int amt = 16*1024*1024;
...
k_a = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(int)*amt, a, NULL);
k_b = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(int)*amt, b, NULL);
k_c = clCreateBuffer(context,CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, sizeof(int)*amt, ret, NULL);
int* map_a = (int*) clEnqueueMapBuffer(c_q, k_a, CL_TRUE, CL_MAP_READ, 0, sizeof(int)*amt, 0, NULL, NULL, &error);
int* map_b = (int*) clEnqueueMapBuffer(c_q, k_b, CL_TRUE, CL_MAP_READ, 0, sizeof(int)*amt, 0, NULL, NULL, &error);
int* map_c = (int*) clEnqueueMapBuffer(c_q, k_c, CL_TRUE, CL_MAP_WRITE, 0, sizeof(int)*amt, 0, NULL, NULL, &error);
clFinish(c_q);
Where a b and ret are 128 bit aligned int arrays.
The time came out to about 22.026186 ms, compared to 198.604528 ms using clEnqueueWriteBuffer
However, when I changed my code to
k_a = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, sizeof(int)*amt, NULL, NULL);
k_b = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, sizeof(int)*amt, NULL, NULL);
k_c = clCreateBuffer(context,CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, sizeof(int)*amt, NULL, NULL);
int* map_a = (int*)clEnqueueMapBuffer(c_q, k_a, CL_TRUE, CL_MAP_READ, 0, sizeof(int)*amt, 0, NULL, NULL, &error);
int* map_b = (int*)clEnqueueMapBuffer(c_q, k_b, CL_TRUE, CL_MAP_READ, 0, sizeof(int)*amt, 0, NULL, NULL, &error);
int* map_c = (int*)clEnqueueMapBuffer(c_q, k_c, CL_TRUE, CL_MAP_WRITE, 0, sizeof(int)*amt, 0, NULL, NULL, &error);
/** initiate map_a and map_b **/
the time increases to 91.350065 ms
What could be the problem? Or is it a problem at all?
EDIT:
This is how I initialize the arrays in the second code:
for (int i = 0; i < amt; i++)
{
map_a[i] = i;
map_b[i] = i;
}
And now that I check, map_a and map_b do contain the right elements at the end of the program, but map_c contains all 0's. I did this:
clEnqueueUnmapMemObject(c_q, k_a, map_a, 0, NULL, NULL);
clEnqueueUnmapMemObject(c_q, k_b, map_b, 0, NULL, NULL);
clEnqueueUnmapMemObject(c_q, k_c, map_c, 0, NULL, NULL);
and my kernel is just
__kernel void test(__global int* a, __global int* b, __global int* c)
{
int i = get_global_id(0);
c[i] = a[i] + b[i];
}
My understanding is that CL_MEM_ALLOC_HOST_PTR allocates but doesn't copy. Does the 2nd block of code actually get any data onto the device?
Also, clCreateBuffer when used with CL_MEM_USE_HOST_PTR and CL_MEM_COPY_HOST_PTR shouldn't require clEnqueueWrite, as the buffer is created with the memory pointed to by void *host_ptr.
Using "pinned" memory in OpenCL should be a process like:
int amt = 16*1024*1024;
int Array[] = new int[amt];
int Error = 0;
//Note, since we are using NULL for the data pointer, we HAVE to use CL_MEM_ALLOC_HOST_PTR
//This allocates memory on the devices
cl_mem B1 = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(int)*amt, NULL, &Error);
//Map the Device memory to host memory, aka pinning it
int *host_ptr = clEnqueueMapBuffer(queue, B1, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, sizeof(int)*amt, 0, NULL, NULL, &Error);
//Copy from host memory to pinned host memory which copies to the card automatically`
memcpy(host_ptr, Array, sizeof(int)*amt);
//Call your kernel and everything else and memcpy back the pinned back to host when
//you are done
Edit: One final thing you can do to speed up the program is to not make the memory read/write blocking by using CL_FALSE instead of CL_TRUE. Just make sure to call clFinish() before data gets copied back to the host so that the command queue is emptied and all commands are processed.
Source: OpenCL In Action
With the right combination of flags, you should be able to achieve "zero copy" (i.e. very fast) map/unmap on Intel Integrated Graphics since there is no need for a "CPU to GPU" copy since they both use the same memory (that's what the "Integrated" means). Read the Intel OpenCL Optimization Guide section on memory.