Euclidean distance using OpenCL - c++

I am trying to compute the euclidean distance of a set of 5D points (pixels) to a 5D single point (center) and store in another result vector, I want to use vector indexing to store all info in a single vector so for the ith pixel, the 5 dimensions are (5i) , (5i+1) , ...
I am new to OpenCL and I just edited a sample code on the internet for my own intentions. The theory is right but the code doesn't show the right answers !
Here is the kernel:
//d_kernel.cl
__kernel void distance_kernel(__global double *pixelInfo,
__global double *clusterCentres,
__global double *distanceFromClusterCentre)
{
int index = get_global_id(0);
int d, dl, da, db, dx, dy;
dl = pixelInfo[5 * index] - clusterCentres[0];
dl = dl * dl;
da = pixelInfo[5 * index + 1] - clusterCentres[1];
da = da * da;
db = pixelInfo[5 * index + 2] - clusterCentres[2];
db = db * db;
dx = pixelInfo[5 * index + 3] - clusterCentres[3];
dx = dx * dx;
dy = pixelInfo[5 * index + 4] - clusterCentres[4];
dy = dy * dy;
distanceFromClusterCentre[index] = dx + dy + dl + da + db;
}
and here is the HOST CODE:
#include <iostream>
#include <CL/cl.h>
#include <vector>
using namespace std;
#define MAX_SOURCE_SIZE (0x100000)
int main(int argc, char **argv)
{
// Create the two input vectors
int i;
const int pixelsNumber = 1024;
const int clustersNumber = 1;
std::vector<double> pixelInfo;
pixelInfo.resize(5 * pixelsNumber);
std::fill(pixelInfo.begin(), pixelInfo.end(), 500);
std::vector<double> clusterCentres;
clusterCentres.resize(5 * clustersNumber);
std::fill(clusterCentres.begin(), clusterCentres.end(), 200);
std::vector<double> distanceFromClusterCentre;
distanceFromClusterCentre.resize(pixelsNumber);
std::fill(distanceFromClusterCentre.begin(), distanceFromClusterCentre.end(), 0);
// Load the kernel source code into the array source_str
FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("d_kernel.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);
// Get platform and device information
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1,
&device_id, &ret_num_devices);
// Create an OpenCL context
cl_context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
// Create a command queue
cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
// Create memory buffers on the device for each vector
cl_mem pixelInfo_mem = clCreateBuffer(context, CL_MEM_READ_ONLY,
5 * pixelsNumber * sizeof(int), NULL, &ret);
cl_mem clusterCentres_mem = clCreateBuffer(context, CL_MEM_READ_ONLY,
5 * clustersNumber * sizeof(int), NULL, &ret);
cl_mem distanceFromClusterCentre_mem = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
pixelsNumber * sizeof(int), NULL, &ret);
// Copy the vectors to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue, pixelInfo_mem, CL_TRUE, 0,
5 * pixelsNumber * sizeof(int), pixelInfo.data(), 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, clusterCentres_mem, CL_TRUE, 0,
5 * clustersNumber * sizeof(int), clusterCentres.data(), 0, NULL, NULL);
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&source_str, (const size_t *)&source_size, &ret);
// Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "vector_add", &ret);
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&pixelInfo_mem);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&clusterCentres_mem);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&distanceFromClusterCentre_mem);
// Execute the OpenCL kernel on the list
size_t global_item_size = pixelsNumber; // Process the entire lists
size_t local_item_size = 64; // Divide work items into groups of 64
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
// Read the memory buffer result on the device to the local vector result
ret = clEnqueueReadBuffer(command_queue, distanceFromClusterCentre_mem, CL_TRUE, 0,
pixelsNumber * sizeof(int), distanceFromClusterCentre.data(), 0, NULL, NULL);
// Display the result to the screen
for (i = 0; i < pixelsNumber; i++)
{
cout << "Pixel " << i << ": " << distanceFromClusterCentre[i] << endl;
//system("PAUSE");
}
// Clean up
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(pixelInfo_mem);
ret = clReleaseMemObject(clusterCentres_mem);
ret = clReleaseMemObject(distanceFromClusterCentre_mem);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
free(pixelInfo.data());
free(clusterCentres.data());
free(distanceFromClusterCentre.data());
system("PAUSE");
return 0;
}
and a part of the RESULT is:
.
.
.
Pixel 501: -1.11874e+306
Pixel 502: -1.16263e+306
Pixel 503: -1.07485e+306
Pixel 504: -1.03079e+306
Pixel 505: -9.42843e+305
Pixel 506: -9.86903e+305
Pixel 507: -8.98954e+305
Pixel 508: -9.86903e+305
Pixel 509: -8.98954e+305
Pixel 510: -9.43014e+305
Press any key to continue . . .
Pixel 511: -8.55065e+305
Pixel 512: 0
Pixel 513: 0
Pixel 514: 0
Pixel 515: 0
Pixel 516: 0
Pixel 517: 0
Pixel 518: 0
Pixel 519: 0
Pixel 520: 0
.
.
.
after index 511 the rest of the vector is zero !

You created your vectors of double's and then you treat them as there were ints (created buffer for ints, writing data to int buffers and reading back results as there were ints). To avoid such mistakes you could write your code this way:
cl_mem pixelInfo_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, pixelInfo.size() * sizeof(pixelInfo[0]), NULL, &ret);
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Related

OpenCL alignment issue

I want to fill an array of glm::vec3 with an OpenCL kernel.
All I want to do is fill the array with [1.0, 2.0, 3.0].
So upon success I should get the triplet repeated 256 times.
[1.0, 2.0, 3.0][1.0, 2.0, 3.0][1.0, 2.0, 3.0] ... [1.0, 2.0, 3.0]
However the result looks like this
[1.0, 2.0, 2.0][2.0, 2.0, 2.0] ... [2.0, 2.0, 2.0]
Why?
Here is the code for the kernel
__kernel void fill_array(__global float *output_values)
{
int i = get_global_id(0);
float3 pos = (float3)(1.0, 2.0, 3.0);
vstore3(pos, 0, &(output_values[i]));
}
And here is the code to run it
#include <stdio.h>
#include <stdlib.h>
#include <vector>
#include "glm/glm.hpp"
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#define MAX_SOURCE_SIZE (0x100000)
int main(void)
{
std::vector<glm::vec3> values;
values.resize(256);
// Load the kernel source code into the array source_str
FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("E:/Dev/fill_array_kernel.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
// Get platform and device information
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_ALL, 1,
&device_id, &ret_num_devices);
// Create an OpenCL context
cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
// Create a command queue
cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
// Create memory buffers on the device for each vector
cl_mem output_mem = clCreateBuffer(context, CL_MEM_WRITE_ONLY, values.size() * sizeof(glm::vec3), NULL, &ret);
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&source_str, (const size_t *)&source_size, &ret);
// Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
if(ret != CL_SUCCESS)
{
cl_build_status build_status;
ret = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &build_status, NULL);
size_t ret_val_size;
ret = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
char *build_log = (char*)malloc(sizeof(char)*(ret_val_size + 1));
ret = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
build_log[ret_val_size] = '\0';
printf("%s\n", build_log);
free(build_log);
return -1;
}
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "fill_array", &ret);
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&output_mem);
// Execute the OpenCL kernel on the list
size_t global_item_size = values.size(); // Process the entire lists
size_t local_item_size = 64; // Process in groups of 64
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
// Read the memory buffer C on the device to the local variable C
ret = clEnqueueReadBuffer(command_queue, output_mem, CL_TRUE, 0, values.size() * sizeof(glm::vec3), values.data(), 0, NULL, NULL);
// Clean up
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(output_mem);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
return 0;
}
I was misusing the vstore function.
I should have used the 2nd parameter to specify the index in the array.
https://www.khronos.org/registry/OpenCL/sdk/1.0/docs/man/xhtml/vstoren.html
__kernel void fill_array(__global float *output_values)
{
int i = get_global_id(0);
float3 pos = (float3)(1.0, 2.0, 3.0);
vstore3(pos, i, output_values);
}

Model class fails to initialize in DirectX 10

My project uses DirectX 10 and some of its boilerplate to render a scene, however, it crashes with an error message "Could not initialize the model object." As far as I understand, making it up to this point means that, at the very least, the model has been successfully created, so the error must be in one of the files below, which is fortunate as the most difficult tasks are handled by the FallBodyClass.cpp that hosts OpenCL API interactions. If needed, I can try attaching parts of it in a later edit.
During debug, my IDE shows that all components of m_Model (m_vertexBuffer, m_indexBuffer etc) are shown as with _vfptr . I do not know what to make of it, but it does seem to confirm that modelclass.cpp is the point of failure.
graphicsclass.cpp
GraphicsClass::GraphicsClass()
{
m_Direct3D = 0;
m_Model = 0;
m_ColorShader = 0;
m_bodies = BODIES;
}
GraphicsClass::GraphicsClass(const GraphicsClass& other)
{}
GraphicsClass::~GraphicsClass()
{}
bool GraphicsClass::Initialize(int screenWidth, int screenHeight, HWND hwnd)
{
bool result;
// Create the Direct3D object.
m_Direct3D = new D3DClass;
if (!m_Direct3D)
{
return false;
}
// Initialize the Direct3D object.
result = m_Direct3D->Initialize(screenWidth, screenHeight, VSYNC_ENABLED, hwnd, FULL_SCREEN, SCREEN_DEPTH, SCREEN_NEAR);
if (!result)
{
MessageBox(hwnd, L"Could not initialize Direct3D", L"Error", MB_OK);
return false;
}
// Create the model object.
m_Model = new ModelClass(m_bodies);
if (!m_Model)
{
return false;
}
// Initialize the model object.
result = m_Model->Initialize(m_Direct3D->GetDevice());
if (!result)
{
MessageBox(hwnd, L"Could not initialize the model object.", L"Error", MB_OK);
return false;
}
modelclass.cpp
ModelClass::ModelClass(int bodies)
{
m_vertexBuffer = 0;
m_indexBuffer = 0;
m_positions = 0;
m_velocities = 0;
m_bodySystem = 0;
m_bodies = bodies;
}
ModelClass::ModelClass(const ModelClass& other)
{}
ModelClass::~ModelClass()
{}
bool ModelClass::Initialize(ID3D10Device* device)
{
bool result;
TwoLines twoLinesConstants = CalculateLinesConstants(M_PI_4);
m_positions = new float[COORD_DIM * m_bodies];
m_velocities = new float[VEL_DIM * m_bodies];
m_bodySystem = new class FallBodyClass(m_bodies, &m_positions, &m_velocities, twoLinesConstants, result);
if (!result) {
return false;
}
// Initialize the vertex and index buffer that hold the geometry for the triangle.
result = InitializeBuffers(device, twoLinesConstants);
if(!result)
{
return false;
}
return true;
}
FallBodyclass.cpp
FallBodyClass::FallBodyClass(int bodies, float ** positionsCPU, float ** velocitiesCPU, TwoLines twoLines, bool & success)
:bodies(bodies)
{
cl_int ret;
// getting the first available platform
cl_platform_id clPlatformID[2];
cl_platform_id GPUplatform;
cl_uint num_platforms;
//char str[1024];
ret = clGetPlatformIDs(2, clPlatformID, &num_platforms);
GPUplatform = clPlatformID[0]; //choose GPU platform
//error |= clGetPlatformInfo(GPUplatform, CL_PLATFORM_NAME, 0, NULL, NULL);
//clGetPlatformInfo(GPUplatform, CL_PLATFORM_VENDOR, sizeof(str), str, NULL);
// getting the first GPU device
ret |= clGetDeviceIDs(GPUplatform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
if (ret != CL_SUCCESS)
{
success = false;
return;
}
//clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(str), str, NULL);
// creating the context
context = clCreateContext(0, 1, &device, NULL, NULL, &ret);
if (ret != CL_SUCCESS)
{
success = false;
return;
}
cl_queue_properties props[] = {
CL_QUEUE_PROFILING_ENABLE
};
// creating the command queue
queue = clCreateCommandQueueWithProperties(context, device, props, &ret);
if (ret != CL_SUCCESS)
{
success = false;
return;
}
// setting the local variables
// (at the same time one of them supposed to be 0 and another to be 1)
read = 0;
write = 1;
// reading the kernel
FILE * f = NULL;
char fileName[18] = "kernel.cl";
f = fopen(fileName, "rb");
if(f == NULL)
{
success = false;
return;
}
// getting the length of the source code for the kernel
fseek(f, 0, SEEK_END);
size_t codeLength = ftell(f);
rewind(f);
char * code = (char *)malloc(codeLength + 1);
if (fread(code, codeLength, 1, f) != 1)
{
fclose(f);
free(code);
success = false;
return;
}
// closing the file and 0-terminating the source code
fclose(f);
code[codeLength] = '\0';
// creating the program
program = clCreateProgramWithSource(context, 1, (const char **)&code, &codeLength, &ret);
if (ret != CL_SUCCESS)
{
success = false;
return;
}
// clearing the memory
free(code);
// building the program
ret |= clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
// creating the kernel
kernel = clCreateKernel(program, "impactManager", &ret);
// setting the local size of the group the largest possible in order to load all computational units
int numGroups;
ret |= clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(numGroups), &numGroups, NULL);
localSize = bodies / numGroups;
// allocating pinned buffers for velocities and positions, and stuck
positionsCPUBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, COORD_DIM * bodies * sizeof(float) , NULL, NULL);
velocitiesCPUBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, VEL_DIM * bodies * sizeof(float) , NULL, NULL);
linesCPUBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, 8 * sizeof(float), NULL, NULL);
// get pointers to arrays to operate with the buffers (array map buffers here (to program) as float-arrays)
*positionsCPU = (float *)clEnqueueMapBuffer(queue, positionsCPUBuffer, CL_TRUE, CL_MAP_WRITE, 0, COORD_DIM * bodies * sizeof(float), 0, NULL, NULL, NULL);
*velocitiesCPU = (float *)clEnqueueMapBuffer(queue, velocitiesCPUBuffer, CL_TRUE, CL_MAP_WRITE, 0, VEL_DIM * bodies * sizeof(float), 0, NULL, NULL, NULL);
float * linesCPU = (float *)clEnqueueMapBuffer(queue, linesCPUBuffer, CL_TRUE, CL_MAP_WRITE, 0, 8 * sizeof(float), 0, NULL, NULL, NULL);
// initialization of the bodies' positions and velocities, and stuck
initBodies(*positionsCPU, *velocitiesCPU);
initLines(twoLines, linesCPU);
// unmapping the pointers to arrays (invalidates array pointers)
clEnqueueUnmapMemObject(queue, positionsCPUBuffer, *positionsCPU, 0, NULL, NULL);
clEnqueueUnmapMemObject(queue, velocitiesCPUBuffer, *velocitiesCPU, 0, NULL, NULL);
clEnqueueUnmapMemObject(queue, linesCPUBuffer, linesCPU, 0, NULL, NULL);
// allocate two arrays on GPU for positions and velocities
for (int i = 0; i < 2; ++i) {
positionsGPU[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, COORD_DIM * bodies * sizeof(float), NULL, NULL);
ret |= clEnqueueWriteBuffer(queue, positionsGPU[i], CL_TRUE, 0, COORD_DIM * bodies * sizeof(float), *positionsCPU, 0, NULL, NULL);
velocitiesGPU[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, VEL_DIM * bodies * sizeof(float), NULL, NULL);
ret |= clEnqueueWriteBuffer(queue, velocitiesGPU[i], CL_TRUE, 0, VEL_DIM * bodies * sizeof(float), *velocitiesCPU, 0, NULL, NULL);
}
linesGPU = clCreateBuffer(context, CL_MEM_READ_WRITE, 8 * sizeof(float), NULL, NULL);
ret |= clEnqueueWriteBuffer(queue, linesGPU, CL_TRUE, 0, 8 * sizeof(float), linesCPU, 0, NULL, NULL);
if (ret != CL_SUCCESS)
{
success = false;
return;
}
}
void FallBodyClass::initLines(IN TwoLines l, OUT float *linesCPU)
{
linesCPU[0] = l.a1;
linesCPU[1] = l.b1;
linesCPU[2] = l.R1.x;
linesCPU[3] = l.R1.y;
linesCPU[4] = l.a2;
linesCPU[5] = l.b2;
linesCPU[6] = l.R2.x;
linesCPU[7] = l.R2.y;
}
// initialization of the bodies' positions and velocities
void FallBodyClass::initBodies(float * positionsCPU, float * velocitiesCPU)
{
float scale = 0.20f;
// initialization of the memory
memset(positionsCPU, 0, COORD_DIM * bodies * sizeof(float));
memset(velocitiesCPU, 0, VEL_DIM * bodies * sizeof(float));
// for the randomization
srand((unsigned int)time(NULL));
for (int i = 0; i < bodies; i++)
{
positionsCPU[COORD_DIM * i] = 1.8*((rand() / (float)RAND_MAX) - 0.5); //x axis
positionsCPU[COORD_DIM * i + 1] = 0.9; //y axis
positionsCPU[COORD_DIM * i + 2] = 0.0f; //z axis
positionsCPU[COORD_DIM * i + 3] = 0.0f; // stuck variable
// velocities are zeros
velocitiesCPU[VEL_DIM* i] = 0.0;
velocitiesCPU[VEL_DIM* i + 1] = -2 * (rand() / (float)RAND_MAX);
velocitiesCPU[VEL_DIM* i + 2] = 0.0;
}
}
// updating the bodies' positions and velocities. Stuck is updated inside too
void FallBodyClass::update(float dt, float * positionsCPU, float * velocitiesCPU, bool & success)
{
cl_int error = CL_SUCCESS;
size_t global_work_size;
size_t local_work_size;
success = true;
if (localSize > bodies)
localSize = bodies;
local_work_size = localSize;
global_work_size = bodies;
// passing the arguments
// we write the new positions and velocities and read the previous ones
error |= clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&positionsGPU[write]);
error |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&velocitiesGPU[write]);
error |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&positionsGPU[read]);
error |= clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&velocitiesGPU[read]);
error |= clSetKernelArg(kernel, 4, sizeof(cl_float), (void *)&dt);
error |= clSetKernelArg(kernel, 5, sizeof(cl_mem), (void *)&linesGPU);
// just swap read and write in order not to copy the arrays
int temp;
temp = write;
write = read;
read = temp;
// executing the kernel
error |= clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
// synchronization
clFinish(queue);
// asynchronously reading the updated values
error |= clEnqueueReadBuffer(queue, positionsGPU[read], CL_FALSE, 0, COORD_DIM * bodies * sizeof(float), positionsCPU, 0, NULL, NULL);
if (error != CL_SUCCESS)
{
success = false;
}
error |= clEnqueueReadBuffer(queue, velocitiesGPU[read], CL_FALSE, 0, VEL_DIM * bodies * sizeof(float), velocitiesCPU, 0, NULL, NULL);
if (error != CL_SUCCESS)
{
success = false;
}
///////////
bool toReboot = positionsCPU[3]; //fourth index of the [0] first element
//bool toReboot = false;
////////////
if (toReboot) {
positionsCPU = (float *)clEnqueueMapBuffer(queue, positionsCPUBuffer, CL_TRUE, CL_MAP_WRITE, 0, COORD_DIM * bodies * sizeof(float), 0, NULL, NULL, NULL);
velocitiesCPU = (float *)clEnqueueMapBuffer(queue, velocitiesCPUBuffer, CL_TRUE, CL_MAP_WRITE, 0, VEL_DIM * bodies * sizeof(float), 0, NULL, NULL, NULL);
initBodies(positionsCPU, velocitiesCPU);
// unmapping the pointers
clEnqueueUnmapMemObject(queue, positionsCPUBuffer, positionsCPU, 0, NULL, NULL);
clEnqueueUnmapMemObject(queue, velocitiesCPUBuffer, velocitiesCPU, 0, NULL, NULL);
//update values on GPU side
error |= clEnqueueWriteBuffer(queue, positionsGPU[read], CL_TRUE, 0, COORD_DIM * bodies * sizeof(float), positionsCPU, 0, NULL, NULL);
error |= clEnqueueWriteBuffer(queue, velocitiesGPU[read], CL_TRUE, 0, VEL_DIM * bodies * sizeof(float), velocitiesCPU, 0, NULL, NULL);
}
return;
}
FallBodyClass::~FallBodyClass(void)
{
// synchronization (if something has to be done)
clFinish(queue);
// releasing all objects
clReleaseMemObject(linesGPU);
clReleaseMemObject(linesCPUBuffer);
clReleaseMemObject(velocitiesGPU[0]);
clReleaseMemObject(velocitiesGPU[1]);
clReleaseMemObject(positionsGPU[0]);
clReleaseMemObject(positionsGPU[1]);
clReleaseMemObject(positionsCPUBuffer);
clReleaseMemObject(velocitiesCPUBuffer);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(queue);
clReleaseContext(context);
}

OpenCL - Kernel method returns unexpected results

I am a beginner at OpenCL. I tried to run a very simple kernel code, adding 1 to each value of vector. Everything runs fine, returns no error code (I checked return value after each step). The source Code :
cl_device_id device_id = NULL;
cl_context context = NULL;
cl_command_queue command_queue = NULL;
cl_mem memobj , resobj = NULL;
cl_program program = NULL;
cl_kernel kernel = NULL;
cl_platform_id platform_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret;
size_t work_units_per_kernels;
int input[10] = {1,2,3,4,5,6,7,8,9,10};
int output[10];
int length = 10 ;
FILE *fp;
char fileName[] = "/home/tuan/OpenCLPlayaround/hello.cl";
char *source_str;
size_t source_size;
/* Load the source code containing the kernel*/
fp = fopen(fileName, "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(0x100000);
source_size = fread(source_str,1,0x100000, fp);
fclose(fp);
ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
std::cout<<ret<<" code"<<std::endl;
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);
std::cout<<ret<<" code"<<std::endl;
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
std::cout<<ret<<" code"<<std::endl;
command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
//Check Concept of memory
memobj = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,length * sizeof(int), input, &ret);
resobj = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, length * sizeof(int), output, &ret);
std::cout<<ret<<" code"<<std::endl;
program = clCreateProgramWithSource(context,1,(const char**)&source_str, (const size_t*)&source_size, &ret);
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
kernel = clCreateKernel(program, "hello", &ret);
ret = clSetKernelArg(kernel,0, sizeof(memobj),(void *)&memobj);
ret = clSetKernelArg(kernel,1, sizeof(resobj),(void *)&resobj);
ret = clEnqueueTask(command_queue, kernel, 0, NULL,NULL);
ret = clEnqueueReadBuffer(command_queue, resobj, CL_TRUE, 0, length* sizeof(int),output, 0, NULL, NULL);
for (int i = 0 ; i <10 ; i++) {
std::cout<<output[i]<<" "<<std::endl;
}
return 0;
The result is somewhat bizarre, while it should be {2,3,4,5,6,7,8,9,10,11} :
2
-16777216
65535
1
-1242789408
32767
4201449
0
2
0
And my kernel :
__kernel void hello(__global int* a, __global int* b)
{
int sam = 0;
int gid = get_global_id(0);
b[gid] = sam + a[gid] +1 ;
}
Can somebody explain why ? Its bursting my head for hours !
clEnqueueTask is equivalent to calling clEnqueueNDRangeKernel with work_dim = 1, global_work_offset = NULL, global_work_size[0] set to 1, and local_work_size[0] set to 1.
so use clEnqueueNDRangeKernel.

clEnqueueWriteBuffer writes wrong data into VRAM

I have a very curious problem with clEnqueueWriteBuffer. In my current project, I would like to copy ~500 images (1GB) onto the graphics card and average some pixels. The images are stored in one big double* Array (size: width*height*nImages). If I copy 300 images into the VRAM and read it out using clEnqueueReadBuffer, I get exactly what I had stored in RAM:
RAM: 14450,5006076793 14450,5006076793 14456,8079379383 14455,2294939826 14444,7361060619
VRAM: 14450,5006076793 14450,5006076793 14456,8079379383 14455,2294939826 14444,7361060619
However, if I load more than 350 images, the content of my cl_mem object is corrupt:
RAM:14450,5006076793 14450,5006076793 14456,8079379383 14455,2294939826 14444,7361060619
VRAM:-6,27743856220419E+66 -6,27743856220419E+66 -6,27743856220419E+66 -6,27743856220419E+66 -6,27743856220419E+66
I would be very happy if you could help me out!
Here is my code:
private: System::Void button7_Click(System::Object^ sender, System::EventArgs^ e) {
std::string text;
text = StringConvA(maskedTextBox1->Text);
textBox1->Text += "You want a bin size of " + atoi(text.c_str()) + ". You have "+ nforegroundImages+" images.\r\n";
binWidth = atoi(text.c_str());
nbins = (int)ceil((double)nforegroundImages / (double)binWidth);
textBox1->Text += "That is going to give you "+nbins+" bins\r\n";
//create context and cmd_queue
context = clCreateContext(NULL, nDevices, &deviceID[0], NULL, NULL, &err);
cmd_queue = clCreateCommandQueue(context, deviceID[0], NULL, &err);
//allocate result memory
//each result image will have width*height double entries. res_im is an array of pointer to double.
res_im = (double*)malloc(width*height*sizeof(double)*nbins);
cl_mem imageData_mem, result_mem, nWavenumber_mem, binSize_mem, imageSizeInPixels_mem, nbins_mem;
imageData_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, width * height * sizeof(double)*nforegroundImages, NULL, NULL);
result_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, width * height * sizeof(double)*nbins, NULL, NULL);
nWavenumber_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int), NULL, NULL);
binSize_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int), NULL, NULL);
imageSizeInPixels_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int), NULL, NULL);
nbins_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int), NULL, NULL);
clFinish(cmd_queue);
int imageSizeInPixels = width*height;
err = clEnqueueWriteBuffer(cmd_queue, imageData_mem, CL_TRUE, 0, width*height*sizeof(double)*nforegroundImages, (void*)images, 0, NULL, NULL); //this is where the images are copied into VRAM. If nforegroundImages>300, the data in VRAM is wrong, otherwise it is the same as in the images array
err = clEnqueueWriteBuffer(cmd_queue, nWavenumber_mem, CL_TRUE, 0, sizeof(int), (void*)&nforegroundImages, 0, NULL, NULL);
err = clEnqueueWriteBuffer(cmd_queue, binSize_mem, CL_TRUE, 0, sizeof(int), (void*)&binWidth, 0, NULL, NULL);
err = clEnqueueWriteBuffer(cmd_queue, imageSizeInPixels_mem, CL_TRUE, 0, sizeof(int), (void*)&imageSizeInPixels, 0, NULL, NULL);
err = clEnqueueWriteBuffer(cmd_queue, nbins_mem, CL_TRUE, 0, sizeof(int), (void*)&nbins, 0, NULL, NULL);
clFinish(cmd_queue);
//read the content of imageData_mem and store it in test array
double * test = (double*)malloc(width*height*sizeof(double)*nforegroundImages);
err = clEnqueueReadBuffer(cmd_queue, imageData_mem, CL_TRUE, 0, width*height*sizeof(double)*nforegroundImages,
test, 0, NULL, NULL);
clFinish(cmd_queue);
//compare original value from the images array to the value retrieved from the VRAM
textBox1->Text += images[1] + "\t" + images[1] + "\t" + images[10] + "\t" + images[100] + "\t" + images[1000] + "\t\r\n"; //original data
textBox1->Text += test[1] + "\t" + test[1] + "\t" + test[10] + "\t" + test[100] + "\t" + test[1000] + "\t\r\n"; //retrieved from imageData_mem
free(test);
//build the program from the source file and print the program build log
cl_program program[2];
cl_kernel kernel[2];
const char * filename = "addKernel.c";
char *program_source = load_program_source(filename);
program[0] = clCreateProgramWithSource(context, 1, (const char**)&program_source,
NULL, &err);
if (err == CL_OUT_OF_HOST_MEMORY){
textBox1->Text += "Error: out of Host Memory!\r\n";
}
else if (err == CL_INVALID_CONTEXT){
textBox1->Text += "Error: invalid Context!\r\n";
}
else if (err == CL_INVALID_VALUE){
textBox1->Text += "Error: invalid Value!\r\n";
}
err = clBuildProgram(program[0], 0, NULL, NULL, NULL, NULL);
textBox1->Text += "Program build error: " + err + "\r\n";
cl_build_status status;
size_t logSize;
clGetProgramBuildInfo(program[0], deviceID[0], CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL);
clGetProgramBuildInfo(program[0], deviceID[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
char* programLog;
programLog = (char*)calloc(logSize + 1, sizeof(char));
clGetProgramBuildInfo(program[0], deviceID[0], CL_PROGRAM_BUILD_LOG, logSize + 1, programLog, NULL);
this->textBox1->Text += "Program build info: error=" + err + ", status=" + status + ", programLog:\r\n" + *programLog + "\r\n" + "In case of an error please make sure that openCL has been initialized\r\n";
kernel[0] = clCreateKernel(program[0], "filterSpectrum", &err);
//(__global double *imageData, __global double *result, __constant int *nWavenumbers, __constant int *binSize, __constant int *imageSizeInPixels,__constant int * nbins)
// Now setup the arguments to our kernel
err = clSetKernelArg(kernel[0], 0, sizeof(cl_mem), &imageData_mem);
err |= clSetKernelArg(kernel[0], 1, sizeof(cl_mem), &result_mem);
err |= clSetKernelArg(kernel[0], 2, sizeof(cl_mem), &nWavenumber_mem);
err |= clSetKernelArg(kernel[0], 3, sizeof(cl_mem), &binSize_mem);
err |= clSetKernelArg(kernel[0], 4, sizeof(cl_mem), &imageSizeInPixels_mem);
err |= clSetKernelArg(kernel[0], 5, sizeof(cl_mem), &nbins_mem);
size_t local_work_size = 32;
// Run the calculation by enqueuing it and forcing the
// command queue to complete the task
size_t global_work_size = width*height;
err = clEnqueueNDRangeKernel(cmd_queue, kernel[0], 1, NULL,&global_work_size, &local_work_size, 0, NULL, NULL);
clFinish(cmd_queue);
// Once finished read back the results from the answer
// array into the results array
err = clEnqueueReadBuffer(cmd_queue, result_mem, CL_TRUE, 0, width*height*sizeof(double)*nbins,
res_im, 0, NULL, NULL);
clFinish(cmd_queue);
textBox1->Text += "result values " + res_im[1] + "\t" + res_im[100] + "\t" + res_im[1000] + "\t" + res_im[10000] + "\t" + res_im[100000] + "\t" + res_im[1000000] + "\r\n";
hScrollBar2->Maximum = nbins+3;
clReleaseMemObject(imageSizeInPixels_mem);
clReleaseMemObject(imageData_mem);
clReleaseMemObject(result_mem);
clReleaseMemObject(nWavenumber_mem);
clReleaseMemObject(binSize_mem);
clReleaseMemObject(nbins_mem);
clReleaseCommandQueue(cmd_queue);
clReleaseContext(context);
}
You are most likely requesting more memory than the driver will allow in a single allocation. It looks like you aren't checking most of the error codes that the OpenCL runtime functions return; doing this makes it much easier to diagnose problems with OpenCL programs. You really should do this for every API call.
You can find out what the largest single memory allocation your device supports is with the following code snippet:
cl_ulong maxMemAlloc;
clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &maxMemAlloc, NULL);
textBox1->Text += "Maximum memory allocation size is " + maxMemAlloc + " bytes\r\n";
It's often the case that the largest memory allocation is much less than the total size of the GPU memory. The OpenCL specification only requires that it is at least 1/4 of the maximum size, or at least 128 MB.

Reading wrong data on OpenCL

I faced with the problem that the kernel writes data in wrong place or host reads data incorrectly sometimes. I write the same data (index at which I write the data) to two global arrays with different types. To ensure that the index is corrent are used the global counter which incremented by means of atom_inc. The problem occures when data are read from second array on the host.
For instance:
.....
output array index: 442: (output1 value:442.0000 output2 value:442)
output array index: 443: (output1 value:443.0000 output2 value:443)
output array index: 444: (output1 value:444.0000 output2 value:444)
output array index: 445: (output1 value:445.0000 output2 value:445)
output array index: 446: (output1 value:446.0000 output2 value:1152892928)
output array index: 447: (output1 value:447.0000 output2 value:447)
output array index: 448: (output1 value:448.0000 output2 value:1152909312)
output array index: 449: (output1 value:449.0000 output2 value:1152917504)
output array index: 450: (output1 value:450.0000 output2 value:1152925696)
......
As you can see at indicies 446, 448, 449 and 450+ output2 contains wrong values. What can be the possible reason of this?
Device: ATI Radeon HD5750
Code sample:
#include <stdio.h>
#include <math.h>
#include <OpenCL/OpenCL.h>
// wtf example
const char *programSource =
"__kernel void kernel1(__global uint *counter,\n" \
"__global float *weights,\n" \
"__global uint *weights_pos)\n" \
"{\n"\
"const uint global_size = get_global_size(0);\n" \
"const uint global_id = get_global_id(0);\n" \
"uint local_id = get_local_id(0);\n" \
"if(global_id == 0) {\n" \
"counter[5] = 0; // set index of pos in weights to zero\n" \
"}\n" \
"uint insert_index = atom_inc(&counter[5]);\n" \
"weights[insert_index] = insert_index;\n" \
"weights_pos[insert_index] = insert_index;\n" \
"}";
void art_process_sinogram(const char* tiff_filename,
const float *angles2,
const unsigned int n_angles2,
const unsigned int n_ray2s,
const float distanc2e)
{
/******************************
* OPENCL ENVIRONMENT
*/
cl_int status;
cl_uint numPlatforms = 0;
cl_platform_id *platforms = NULL;
cl_device_id device_id;
//discover platforms
status = clGetPlatformIDs(0, NULL, &numPlatforms);
platforms = (cl_platform_id*)malloc(numPlatforms * sizeof(cl_platform_id));
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
//discover devices
cl_uint numDevices = 0;
cl_device_id *devices = NULL;
status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices);
devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id));
status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL);
device_id = devices[1];
//create context
cl_context context = NULL;
context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status);
cl_program program = clCreateProgramWithSource(context, 1, (const char **)&programSource, NULL, &status);
clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
cl_kernel kernel_weights = clCreateKernel(program, "kernel1", &status);
//create queue
cl_command_queue command_queue1 = clCreateCommandQueue(context, device_id, 0, &status);
/******************************
* HARDWARE PARAMETERS
*/
cl_uint wavefronts_per_SIMD = 7;
size_t global_work_size;
size_t local_work_size = 64;
cl_uint max_compute_units;
clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &max_compute_units, NULL);
size_t wg_count = max_compute_units * wavefronts_per_SIMD;
global_work_size = wg_count * local_work_size;
/**************************** DATA PART *************************************/
size_t w_portion_size = 768 * sizeof(cl_float);
size_t w_pos_portion_size = 768 * sizeof(cl_uint);
size_t counters_data_size = 6 * sizeof(cl_uint);
cl_uint counters_data[6];
counters_data[0] = 1;
counters_data[1] = 2; // max number of the cells intersected by the ray
counters_data[2] = 3;
counters_data[3] = 4;
counters_data[4] = 5; // same to the number of rays
counters_data[5] = 0; // counter inside kernel
/*****************
* Main buffers
*/
cl_mem weights1_buffer = clCreateBuffer(context,
CL_MEM_READ_WRITE,
w_portion_size,
NULL,
NULL);
cl_mem weights_pos1_buffer = clCreateBuffer(context,
CL_MEM_READ_WRITE,
w_pos_portion_size,
NULL,
NULL);
/*****************
* Supplement buffers (constant)
*/
cl_mem counters_data_buffer = clCreateBuffer(context,
CL_MEM_READ_ONLY,
counters_data_size,
NULL,
&status);
cl_event supplement_buffer_ready[1];
status = clEnqueueWriteBuffer(command_queue1,
counters_data_buffer,
CL_FALSE,
0,
counters_data_size,
counters_data,
0,
NULL,
&supplement_buffer_ready[0]);
status = clSetKernelArg(kernel_weights, 0, sizeof(void *), (void *)&counters_data_buffer);
status = clSetKernelArg(kernel_weights, 1, sizeof(void *), (void *)&weights1_buffer);
status = clSetKernelArg(kernel_weights, 2, sizeof(void *), (void *)&weights_pos1_buffer);
status = clEnqueueNDRangeKernel(command_queue1,
kernel_weights,
1, // work dimensional 1D, 2D, 3D
NULL, // offset
&global_work_size, // total number of WI
&local_work_size, // nomber of WI in WG
1, // num events in wait list
supplement_buffer_ready, // event wait list
NULL); // event
clFinish(command_queue1);
cl_float *output1 = (cl_float *) clEnqueueMapBuffer(command_queue1,
weights1_buffer,//*pmain_weights_buffer,
CL_TRUE,
CL_MAP_READ,
0,
w_portion_size,
0, NULL, NULL, NULL);
cl_uint *output2 = malloc(w_portion_size);
status = clEnqueueReadBuffer(command_queue1, weights_pos1_buffer,
CL_TRUE, 0, w_pos_portion_size, output2,
0, NULL, NULL);
clFinish(command_queue1);
for(int i = 0; i < 790; ++i) {
printf("output array index: %d: (output1 value:%.4f \t output2 value:%d) \n", i, output1[i], output2[i]);
}
}
SOLUTION:
The kernel should be looks like (need checking index):
__kernel void k_1(__global uint *counter,
__global uint *weights,
__global uint2 *weights_pos)
{
const uint global_size = get_global_size(0);
const uint global_id = get_global_id(0);
uint local_id = get_local_id(0);
uint insert_index = atom_inc(&counter[5]);
if(insert_index < 768) {
weights[insert_index]= insert_index;
weights_pos[insert_index].x = insert_index;
weights_pos[insert_index].y = insert_index;
}
}
You are messing up with buffer dimensions.
1) Your buffers contains 768 elements each (see initialization of w_portion_size and w_pos_portion_size)
2) Workgroup size on my machine is 896 (see initialization of wg_count)
3) You print out 790 values.
Apart from this, one conceptual error is here:
if(global_id == 0) {
counter[5] = 0; // set index of pos in weights to zero
}
//atomic increments on counter[5]
You can't assume that the first virtual processor will execute this line before the others. You should completely remove this line, since you initialize counter[5] on the host side. (I believe that this is the cause of your problem, but I can't reproduce that).
After fixing these problems your code seems to run fine (intel implementation).
The kernel should be looks like (need checking index):
__kernel void k_1(__global uint *counter,
__global uint *weights,
__global uint2 *weights_pos)
{
const uint global_size = get_global_size(0);
const uint global_id = get_global_id(0);
uint local_id = get_local_id(0);
uint insert_index = atom_inc(&counter[5]);
if(insert_index < 768) {
weights[insert_index]= insert_index;
weights_pos[insert_index].x = insert_index;
weights_pos[insert_index].y = insert_index;
}
}