I just installed clBLAS on my mac (Monterey 12.4) using brew :
brew install clblas
But I can't run the simple example given by the library :
#include <sys/types.h>
#include <stdio.h>
/* Include the clBLAS header. It includes the appropriate OpenCL headers */
#include <clBLAS.h>
/* This example uses predefined matrices and their characteristics for
* simplicity purpose.
*/
#define M 4
#define N 3
#define K 5
static const cl_float alpha = 10;
static const cl_float A[M*K] = {
11, 12, 13, 14, 15,
21, 22, 23, 24, 25,
31, 32, 33, 34, 35,
41, 42, 43, 44, 45,
};
static const size_t lda = K; /* i.e. lda = K */
static const cl_float B[K*N] = {
11, 12, 13,
21, 22, 23,
31, 32, 33,
41, 42, 43,
51, 52, 53,
};
static const size_t ldb = N; /* i.e. ldb = N */
static const cl_float beta = 20;
static cl_float C[M*N] = {
11, 12, 13,
21, 22, 23,
31, 32, 33,
41, 42, 43,
};
static const size_t ldc = N; /* i.e. ldc = N */
static cl_float result[M*N];
int main( void )
{
cl_int err;
cl_platform_id platform = 0;
cl_device_id device = 0;
cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
cl_context ctx = 0;
cl_command_queue queue = 0;
cl_mem bufA, bufB, bufC;
cl_event event = NULL;
int ret = 0;
/* Setup OpenCL environment. */
err = clGetPlatformIDs( 1, &platform, NULL );
err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL );
props[1] = (cl_context_properties)platform;
ctx = clCreateContext( props, 1, &device, NULL, NULL, &err );
queue = clCreateCommandQueue( ctx, device, 0, &err );
/* Setup clBLAS */
err = clblasSetup( );
/* Prepare OpenCL memory objects and place matrices inside them. */
bufA = clCreateBuffer( ctx, CL_MEM_READ_ONLY, M * K * sizeof(*A),
NULL, &err );
bufB = clCreateBuffer( ctx, CL_MEM_READ_ONLY, K * N * sizeof(*B),
NULL, &err );
bufC = clCreateBuffer( ctx, CL_MEM_READ_WRITE, M * N * sizeof(*C),
NULL, &err );
err = clEnqueueWriteBuffer( queue, bufA, CL_TRUE, 0,
M * K * sizeof( *A ), A, 0, NULL, NULL );
err = clEnqueueWriteBuffer( queue, bufB, CL_TRUE, 0,
K * N * sizeof( *B ), B, 0, NULL, NULL );
err = clEnqueueWriteBuffer( queue, bufC, CL_TRUE, 0,
M * N * sizeof( *C ), C, 0, NULL, NULL );
/* Call clBLAS extended function. Perform gemm for the lower right sub-matrices */
err = clblasSgemm( clblasRowMajor, clblasNoTrans, clblasNoTrans,
M, N, K,
alpha, bufA, 0, lda,
bufB, 0, ldb, beta,
bufC, 0, ldc,
1, &queue, 0, NULL, &event );
/* Wait for calculations to be finished. */
err = clWaitForEvents( 1, &event );
/* Fetch results of calculations from GPU memory. */
err = clEnqueueReadBuffer( queue, bufC, CL_TRUE, 0,
M * N * sizeof(*result),
result, 0, NULL, NULL );
/* Release OpenCL memory objects. */
clReleaseMemObject( bufC );
clReleaseMemObject( bufB );
clReleaseMemObject( bufA );
/* Finalize work with clBLAS */
clblasTeardown( );
/* Release OpenCL working objects. */
clReleaseCommandQueue( queue );
clReleaseContext( ctx );
return ret;
}
I get the error :
Undefined symbols for architecture x86_64:
"_clblasSetup", referenced from:
_main in main.o
"_clblasSgemm", referenced from:
_main in main.o
"_clblasTeardown", referenced from:
_main in main.o
ld: symbol(s) not found for architecture x86_64
clang: error: linker command failed with exit code 1 (use -v to see invocation)
make: *** [Program] Error 1
I know this is a linker problem but I don't know how to solve it. I'm including OpenCL like I do for other project :
LDFLAGS=-framework OpenCL
I tried variations around :
LDFLAGS=-framework OpenCL -framework clblas
But nothing works. Sorry if the question is simple.
EDIT : I found with this question that cblas.h is in a
-framework Accelerate
But still no possibility to find clblas.h
I found the solution which was quite simple.
We only need to add the library in the makefile :
/usr/local/Cellar/clblas/2.12/lib/libclBLAS.dylib
And add the path to the header :
/usr/local/Cellar/clblas/2.12/include/
Paths may depend on your installation.
Related
I want to fill an array of glm::vec3 with an OpenCL kernel.
All I want to do is fill the array with [1.0, 2.0, 3.0].
So upon success I should get the triplet repeated 256 times.
[1.0, 2.0, 3.0][1.0, 2.0, 3.0][1.0, 2.0, 3.0] ... [1.0, 2.0, 3.0]
However the result looks like this
[1.0, 2.0, 2.0][2.0, 2.0, 2.0] ... [2.0, 2.0, 2.0]
Why?
Here is the code for the kernel
__kernel void fill_array(__global float *output_values)
{
int i = get_global_id(0);
float3 pos = (float3)(1.0, 2.0, 3.0);
vstore3(pos, 0, &(output_values[i]));
}
And here is the code to run it
#include <stdio.h>
#include <stdlib.h>
#include <vector>
#include "glm/glm.hpp"
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#define MAX_SOURCE_SIZE (0x100000)
int main(void)
{
std::vector<glm::vec3> values;
values.resize(256);
// Load the kernel source code into the array source_str
FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("E:/Dev/fill_array_kernel.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
// Get platform and device information
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_ALL, 1,
&device_id, &ret_num_devices);
// Create an OpenCL context
cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
// Create a command queue
cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
// Create memory buffers on the device for each vector
cl_mem output_mem = clCreateBuffer(context, CL_MEM_WRITE_ONLY, values.size() * sizeof(glm::vec3), NULL, &ret);
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&source_str, (const size_t *)&source_size, &ret);
// Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
if(ret != CL_SUCCESS)
{
cl_build_status build_status;
ret = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &build_status, NULL);
size_t ret_val_size;
ret = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
char *build_log = (char*)malloc(sizeof(char)*(ret_val_size + 1));
ret = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
build_log[ret_val_size] = '\0';
printf("%s\n", build_log);
free(build_log);
return -1;
}
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "fill_array", &ret);
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&output_mem);
// Execute the OpenCL kernel on the list
size_t global_item_size = values.size(); // Process the entire lists
size_t local_item_size = 64; // Process in groups of 64
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
// Read the memory buffer C on the device to the local variable C
ret = clEnqueueReadBuffer(command_queue, output_mem, CL_TRUE, 0, values.size() * sizeof(glm::vec3), values.data(), 0, NULL, NULL);
// Clean up
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(output_mem);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
return 0;
}
I was misusing the vstore function.
I should have used the 2nd parameter to specify the index in the array.
https://www.khronos.org/registry/OpenCL/sdk/1.0/docs/man/xhtml/vstoren.html
__kernel void fill_array(__global float *output_values)
{
int i = get_global_id(0);
float3 pos = (float3)(1.0, 2.0, 3.0);
vstore3(pos, i, output_values);
}
I am new to OpenCL and especially to clBlas. I would like to implement a simple program that takes as input a matrix A, multiplies it with another matrix B obtaining a matrix C, then multiplies C for another matrix D, and returns the result matrix of these operations.
I would like to do it deploying the clBlas library, trying to keep the maximum performance. How can I do it?
So far I have seen that the common way of doing a single matrix multiplication is the following:
/* Setup clBLAS */
err = clblasSetup( );
/* Prepare OpenCL memory objects and place matrices inside them. */
bufA = clCreateBuffer( ctx, CL_MEM_READ_ONLY, M * K * sizeof(*A),
NULL, &err );
bufB = clCreateBuffer( ctx, CL_MEM_READ_ONLY, K * N * sizeof(*B),
NULL, &err );
bufC = clCreateBuffer( ctx, CL_MEM_READ_WRITE, M * N * sizeof(*C),
NULL, &err );
err = clEnqueueWriteBuffer( queue, bufA, CL_TRUE, 0,
M * K * sizeof( *A ), A, 0, NULL, NULL );
err = clEnqueueWriteBuffer( queue, bufB, CL_TRUE, 0,
K * N * sizeof( *B ), B, 0, NULL, NULL );
err = clEnqueueWriteBuffer( queue, bufC, CL_TRUE, 0,
M * N * sizeof( *C ), C, 0, NULL, NULL );
/* Call clBLAS extended function. Perform gemm for the lower right sub-matrices */
err = clblasSgemm( clblasRowMajor, clblasNoTrans, clblasNoTrans,
M, N, K,
alpha, bufA, 0, lda,
bufB, 0, ldb, beta,
bufC, 0, ldc,
1, &queue, 0, NULL, &event );
/* Wait for calculations to be finished. */
err = clWaitForEvents( 1, &event );
/* Fetch results of calculations from GPU memory. */
err = clEnqueueReadBuffer( queue, bufC, CL_TRUE, 0,
M * N * sizeof(*result),
result, 0, NULL, NULL );
But in this way we are keeping moving the matrices from the host memory to the GPU memory at this is a waste, is there a way of maintaining the result in the GPU memory?
I've attempted to parallelize a set of conditional statements, but the output does not match existing implementation after the first loop which contains the kernel executes (mapI is an int array of 135, and on the 60th index of the second loop it fails, totaling 195 calls to mapI). I've checked that all arrays are passing to and from the kernel correctly by comparing them to the existing implementation and am baffled as to why this computation does not return the correct result, as it does for the first loop of the code. All OpenCL overhead functions return CL_SUCCESS.
cl_mem Q1cl, Q3cl;
Q1cl = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(double)*um->Npts*um->Nel, Q1, &err);
Q3cl = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(double)*um->Npts*um->Nel, Q3, &err);
nxcl = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(double)*um->Nel*um->Nfaces*um->Nfq, nx, &err);
nycl = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(double)*um->Nel*um->Nfaces*um->Nfq, ny, &err);
mapIcl = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int)*(um->Nfq+um->Ninflow), mapI, &err);
mapFcl = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int)*(um->Nfq+um->Nfar), mapF, &err);
fluxQ1cl = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(double)*um->Nel*um->Nfaces*um->Nfq, *fluxQ1check, &err);
err = clSetKernelArg(kernel[7], 0, sizeof(cl_mem), (void*)&mapIcl);
err = clSetKernelArg(kernel[7], 1, sizeof(cl_mem), (void*)&nxcl);
err = clSetKernelArg(kernel[7], 2, sizeof(cl_mem), (void*)&nycl);
err = clSetKernelArg(kernel[7], 3, sizeof(cl_mem), (void*)&mapFcl);
err = clSetKernelArg(kernel[7], 4, sizeof(cl_mem), (void*)&Q1cl);
err = clSetKernelArg(kernel[7], 5, sizeof(cl_mem), (void*)&Q3cl);
err = clSetKernelArg(kernel[7], 6, sizeof(cl_mem), (void*)&fluxQ1cl);
globalWorkSize[0] = Ninflow; //Old implementation, now NEL
globalWorkSize[1] = Nfar; //Old implementation, now NFACES*NFQ
err = clEnqueueNDRangeKernel(queue[0], kernel[7], 2, NULL, globalWorkSize, NULL, 0, NULL, NULL);
err = clEnqueueReadBuffer(queue[0], fluxQ1cl, CL_TRUE, 0, sizeof(double)*um->Nel*um->Nfaces*um->Nfq, *fluxQ1check, 0, NULL, NULL);
Kernel Code:
__kernel void umBC(__global int* mapI,
__global double* nx,
__global double* ny,
__global int* mapF,
__global double* Q1,
__global double* Q3,
__global double* fluxQ1)
{
int id, idF;
double e[9][2] = { {0, 0}, {1, 0}, {0, 1}, {-1, 0}, {0, -1}, {1, 1}, {-1, 1}, {-1, -1}, {1, -1}};
double t_1 = 1. / 9.;
double uf = 0.;
double vf = 0.;
int globalx = get_global_id(0);
int globaly = get_global_id(1);
id = mapI[globalx];
fluxQ1[id] = ((e[1][0]*nx[id] + e[1][1]*ny[id]) < 0)*((Q1[id]-Q3[id] -2.*t_1*1.*(e[1][0]*uf+e[1][0]*vf)*3.) * (e[1][0]*nx[id] + e[1][1]*ny[id])) + 0.;
uf = 0.01;
vf = 0.;
idF = mapF[globaly];
fluxQ1[idF] = ((e[1][0]*nx[idF] + e[1][1]*ny[idF]) < 0)*((Q1[idF]-Q3[idF] -2.*t_1*1.*(e[1][0]*uf+e[1][0]*vf)*3.) * (e[1][0]*nx[idF] + e[1][1]*ny[idF])) + 0.;
}
Edit: Below is the working implementation, thank you again doqtor and Lee for your help. To implementthis I needed to change the way mapI and mapF worked to match the sizing of fluxQ.
__kernel void umBC(__global int* mapI,
__global double* nx,
__global double* ny,
__global int* mapF,
__global double* Q1,
__global double* Q3,
__global double* fluxQ1)
{
double e[9][2] = { {0, 0}, {1, 0}, {0, 1}, {-1, 0}, {0, -1}, {1, 1}, {-1, 1}, {-1, -1}, {1, -1}};
double t_1 = 1. / 9.;
double uf = 0.;
double vf = 0.;
int globalx = get_global_id(0); //NEL
int globaly = get_global_id(1); //NFACES*NFQ
if(mapI[globalx*NFACES*NFQ+globaly] != NEL*NFACES*NFQ+1000){
fluxQ1[globalx*NFACES*NFQ+globaly] = 0.0;
if ((e[1][0]*nx[globalx*NFACES*NFQ+globaly] + e[1][1]*ny[globalx*NFACES*NFQ+globaly]) < 0){
fluxQ1[globalx*NFACES*NFQ+globaly] = (Q1[globalx*NFACES*NFQ+globaly]-Q3[globalx*NFACES*NFQ+globaly] -2.*t_1*1.*(e[1][0]*uf+e[1][0]*vf)*3.) * (e[1][0]*nx[globalx*NFACES*NFQ+globaly] + e[1][1]*ny[globalx*NFACES*NFQ+globaly]);
}
}
uf = 0.01;
vf = 0.;
if(mapF[globalx*NFACES*NFQ+globaly] != NEL*NFACES*NFQ+1000){
fluxQ1[globalx*NFACES*NFQ+globaly] = 0.0;
if ((e[1][0]*nx[globalx*NFACES*NFQ+globaly] + e[1][1]*ny[globalx*NFACES*NFQ+globaly]) < 0){
fluxQ1[globalx*NFACES*NFQ+globaly] = (Q1[globalx*NFACES*NFQ+globaly]-Q3[globalx*NFACES*NFQ+globaly] -2.*t_1*1.*(e[1][0]*uf+e[1][0]*vf)*3.) * (e[1][0]*nx[globalx*NFACES*NFQ+globaly] + e[1][1]*ny[globalx*NFACES*NFQ+globaly]);
}
}
}
You get wrong results because your kernel is not implemented correctly. #Lee already said that, I will try to further explain why.
Let's say for simplicity we consider executing kernel using 2x2 global range.
Now, the (globalx, globaly) for each work item will be:
(0, 0)
(0, 1)
(1, 0)
(1, 1)
so:
work item (0, 0) writes to fluxQ1[mapI[0]] and fluxQ1[mapF[0]]
work item (0, 1) writes to fluxQ1[mapI[0]] and fluxQ1[mapF[1]]
work item (1, 0) writes to fluxQ1[mapI[1]] and fluxQ1[mapF[0]]
work item (1, 1) writes to fluxQ1[mapI[1]] and fluxQ1[mapF[1]]
Assuming that mapI[0], mapI[1], mapF[0] and mapF[1] return unique ids then 2 work items write to the same location in parallel which obviously can't give you correct results. This will get worse if mapI and mapF can return same ids.
Some of the results you get correct by luck.
Changing number of global or local work items won't help, using select or not won't help either. You need to ensure work items don't write to the same location at the same time!
You have multiple work-items in each dimension, yet you are writing to a location indexed by only one:
int globalx = get_global_id(0);
int globaly = get_global_id(1);
id = mapI[globalx];
fluxQ1[id] =
So multiple work-items here will share the same globalx. That means they will read the same id from mapI, and will write to the same location in fluxQ1.
I am trying to compute the euclidean distance of a set of 5D points (pixels) to a 5D single point (center) and store in another result vector, I want to use vector indexing to store all info in a single vector so for the ith pixel, the 5 dimensions are (5i) , (5i+1) , ...
I am new to OpenCL and I just edited a sample code on the internet for my own intentions. The theory is right but the code doesn't show the right answers !
Here is the kernel:
//d_kernel.cl
__kernel void distance_kernel(__global double *pixelInfo,
__global double *clusterCentres,
__global double *distanceFromClusterCentre)
{
int index = get_global_id(0);
int d, dl, da, db, dx, dy;
dl = pixelInfo[5 * index] - clusterCentres[0];
dl = dl * dl;
da = pixelInfo[5 * index + 1] - clusterCentres[1];
da = da * da;
db = pixelInfo[5 * index + 2] - clusterCentres[2];
db = db * db;
dx = pixelInfo[5 * index + 3] - clusterCentres[3];
dx = dx * dx;
dy = pixelInfo[5 * index + 4] - clusterCentres[4];
dy = dy * dy;
distanceFromClusterCentre[index] = dx + dy + dl + da + db;
}
and here is the HOST CODE:
#include <iostream>
#include <CL/cl.h>
#include <vector>
using namespace std;
#define MAX_SOURCE_SIZE (0x100000)
int main(int argc, char **argv)
{
// Create the two input vectors
int i;
const int pixelsNumber = 1024;
const int clustersNumber = 1;
std::vector<double> pixelInfo;
pixelInfo.resize(5 * pixelsNumber);
std::fill(pixelInfo.begin(), pixelInfo.end(), 500);
std::vector<double> clusterCentres;
clusterCentres.resize(5 * clustersNumber);
std::fill(clusterCentres.begin(), clusterCentres.end(), 200);
std::vector<double> distanceFromClusterCentre;
distanceFromClusterCentre.resize(pixelsNumber);
std::fill(distanceFromClusterCentre.begin(), distanceFromClusterCentre.end(), 0);
// Load the kernel source code into the array source_str
FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("d_kernel.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);
// Get platform and device information
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1,
&device_id, &ret_num_devices);
// Create an OpenCL context
cl_context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
// Create a command queue
cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
// Create memory buffers on the device for each vector
cl_mem pixelInfo_mem = clCreateBuffer(context, CL_MEM_READ_ONLY,
5 * pixelsNumber * sizeof(int), NULL, &ret);
cl_mem clusterCentres_mem = clCreateBuffer(context, CL_MEM_READ_ONLY,
5 * clustersNumber * sizeof(int), NULL, &ret);
cl_mem distanceFromClusterCentre_mem = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
pixelsNumber * sizeof(int), NULL, &ret);
// Copy the vectors to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue, pixelInfo_mem, CL_TRUE, 0,
5 * pixelsNumber * sizeof(int), pixelInfo.data(), 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, clusterCentres_mem, CL_TRUE, 0,
5 * clustersNumber * sizeof(int), clusterCentres.data(), 0, NULL, NULL);
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&source_str, (const size_t *)&source_size, &ret);
// Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "vector_add", &ret);
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&pixelInfo_mem);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&clusterCentres_mem);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&distanceFromClusterCentre_mem);
// Execute the OpenCL kernel on the list
size_t global_item_size = pixelsNumber; // Process the entire lists
size_t local_item_size = 64; // Divide work items into groups of 64
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
// Read the memory buffer result on the device to the local vector result
ret = clEnqueueReadBuffer(command_queue, distanceFromClusterCentre_mem, CL_TRUE, 0,
pixelsNumber * sizeof(int), distanceFromClusterCentre.data(), 0, NULL, NULL);
// Display the result to the screen
for (i = 0; i < pixelsNumber; i++)
{
cout << "Pixel " << i << ": " << distanceFromClusterCentre[i] << endl;
//system("PAUSE");
}
// Clean up
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(pixelInfo_mem);
ret = clReleaseMemObject(clusterCentres_mem);
ret = clReleaseMemObject(distanceFromClusterCentre_mem);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
free(pixelInfo.data());
free(clusterCentres.data());
free(distanceFromClusterCentre.data());
system("PAUSE");
return 0;
}
and a part of the RESULT is:
.
.
.
Pixel 501: -1.11874e+306
Pixel 502: -1.16263e+306
Pixel 503: -1.07485e+306
Pixel 504: -1.03079e+306
Pixel 505: -9.42843e+305
Pixel 506: -9.86903e+305
Pixel 507: -8.98954e+305
Pixel 508: -9.86903e+305
Pixel 509: -8.98954e+305
Pixel 510: -9.43014e+305
Press any key to continue . . .
Pixel 511: -8.55065e+305
Pixel 512: 0
Pixel 513: 0
Pixel 514: 0
Pixel 515: 0
Pixel 516: 0
Pixel 517: 0
Pixel 518: 0
Pixel 519: 0
Pixel 520: 0
.
.
.
after index 511 the rest of the vector is zero !
You created your vectors of double's and then you treat them as there were ints (created buffer for ints, writing data to int buffers and reading back results as there were ints). To avoid such mistakes you could write your code this way:
cl_mem pixelInfo_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, pixelInfo.size() * sizeof(pixelInfo[0]), NULL, &ret);
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
so I'm able to compile and execute my kernel, the problem is that only two work-items are being used. I'm basically trying to fill up a float array[8] with {0,1,2,3,4,5,6,7}. So this is a very simple hello world application. Bellow is my kernel.
// Highly simplified to demonstrate
__kernel void rnd_float32_matrix (
__global float * res
) {
uint idx = get_global_id(0);
res[idx] = idx;
}
I then create and execute the kernel with the following code...
// Some more code
cl::Program program(context, sources, &err);
program.build(devices, NULL, NULL, NULL);
cl::Kernel kernel(program, "rnd_float32_matrix", &err);
kernel.setArg(0, src_d);
cl::CommandQueue queue(context, devices[0], 0, &err);
cl::Event event;
err = queue.enqueueNDRangeKernel(
kernel,
cl::NullRange,
cl::NDRange(8),
// I've tried cl::NDRange(8) as well
cl::NDRange(1),
NULL,
&event
);
event.wait();
err = queue.enqueueReadBuffer(
// This is:
// cl::Buffer src_d(
// context,
// CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
// mem_size,
// src_h,
// &err);
src_d,
CL_TRUE,
0,
8,
// This is float * src_h = new float[8];
src_h);
for(int i = 0; i < 8; i ++) {
std::cout << src_h[i] << std::endl;
}
I may not show it in the code, but I also do select a gpu device and using context.getInfo(..) it shows I'm using my NVidia GTX 770M card which shows 1024, 1024, 64 work-items available in dimensions 0, 1 and 2. When this array prints I keep getting... 0, 1, 0, 0, 0, 0, 0, 0. I've also tried setting res[idx] = 5, and I get... 5, 5, 0, 0, 0, 0, 0, 0. So it seems that only two give work-items are actually being used. What am I doing wrong?
Your command to read the data back from the device is only reading 8 bytes, which is two floats:
err = queue.enqueueReadBuffer(
src_d,
CL_TRUE,
0,
8, // <- This is the number of bytes, not the number of elements!
// This is float * src_h = new float[8];
src_h);
To read 8 floats, you would need to do this:
err = queue.enqueueReadBuffer(
src_d,
CL_TRUE,
0,
8 * sizeof(cl_float),
// This is float * src_h = new float[8];
src_h);