I am new to OpenCL and especially to clBlas. I would like to implement a simple program that takes as input a matrix A, multiplies it with another matrix B obtaining a matrix C, then multiplies C for another matrix D, and returns the result matrix of these operations.
I would like to do it deploying the clBlas library, trying to keep the maximum performance. How can I do it?
So far I have seen that the common way of doing a single matrix multiplication is the following:
/* Setup clBLAS */
err = clblasSetup( );
/* Prepare OpenCL memory objects and place matrices inside them. */
bufA = clCreateBuffer( ctx, CL_MEM_READ_ONLY, M * K * sizeof(*A),
NULL, &err );
bufB = clCreateBuffer( ctx, CL_MEM_READ_ONLY, K * N * sizeof(*B),
NULL, &err );
bufC = clCreateBuffer( ctx, CL_MEM_READ_WRITE, M * N * sizeof(*C),
NULL, &err );
err = clEnqueueWriteBuffer( queue, bufA, CL_TRUE, 0,
M * K * sizeof( *A ), A, 0, NULL, NULL );
err = clEnqueueWriteBuffer( queue, bufB, CL_TRUE, 0,
K * N * sizeof( *B ), B, 0, NULL, NULL );
err = clEnqueueWriteBuffer( queue, bufC, CL_TRUE, 0,
M * N * sizeof( *C ), C, 0, NULL, NULL );
/* Call clBLAS extended function. Perform gemm for the lower right sub-matrices */
err = clblasSgemm( clblasRowMajor, clblasNoTrans, clblasNoTrans,
M, N, K,
alpha, bufA, 0, lda,
bufB, 0, ldb, beta,
bufC, 0, ldc,
1, &queue, 0, NULL, &event );
/* Wait for calculations to be finished. */
err = clWaitForEvents( 1, &event );
/* Fetch results of calculations from GPU memory. */
err = clEnqueueReadBuffer( queue, bufC, CL_TRUE, 0,
M * N * sizeof(*result),
result, 0, NULL, NULL );
But in this way we are keeping moving the matrices from the host memory to the GPU memory at this is a waste, is there a way of maintaining the result in the GPU memory?
Related
https://i.stack.imgur.com/TA9v6.png
I have been trying to get to compile a kernel that assigns certain indices to an std::vector using OpenCL through clEnqueueReadBuffer function but it does not seem to work correctly since the first result is the only assigned in the std::vector
the source code for the host in c++ is the following:
cl_mem originalPixelsBuffer = clCreateBuffer(p1.context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(Color) * imageObj->SourceLength(), source, &p1.status);
CheckErrorCode(p1.status, p1.program, p1.devices[0], "Failed to Create buffer 0");
cl_mem targetBuffer = clCreateBuffer(p1.context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(Color) * imageObj->OutputLength(), target, &p1.status);
CheckErrorCode(p1.status, p1.program, p1.devices[0], "Failed to Create buffer 1");
//write buffers
p1.status = clEnqueueWriteBuffer(p1.commandQueue, originalPixelsBuffer, CL_FALSE, 0, sizeof(Color) * imageObj->SourceLength(), source, 0, NULL, NULL);
CheckErrorCode(p1.status, p1.program, p1.devices[0], "Failed to write buffer 0");
p1.status = clEnqueueWriteBuffer(p1.commandQueue, targetBuffer, CL_TRUE, 0, sizeof(Color) * imageObj->OutputLength(), target, 0, NULL, NULL);
CheckErrorCode(p1.status, p1.program, p1.devices[0], "Failed to write buffer 1");
size_t globalWorkSize[2] = { imageObj->originalWidth * 4, imageObj->originalHeight * 4 };
size_t localWorkSize[2]{ 64,64 };
SetLocalWorkSize(IsDivisibleBy64(localWorkSize[0]), localWorkSize);
//execute kernel
p1.status = clEnqueueNDRangeKernel(p1.commandQueue, Kernel, 1, NULL, globalWorkSize, IsDisibibleByLocalWorkSize(globalWorkSize, localWorkSize) ? localWorkSize : NULL, 0, NULL, NULL);
CheckErrorCode(p1.status, p1.program, p1.devices[0], "Failed to clEnqueueDRangeKernel");
//read buffer
p1.status = clEnqueueReadBuffer(p1.commandQueue, targetBuffer, CL_TRUE, 0, sizeof(Color) * imageObj->OutputLength(), target, 0, NULL, NULL);
CheckErrorCode(p1.status, p1.program, p1.devices[0], "Failed to write buffer 1");
the kernel code:
__kernel void interp(__global struct Color* source,__global struct Color* target,uint64 width,uint64 height,uint64 ratio,uint64 limit, uint64 originalHeight)
{
__private fp32 wIndex = (int64)get_global_id(0);
__private fp32 hIndex = (int64)get_global_id(1);
if(((int64)wIndex)%ratio==MATCH && ((int64)hIndex)%ratio ==MATCH)
{
__private int64 Index = (wIndex/ratio) * (originalHeight/ratio) + (hIndex/ratio);
if(Index < limit)
{
__private int64 tIndex = wIndex * height + hIndex;
target[tIndex].R = source[Index].R;
target[tIndex].G = source[Index].G;
target[tIndex].B = source[Index].B;
target[tIndex].A = source[Index].A;
}
}
}```
Is there a way to free up dedicated memory when a function that runs on GPU using OpenCL ends? I have noticed that if you repeatedly call a program that uses OpenCL in GPU and it ends, a certain amount of dedicated memory is not released, which would cause an error if the function is called too many times.
UPDATE 22/12/2019:
I enclose a fragment of the code that is within the iteration. The configuration of the cl_program and cl_context is done outside the iteration:
void launch_kernel(float * blockC, float * blockR, float * blockEst, float * blockEstv, float * blockL, cl_program program, cl_context context, std::vector<cl_device_id> deviceIds, int n){
cl_kernel kernel_function = clCreateKernel(program, "function_example", &error);
cl_event event;
cl_command_queue queue = clCreateCommandQueue(context, (deviceIds)[0], 0, &error);
std::size_t size[1] = { (size_t)n };
cl_mem blockCBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * (n * s), (void *)&blockC[0], &error);
cl_mem blockRBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * (n * s), (void *)&blockR[0], &error);
cl_mem blockEstBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(float) * n, (void *)&blockEst[0], &error);
cl_mem blockEstvBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(float) * n, (void *)&blockEstv[0], &error);
cl_mem blockLBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(float) * n, (void *)&blockL[0], &error);
clSetKernelArg(kernel_solution, 0, sizeof(cl_mem), &blockCBuffer);
clSetKernelArg(kernel_solution, 1, sizeof(cl_mem), &blockRBuffer);
clSetKernelArg(kernel_solution, 2, sizeof(cl_mem), &blockEstBuffer);
clSetKernelArg(kernel_solution, 3, sizeof(cl_mem), &blockEstvBuffer);
clSetKernelArg(kernel_solution, 4, sizeof(cl_mem), &blockLBuffer);
clSetKernelArg(kernel_solution, 5, sizeof(int), &s);
openclSingleton->checkError(clEnqueueNDRangeKernel(queue, kernel_function, 1, nullptr, size, nullptr, 0, nullptr, nullptr));
clEnqueueMapBuffer(queue, blockEstBuffer, CL_TRUE, CL_MAP_READ, 0, n * sizeof(float) , 0, nullptr, nullptr, &error);
clEnqueueMapBuffer(queue, blockEstvBuffer, CL_TRUE, CL_MAP_READ, 0, n * sizeof(float) , 0, nullptr, nullptr, &error);
clEnqueueMapBuffer(queue, blockLBuffer, CL_TRUE, CL_MAP_READ, 0, n * sizeof(float) , 0, nullptr, nullptr, &error);
clReleaseMemObject(blockCBuffer);
clReleaseMemObject(blockRBuffer);
clReleaseMemObject(blockEstBuffer);
clReleaseMemObject(blockEstvBuffer);
clReleaseMemObject(blockLBuffer);
clFlush(queue);
clFinish(queue);
clWaitForEvents(1, &event);
clReleaseCommandQueue(queue);
clReleaseKernel(kernel_function);
}
UPDATE 23/12/2019
Updated the code with the iterative process that calls the function in OpenCl. The problem arises when at the end of the launch_kernel function it leaves some dedicated memory used, which causes that if the variable m is too large, the memory becomes full and the program crashes due to lack of resources.
std::vector<cl_device_id> deviceIds;
cl_program program;
cl_context context;
... //configuration program and context
int n;
float *blockEst, *blockEstv, *blockL, *blockC, *blockR;
for(int i = 0; i < m; m++){
blockC = (float*)std::malloc(sizeof(float)*n*s);
blockR = (float*)std::malloc(sizeof(float)*n*s);
blockEst = (float*)std::malloc(sizeof(float)*n);
blockEstv = (float*)std::malloc(sizeof(float)*n);
blockL = (float*)std::malloc(sizeof(float)*n);
.... //Set values blockC and blockR
launch_kernel(blockC, blockR, blockEst, blockEstv, blockL, program, context, deviceIds, n);
... //Save value blockEst, blockEstv and blockL
std::free(blockC);
std::free(blockR);
std::free(blockEst);
std::free(blockEstv);
std::free(blockL);
}
clReleaseProgram(program);
clReleaseContext(context);
clReleaseDevice(deviceIds[0]);
Is it necessary that array pointer passed to clEnqueueWriteBuffer should malloc in the same scope?
Here is my code:
class int_matrix{
public:
int_matrix(size_t size_row, size_t size_col) :
_size_row(size_row), _size_col(size_col) {
element = (int*)malloc(size_row * size_col * sizeof(int));
}
friend int_matrix cl_prod_l(int_matrix& lhs, int_matrix& rhs);
private:
int* element;
};
int_matrix cl_prod_l(int_matrix& lhs, int_matrix& rhs) {
...
int_matrix return_val(lhs._size_row, rhs._size_col, 0); // Initialize elements in retrun_val
cl_mem lhs_buffer, rhs_buffer, return_buffer;
/* config buffer */
lhs_buffer = clCreateBuffer(int_matrix::_clconfig._context,
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, M*K * sizeof(int), NULL, &err);
rhs_buffer = clCreateBuffer(int_matrix::_clconfig._context,
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, N*K * sizeof(int), NULL, &err);
return_buffer = clCreateBuffer(int_matrix::_clconfig._context,
CL_MEM_READ_WRITE, M*N * sizeof(int), NULL, &err);
cl_kernel Kernel= clCreateKernel(int_matrix::_clconfig._program, ker, &err);
/* enqueue buffer */
clEnqueueWriteBuffer(int_matrix::_clconfig._cmdque, lhs_buffer, CL_TRUE, 0, M*K * sizeof(int), lhs.element, 0, NULL, NULL);
clEnqueueWriteBuffer(int_matrix::_clconfig._cmdque, rhs_buffer, CL_TRUE, 0, N*K * sizeof(int), rhs.element, 0, NULL, NULL);
clEnqueueWriteBuffer(int_matrix::_clconfig._cmdque, return_buffer, CL_TRUE, 0, M*N * sizeof(int), return_val.element, 0, NULL, NULL);
...
}
In this example, I find lhs.element, rhs.element and return_val.element cannot be passed in kernel. But when I change to some array malloc in this this function(copy the same value), the kernel can return the right result.
So is there some limitations on the array pointer passed to clEnqueueWriteBuffer?
Emmm...
I find the answer my seld, cl_mem object and int* element should be put in the same scope.
I am trying to compute the euclidean distance of a set of 5D points (pixels) to a 5D single point (center) and store in another result vector, I want to use vector indexing to store all info in a single vector so for the ith pixel, the 5 dimensions are (5i) , (5i+1) , ...
I am new to OpenCL and I just edited a sample code on the internet for my own intentions. The theory is right but the code doesn't show the right answers !
Here is the kernel:
//d_kernel.cl
__kernel void distance_kernel(__global double *pixelInfo,
__global double *clusterCentres,
__global double *distanceFromClusterCentre)
{
int index = get_global_id(0);
int d, dl, da, db, dx, dy;
dl = pixelInfo[5 * index] - clusterCentres[0];
dl = dl * dl;
da = pixelInfo[5 * index + 1] - clusterCentres[1];
da = da * da;
db = pixelInfo[5 * index + 2] - clusterCentres[2];
db = db * db;
dx = pixelInfo[5 * index + 3] - clusterCentres[3];
dx = dx * dx;
dy = pixelInfo[5 * index + 4] - clusterCentres[4];
dy = dy * dy;
distanceFromClusterCentre[index] = dx + dy + dl + da + db;
}
and here is the HOST CODE:
#include <iostream>
#include <CL/cl.h>
#include <vector>
using namespace std;
#define MAX_SOURCE_SIZE (0x100000)
int main(int argc, char **argv)
{
// Create the two input vectors
int i;
const int pixelsNumber = 1024;
const int clustersNumber = 1;
std::vector<double> pixelInfo;
pixelInfo.resize(5 * pixelsNumber);
std::fill(pixelInfo.begin(), pixelInfo.end(), 500);
std::vector<double> clusterCentres;
clusterCentres.resize(5 * clustersNumber);
std::fill(clusterCentres.begin(), clusterCentres.end(), 200);
std::vector<double> distanceFromClusterCentre;
distanceFromClusterCentre.resize(pixelsNumber);
std::fill(distanceFromClusterCentre.begin(), distanceFromClusterCentre.end(), 0);
// Load the kernel source code into the array source_str
FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("d_kernel.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);
// Get platform and device information
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1,
&device_id, &ret_num_devices);
// Create an OpenCL context
cl_context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
// Create a command queue
cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
// Create memory buffers on the device for each vector
cl_mem pixelInfo_mem = clCreateBuffer(context, CL_MEM_READ_ONLY,
5 * pixelsNumber * sizeof(int), NULL, &ret);
cl_mem clusterCentres_mem = clCreateBuffer(context, CL_MEM_READ_ONLY,
5 * clustersNumber * sizeof(int), NULL, &ret);
cl_mem distanceFromClusterCentre_mem = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
pixelsNumber * sizeof(int), NULL, &ret);
// Copy the vectors to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue, pixelInfo_mem, CL_TRUE, 0,
5 * pixelsNumber * sizeof(int), pixelInfo.data(), 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, clusterCentres_mem, CL_TRUE, 0,
5 * clustersNumber * sizeof(int), clusterCentres.data(), 0, NULL, NULL);
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&source_str, (const size_t *)&source_size, &ret);
// Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "vector_add", &ret);
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&pixelInfo_mem);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&clusterCentres_mem);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&distanceFromClusterCentre_mem);
// Execute the OpenCL kernel on the list
size_t global_item_size = pixelsNumber; // Process the entire lists
size_t local_item_size = 64; // Divide work items into groups of 64
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
// Read the memory buffer result on the device to the local vector result
ret = clEnqueueReadBuffer(command_queue, distanceFromClusterCentre_mem, CL_TRUE, 0,
pixelsNumber * sizeof(int), distanceFromClusterCentre.data(), 0, NULL, NULL);
// Display the result to the screen
for (i = 0; i < pixelsNumber; i++)
{
cout << "Pixel " << i << ": " << distanceFromClusterCentre[i] << endl;
//system("PAUSE");
}
// Clean up
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(pixelInfo_mem);
ret = clReleaseMemObject(clusterCentres_mem);
ret = clReleaseMemObject(distanceFromClusterCentre_mem);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
free(pixelInfo.data());
free(clusterCentres.data());
free(distanceFromClusterCentre.data());
system("PAUSE");
return 0;
}
and a part of the RESULT is:
.
.
.
Pixel 501: -1.11874e+306
Pixel 502: -1.16263e+306
Pixel 503: -1.07485e+306
Pixel 504: -1.03079e+306
Pixel 505: -9.42843e+305
Pixel 506: -9.86903e+305
Pixel 507: -8.98954e+305
Pixel 508: -9.86903e+305
Pixel 509: -8.98954e+305
Pixel 510: -9.43014e+305
Press any key to continue . . .
Pixel 511: -8.55065e+305
Pixel 512: 0
Pixel 513: 0
Pixel 514: 0
Pixel 515: 0
Pixel 516: 0
Pixel 517: 0
Pixel 518: 0
Pixel 519: 0
Pixel 520: 0
.
.
.
after index 511 the rest of the vector is zero !
You created your vectors of double's and then you treat them as there were ints (created buffer for ints, writing data to int buffers and reading back results as there were ints). To avoid such mistakes you could write your code this way:
cl_mem pixelInfo_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, pixelInfo.size() * sizeof(pixelInfo[0]), NULL, &ret);
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
so I'm able to compile and execute my kernel, the problem is that only two work-items are being used. I'm basically trying to fill up a float array[8] with {0,1,2,3,4,5,6,7}. So this is a very simple hello world application. Bellow is my kernel.
// Highly simplified to demonstrate
__kernel void rnd_float32_matrix (
__global float * res
) {
uint idx = get_global_id(0);
res[idx] = idx;
}
I then create and execute the kernel with the following code...
// Some more code
cl::Program program(context, sources, &err);
program.build(devices, NULL, NULL, NULL);
cl::Kernel kernel(program, "rnd_float32_matrix", &err);
kernel.setArg(0, src_d);
cl::CommandQueue queue(context, devices[0], 0, &err);
cl::Event event;
err = queue.enqueueNDRangeKernel(
kernel,
cl::NullRange,
cl::NDRange(8),
// I've tried cl::NDRange(8) as well
cl::NDRange(1),
NULL,
&event
);
event.wait();
err = queue.enqueueReadBuffer(
// This is:
// cl::Buffer src_d(
// context,
// CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
// mem_size,
// src_h,
// &err);
src_d,
CL_TRUE,
0,
8,
// This is float * src_h = new float[8];
src_h);
for(int i = 0; i < 8; i ++) {
std::cout << src_h[i] << std::endl;
}
I may not show it in the code, but I also do select a gpu device and using context.getInfo(..) it shows I'm using my NVidia GTX 770M card which shows 1024, 1024, 64 work-items available in dimensions 0, 1 and 2. When this array prints I keep getting... 0, 1, 0, 0, 0, 0, 0, 0. I've also tried setting res[idx] = 5, and I get... 5, 5, 0, 0, 0, 0, 0, 0. So it seems that only two give work-items are actually being used. What am I doing wrong?
Your command to read the data back from the device is only reading 8 bytes, which is two floats:
err = queue.enqueueReadBuffer(
src_d,
CL_TRUE,
0,
8, // <- This is the number of bytes, not the number of elements!
// This is float * src_h = new float[8];
src_h);
To read 8 floats, you would need to do this:
err = queue.enqueueReadBuffer(
src_d,
CL_TRUE,
0,
8 * sizeof(cl_float),
// This is float * src_h = new float[8];
src_h);