ArrayFire: function with an OpenCL kernel called from main function - c++

the function is the following (extracted from http://arrayfire.org/docs/interop_opencl.htm)
unique main function
int main() {
size_t length = 10;
// Create ArrayFire array objects:
af::array A = af::randu(length, f32);
af::array B = af::constant(0, length, f32);
// ... additional ArrayFire operations here
// 2. Obtain the device, context, and queue used by ArrayFire
static cl_context af_context = afcl::getContext();
static cl_device_id af_device_id = afcl::getDeviceId();
static cl_command_queue af_queue = afcl::getQueue();
// 3. Obtain cl_mem references to af::array objects
cl_mem * d_A = A.device<cl_mem>();
cl_mem * d_B = B.device<cl_mem>();
// 4. Load, build, and use your kernels.
// For the sake of readability, we have omitted error checking.
int status = CL_SUCCESS;
// A simple copy kernel, uses C++11 syntax for multi-line strings.
const char * kernel_name = "copy_kernel";
const char * source = R"(
void __kernel
copy_kernel(__global float * gA, __global float * gB)
{
int id = get_global_id(0);
gB[id] = gA[id];
}
)";
// Create the program, build the executable, and extract the entry point
// for the kernel.
cl_program program = clCreateProgramWithSource(af_context, 1, &source, NULL, &status);
status = clBuildProgram(program, 1, &af_device_id, NULL, NULL, NULL);
cl_kernel kernel = clCreateKernel(program, kernel_name, &status);
// Set arguments and launch your kernels
clSetKernelArg(kernel, 0, sizeof(cl_mem), d_A);
clSetKernelArg(kernel, 1, sizeof(cl_mem), d_B);
clEnqueueNDRangeKernel(af_queue, kernel, 1, NULL, &length, NULL, 0, NULL, NULL);
// 5. Return control of af::array memory to ArrayFire
A.unlock();
B.unlock();
// ... resume ArrayFire operations
// Because the device pointers, d_x and d_y, were returned to ArrayFire's
// control by the unlock function, there is no need to free them using
// clReleaseMemObject()
return 0;
}
that work well, since the final values ​​of B coincide with those of A, i.e.
af_print(B);match A, but when I write the functions separately as follows:
separately main function
arraycopy function
void arraycopy(af::array A, af::array B,size_t length) {
// 2. Obtain the device, context, and queue used by ArrayFire
static cl_context af_context = afcl::getContext();
static cl_device_id af_device_id = afcl::getDeviceId();
static cl_command_queue af_queue = afcl::getQueue();
// 3. Obtain cl_mem references to af::array objects
cl_mem * d_A = A.device<cl_mem>();
cl_mem * d_B = B.device<cl_mem>();
// 4. Load, build, and use your kernels.
// For the sake of readability, we have omitted error checking.
int status = CL_SUCCESS;
// A simple copy kernel, uses C++11 syntax for multi-line strings.
const char * kernel_name = "copy_kernel";
const char * source = R"(
void __kernel
copy_kernel(__global float * gA, __global float * gB)
{
int id = get_global_id(0);
gB[id] = gA[id];
}
)";
// Create the program, build the executable, and extract the entry point
// for the kernel.
cl_program program = clCreateProgramWithSource(af_context, 1, &source, NULL, &status);
status = clBuildProgram(program, 1, &af_device_id, NULL, NULL, NULL);
cl_kernel kernel = clCreateKernel(program, kernel_name, &status);
// Set arguments and launch your kernels
clSetKernelArg(kernel, 0, sizeof(cl_mem), d_A);
clSetKernelArg(kernel, 1, sizeof(cl_mem), d_B);
clEnqueueNDRangeKernel(af_queue, kernel, 1, NULL, &length, NULL, 0, NULL, NULL);
// 5. Return control of af::array memory to ArrayFire
A.unlock();
B.unlock();
// ... resume ArrayFire operations
// Because the device pointers, d_x and d_y, were returned to ArrayFire's
// control by the unlock function, there is no need to free them using
// clReleaseMemObject()
}
main function
int main()
{
size_t length = 10;
af::array A = af::randu(length, f32);
af::array B = af::constant(0, length, f32);
arraycopy(A, B, length);
af_print(B);//does not match A
}
the final values of B have not changed, why is this happening? and what should I do to make it work?, thanks in advance

You pass af::array into arraycopy by value, not by reference, hence A and B in main remain unchanged regardless of what you do inside arraycopy. You can pass B by reference: af::array &B in parameter list. I'd also recommend passing A by const-reference as a custom to avoid unnecessary copies (const af::array &A).

The reason behind the behavior you are seeing is reference counting. But it is not a bug for sure and falls inline with C++ language behavior.
af::array objects when created using assignment or equivalent operations perform only copy of meta data and keep a shared pointer.
In the version of your code where it is a function, B is passed by value, thus internally B from arraycopy function is a copy of meta data of B from main function and sharing the pointer to the data from array B of main. At this point, if the user does a device call to fetch the pointer, we assume it is for writing to locations of that pointer. Therefore, when device is called on a array object has a shared pointer with reference count > 1, we make a copy of original array (B from main) and return the pointer to that memory. Therefore, if you do af_print(B) inside you will see the correct values. This is essentially copy-on-write - Since B is passed by value, you are not seeing the modified results of B from arraycopy function.
In the very first line I said, it falls in line with C++ behavior because, if the object B needs to be modified from a function it has to be passed by reference. Passing it by value only makes the value change inside the function - which is exactly how ArrayFire is handling af::array objects.
Hope that clears the confusion.
Pradeep.
ArrayFire Dev Team.

Related

How to copy a big array to memory and use it in OpenCL kernel?

I have an array of uint8_t. The size of the array is about 2.000.000. I need to do some calculations on these values, but after I call the kernel and copy the modified values back, it returns only zeros.
I'm creating the array, the "row" and "columns" are int.
uint8_t arrayIn[rows * columns];
uint8_t arrayOut[rows * columns];
I'm creating the cl_mem objects and copy the array data into.
arrayInMem = clCreateBuffer(context, CL_MEM_READ_ONLY, rows * columns * sizeof(uint8_t), NULL, &err);
arrayOutMem = clCreateBuffer(context, CL_MEM_WRITE_ONLY, rows * columns * sizeof(uint8_t), NULL, &err);
err = clEnqueueWriteBuffer(img_cmd_queue, arrayInMem, CL_TRUE, 0, rows * columns * sizeof(uint8_t), arrayIn, 0, NULL, NULL);
Setting the kernel arg like this.
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&arrayInMem);
err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&arrayOutMem);
Reading back to the host the modified array.
err = clEnqueueReadBuffer(img_cmd_queue, arrayOutMem, CL_TRUE, 0, MEM_SIZE * sizeof(uint8_t), arrayOut, 0, NULL, NULL);
The kernel signature look like this:
__kernel void calculate(__global uchar * arrayInKernel, __global uchar * arrayOutKernel){
//do some calculation like this eg.
//int gid = get_global_id(0);
//arrayOutKernel[gid] = 2 * arrayInKernel[gid];
}
Could somebody help, what am I missing out?
Your code is fine, assuming MEM_SIZE = rows * columns. The argument order in clEnqueueReadBuffer also is correct.
I could imagine that you forgot to call clFinish(img_cmd_queue); after clEnqueueWriteBuffer, clEnqueueNDRangeKernel and clEnqueueReadBuffer and before you check the results in arrayOut. All these commands end up in a queue and without clFinish the queue may be executed after you checked results.

Correct way to write and call custom C functions of ArrayFire in Julia

I'm working in Julia and I need call some customize C functions that use ArraFire library, when I use a code like:
void copy(const af::array &A, af::array &B,size_t length) {
// 2.Obtain the device, context, and queue used by ArrayFire
// 3.Obtain cl_mem references to af::array objects
cl_mem * d_A = A.device<cl_mem>();
cl_mem * d_B = B.device<cl_mem>();
// 4. Load, build, and use your kernels.
// Set arguments and launch your kernels
//kernel is the function build in step 4
clSetKernelArg(kernel, 0, sizeof(cl_mem), d_A);
clSetKernelArg(kernel, 1, sizeof(cl_mem), d_B);
clEnqueueNDRangeKernel(af_queue, kernel, 1, NULL, &length, NULL, 0, NULL, NULL);
// 5. Return control of af::array memory to ArrayFire
A.unlock();
B.unlock();
}
I used as reference the example provided in:Interoperability with OpenCL
I call this function in Julia as follows:
ccall((:copy,"path/to/dll"),Cvoid(Ref{af_array},Ref{af_array}),Af.arr,Bf.arr)
Af and Bf are ArrayFire arrays, the call works as expected, the problem is when I use directly B=A only to test i.e.
void copy(const af::array &A, af::array &B,size_t length) {
B=A;//only to test
}
the call stop works in Julia, this made me doubt if I'm using the correct way to write and call this functions.
Some of the Arrayfire functions incorporated in Julia that I saw, call functions that have af_array as arguments that are different from the arguments af :: array. Well I want to change the arguments, then I do this:
void copy(const af_array &dA, af_array &dB,size_t length) {
//this to be able to use A.device and B.device
array A=array(dA);
array B=array(dB);
//steps 2 to 5 in the original code
}
It doesn't work in C or in Julia, the question is if I want to use af_array as arguments how I get the device pointer? or what is the corret way to handle this functions to avoid problems when I call them in Julia?
thanks in advance.
UPD
I changed B=A; inside the function:
void copy(const af::array &A, af::array &B,size_t length) {
size_t len = A.dims(0);
seq idx(0, len - 1, 1);
af::copy(B, A, idx);
}
And works! However, I still doubt that this is the correct way, since this code is very simple. I will work with a more complex code that may stop working in a similar way.
This is not a definitive answer, but I think it significantly improves functionality. The af_get_device_ptr function is a solution to get the device pointer from a af_array object, and the correct way to write functions to be able to call from Julia seems to be those with af_array arguments (See: calling custom C ArrayFire functions in Julia #229
) , Since the functions integrated in ArrayFire.jl do it this way. Here is a simple and complete example of how to write and call the function from Julia:
in C
//function for adding ArrayFire arrays
void AFire::sumaaf(af_array* out , af_array dA, af_array dB) {
//to store the result
af_array dC;
af_copy_array(&dC, dA);
// 2. Obtain the device, context, and queue used by ArrayFire
static cl_context af_context = afcl::getContext();
static cl_device_id af_device_id = afcl::getDeviceId();
static cl_command_queue af_queue = afcl::getQueue();
dim_t _order[4];
af_get_dims(&_order[0], &_order[1], &_order[2], &_order[3], dA);
size_t order = _order[0];
int status = CL_SUCCESS;
// 3. Obtain cl_mem references to af_array objects
cl_mem *d_A = (cl_mem*)clCreateBuffer(af_context,
CL_MEM_READ_ONLY, sizeof(float) * order,
NULL, &status);
af_get_device_ptr((void**)d_A, dA);
cl_mem *d_B = (cl_mem*)clCreateBuffer(af_context,
CL_MEM_READ_ONLY, sizeof(float) * order,
NULL, &status);
af_get_device_ptr((void**)d_B, dB);
cl_mem *d_C = (cl_mem*)clCreateBuffer(af_context,
CL_MEM_WRITE_ONLY, sizeof(float) * order,
NULL, &status);
af_get_device_ptr((void**)d_C, dC);
// 4. Load, build, and use your kernels.
// For the sake of readability, we have omitted error checking.
// A simple sum kernel, uses C++11 syntax for multi-line strings.
const char * kernel_name = "sum_kernel";
const char * source = R"(
void __kernel
sum_kernel(__global float * gC, __global float * gA, __global float * gB)
{
int id = get_global_id(0);
gC[id] = gA[id]+gB[id];
}
)";
// Create the program, build the executable, and extract the entry point
// for the kernel.
cl_program program = clCreateProgramWithSource(af_context, 1, &source, NULL, &status);
status = clBuildProgram(program, 1, &af_device_id, NULL, NULL, NULL);
cl_kernel sumkernel = clCreateKernel(program, kernel_name, &status);
// Set arguments and launch your kernels
clSetKernelArg(sumkernel, 0, sizeof(cl_mem), d_C);
clSetKernelArg(sumkernel, 1, sizeof(cl_mem), d_A);
clSetKernelArg(sumkernel, 2, sizeof(cl_mem), d_B);
clEnqueueNDRangeKernel(af_queue, sumkernel, 1, NULL, &order, NULL, 0, NULL, NULL);
// 5. Return control of af::array memory to ArrayFire
af_unlock_array(dA);
af_unlock_array(dB);
af_unlock_array(dC);
//copy results to output argument
af_copy_array(out, dC);
// ... resume ArrayFire operations
// Because the device pointers, d_x and d_y, were returned to ArrayFire's
// control by the unlock function, there is no need to free them using
// clReleaseMemObject()
}
in Julia the call would be:
function sumaaf(A::AFArray{Float32,1},B::AFArray{Float32,1})
out = ArrayFire.RefValue{af_array}(0);
ccall((:sumaaf,"path/to/dll")
,Cvoid,(Ptr{af_array},af_array,af_array),out,Af.arr,Bf.arr);
AFArray{Float32,1}(out[])
end

OpenCL Kernel won't work until another buffer creation call

I have a OpenCL buffer that is created via:
return cl::Buffer(_context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, size);
I write data to that buffer and want to use it later in a kernel.
I get a strange behavior thought because my kernel wont work with that buffer. Only when I randomly call
BufferContainer blah(oclEnvironment, cv::Size(width, height), 3);
calls the above function to create a same sized buffer again, the kernel works. I don't call blah.Write(...) at all. It seems to work with the data I wrote to the first buffer. But if I comment out that single line with the "blah" buffer it wont work again.
e: both buffers are created with the exact same dimensions.
e2: does it have something to do with the command queue and the order of objects there?
Basically I try to run a kernel to reduce the image and find the max hsv v value. Then after that kernel finishes and gives me the max I run the next kernel with one parameter set to that found maximum. So the call chain is like:
float maxV = _maxValueReduce->GetValueMaximum(oclEnvironment, fiBuffer, width, height, true);
//starting to paramter the next kernel
...
_kernel.setArg(8, maxV);
oclEnvironment._commandQueue.enqueueNDRangeKernel(_kernel, cl::NullRange, global, local);
And the GetValueMaximum(...) starts itself a reducing kernel to find that maximum.
e3:
float OclMaxValueReduce::GetValueMaximum(OclEnvironment& oclEnvironment,
BufferContainer& source, int width, int height, const bool sync)
{
//Create the result buffer
//Intel HD 530 can have a max. workgroup size of 256.
int dim1 = 16;
int dim2 = 16;
cl::NDRange local(dim1, dim2,1);
cl::NDRange global(source._size.width, source._size.height, 1);
//Calculate the number of workgroups
int numberOfWorkgroups = ceil((width * height) / (float)(dim1 * dim2));
//each workgroup reduces the data to a single element. This elements are then reduced on host in the final reduction step.
//First create the buffer for the workgroups result
BufferContainer result(oclEnvironment, cv::Size(numberOfWorkgroups, 1), sizeof(float));
//set the kernel arguments
_kernel.setArg(0, source.GetOclBuffer());
_kernel.setArg(1, result.GetOclBuffer());
_kernel.setArg(2, width);
_kernel.setArg(3, height);
oclEnvironment._commandQueue.enqueueNDRangeKernel(_kernel, cl::NullRange, global, local);
if (sync)
oclEnvironment._commandQueue.finish();
//retrieve the reduced result array. The final reduce step is done here on host.
float* dest = new float[numberOfWorkgroups];
ReadBuffer(oclEnvironment, result.GetOclBuffer(), dest, numberOfWorkgroups);
std::vector<float> resultArray(dest, dest + numberOfWorkgroups);
delete[] dest;
//find and return the max in array.
std::vector<float>::iterator it;
it = std::max_element(resultArray.begin(), resultArray.end());
return resultArray[std::distance(resultArray.begin(), it)];
}
and this calls the read buffer:
/* Read a float array from ocl buffer*/
void OclMaxValueReduce::ReadBuffer(OclEnvironment oclEnvironment, cl::Buffer
&resultBuffer, float* dest, const size_t size) {
int errcode;
float* resultData = (float*)oclEnvironment._commandQueue.enqueueMapBuffer(resultBuffer, true, CL_MAP_READ, 0, size * sizeof(float), 0, 0, &errcode);
if (errcode)
throw std::exception(std::string("OclEnvironment::ReadBuffer: OCL could not map Buffer!").data(), errcode);
//std::copy(resultData, (resultData + size), dest);
memcpy(dest, resultData, size * sizeof(float));
cl::Event testEvent;
oclEnvironment._commandQueue.enqueueUnmapMemObject(resultBuffer, resultData, NULL, &testEvent); // Unmap Buffer
testEvent.wait();
}

Supplying a pointer renders my OpenCL code incorrect

My computer has a GeForce 1080Ti. With 11GB of VRAM, I don't expect a memory issue, so I'm at a loss to explain why the following breaks my code.
I execute the kernel on the host with this code.
cl_mem buffer = clCreateBuffer(context.GetContext(), CL_MEM_READ_WRITE, n * n * sizeof(int), NULL, &error);
error = clSetKernelArg(context.GetKernel(myKernel), 1, n * n, m1);
error = clSetKernelArg(context.GetKernel(myKernel), 0, sizeof(cl_mem), &buffer);
error = clEnqueueNDRangeKernel(context.GetCommandQueue(0), context.GetKernel(myKernel), 1, NULL, 10, 10, 0, NULL, NULL);
clFinish(context.GetCommandQueue(0));
error = clEnqueueReadBuffer(context.GetCommandQueue(0), buffer, true, 0, n * n * sizeof(int), results, 0, NULL, NULL);
results is a pointer to an n-by-n int array. m1 is a pointer to an n-by-n-bit array. The variable n is divisible by 8, so we can interpret the array as a char array.
The first ten values of the array are set to 1025 by the kernel (the value isn't important):
__kernel void PopCountProduct (__global int *results)
{
results[get_global_id(0)] = 1025;
}
When I print out the result on the host, the first 10 indices are 1025. All is well and good.
Suddenly it stops working when I introduce an additional argument:
__kernel void PopCountProduct (__global int *results, __global char *m)
{
results[get_global_id(0)] = 1025;
}
Why is this happening? Am I missing something crucial about OpenCL?
You can't pas host pointer to clSetKernelArg in OpenCL 1.2. Similar thing can only be done in OpenCL 2.0+ by clSetKernelArgSVMPointer with SVM pointer if supported. But most probable making a buffer object on GPU and copying host memory to it is what you need.

OpenCL instantiating local memory array: invalid pointer error in kernel

I'm trying to create 2 local arrays for a kernel to use. My goal is to copy a global input buffer into the first array (arr1), and instantiate the second array (arr2) so its elements can be accessed and set later.
My kernel looks like this:
__kernel void do_things (__global uchar* in, __global uchar* out,
uint numIterations, __local uchar* arr1, __local uchar* arr2)
{
size_t work_size = get_global_size(0) * get_global_size(1);
event_t event;
async_work_group_copy(arr1, in, work_size, event);
wait_group_events(1, &event);
int cIndex = (get_global_id(0) * get_global_size(1)) + get_global_id(1);
arr2[cIndex] = 0;
//Do other stuff later
}
In the C++ code I'm calling this from, I set the kernel arguments like this:
//Create input and output buffers
cl_mem inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY |
CL_MEM_COPY_HOST_PTR, myInputVector.size(), (void*)
myInputVector.data(), NULL);
cl_mem outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
myInputVector.size(), NULL, NULL);
//Set kernel arguments.
clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&inputBuffer));
clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&outputBuffer));
clSetKernelArg(kernel, 2, sizeof(cl_uint), &iterations));
clSetKernelArg(kernel, 3, sizeof(inputBuffer), NULL));
clSetKernelArg(kernel, 4, sizeof(inputBuffer), NULL));
Where myInputVector is a vector full of uchars.
Then, I enqueue it with a 2D work size, rows * cols big. myInputVector has a size of rows * cols.
//Execute the kernel
size_t global_work_size[2] = { rows, cols }; //2d work size
status = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL,
global_work_size, NULL, 0, NULL, NULL);
The problem is, I'm getting crashes when I run the kernel. Specifically, this line in the kernel:
arr2[cIndex] = 0;
is responsible for the crash (omitting it makes it so it doesn't crash anymore). The error reads:
*** glibc detected *** ./MyProgram: free(): invalid pointer: 0x0000000001a28fb0 ***
All I want is to be able to access arr2 alongside arr1. arr2 should be the same size as arr1. If that's the case, Why am I getting this bizarre error? Why is this an invalid pointer?
The issue is that you are allocating only sizeof(cl_mem) for your local buffers. And a cl_mem is simply a typedef of some sort of pointer type (therefore 4 to 8 bytes depending on your system).
What then happen in your kernel is that you are accessing beyond the size of the local buffer you allocated and the GPU launches a memory fault.
clSetKernelArg(kernel, 3, myInputVector.size(), NULL);
clSetKernelArg(kernel, 4, myInputVector.size(), NULL);
Should fix your problem. Also note that the size you are providing is the size in bytes so you would need to multiply by the sizeof of the vector element type (which is not clear from code).