I tried to do an basic abstraction for OpenCL Here it is:
OpenCLBuffer:
Header:
class OpenCLBuffer {
void* GetNativeID() { return obj; }
cl_mem obj;
cl_command_queue commandQueue;
cl_context context;
cl_int ret;
int type;
int maxSize;
int currSize;
};
Impl:
OpenCLBuffer::OpenCLBuffer(cl_context cont, cl_command_queue queue, cl_int t, unsigned int size)
{
context = cont;
commandQueue = queue;
maxSize = size;
type = t;
obj = clCreateBuffer(context, t, size, NULL, &ret);
}
OpenCLBuffer::~OpenCLBuffer()
{
ret = clReleaseMemObject(obj);
}
void OpenCLBuffer::SetData(int size, void* data, int offset)
{
currSize = size;
ret = clEnqueueWriteBuffer(commandQueue, obj, CL_TRUE, offset, size, data, 0, NULL, NULL);
}
void OpenCLBuffer::GetData(void* data, int size)
{
if (size == -1)
size = currSize;
ret = clEnqueueReadBuffer(commandQueue, obj, CL_TRUE, 0, size, data, 0, NULL, NULL);
}
OpenClContext:
Header:
class OpenCLContext {
// I removed the fuc definations from question as they are already in the Impl part
cl_platform_id plarformId;
cl_device_id deviceId;
cl_context context;
cl_uint numDevices;
cl_uint numPlatforms;
cl_command_queue commandQueue;
cl_int ret;
char name[1024];
};
ImPl:
static void _stdcall OpenCLErrorFunc(const char* errinfo, const void* private_info, size_t cb, void* user_data){
std::cout << "OpenCL (" << user_data << ") Error : \n" << errinfo << "\n";
}
OpenCLContext::OpenCLContext(std::string n)
{
ret = clGetPlatformIDs(1, &plarformId, &numPlatforms);
ret = clGetDeviceIDs(plarformId, CL_DEVICE_TYPE_DEFAULT, 1, &deviceId, &numDevices);
context = clCreateContext(NULL, 1, &deviceId, OpenCLErrorFunc, name, &ret);
commandQueue = clCreateCommandQueue(context, deviceId, 0, &ret);
memcpy_s(name, 1024, n.data(), std::min(1024, (int)n.size()));
}
OpenCLContext::~OpenCLContext()
{
for (std::pair<std::string, char*> data : sources) {
if (data.second)
delete data.second;
}
ret = clFlush(commandQueue);
ret = clReleaseCommandQueue(commandQueue);
ret = clReleaseContext(context);
}
OpenCLBuffer* OpenCLContext::CreateBuffer(void* data, int size, int type)
{
OpenCLBuffer* buffer = new OpenCLBuffer(context, commandQueue, type, size);
buffer->SetData(size, data);
return buffer;
}
OpenCLBuffer* OpenCLContext::CreateBuffer(int size, int type)
{
OpenCLBuffer* buffer = new OpenCLBuffer(context, commandQueue, type, size);
return buffer;
}
void OpenCLContext::AddProgram(std::string name, std::string source)
{
char* sc = new char[source.size()];
memcpy_s(sc, source.size(), source.data(), source.size());
sources[name] = sc;
int sourceSize = source.size();
programs[name] = clCreateProgramWithSource(context, 1, (const char**)&sc, (const size_t*)&sourceSize, &ret);
ret = clBuildProgram(programs[name], 1, &deviceId, NULL, NULL, NULL);
}
void OpenCLContext::MakeKernel(std::string programName, std::string kernelName)
{
kernels[kernelName] = clCreateKernel(programs[programName], kernelName.c_str(), &ret);
}
void OpenCLContext::SetKernelArg(std::string kernelName, int num, int size, void* arg)
{
ret = clSetKernelArg(kernels[kernelName], num, size, arg);
}
void OpenCLContext::ReleaseKernerl(std::string kernelName)
{
ret = clFlush(commandQueue);
ret = clReleaseKernel(kernels[kernelName]);
}
void OpenCLContext::ReleaseProgram(std::string programName)
{
ret = clFlush(commandQueue);
ret = clReleaseProgram(programs[programName]);
}
void OpenCLContext::Dispatch(std::string kernelName, int globalItemSize, int localItemSize)
{
ret = clEnqueueNDRangeKernel(commandQueue, kernels[kernelName], 1, NULL, (const size_t*)&globalItemSize, (const size_t*)&localItemSize, 0, NULL, NULL);
}
Driver Code:
std::string shadersrc = R"(
__kernel void vector_add(__global const int *A, __global const int *B, __global int *C) {
// Get the index of the current element to be processed
int i = get_global_id(0);
// Do the operation
C[i] = A[i] + B[i];
}
)";
const int LIST_SIZE = 1024;
int* A = (int*)malloc(sizeof(int) * LIST_SIZE);
int* B = (int*)malloc(sizeof(int) * LIST_SIZE);
for (int i = 0; i < LIST_SIZE; i++) {
A[i] = i;
B[i] = LIST_SIZE - i;
}
context = new OpenCLContext("Vector Adder");
a = context->CreateBuffer(LIST_SIZE * sizeof(int), CL_MEM_READ_ONLY);
b = context->CreateBuffer(LIST_SIZE * sizeof(int), CL_MEM_READ_ONLY);
c = context->CreateBuffer(LIST_SIZE * sizeof(int), CL_MEM_WRITE_ONLY);
a->SetData(LIST_SIZE * sizeof(int), A);
b->SetData(LIST_SIZE * sizeof(int), B);
context->AddProgram("VectorAdderSrc", shadersrc);
context->MakeKernel("VectorAdderSrc", "vector_add");
context->SetKernelArg("vector_add", 0, sizeof(cl_mem), a->GetNativeID());
context->SetKernelArg("vector_add", 1, sizeof(cl_mem), b->GetNativeID());
context->SetKernelArg("vector_add", 2, sizeof(cl_mem), c->GetNativeID());
context->Dispatch("vector_add", LIST_SIZE, 64);
int* C = (int*)malloc(sizeof(int) * LIST_SIZE);
memset(C, 0, sizeof(int) * LIST_SIZE);
c->GetData(c, sizeof(int) * LIST_SIZE);
for (int i = 0; i < LIST_SIZE; i++)
printf("%d + %d = %d\n", A[i], B[i], C[i]);
Sometimes i am getting Read Access Violation and sometimes:
0 + 1024 = 0
1 + 1023 = 0
2 + 1022 = 0
3 + 1021 = 0
...
Then crash.
Could you please help me find the problems?
First, few general tips to make debugging easier:
Always check return values. Every OpenCL API reports the errors via return value or a reference parameter.
At the first occurrence of an error you should stop the rest of the program as it will most likely not work. Throwing exceptions are a good strategy.
Specifically for OpenCL, the error codes are defined in the main header cl.h, you can find an code->string mapping routine here:Convenient way to show OpenCL error codes?
Regarding your code, the first error came from your function AddProgram. The function clCreateProgramWithSource returns CL_OUT_OF_HOST_MEMORY. Your decision to cast sourceSize from int* to size_t* is problematic since they are not the same size, and the API reads a corrupted 64-bit value.
Here is a better implementation:
void AddProgram(std::string name, std::string source)
{
const char* sc = source.c_str();
size_t sourceSize[1] = {source.size()};
programs[name] = clCreateProgramWithSource(context, 1, &sc, sourceSize, &ret);
ret = clBuildProgram(programs[name], 1, &deviceId, NULL, NULL, NULL);
}
There is no need to keep the source code in memory after it was compiled, but if you really want to, I suggest managing std::string objects because they save the hustle of managing memory.
The next problem is that clEnqueueNDRangeKernel returns CL_INVALID_WORK_GROUP_SIZE. Here you can find the int* to size_t* cast problem again, which passes bad arguments to the function.
Finally, your call to SetKernelArg returns CL_INVALID_MEM_OBJECT, and this is because the last argument, in case of OpenCL buffers, expected to be a pointer pointer to a cl_mem object (in your case - address of cl_mem that is returned by GetNativeID()).
Finally, there is a typo in the line c->GetData(c, sizeof(int) * LIST_SIZE); as it should be c->GetData(C, sizeof(int) * LIST_SIZE);
That should make it work. Please pay attention to the tips above, and avoid C casts in favor of C++ casts.
Related
I am testing out cuda graphs. My graph is as follows.
the code for this is as follows
#include <cstdio>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <vector>
#define NumThreads 20
#define NumBlocks 1
template <typename PtrType>
__global__ void kernel1(PtrType *buffer, unsigned int numElems) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
buffer[tid] = (PtrType)tid;
}
template <typename PtrType>
__global__ void kernel2(PtrType *buffer, unsigned int numElems) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if(tid < numElems/2) buffer[tid] += 5;
}
template <typename PtrType>
__global__ void kernel3(PtrType *buffer, unsigned int numElems) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if(tid>=numElems/2) buffer[tid] *= 5;
}
template <typename PtrType>
void print(void *data) {
PtrType *buffer = (PtrType *)data;
std::cout << "[";
for (unsigned int i = 0; i < NumThreads; ++i) {
std::cout << buffer[i] << ",";
}
std::cout << "]\n";
}
void runCudaGraph(cudaGraph_t &Graph, cudaGraphExec_t &graphExec,
cudaStream_t &graphStream) {
cudaGraphInstantiate(&graphExec, Graph, nullptr, nullptr, 0);
cudaStreamCreateWithFlags(&graphStream, cudaStreamNonBlocking);
cudaGraphLaunch(graphExec, graphStream);
cudaStreamSynchronize(graphStream);
}
void destroyCudaGraph(cudaGraph_t &Graph, cudaGraphExec_t &graphExec,
cudaStream_t &graphStream) {
cudaCtxResetPersistingL2Cache();
cudaGraphExecDestroy(graphExec);
cudaGraphDestroy(Graph);
cudaStreamDestroy(graphStream);
cudaDeviceReset();
}
template <typename PtrType>
void createCudaGraph(cudaGraph_t &Graph, cudaGraphExec_t &graphExec,
cudaStream_t &graphStream, PtrType *buffer,
unsigned int numElems, PtrType *hostBuffer) {
cudaGraphCreate(&Graph, 0);
cudaGraphNode_t Kernel1;
cudaKernelNodeParams nodeParams = {0};
memset(&nodeParams, 0, sizeof(nodeParams));
nodeParams.func = (void *)kernel1<PtrType>;
nodeParams.gridDim = dim3(NumBlocks, 1, 1);
nodeParams.blockDim = dim3(NumThreads/NumBlocks, 1, 1);
nodeParams.sharedMemBytes = 0;
void *inputs[2];
inputs[0] = (void *)&buffer;
inputs[1] = (void *)&numElems;
nodeParams.kernelParams = inputs;
nodeParams.extra = nullptr;
cudaGraphAddKernelNode(&Kernel1, Graph, nullptr, 0, &nodeParams);
cudaGraphNode_t Kernel2;
memset(&nodeParams, 0, sizeof(nodeParams));
nodeParams.func = (void *)kernel2<PtrType>;
nodeParams.gridDim = dim3(NumBlocks, 1, 1);
nodeParams.blockDim = dim3(NumThreads/NumBlocks, 1, 1);
nodeParams.sharedMemBytes = 0;
inputs[0] = (void *)&buffer;
inputs[1] = (void *)&numElems;
nodeParams.kernelParams = inputs;
nodeParams.extra = NULL;
cudaGraphAddKernelNode(&Kernel2, Graph, &Kernel1, 1, &nodeParams);
cudaGraphNode_t Kernel3;
memset(&nodeParams, 0, sizeof(nodeParams));
nodeParams.func = (void *)kernel3<PtrType>;
nodeParams.gridDim = dim3(NumBlocks, 1, 1);
nodeParams.blockDim = dim3(NumThreads/NumBlocks, 1, 1);
nodeParams.sharedMemBytes = 0;
inputs[0] = (void *)&buffer;
inputs[1] = (void *)&numElems;
nodeParams.kernelParams = inputs;
nodeParams.extra = NULL;
cudaGraphAddKernelNode(&Kernel3, Graph, &Kernel1, 1, &nodeParams);
cudaGraphNode_t copyBuffer;
std::vector<cudaGraphNode_t> dependencies = {Kernel2, Kernel3};
cudaGraphAddMemcpyNode1D(©Buffer, Graph,dependencies.data(),dependencies.size(),hostBuffer, buffer, numElems*sizeof(PtrType), cudaMemcpyDeviceToHost);
cudaGraphNode_t Host1;
cudaHostNodeParams hostNodeParams;
memset(&hostNodeParams, 0, sizeof(hostNodeParams));
hostNodeParams.fn = print<PtrType>;
hostNodeParams.userData = (void *)&hostBuffer;
cudaGraphAddHostNode(&Host1, Graph, ©Buffer, 1,
&hostNodeParams);
}
int main() {
cudaGraph_t graph;
cudaGraphExec_t graphExec;
cudaStream_t graphStream;
unsigned int numElems = NumThreads;
unsigned int bufferSizeBytes = numElems * sizeof(unsigned int);
unsigned int hostBuffer[numElems];
memset(hostBuffer, 0, bufferSizeBytes);
unsigned int *deviceBuffer;
cudaMalloc(&deviceBuffer, bufferSizeBytes);
createCudaGraph(graph, graphExec, graphStream, deviceBuffer,numElems, hostBuffer);
runCudaGraph(graph, graphExec, graphStream);
destroyCudaGraph(graph, graphExec, graphStream);
std::cout << "graph example done!" << std::endl;
}
When I run this example I get a result of
[3593293488,22096,3561843129,22096,3561385808,22096,3593293488,22096,3598681264,22096,3561792984,22096,2687342880,0,0,0,3598597376,22096,3598599312,0,]
However I expect:
[5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95]
I can't figure out where I went wrong. I used cuda-gdb and it seems right on the GPU. However, somewhere in the memCpy and sending to host function it goes wrong. Any ideas?
You are not passing the correct pointer to the host callback.
void createCudaGraph(cudaGraph_t &Graph, cudaGraphExec_t &graphExec,
cudaStream_t &graphStream, PtrType *buffer,
unsigned int numElems, PtrType *hostBuffer) {
...
hostNodeParams.userData = (void *)&hostBuffer;
}
This takes the address of a stack variable which is no longer valid when the host function is called.
Since hostBuffer already points to the array you want to print, you can just pass this pointer directly.
hostNodeParams.userData = (void *)hostBuffer;
That aside, I would like to mention that there is a handy function
cudaGraphDebugDotPrint which can output a graph to file that can be converted to png to help with debugging. With your original code, it will show that the pointer used as memcpy destination and the pointer passed to the host function are different.
The platform i am using is AMD radeon 200 series (sapphire 250 (GPU- Oland)
I am creating a class on host having a pointer as data member, and same class on kernel side too.
now if in kernel side, pointer is present in the class, build error( -11) arises. if we remove pointer on kernel side, code builds up.
there, the pointer value is copied to a global pointer and dereferenced value comes to be zero.
My main objective is to access a pointer in a class on device side.
AMD SDK-3.0
opencl c++ version 1.2
any help would be appreciated.
class A
{
public:
int* ptr;
};
int main()
{
const int LIST_SIZE = 1;
int abc=20;
A *obj=new A;
obj->ptr= &abc;
printf("\nx=%d\n",*(obj->ptr));
int *A = new int[LIST_SIZE];
int *B = new int[LIST_SIZE];
cl_int z;
for(int i = 0; i < LIST_SIZE; i++) {
A[i] = i;
B[i] = LIST_SIZE - i;
}
// Get available platforms
vector<Platform> platforms;
Platform::get(&platforms);
// Select the default platform and create a context using this platform and the GPU
cl_context_properties cps[3] = {
CL_CONTEXT_PLATFORM,
(cl_context_properties)(platforms[0])(),
0
};
Context context( CL_DEVICE_TYPE_GPU, cps,NULL,NULL,&z);
// Get a list of devices on this platform
vector<Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
// Create a command queue and use the first device
CommandQueue queue = CommandQueue(context, devices[0],NULL,&z);
//Read source file
std::ifstream sourceFile("kernel.cl");
std::string sourceCode(
std::istreambuf_iterator<char>(sourceFile),
(std::istreambuf_iterator<char>()));
Program::Sources source(1, std::make_pair(sourceCode.c_str(), sourceCode.length()+1));
// Make program of the source code in the context
Program program = Program(context, source,&z);
// Build program for these specific devices
z=program.build(devices,"-x clc++",NULL,NULL);
if(z!=CL_SUCCESS){
cout<<"build"<<endl;return 1;}
// Make kernel
Kernel kernel(program, "vector_add",&z);
// Create memory buffers
Buffer bufferA = Buffer(context, CL_MEM_READ_ONLY, LIST_SIZE * sizeof(int),NULL,&z);
Buffer bufferB = Buffer(context, CL_MEM_READ_ONLY, LIST_SIZE * sizeof(int),NULL,&z);
Buffer bufferC = Buffer(context, CL_MEM_WRITE_ONLY, LIST_SIZE * sizeof(int),NULL,&z);
Buffer bufferD = Buffer(context, CL_MEM_READ_WRITE, sizeof(obj),NULL,&z);
// Copy lists A and B to the memory buffers
z= queue.enqueueWriteBuffer(bufferA, CL_TRUE, 0, LIST_SIZE * sizeof(int), A,NULL,NULL);
if(z!=CL_SUCCESS){
cout<<"enqueue buff A"<<endl;return 1;}
z=queue.enqueueWriteBuffer(bufferB, CL_TRUE, 0, LIST_SIZE * sizeof(int), B,NULL,NULL);
if(z!=CL_SUCCESS){
cout<<"enqueue buffB"<<endl;return 1;}
z=queue.enqueueWriteBuffer(bufferD, CL_TRUE, 0, sizeof(obj), obj,NULL,NULL);
if(z!=CL_SUCCESS){
cout<<"enqueue buffB"<<endl;return 1;}
// Set arguments to kernel
z= kernel.setArg(0, bufferA);
if(z!=CL_SUCCESS){
cout<<"kerarg A"<<endl;return 1;}
z= kernel.setArg(1, bufferB);
if(z!=CL_SUCCESS){
cout<<"kerarg buff B"<<endl;return 1;}
z= kernel.setArg(2, bufferC);
if(z!=CL_SUCCESS){
cout<<"kerarg C"<<endl;return 1;}
z= kernel.setArg(3, bufferD);
if(z!=CL_SUCCESS){
cout<<"kerarg C"<<endl;return 1;}
// Run the kernel on specific ND range
NDRange global(LIST_SIZE);
NDRange local(1);
queue.enqueueNDRangeKernel(kernel, NullRange, global, local,NULL,NULL);
// Read buffer C into a local list
int *C = new int[LIST_SIZE];
queue.enqueueReadBuffer(bufferC, CL_TRUE, 0, LIST_SIZE * sizeof(int), C,NULL,NULL);
queue.enqueueReadBuffer(bufferD, CL_TRUE, 0, sizeof(obj), obj,NULL,NULL);
for(int i = 0; i < LIST_SIZE; i ++)
std::cout << A[i] << " + " << B[i] << " = " << C[i] << std::endl;
printf("\nx=%d\n",*(obj->ptr));
return 0;
}
and kernel code is
class A
{
public:
//int* ptr; //generates error in building
int ptr;
};
__kernel void vector_add(__global int *d,__global int *b,__global int *c,__global class A *obj)
{
size_t id=get_global_id(0);
c[id]=d[id]+b[id];
__global int *p=(__global int *)obj->ptr;
printf("kernel *p= %d p= %d obj->ptr= %d \n",*(p),p,obj->ptr);
}
I am using an array of structs to store different binary data, from different clients. While I am debugging, I can do a few iterations with success (in memcpy). But at some point the debug crashes with "unhandled exception".
struct Buffer {
int size_ = 0;
int capacity_total = 200000;
int beg_index = 0
int end_index = 0;
char data_[200000];
} buffer_audio[3];
int writing_bufer(Buffer& buffers, const char *data, int nbytes) {
if (nbytes == 0) return 0;
int capacity = buffers.capacity_total;
if (nbytes <= capacity - buffers.end_index)
{
memcpy(buffers.data_ + buffers.end_index, data, nbytes); //crashes here
buffers.end_index += nbytes;
if (buffers.end_index == capacity) printf("full");
}
else {
return 0; }
return buffers.end_index;
}
The buffer never is full or close of that.
The full routine:
void buffering(const FunctionCallbackInfo<v8::Value>& args) {
Isolate* isolate = Isolate::GetCurrent();
HandleScope scope(isolate);
int size = args[1]->NumberValue();
int final_read = args[2]->NumberValue();
int client_id = args[3]->NumberValue();
int nbytes = args[4]->NumberValue();
(...)
buf = node::Buffer::Data(bufferObj);
buffering_mem(buf,size, final_read, client_id,nbytes);
Local<String> devolve = String::NewFromUtf8(isolate, "buffering_com_sucesso");//C++--->JS
args.GetReturnValue().Set(devolve);
}
void buffering_mem(char* chunk,int size_chunk, int close_file, int client, int total_size){
int check_bytes = writing_bufer(buffer_audio[client], chunk, size_chunk);
//other code}
You are copying the wrong amount in your code:
memcpy(buffers.data_ + buffers.end_index, data, buffers.end_index+nbytes);
That should be just
memcpy(buffers.data_ + buffers.end_index, data, nbytes);
With the huge help of #JSF
void buffering_mem(char* chunk,int size_chunk, int close_file, int client, int total_size){
//old place of invocation
if (close_file == 3) {
fp0 = fopen("buffer_inteiro", "wb");
fwrite(buffer_audio[client].data_, 1,total_size,fp0);
fflush(fp0); return;
}
int check_bytes = writing_bufer(buffer_audio[client], chunk, size_chunk);
}
It is simple program that read two float4 vectors from files then calculate sum of opposite numbers.
I couldn't find the problem:
MAIN file:
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <iomanip>
#include <array>
#include <fstream>
#include <sstream>
#include <string>
#include <algorithm>
#include <iterator>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#include <time.h>
#endif
const int number_of_points = 16; // number of points in Both A and B files (number of rows)
const int number_of_axis = 4; // number of points axis in Both A and B files (number of Columns)
using namespace std;
void checkError(cl_int err, const char *operation)
{
if (err != CL_SUCCESS)
{
fprintf(stderr, "Error during operation '%s': %d\n", operation, err);
exit(1);
}
}
int main(int argc, char *argv[]) {
clock_t tStart = clock();
// Create the two input vectors
// working variables
int i;
ifstream input_fileA, input_fileB; // input files
string line; // transfer row from file to array
float x; // transfer word from file to array
int row = 0; // number of rows of file A,B (= array)
int col = 0; // number of rows of file A,B (= array)
// working arrays
// working arrays
// int mem_size_TempA = number_of_points * number_of_axis * sizeof(cl_float);
// int mem_size_TempB = number_of_points * number_of_axis * sizeof(cl_float);
float tempAArray[number_of_points][number_of_axis]={{0}}; // array contains file A data
float tempBArray[number_of_points][number_of_axis]={{0}}; // array contains file B data
int mem_size_InputA = number_of_points * number_of_axis ;
int mem_size_InputB = number_of_points * number_of_axis ;
int mem_size_Output = number_of_points * number_of_axis ;
float *inputAArray = (float*) malloc(number_of_points*sizeof(cl_float4)); // array contains file A data
float *inputBArray = (float*) malloc(number_of_points*sizeof(cl_float4)); // array contains file B data
float *outputArray = (float*) malloc(number_of_points*sizeof(cl_float4)); // array contains file B data
// import input files
input_fileA.open(argv[1]);
input_fileB.open(argv[2]);
// transfer input files data to array
// input file A to arrayA
row = 0;
while (getline(input_fileA, line))
{
istringstream streamA(line);
col = 0;
while(streamA >> x){
tempAArray[row][col] = x;
col++;
}
row++;
}
// input file B to arrayB
row = 0;
while (getline(input_fileB, line))
{
istringstream streamB(line);
col = 0;
while(streamB >> x){
tempBArray[row][col] = x;
col++;
}
row++;
}
// switch columns of B array
for(int row_of_arrayB = 0; row_of_arrayB < number_of_points; row_of_arrayB++ )
{
float temporary = tempBArray[row_of_arrayB][2];
tempBArray[row_of_arrayB][2] = tempBArray[row_of_arrayB][1];
tempBArray[row_of_arrayB][1] = temporary;
}
// from Array to 3d vectors
// for (int row_of_array = 0; row_of_array<number_of_points; row_of_array++)
// {
// inputAArray[row_of_array] = (tempAArray[row_of_array][0], tempAArray[row_of_array][1], tempAArray[row_of_array][2],0);
// inputBArray[row_of_array] = (tempBArray[row_of_array][0], tempBArray[row_of_array][1], tempBArray[row_of_array][2],0);
// }
for (int row_of_array=0; row_of_array < number_of_points; row_of_array++)
{
inputAArray[row_of_array*number_of_points+0] = tempAArray[row_of_array][0];
inputAArray[row_of_array*number_of_points+1] = tempAArray[row_of_array][1];
inputAArray[row_of_array*number_of_points+2] = tempAArray[row_of_array][2];
inputAArray[row_of_array*number_of_points+3] = 0.0f;
inputBArray[row_of_array*number_of_points+0] = tempBArray[row_of_array][0];
inputBArray[row_of_array*number_of_points+1] = tempBArray[row_of_array][1];
inputBArray[row_of_array*number_of_points+2] = tempBArray[row_of_array][2];
inputBArray[row_of_array*number_of_points+3] = tempBArray[row_of_array][3];
outputArray[row_of_array*number_of_points+0] = 0.0f;
outputArray[row_of_array*number_of_points+1] = 0.0f;
outputArray[row_of_array*number_of_points+2] = 0.0f;
outputArray[row_of_array*number_of_points+3] = 0.0f;
// inputBArray[row_of_array] = (tempBArray[row_of_array][0], tempBArray[row_of_array][1], tempBArray[row_of_array][2],0);
}
// for (int row_of_array=0; row_of_array < number_of_points; row_of_array++)
// {
// printf("0: %f, 1: %f, 2: %f, 3: %f \n", inputAArray[row_of_array*number_of_points+0], inputAArray[row_of_array*number_of_points+1],
// inputAArray[row_of_array*number_of_points+2], inputAArray[row_of_array*number_of_points+3]);
// }
// close input files
input_fileA.close();
input_fileB.close();
// Load the kernel source code into the array source_str
FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("calculate_bottom_SNM_kernel.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
fseek(fp, 0, SEEK_END);
size_t programLength = ftell(fp);
rewind(fp);
source_str = (char*)malloc(programLength+1);
source_size = fread( source_str, 1, programLength, fp);
source_str[programLength] = '\0';
fclose( fp );
// Get platform and device information
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_ALL, 1,
&device_id, &ret_num_devices);
// Create an OpenCL context
cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
// Create a command queue
cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
// Create memory buffers on the device for each vector
cl_mem inputa_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
mem_size_InputA*sizeof(cl_float4) , NULL, &ret);
cl_mem inputb_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
mem_size_InputB*sizeof(cl_float4), NULL, &ret);
cl_mem output_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
mem_size_Output*sizeof(cl_float4), NULL, &ret);
// Copy the lists A and B to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue, inputa_mem_obj, CL_TRUE, 0,
mem_size_InputA*sizeof(cl_float4), inputAArray, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, inputb_mem_obj, CL_TRUE, 0,
mem_size_InputB*sizeof(cl_float4), inputBArray, 0, NULL, NULL);
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&source_str, (const size_t *)&source_size, &ret);
// Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
if (ret == CL_BUILD_PROGRAM_FAILURE)
{
// Get size of build log
size_t logSize;
ret = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG,
0, NULL, &logSize);
checkError(ret, "getting build log size");
// Get build log
char log[logSize];
ret = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG,
logSize, log, NULL);
checkError(ret, "getting build log");
printf("OpenCL program build log:\n%s\n", log);
exit(1);
}
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "calculate_bottom_SNM", &ret);
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputa_mem_obj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&inputb_mem_obj);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&output_mem_obj);
// Execute the OpenCL kernel on the list
size_t global_item_size = number_of_points; // Process the entire lists
size_t local_item_size = 4; // Process in groups of 64
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
// Read the memory buffer C on the device to the local variable C
// int *C = (int*)malloc(sizeof(int)*number_of_points);
// float *C = (float*)malloc(sizeof(float)*number_of_points);
clEnqueueReadBuffer(command_queue, output_mem_obj, CL_TRUE, 0,
mem_size_Output, outputArray, 0, NULL, NULL);
// Display the result to the screen
// float buttomSNM = 0;
// for(i = 0; i < number_of_points; i++)
// {
// for (int t=0; t<4; t++)
// {
// cout << "h" ;
//// printf("%f, \n", outputArray[i*number_of_points+t]);
// }
// }
// Clean up
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(inputa_mem_obj);
ret = clReleaseMemObject(inputb_mem_obj);
ret = clReleaseMemObject(output_mem_obj);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
free (inputAArray);
free (inputBArray);
free (outputArray);
printf("ALL Time taken: %.2fs\n", (double)(clock() - tStart)/CLOCKS_PER_SEC);
return 0;
}
Kernel:
__kernel void calculate_bottom_SNM(__global float4 *inputAArray, __global float4 *inputBArray,
__global float4 *outputArray) {
// Get the index of the current element
int i = get_global_id(0);
int number_of_points = 16;
outputArray[i*number_of_points+0] = inputAArray[i*number_of_points+0] + inputBArray[i*number_of_points+0];
outputArray[i*number_of_points+1] = inputAArray[i*number_of_points+1] + inputBArray[i*number_of_points+1];
outputArray[i*number_of_points+2] = inputAArray[i*number_of_points+2] + inputBArray[i*number_of_points+2];
outputArray[i*number_of_points+3] = inputAArray[i*number_of_points+3] + inputBArray[i*number_of_points+3];
}
The first input files: A.txt
0 0.000000e+00 9.998994e-01
1 1.000000e-03 9.998981e-01
2 2.000000e-03 9.998967e-01
3 3.000000e-03 9.998953e-01
4 4.000000e-03 9.998939e-01
5 5.000000e-03 9.998925e-01
6 6.000000e-03 9.998911e-01
7 7.000000e-03 9.998896e-01
8 8.000000e-03 9.998881e-01
9 9.000000e-03 9.998865e-01
10 1.000000e-02 9.998850e-01
11 1.100000e-02 9.998834e-01
12 1.200000e-02 9.998817e-01
13 1.300000e-02 9.998800e-01
14 1.400000e-02 9.998783e-01
15 1.500000e-02 9.998766e-01
The second input file B:
0 0.000000e+00 9.998966e-01
1 1.000000e-03 9.998953e-01
2 2.000000e-03 9.998939e-01
3 3.000000e-03 9.998925e-01
4 4.000000e-03 9.998911e-01
5 5.000000e-03 9.998896e-01
6 6.000000e-03 9.998881e-01
7 7.000000e-03 9.998866e-01
8 8.000000e-03 9.998850e-01
9 9.000000e-03 9.998834e-01
10 1.000000e-02 9.998818e-01
11 1.100000e-02 9.998801e-01
12 1.200000e-02 9.998785e-01
13 1.300000e-02 9.998767e-01
14 1.400000e-02 9.998750e-01
15 1.500000e-02 9.998732e-01
Thanks in advance
You are computing your array indices in your kernel in a fairly strange manner:
i*number_of_points+0
i*number_of_points+1
i*number_of_points+2
i*number_of_points+3
Think about what this actually translates to for different values of i (assuming number_of_points=16):
i array indices (i*16 + (0,1,2,3))
--------------------------------------
0 0, 1, 2, 3
1 16, 17, 18, 19
2 32, 33, 34, 35
...
etc
This is surely not what you wanted! Your sample code appears to just be trying to perform a vectorised vector addition. If that's the case, your kernel code just needs to look something like this:
__kernel void vecadd(__global float4 *inputA,
__global float4 *inputB,
__global float4 *output)
{
int i = get_global_id(0);
output[i] = inputA[i] + inputB[i];
}
This works because were are performing the same operation to each element of the vector. If you have a kernel that needs to use these elements separately, you would write code like this:
float4 valueA = inputA[i];
float4 valueB = inputB[i];
float4 result;
result.x = valueA.x + valueB.x; // Do something with first component
result.y = valueA.y * valueB.y; // Do something with second component
result.z = valueA.z / valueB.z; // Do something with third component
result.w = valueA.w - valueB.w; // Do something with fourth component
http://www.thesalmons.org/john/random123/releases/1.00/docs/index.html
I have a hard time looking at the example for opencl and random123 as im new to OpenCL and i am not sure how I can use the provided information when im using Visual Studio 2010.
Anyone who can compose a guide for generating random numbers with the above lib and using visual studio 2010.
UPDATE:
I solved it as following and are now wondering how do I change the seed such i get random numbers at each run.
int main(int argc, char **argv)
{
const char *kernelname = "counthits";
unsigned count =10000;
cl_int err;
cl::Context cl_context;
cl::Program program;
cl::Kernel cl_kernel;
cl::Buffer cl_out;
cl::CommandQueue cl_queue;
size_t i, nthreads, hits_sz;
size_t cores, work_group_size;
cl_uint2 * hits_host;
double d = 0.; // timer
d = timer(&d);
progname = argv[0];
std::vector< cl::Platform > platformList;
CHECK(cl::Platform::get(&platformList));
CHECKERR( cl_context = createCLContext(CL_DEVICE_TYPE_GPU,cl_vendor::VENDOR_AMD, &err) );
std::vector<cl::Device> devices;
CHECKERR( devices = cl_context.getInfo<CL_CONTEXT_DEVICES>(&err) );
size_t length = 0;
const char * sourceStr = loadFileToString("pi_opencl_kernel.ocl","",&length);
cl::Program::Sources sources(1, std::make_pair(sourceStr, length));
program = cl::Program(cl_context, sources);
CHECK( program.build(devices,"-I D:\\libs\\Random123\\1.06\\include") );
CHECKERR(work_group_size = devices[0].getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>(&err) );
CHECKERR(cores = devices[0].getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(&err) );
cores *= 16*4; //Tahiti.
if (work_group_size > 64) work_group_size /= 2;
nthreads = cores * work_group_size*32; //2048*128 = 262144
if (count == 0)
count = NTRIES/nthreads; //38
printf("Count: %lu\n",count);
hits_sz = nthreads * sizeof(hits_host[0]);//2097152
CHECKNOTZERO(hits_host = (cl_uint2 *)malloc(hits_sz));
CHECKERR ( cl_out = cl::Buffer( cl_context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, hits_sz, hits_host, &err));
CHECKERR ( cl_kernel = cl::Kernel(program,kernelname,&err) );
CHECK ( cl_kernel.setArg( 0, count) );
CHECK ( cl_kernel.setArg( 1, cl_out) );
CHECKERR (cl_queue = cl::CommandQueue(cl_context, devices[0], 0, &err) );
cl::Event event;
CHECK( cl_queue.enqueueNDRangeKernel(cl_kernel,cl::NullRange,cl::NDRange(nthreads), cl::NDRange(work_group_size), NULL, &event) );
event.wait();
CHECK( cl_queue.enqueueReadBuffer(cl_out, CL_TRUE, 0,hits_sz, hits_host) );
unsigned long hits = 0, tries = 0;
for (i = 0; i < nthreads; i++) {
#ifdef _DEBUG
printf("%lu %u %u\n", (unsigned long)i, hits_host[i].s[0], hits_host[i].s[1]);
#endif
hits += hits_host[i].s[0];
tries += hits_host[i].s[1];
}
return pi_check(hits, tries);
}
Kernel:
#include <Random123/threefry.h>
/*
* counthits generates 2*n x,y points and returns hits[tid] with
* the count of number of those points within the unit circle on
* each thread.
*/
__kernel void counthits(unsigned n, __global uint2 *hitsp) {
unsigned tid = get_global_id(0);
unsigned hits = 0, tries = 0;
threefry4x32_key_t k = {{tid, 0xdecafbad, 0xfacebead, 0x12345678}};
threefry4x32_ctr_t c = {{0, 0xf00dcafe, 0xdeadbeef, 0xbeeff00d}};
while (tries < n) {
union {
threefry4x32_ctr_t c;
int4 i;
} u;
c.v[0]++;
u.c = threefry4x32(c, k);
long x1 = u.i.x, y1 = u.i.y;
long x2 = u.i.z, y2 = u.i.w;
if ((x1*x1 + y1*y1) < (1L<<62)) {
hits++;
}
tries++;
if ((x2*x2 + y2*y2) < (1L<<62)) {
hits++;
}
tries++;
}
hitsp[tid].x = hits;
hitsp[tid].y = tries;
}
I haven't tested this, but roughly speaking, something like the following:
Try changing the signature of counthits to:
_kernel void counthits(unsigned n, __global uint2 *hitsp, unsigned seed)
Replace 0xdecafbad with seed
Add
char *seedstr = getenv("COUNTHITS_SEED");
unsigned seed = seedstr ? atoi(seedstr) : 0xdecafbad;
...
CHECK ( cl_kernel.setArg( 2, seed) );
to the main program (this setArg comes after setArg( 1, ...), and you can, of).