OpenCl: sample float4 program - Segmentation fault (core dumped) - c++

It is simple program that read two float4 vectors from files then calculate sum of opposite numbers.
I couldn't find the problem:
MAIN file:
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <iomanip>
#include <array>
#include <fstream>
#include <sstream>
#include <string>
#include <algorithm>
#include <iterator>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#include <time.h>
#endif
const int number_of_points = 16; // number of points in Both A and B files (number of rows)
const int number_of_axis = 4; // number of points axis in Both A and B files (number of Columns)
using namespace std;
void checkError(cl_int err, const char *operation)
{
if (err != CL_SUCCESS)
{
fprintf(stderr, "Error during operation '%s': %d\n", operation, err);
exit(1);
}
}
int main(int argc, char *argv[]) {
clock_t tStart = clock();
// Create the two input vectors
// working variables
int i;
ifstream input_fileA, input_fileB; // input files
string line; // transfer row from file to array
float x; // transfer word from file to array
int row = 0; // number of rows of file A,B (= array)
int col = 0; // number of rows of file A,B (= array)
// working arrays
// working arrays
// int mem_size_TempA = number_of_points * number_of_axis * sizeof(cl_float);
// int mem_size_TempB = number_of_points * number_of_axis * sizeof(cl_float);
float tempAArray[number_of_points][number_of_axis]={{0}}; // array contains file A data
float tempBArray[number_of_points][number_of_axis]={{0}}; // array contains file B data
int mem_size_InputA = number_of_points * number_of_axis ;
int mem_size_InputB = number_of_points * number_of_axis ;
int mem_size_Output = number_of_points * number_of_axis ;
float *inputAArray = (float*) malloc(number_of_points*sizeof(cl_float4)); // array contains file A data
float *inputBArray = (float*) malloc(number_of_points*sizeof(cl_float4)); // array contains file B data
float *outputArray = (float*) malloc(number_of_points*sizeof(cl_float4)); // array contains file B data
// import input files
input_fileA.open(argv[1]);
input_fileB.open(argv[2]);
// transfer input files data to array
// input file A to arrayA
row = 0;
while (getline(input_fileA, line))
{
istringstream streamA(line);
col = 0;
while(streamA >> x){
tempAArray[row][col] = x;
col++;
}
row++;
}
// input file B to arrayB
row = 0;
while (getline(input_fileB, line))
{
istringstream streamB(line);
col = 0;
while(streamB >> x){
tempBArray[row][col] = x;
col++;
}
row++;
}
// switch columns of B array
for(int row_of_arrayB = 0; row_of_arrayB < number_of_points; row_of_arrayB++ )
{
float temporary = tempBArray[row_of_arrayB][2];
tempBArray[row_of_arrayB][2] = tempBArray[row_of_arrayB][1];
tempBArray[row_of_arrayB][1] = temporary;
}
// from Array to 3d vectors
// for (int row_of_array = 0; row_of_array<number_of_points; row_of_array++)
// {
// inputAArray[row_of_array] = (tempAArray[row_of_array][0], tempAArray[row_of_array][1], tempAArray[row_of_array][2],0);
// inputBArray[row_of_array] = (tempBArray[row_of_array][0], tempBArray[row_of_array][1], tempBArray[row_of_array][2],0);
// }
for (int row_of_array=0; row_of_array < number_of_points; row_of_array++)
{
inputAArray[row_of_array*number_of_points+0] = tempAArray[row_of_array][0];
inputAArray[row_of_array*number_of_points+1] = tempAArray[row_of_array][1];
inputAArray[row_of_array*number_of_points+2] = tempAArray[row_of_array][2];
inputAArray[row_of_array*number_of_points+3] = 0.0f;
inputBArray[row_of_array*number_of_points+0] = tempBArray[row_of_array][0];
inputBArray[row_of_array*number_of_points+1] = tempBArray[row_of_array][1];
inputBArray[row_of_array*number_of_points+2] = tempBArray[row_of_array][2];
inputBArray[row_of_array*number_of_points+3] = tempBArray[row_of_array][3];
outputArray[row_of_array*number_of_points+0] = 0.0f;
outputArray[row_of_array*number_of_points+1] = 0.0f;
outputArray[row_of_array*number_of_points+2] = 0.0f;
outputArray[row_of_array*number_of_points+3] = 0.0f;
// inputBArray[row_of_array] = (tempBArray[row_of_array][0], tempBArray[row_of_array][1], tempBArray[row_of_array][2],0);
}
// for (int row_of_array=0; row_of_array < number_of_points; row_of_array++)
// {
// printf("0: %f, 1: %f, 2: %f, 3: %f \n", inputAArray[row_of_array*number_of_points+0], inputAArray[row_of_array*number_of_points+1],
// inputAArray[row_of_array*number_of_points+2], inputAArray[row_of_array*number_of_points+3]);
// }
// close input files
input_fileA.close();
input_fileB.close();
// Load the kernel source code into the array source_str
FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("calculate_bottom_SNM_kernel.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
fseek(fp, 0, SEEK_END);
size_t programLength = ftell(fp);
rewind(fp);
source_str = (char*)malloc(programLength+1);
source_size = fread( source_str, 1, programLength, fp);
source_str[programLength] = '\0';
fclose( fp );
// Get platform and device information
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_ALL, 1,
&device_id, &ret_num_devices);
// Create an OpenCL context
cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
// Create a command queue
cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
// Create memory buffers on the device for each vector
cl_mem inputa_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
mem_size_InputA*sizeof(cl_float4) , NULL, &ret);
cl_mem inputb_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
mem_size_InputB*sizeof(cl_float4), NULL, &ret);
cl_mem output_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
mem_size_Output*sizeof(cl_float4), NULL, &ret);
// Copy the lists A and B to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue, inputa_mem_obj, CL_TRUE, 0,
mem_size_InputA*sizeof(cl_float4), inputAArray, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, inputb_mem_obj, CL_TRUE, 0,
mem_size_InputB*sizeof(cl_float4), inputBArray, 0, NULL, NULL);
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&source_str, (const size_t *)&source_size, &ret);
// Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
if (ret == CL_BUILD_PROGRAM_FAILURE)
{
// Get size of build log
size_t logSize;
ret = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG,
0, NULL, &logSize);
checkError(ret, "getting build log size");
// Get build log
char log[logSize];
ret = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG,
logSize, log, NULL);
checkError(ret, "getting build log");
printf("OpenCL program build log:\n%s\n", log);
exit(1);
}
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "calculate_bottom_SNM", &ret);
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputa_mem_obj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&inputb_mem_obj);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&output_mem_obj);
// Execute the OpenCL kernel on the list
size_t global_item_size = number_of_points; // Process the entire lists
size_t local_item_size = 4; // Process in groups of 64
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
// Read the memory buffer C on the device to the local variable C
// int *C = (int*)malloc(sizeof(int)*number_of_points);
// float *C = (float*)malloc(sizeof(float)*number_of_points);
clEnqueueReadBuffer(command_queue, output_mem_obj, CL_TRUE, 0,
mem_size_Output, outputArray, 0, NULL, NULL);
// Display the result to the screen
// float buttomSNM = 0;
// for(i = 0; i < number_of_points; i++)
// {
// for (int t=0; t<4; t++)
// {
// cout << "h" ;
//// printf("%f, \n", outputArray[i*number_of_points+t]);
// }
// }
// Clean up
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(inputa_mem_obj);
ret = clReleaseMemObject(inputb_mem_obj);
ret = clReleaseMemObject(output_mem_obj);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
free (inputAArray);
free (inputBArray);
free (outputArray);
printf("ALL Time taken: %.2fs\n", (double)(clock() - tStart)/CLOCKS_PER_SEC);
return 0;
}
Kernel:
__kernel void calculate_bottom_SNM(__global float4 *inputAArray, __global float4 *inputBArray,
__global float4 *outputArray) {
// Get the index of the current element
int i = get_global_id(0);
int number_of_points = 16;
outputArray[i*number_of_points+0] = inputAArray[i*number_of_points+0] + inputBArray[i*number_of_points+0];
outputArray[i*number_of_points+1] = inputAArray[i*number_of_points+1] + inputBArray[i*number_of_points+1];
outputArray[i*number_of_points+2] = inputAArray[i*number_of_points+2] + inputBArray[i*number_of_points+2];
outputArray[i*number_of_points+3] = inputAArray[i*number_of_points+3] + inputBArray[i*number_of_points+3];
}
The first input files: A.txt
0 0.000000e+00 9.998994e-01
1 1.000000e-03 9.998981e-01
2 2.000000e-03 9.998967e-01
3 3.000000e-03 9.998953e-01
4 4.000000e-03 9.998939e-01
5 5.000000e-03 9.998925e-01
6 6.000000e-03 9.998911e-01
7 7.000000e-03 9.998896e-01
8 8.000000e-03 9.998881e-01
9 9.000000e-03 9.998865e-01
10 1.000000e-02 9.998850e-01
11 1.100000e-02 9.998834e-01
12 1.200000e-02 9.998817e-01
13 1.300000e-02 9.998800e-01
14 1.400000e-02 9.998783e-01
15 1.500000e-02 9.998766e-01
The second input file B:
0 0.000000e+00 9.998966e-01
1 1.000000e-03 9.998953e-01
2 2.000000e-03 9.998939e-01
3 3.000000e-03 9.998925e-01
4 4.000000e-03 9.998911e-01
5 5.000000e-03 9.998896e-01
6 6.000000e-03 9.998881e-01
7 7.000000e-03 9.998866e-01
8 8.000000e-03 9.998850e-01
9 9.000000e-03 9.998834e-01
10 1.000000e-02 9.998818e-01
11 1.100000e-02 9.998801e-01
12 1.200000e-02 9.998785e-01
13 1.300000e-02 9.998767e-01
14 1.400000e-02 9.998750e-01
15 1.500000e-02 9.998732e-01
Thanks in advance

You are computing your array indices in your kernel in a fairly strange manner:
i*number_of_points+0
i*number_of_points+1
i*number_of_points+2
i*number_of_points+3
Think about what this actually translates to for different values of i (assuming number_of_points=16):
i array indices (i*16 + (0,1,2,3))
--------------------------------------
0 0, 1, 2, 3
1 16, 17, 18, 19
2 32, 33, 34, 35
...
etc
This is surely not what you wanted! Your sample code appears to just be trying to perform a vectorised vector addition. If that's the case, your kernel code just needs to look something like this:
__kernel void vecadd(__global float4 *inputA,
__global float4 *inputB,
__global float4 *output)
{
int i = get_global_id(0);
output[i] = inputA[i] + inputB[i];
}
This works because were are performing the same operation to each element of the vector. If you have a kernel that needs to use these elements separately, you would write code like this:
float4 valueA = inputA[i];
float4 valueB = inputB[i];
float4 result;
result.x = valueA.x + valueB.x; // Do something with first component
result.y = valueA.y * valueB.y; // Do something with second component
result.z = valueA.z / valueB.z; // Do something with third component
result.w = valueA.w - valueB.w; // Do something with fourth component

Related

C++ OpenCL Abstraction Not giving desired result

I tried to do an basic abstraction for OpenCL Here it is:
OpenCLBuffer:
Header:
class OpenCLBuffer {
void* GetNativeID() { return obj; }
cl_mem obj;
cl_command_queue commandQueue;
cl_context context;
cl_int ret;
int type;
int maxSize;
int currSize;
};
Impl:
OpenCLBuffer::OpenCLBuffer(cl_context cont, cl_command_queue queue, cl_int t, unsigned int size)
{
context = cont;
commandQueue = queue;
maxSize = size;
type = t;
obj = clCreateBuffer(context, t, size, NULL, &ret);
}
OpenCLBuffer::~OpenCLBuffer()
{
ret = clReleaseMemObject(obj);
}
void OpenCLBuffer::SetData(int size, void* data, int offset)
{
currSize = size;
ret = clEnqueueWriteBuffer(commandQueue, obj, CL_TRUE, offset, size, data, 0, NULL, NULL);
}
void OpenCLBuffer::GetData(void* data, int size)
{
if (size == -1)
size = currSize;
ret = clEnqueueReadBuffer(commandQueue, obj, CL_TRUE, 0, size, data, 0, NULL, NULL);
}
OpenClContext:
Header:
class OpenCLContext {
// I removed the fuc definations from question as they are already in the Impl part
cl_platform_id plarformId;
cl_device_id deviceId;
cl_context context;
cl_uint numDevices;
cl_uint numPlatforms;
cl_command_queue commandQueue;
cl_int ret;
char name[1024];
};
ImPl:
static void _stdcall OpenCLErrorFunc(const char* errinfo, const void* private_info, size_t cb, void* user_data){
std::cout << "OpenCL (" << user_data << ") Error : \n" << errinfo << "\n";
}
OpenCLContext::OpenCLContext(std::string n)
{
ret = clGetPlatformIDs(1, &plarformId, &numPlatforms);
ret = clGetDeviceIDs(plarformId, CL_DEVICE_TYPE_DEFAULT, 1, &deviceId, &numDevices);
context = clCreateContext(NULL, 1, &deviceId, OpenCLErrorFunc, name, &ret);
commandQueue = clCreateCommandQueue(context, deviceId, 0, &ret);
memcpy_s(name, 1024, n.data(), std::min(1024, (int)n.size()));
}
OpenCLContext::~OpenCLContext()
{
for (std::pair<std::string, char*> data : sources) {
if (data.second)
delete data.second;
}
ret = clFlush(commandQueue);
ret = clReleaseCommandQueue(commandQueue);
ret = clReleaseContext(context);
}
OpenCLBuffer* OpenCLContext::CreateBuffer(void* data, int size, int type)
{
OpenCLBuffer* buffer = new OpenCLBuffer(context, commandQueue, type, size);
buffer->SetData(size, data);
return buffer;
}
OpenCLBuffer* OpenCLContext::CreateBuffer(int size, int type)
{
OpenCLBuffer* buffer = new OpenCLBuffer(context, commandQueue, type, size);
return buffer;
}
void OpenCLContext::AddProgram(std::string name, std::string source)
{
char* sc = new char[source.size()];
memcpy_s(sc, source.size(), source.data(), source.size());
sources[name] = sc;
int sourceSize = source.size();
programs[name] = clCreateProgramWithSource(context, 1, (const char**)&sc, (const size_t*)&sourceSize, &ret);
ret = clBuildProgram(programs[name], 1, &deviceId, NULL, NULL, NULL);
}
void OpenCLContext::MakeKernel(std::string programName, std::string kernelName)
{
kernels[kernelName] = clCreateKernel(programs[programName], kernelName.c_str(), &ret);
}
void OpenCLContext::SetKernelArg(std::string kernelName, int num, int size, void* arg)
{
ret = clSetKernelArg(kernels[kernelName], num, size, arg);
}
void OpenCLContext::ReleaseKernerl(std::string kernelName)
{
ret = clFlush(commandQueue);
ret = clReleaseKernel(kernels[kernelName]);
}
void OpenCLContext::ReleaseProgram(std::string programName)
{
ret = clFlush(commandQueue);
ret = clReleaseProgram(programs[programName]);
}
void OpenCLContext::Dispatch(std::string kernelName, int globalItemSize, int localItemSize)
{
ret = clEnqueueNDRangeKernel(commandQueue, kernels[kernelName], 1, NULL, (const size_t*)&globalItemSize, (const size_t*)&localItemSize, 0, NULL, NULL);
}
Driver Code:
std::string shadersrc = R"(
__kernel void vector_add(__global const int *A, __global const int *B, __global int *C) {
// Get the index of the current element to be processed
int i = get_global_id(0);
// Do the operation
C[i] = A[i] + B[i];
}
)";
const int LIST_SIZE = 1024;
int* A = (int*)malloc(sizeof(int) * LIST_SIZE);
int* B = (int*)malloc(sizeof(int) * LIST_SIZE);
for (int i = 0; i < LIST_SIZE; i++) {
A[i] = i;
B[i] = LIST_SIZE - i;
}
context = new OpenCLContext("Vector Adder");
a = context->CreateBuffer(LIST_SIZE * sizeof(int), CL_MEM_READ_ONLY);
b = context->CreateBuffer(LIST_SIZE * sizeof(int), CL_MEM_READ_ONLY);
c = context->CreateBuffer(LIST_SIZE * sizeof(int), CL_MEM_WRITE_ONLY);
a->SetData(LIST_SIZE * sizeof(int), A);
b->SetData(LIST_SIZE * sizeof(int), B);
context->AddProgram("VectorAdderSrc", shadersrc);
context->MakeKernel("VectorAdderSrc", "vector_add");
context->SetKernelArg("vector_add", 0, sizeof(cl_mem), a->GetNativeID());
context->SetKernelArg("vector_add", 1, sizeof(cl_mem), b->GetNativeID());
context->SetKernelArg("vector_add", 2, sizeof(cl_mem), c->GetNativeID());
context->Dispatch("vector_add", LIST_SIZE, 64);
int* C = (int*)malloc(sizeof(int) * LIST_SIZE);
memset(C, 0, sizeof(int) * LIST_SIZE);
c->GetData(c, sizeof(int) * LIST_SIZE);
for (int i = 0; i < LIST_SIZE; i++)
printf("%d + %d = %d\n", A[i], B[i], C[i]);
Sometimes i am getting Read Access Violation and sometimes:
0 + 1024 = 0
1 + 1023 = 0
2 + 1022 = 0
3 + 1021 = 0
...
Then crash.
Could you please help me find the problems?
First, few general tips to make debugging easier:
Always check return values. Every OpenCL API reports the errors via return value or a reference parameter.
At the first occurrence of an error you should stop the rest of the program as it will most likely not work. Throwing exceptions are a good strategy.
Specifically for OpenCL, the error codes are defined in the main header cl.h, you can find an code->string mapping routine here:Convenient way to show OpenCL error codes?
Regarding your code, the first error came from your function AddProgram. The function clCreateProgramWithSource returns CL_OUT_OF_HOST_MEMORY. Your decision to cast sourceSize from int* to size_t* is problematic since they are not the same size, and the API reads a corrupted 64-bit value.
Here is a better implementation:
void AddProgram(std::string name, std::string source)
{
const char* sc = source.c_str();
size_t sourceSize[1] = {source.size()};
programs[name] = clCreateProgramWithSource(context, 1, &sc, sourceSize, &ret);
ret = clBuildProgram(programs[name], 1, &deviceId, NULL, NULL, NULL);
}
There is no need to keep the source code in memory after it was compiled, but if you really want to, I suggest managing std::string objects because they save the hustle of managing memory.
The next problem is that clEnqueueNDRangeKernel returns CL_INVALID_WORK_GROUP_SIZE. Here you can find the int* to size_t* cast problem again, which passes bad arguments to the function.
Finally, your call to SetKernelArg returns CL_INVALID_MEM_OBJECT, and this is because the last argument, in case of OpenCL buffers, expected to be a pointer pointer to a cl_mem object (in your case - address of cl_mem that is returned by GetNativeID()).
Finally, there is a typo in the line c->GetData(c, sizeof(int) * LIST_SIZE); as it should be c->GetData(C, sizeof(int) * LIST_SIZE);
That should make it work. Please pay attention to the tips above, and avoid C casts in favor of C++ casts.

How to pass a pointer in a class to a kernel in opencl c++ and get pointer data

The platform i am using is AMD radeon 200 series (sapphire 250 (GPU- Oland)
I am creating a class on host having a pointer as data member, and same class on kernel side too.
now if in kernel side, pointer is present in the class, build error( -11) arises. if we remove pointer on kernel side, code builds up.
there, the pointer value is copied to a global pointer and dereferenced value comes to be zero.
My main objective is to access a pointer in a class on device side.
AMD SDK-3.0
opencl c++ version 1.2
any help would be appreciated.
class A
{
public:
int* ptr;
};
int main()
{
const int LIST_SIZE = 1;
int abc=20;
A *obj=new A;
obj->ptr= &abc;
printf("\nx=%d\n",*(obj->ptr));
int *A = new int[LIST_SIZE];
int *B = new int[LIST_SIZE];
cl_int z;
for(int i = 0; i < LIST_SIZE; i++) {
A[i] = i;
B[i] = LIST_SIZE - i;
}
// Get available platforms
vector<Platform> platforms;
Platform::get(&platforms);
// Select the default platform and create a context using this platform and the GPU
cl_context_properties cps[3] = {
CL_CONTEXT_PLATFORM,
(cl_context_properties)(platforms[0])(),
0
};
Context context( CL_DEVICE_TYPE_GPU, cps,NULL,NULL,&z);
// Get a list of devices on this platform
vector<Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
// Create a command queue and use the first device
CommandQueue queue = CommandQueue(context, devices[0],NULL,&z);
//Read source file
std::ifstream sourceFile("kernel.cl");
std::string sourceCode(
std::istreambuf_iterator<char>(sourceFile),
(std::istreambuf_iterator<char>()));
Program::Sources source(1, std::make_pair(sourceCode.c_str(), sourceCode.length()+1));
// Make program of the source code in the context
Program program = Program(context, source,&z);
// Build program for these specific devices
z=program.build(devices,"-x clc++",NULL,NULL);
if(z!=CL_SUCCESS){
cout<<"build"<<endl;return 1;}
// Make kernel
Kernel kernel(program, "vector_add",&z);
// Create memory buffers
Buffer bufferA = Buffer(context, CL_MEM_READ_ONLY, LIST_SIZE * sizeof(int),NULL,&z);
Buffer bufferB = Buffer(context, CL_MEM_READ_ONLY, LIST_SIZE * sizeof(int),NULL,&z);
Buffer bufferC = Buffer(context, CL_MEM_WRITE_ONLY, LIST_SIZE * sizeof(int),NULL,&z);
Buffer bufferD = Buffer(context, CL_MEM_READ_WRITE, sizeof(obj),NULL,&z);
// Copy lists A and B to the memory buffers
z= queue.enqueueWriteBuffer(bufferA, CL_TRUE, 0, LIST_SIZE * sizeof(int), A,NULL,NULL);
if(z!=CL_SUCCESS){
cout<<"enqueue buff A"<<endl;return 1;}
z=queue.enqueueWriteBuffer(bufferB, CL_TRUE, 0, LIST_SIZE * sizeof(int), B,NULL,NULL);
if(z!=CL_SUCCESS){
cout<<"enqueue buffB"<<endl;return 1;}
z=queue.enqueueWriteBuffer(bufferD, CL_TRUE, 0, sizeof(obj), obj,NULL,NULL);
if(z!=CL_SUCCESS){
cout<<"enqueue buffB"<<endl;return 1;}
// Set arguments to kernel
z= kernel.setArg(0, bufferA);
if(z!=CL_SUCCESS){
cout<<"kerarg A"<<endl;return 1;}
z= kernel.setArg(1, bufferB);
if(z!=CL_SUCCESS){
cout<<"kerarg buff B"<<endl;return 1;}
z= kernel.setArg(2, bufferC);
if(z!=CL_SUCCESS){
cout<<"kerarg C"<<endl;return 1;}
z= kernel.setArg(3, bufferD);
if(z!=CL_SUCCESS){
cout<<"kerarg C"<<endl;return 1;}
// Run the kernel on specific ND range
NDRange global(LIST_SIZE);
NDRange local(1);
queue.enqueueNDRangeKernel(kernel, NullRange, global, local,NULL,NULL);
// Read buffer C into a local list
int *C = new int[LIST_SIZE];
queue.enqueueReadBuffer(bufferC, CL_TRUE, 0, LIST_SIZE * sizeof(int), C,NULL,NULL);
queue.enqueueReadBuffer(bufferD, CL_TRUE, 0, sizeof(obj), obj,NULL,NULL);
for(int i = 0; i < LIST_SIZE; i ++)
std::cout << A[i] << " + " << B[i] << " = " << C[i] << std::endl;
printf("\nx=%d\n",*(obj->ptr));
return 0;
}
and kernel code is
class A
{
public:
//int* ptr; //generates error in building
int ptr;
};
__kernel void vector_add(__global int *d,__global int *b,__global int *c,__global class A *obj)
{
size_t id=get_global_id(0);
c[id]=d[id]+b[id];
__global int *p=(__global int *)obj->ptr;
printf("kernel *p= %d p= %d obj->ptr= %d \n",*(p),p,obj->ptr);
}

how to fill the "data field" of wavfile

Hi i am trying to record from a board and i have successfully record 4 seconds. Problem is when i try to record for more time, i got an error telling me that there not enough memory. my target is to record a 5 minutes file. Until now i have create a buffer named snIn[256] where are the samples. i send it to a big buffer of [16K * 4sec] and when it is full, i create the wav file.
#include "SAI_InOut.hpp"
#include "F746_GUI.hpp"
#include "Delay.hpp"
#include "WaveformDisplay.hpp"
#include "SDFileSystem.h"
#include "wavfile.h"
using namespace Mikami;
#define RES_STR_SIZE 0x20
#define WAVFILE_SAMPLES_PER_SECOND 16000
#define REC_TIME 4
//Create an SDFileSystem object
SDFileSystem sd("sd");
bool flag = 1;
int count = 0;
char *res_buf;
int rp = 0;
const int NUM_SAMPLES = WAVFILE_SAMPLES_PER_SECOND * REC_TIME;
Array<int16_t> my_buffer(NUM_SAMPLES);
int j = 0;
static const char *target_filename = "/sd/rectest.wav";
const int SEG_SIZE = 256;
int sent_array = 0;
int rec(const char *filename, Array<int16_t> my_buffer)
{
j = 0;
flag = 0;
sent_array = 0;
WavFileResult result;
wavfile_info_t info;
wavfile_data_t data;
WAVFILE_INFO_AUDIO_FORMAT(&info) = 1;
WAVFILE_INFO_NUM_CHANNELS(&info) = 1;
WAVFILE_INFO_SAMPLE_RATE(&info) = WAVFILE_SAMPLES_PER_SECOND;
WAVFILE_INFO_BITS_PER_SAMPLE(&info) = 16;
WAVFILE_INFO_BYTE_RATE(&info) = WAVFILE_INFO_NUM_CHANNELS(&info) * WAVFILE_INFO_SAMPLE_RATE(&info) * (WAVFILE_INFO_BITS_PER_SAMPLE(&info) / 8);
WAVFILE_INFO_BLOCK_ALIGN(&info) = 2;
WAVFILE *wf = wavfile_open(filename, WavFileModeWrite, &result);
if (result != WavFileResultOK) {
wavfile_result_string(result, res_buf, RES_STR_SIZE);
printf("%s", res_buf);
return result;
} else printf ("Open file success \r\n");
rp = 0;
WAVFILE_DATA_NUM_CHANNELS(&data) = 1;
result = wavfile_write_info(wf, &info);
if (result != WavFileResultOK) {
wavfile_result_string(result, res_buf, RES_STR_SIZE);
printf("%s", res_buf);
return result; } else printf ("Write info success \r\n");
while ( rp < NUM_SAMPLES ) {
WAVFILE_DATA_CHANNEL_DATA(&data, 0) = my_buffer[rp];
result = wavfile_write_data(wf, &data);
rp += 1;
}
if (result != WavFileResultOK) {
wavfile_result_string(result, res_buf, RES_STR_SIZE);
printf("%s", res_buf);
return result; } else printf ("Write Data file success \r\n");
result = wavfile_close(wf);
if (result != WavFileResultOK) {
wavfile_result_string(result, res_buf , RES_STR_SIZE);
printf("%s", res_buf);
return result; } else printf ("Close file success \r\n");
//UnMount the filesystem
sd.unmount();
printf("Success rec !\r\n");
return 0;
}
int main()
{
//Mount the filesystem
sd.mount();
const float MAX_DELAY = 0.5f; // 最大遅延,単位:秒
const int FS = I2S_AUDIOFREQ_16K; // 標本化周波数: 16 kHz
const uint32_t MAX_ARRAY_SIZE = (uint32_t)(MAX_DELAY*FS);
SaiIO mySai(SaiIO::BOTH, 256, FS, INPUT_DEVICE_DIGITAL_MICROPHONE_2);
Label myLabel(185, 10, "Delay System", Label::CENTER, Font16);
// ButtonGroup: "ON", "OFF"
const uint16_t BG_LEFT = 370;
const uint16_t BG_WIDTH = 100;
const uint16_t BG_HEIGHT = 45;
ButtonGroup onOff(BG_LEFT, 40, BG_WIDTH/2, BG_HEIGHT,
2, (string[]){"ON", "OFF"}, 0, 0, 2, 1);
const uint16_t SB_LEFT = BG_LEFT - 320;
const uint16_t SB_WIDTH = 270;
const uint16_t SB_Y0 = 240;
char str[20];
sprintf(str, " %3.1f [s]", MAX_DELAY);
SeekBar barDelay(SB_LEFT, SB_Y0, SB_WIDTH,
0, MAX_ARRAY_SIZE, 0, "0", "", str);
NumericLabel<float> labelDelay(SB_LEFT+SB_WIDTH/2, SB_Y0-40, "DELEY: %4.2f", 0, Label::CENTER);
DelaySystem delaySystem(MAX_ARRAY_SIZE);
WaveformDisplay displayIn(*GuiBase::GetLcdPtr(), SB_LEFT+7, 70, 256, 9,LCD_COLOR_WHITE, LCD_COLOR_CYAN,GuiBase::ENUM_BACK);
Label inLabel(SB_LEFT-30, 65, "IN");
WaveformDisplay displayOut(*GuiBase::GetLcdPtr(), SB_LEFT+7, 130, 256, 9,LCD_COLOR_WHITE, LCD_COLOR_CYAN,GuiBase::ENUM_BACK);
Label outLabel(SB_LEFT-30, 125, "OUT");
int runStop = 1;
Array<int16_t> snIn(mySai.GetLength());
Array<int16_t> snOut(mySai.GetLength());
mySai.RecordIn();
mySai.PlayOut();
mySai.PauseOut();
while (true)
{
// On/OFF
int num;
if (onOff.GetTouchedNumber(num))
if (runStop != num)
{
if (num == 0) mySai.ResumeOut();
else mySai.PauseOut();
runStop = num;
}
if (mySai.IsCompleted())
{
for (int n=0; n<mySai.GetLength() ; n++)
{
int16_t xL, xR;
mySai.Input(xL,xR);
int16_t xn = xL + xR;
snIn[n] = xn;
my_buffer[j] = xn;
j++;
if (j == NUM_SAMPLES && flag == 1) {
rec (target_filename , my_buffer); }
int16_t yn = delaySystem.Execute(xn);
mySai.Output(yn, yn);
snOut[n] = yn;
}
mySai.Reset();
displayIn.Execute(snIn);
}
}
}
I thought about a possible solution, to fill directly the "data field" of the wavefile with the snIn[256] buffer (instead of using my_buffer) again and again and at the end close the wavfile. Please let me know what you think about that and other solutions
things to note: 1) while a write operation is being performed, more data is still coming in.
At the very least I would double buffer that data, so can be writing one buffer while the other one fills.
Usually this means using an interrupt to collect the samples (into which ever buffer is currently being filed.)
the foreground program waits for the current buffer to be 'full', then initiates write operation.,
then waits again for a buffer to be 'full'
The interrupt function tracks which buffer is being filled and the current index into that buffer. When a buffer is full, set a 'global' status to let the foreground program know which buffer is ready to be written.
The foreground program writes the buffer, then resets the status for that buffer.

C/CUDA: Only every fourth element in CudaArray can be indexed

This is my first post, so I am thrilled to get some new insights and enlarge my knowledge. Currently I am working on a C-project where a binary raw file with 3d-data is loaded, processed in CUDA and saved in a new binary raw file.
This is based on the simpleTexture3D project from CUDA Samples:
This is my cpp
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
// includes, cuda
#include <vector_types.h>
#include <driver_functions.h>
#include <cuda_runtime.h>
// CUDA utilities and system includes
#include <helper_cuda.h>
#include <helper_functions.h>
#include <vector_types.h>
typedef unsigned int uint;
typedef unsigned char uchar;
const char *sSDKsample = "simpleTexture3D";
const char *volumeFilename = "Bucky.raw";
const cudaExtent volumeSize = make_cudaExtent(32, 32, 32);
const uint width = 64, height = 64, depth=64;
//const char *volumeFilename = "TestOCT.raw";
//const cudaExtent volumeSize = make_cudaExtent(1024, 512, 512);
//
//const uint width = 1024, height = 512, depth=512;
const dim3 blockSize(8, 8, 8);
const dim3 gridSize(width / blockSize.x, height / blockSize.y, depth / blockSize.z);
uint *d_output = NULL;
int *pArgc = NULL;
char **pArgv = NULL;
extern "C" void cleanup();
extern "C" void initCuda(const uchar *h_volume, cudaExtent volumeSize);
extern "C" void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output, uint imageW, uint imageH, uint imageD);
void loadVolumeData(char *exec_path);
// render image using CUDA
void render()
{
// call CUDA kernel
render_kernel(gridSize, blockSize, d_output, width, height, depth);
getLastCudaError("render_kernel failed");
}
void cleanup()
{
// cudaDeviceReset causes the driver to clean up all state. While
// not mandatory in normal operation, it is good practice. It is also
// needed to ensure correct operation when the application is being
// profiled. Calling cudaDeviceReset causes all profile data to be
// flushed before the application exits
checkCudaErrors(cudaDeviceReset());
}
// Load raw data from disk
uchar *loadRawFile(const char *filename, size_t size)
{
FILE *fp = fopen(filename, "rb");
if (!fp)
{
fprintf(stderr, "Error opening file '%s'\n", filename);
return 0;
}
uchar *data = (uchar *) malloc(size);
size_t read = fread(data, 1, size, fp);
fclose(fp);
printf("Read '%s', %lu bytes\n", filename, read);
return data;
}
// write raw data to disk
int writeRawFile(const char *filename, uchar *data, size_t size)
{
int returnState=0;
// cut file extension from filename
char *a=strdup(filename); //via strdup you dumb a const char to char, you must free it yourself
int len = strlen(a);
a[len-4] = '\0'; //deletes '.raw'
//printf("%s\n",a);
char b[50];
sprintf(b, "_%dx%dx%d_out.raw", width, height, depth);
//char b[]="_out.raw"; //Add suffix out to filename
char buffer[256]; // <- danger, only storage for 256 characters.
strncpy(buffer, a, sizeof(buffer));
strncat(buffer, b, sizeof(buffer));
free(a);
FILE *fp = fopen(buffer, "wb"); //Open or create file for writing as binary, all existing data is cleared
if (!fp)
{
fprintf(stderr, "Error opening or creating file '%s'\n", buffer);
return 0;
}
size_t write = fwrite(data, 1, size, fp);
fclose(fp);
if (write==size)
{
printf("Wrote %lu bytes to '%s'\n", write, buffer);
return 0;
}
else
{
printf("Error writing data to file '%s'\n", buffer);
return 1;
}
}
// General initialization call for CUDA Device
int chooseCudaDevice(int argc, char **argv)
{
int result = 0;
result = findCudaDevice(argc, (const char **)argv);
return result;
}
void runAutoTest(char *exec_path, char *PathToFile)
{
// set path
char *path;
if (PathToFile == NULL)
{
path = sdkFindFilePath(volumeFilename, exec_path);
}
else
{
path = PathToFile;
}
if (path == NULL)
{
fprintf(stderr, "Error unable to find 3D Volume file: '%s'\n", volumeFilename);
exit(EXIT_FAILURE);
}
// Allocate output memory
checkCudaErrors(cudaMalloc((void **)&d_output, width*height*depth*sizeof(uchar)));
// zero out the output array with cudaMemset
cudaMemset(d_output, 0, width*height*depth*sizeof(uchar));
// render the volumeData
render_kernel(gridSize, blockSize, d_output, width, height, depth);
checkCudaErrors(cudaDeviceSynchronize());
getLastCudaError("render_kernel failed");
uchar *h_output = (uchar*)malloc(width*height*depth);
checkCudaErrors(cudaMemcpy(h_output, d_output, width*height*depth*sizeof(uchar), cudaMemcpyDeviceToHost));
int wState=writeRawFile(path,h_output,width*height*depth);
checkCudaErrors(cudaFree(d_output));
free(h_output);
// cudaDeviceReset causes the driver to clean up all state. While
// not mandatory in normal operation, it is good practice. It is also
// needed to ensure correct operation when the application is being
// profiled. Calling cudaDeviceReset causes all profile data to be
// flushed before the application exits
cudaDeviceReset();
//exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
}
void loadVolumeData(char *exec_path, char *PathToFile)
{
char *path;
// load volume data
if (PathToFile == NULL)
{
path = sdkFindFilePath(volumeFilename, exec_path);
}
else
{
path = PathToFile;
}
if (path == NULL)
{
fprintf(stderr, "Error unable to find 3D Volume file: '%s'\n", volumeFilename);
exit(EXIT_FAILURE);
}
size_t size = volumeSize.width*volumeSize.height*volumeSize.depth;
uchar *h_volume = loadRawFile(path, size);
//int wState=writeRawFile(path,h_volume,size);
initCuda(h_volume, volumeSize);
free(h_volume);
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main(int argc, char **argv)
{
pArgc = &argc;
pArgv = argv;
char *image_file = NULL;
printf("%s Starting...\n\n", sSDKsample);
if (checkCmdLineFlag(argc, (const char **)argv, "file")) //Note cmd line argument is -file "PathToFile/File.raw"
{ // for example -file "C:\ProgramData\NVIDIA Corporation\CUDA Samples\v7.0\2_Graphics\simpleTexture3D_FanBeamCorr\data\TestOCT_Kopie.raw"
getCmdLineArgumentString(argc, (const char **)argv, "file", &image_file);
}
if (image_file)
{
chooseCudaDevice(argc, argv);
loadVolumeData(argv[0],image_file);
runAutoTest(argv[0],image_file);
}
else
{
// use command-line specified CUDA device, otherwise use device with highest Gflops/s
chooseCudaDevice(argc, argv);
loadVolumeData(argv[0],NULL);
runAutoTest(argv[0],NULL);
}
printf("I am finished...\n"
"Can I get some ice cream please\n");
exit(EXIT_SUCCESS);
}
And this is my .cu
#ifndef _SIMPLETEXTURE3D_KERNEL_CU_
#define _SIMPLETEXTURE3D_KERNEL_CU_
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <helper_cuda.h>
#include <helper_math.h>
typedef unsigned int uint;
typedef unsigned char uchar;
texture<uchar, 3, cudaReadModeNormalizedFloat> tex; // 3D texture
cudaArray *d_volumeArray = 0;
__global__ void
d_render(uint *d_output, uint imageW, uint imageH, uint imageD)
{
uint x = __umul24(blockIdx.x, blockDim.x) + threadIdx.x;
uint y = __umul24(blockIdx.y, blockDim.y) + threadIdx.y;
uint z = __umul24(blockIdx.z, blockDim.z) + threadIdx.z;
// float u = x / (float) imageW;
// float v = y / (float) imageH;
//float w = z / (float) imageD;
// // read from 3D texture
// float voxel = tex3D(tex, u, v, w);
uint ps=__umul24(imageW,imageH);
if ((x < imageW) && (y < imageH) && (z < imageD))
{
// write output color
uint i = __umul24(z,ps) +__umul24(y, imageW) + x;
d_output[1] = (uchar) 255;//+0*voxel*255;
}
}
extern "C"
void initCuda(const uchar *h_volume, cudaExtent volumeSize)
{
// create 3D array
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<uchar>();
checkCudaErrors(cudaMalloc3DArray(&d_volumeArray, &channelDesc, volumeSize));
// copy data to 3D array
cudaMemcpy3DParms copyParams = {0};
copyParams.srcPtr = make_cudaPitchedPtr((void *)h_volume, volumeSize.width*sizeof(uchar), volumeSize.width, volumeSize.height);
copyParams.dstArray = d_volumeArray;
copyParams.extent = volumeSize;
copyParams.kind = cudaMemcpyHostToDevice;
checkCudaErrors(cudaMemcpy3D(&copyParams));
// set texture parameters
tex.normalized = true; // access with normalized texture coordinates
tex.filterMode = cudaFilterModeLinear; // linear interpolation
tex.addressMode[0] = cudaAddressModeBorder; // wrap texture coordinates
tex.addressMode[1] = cudaAddressModeBorder;
tex.addressMode[2] = cudaAddressModeBorder;
// bind array to 3D texture
checkCudaErrors(cudaBindTextureToArray(tex, d_volumeArray, channelDesc));
}
extern "C"
void render_kernel(dim3 gridSize, dim3 blockSize, uint *d_output, uint imageW, uint imageH, uint imageD)
{
d_render<<<gridSize, blockSize>>>(d_output, imageW, imageH, imageD);
}
#endif // #ifndef _SIMPLETEXTURE3D_KERNEL_CU_
As you can see, currently, I set all values to zero except the index = 1, which is set to 255. Yet when I now open the image stack in Fiji, I see that the fourth pixel on the first slide is white. If I use index=i instead, I get white vertical lines across the image stack periodically every four columns. Generally spoken, it seems that only every fourth element is beeing indexed in the CudaArray. So I am wondering if there is somekind of error here resulting from sizeof(uchar)=1 and sizeof(uint)=4. There would obviously be the factor 4 :)
I am eager to here from you experts
Cheers Mika
I figured it out by myself. The kernel works with uint* d_output while the copy to the host is written into a uchar* h_output
uchar *h_output = (uchar*)malloc(width*height*depth);
checkCudaErrors(cudaMemcpy(h_output, d_output, width*height*depth*sizeof(uchar), cudaMemcpyDeviceToHost));
This led to this strange behavior

Random123 generating random numbers for opencl using visual studio

http://www.thesalmons.org/john/random123/releases/1.00/docs/index.html
I have a hard time looking at the example for opencl and random123 as im new to OpenCL and i am not sure how I can use the provided information when im using Visual Studio 2010.
Anyone who can compose a guide for generating random numbers with the above lib and using visual studio 2010.
UPDATE:
I solved it as following and are now wondering how do I change the seed such i get random numbers at each run.
int main(int argc, char **argv)
{
const char *kernelname = "counthits";
unsigned count =10000;
cl_int err;
cl::Context cl_context;
cl::Program program;
cl::Kernel cl_kernel;
cl::Buffer cl_out;
cl::CommandQueue cl_queue;
size_t i, nthreads, hits_sz;
size_t cores, work_group_size;
cl_uint2 * hits_host;
double d = 0.; // timer
d = timer(&d);
progname = argv[0];
std::vector< cl::Platform > platformList;
CHECK(cl::Platform::get(&platformList));
CHECKERR( cl_context = createCLContext(CL_DEVICE_TYPE_GPU,cl_vendor::VENDOR_AMD, &err) );
std::vector<cl::Device> devices;
CHECKERR( devices = cl_context.getInfo<CL_CONTEXT_DEVICES>(&err) );
size_t length = 0;
const char * sourceStr = loadFileToString("pi_opencl_kernel.ocl","",&length);
cl::Program::Sources sources(1, std::make_pair(sourceStr, length));
program = cl::Program(cl_context, sources);
CHECK( program.build(devices,"-I D:\\libs\\Random123\\1.06\\include") );
CHECKERR(work_group_size = devices[0].getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>(&err) );
CHECKERR(cores = devices[0].getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(&err) );
cores *= 16*4; //Tahiti.
if (work_group_size > 64) work_group_size /= 2;
nthreads = cores * work_group_size*32; //2048*128 = 262144
if (count == 0)
count = NTRIES/nthreads; //38
printf("Count: %lu\n",count);
hits_sz = nthreads * sizeof(hits_host[0]);//2097152
CHECKNOTZERO(hits_host = (cl_uint2 *)malloc(hits_sz));
CHECKERR ( cl_out = cl::Buffer( cl_context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, hits_sz, hits_host, &err));
CHECKERR ( cl_kernel = cl::Kernel(program,kernelname,&err) );
CHECK ( cl_kernel.setArg( 0, count) );
CHECK ( cl_kernel.setArg( 1, cl_out) );
CHECKERR (cl_queue = cl::CommandQueue(cl_context, devices[0], 0, &err) );
cl::Event event;
CHECK( cl_queue.enqueueNDRangeKernel(cl_kernel,cl::NullRange,cl::NDRange(nthreads), cl::NDRange(work_group_size), NULL, &event) );
event.wait();
CHECK( cl_queue.enqueueReadBuffer(cl_out, CL_TRUE, 0,hits_sz, hits_host) );
unsigned long hits = 0, tries = 0;
for (i = 0; i < nthreads; i++) {
#ifdef _DEBUG
printf("%lu %u %u\n", (unsigned long)i, hits_host[i].s[0], hits_host[i].s[1]);
#endif
hits += hits_host[i].s[0];
tries += hits_host[i].s[1];
}
return pi_check(hits, tries);
}
Kernel:
#include <Random123/threefry.h>
/*
* counthits generates 2*n x,y points and returns hits[tid] with
* the count of number of those points within the unit circle on
* each thread.
*/
__kernel void counthits(unsigned n, __global uint2 *hitsp) {
unsigned tid = get_global_id(0);
unsigned hits = 0, tries = 0;
threefry4x32_key_t k = {{tid, 0xdecafbad, 0xfacebead, 0x12345678}};
threefry4x32_ctr_t c = {{0, 0xf00dcafe, 0xdeadbeef, 0xbeeff00d}};
while (tries < n) {
union {
threefry4x32_ctr_t c;
int4 i;
} u;
c.v[0]++;
u.c = threefry4x32(c, k);
long x1 = u.i.x, y1 = u.i.y;
long x2 = u.i.z, y2 = u.i.w;
if ((x1*x1 + y1*y1) < (1L<<62)) {
hits++;
}
tries++;
if ((x2*x2 + y2*y2) < (1L<<62)) {
hits++;
}
tries++;
}
hitsp[tid].x = hits;
hitsp[tid].y = tries;
}
I haven't tested this, but roughly speaking, something like the following:
Try changing the signature of counthits to:
_kernel void counthits(unsigned n, __global uint2 *hitsp, unsigned seed)
Replace 0xdecafbad with seed
Add
char *seedstr = getenv("COUNTHITS_SEED");
unsigned seed = seedstr ? atoi(seedstr) : 0xdecafbad;
...
CHECK ( cl_kernel.setArg( 2, seed) );
to the main program (this setArg comes after setArg( 1, ...), and you can, of).