I tried to build my kernel code but It doesn't work.
Even the error message and error code doesn't appear and
only
Could not open file: C:\Users\?
phrase was written on the console.
By partially commenting out, I found out the location of the error.
the below is the part making error.
err = clBuildProgram(program, 1, devices, "-cl-fast-relaxed-math", NULL, NULL);
CHECK_ERROR(err);
I think there is no problem on the kernel code because when I made syntax error deliberately on my kernel code, I received another message with error code
"Could not open file: C:\Users\?[D:\OpenCLProjects\Exam03\Exam02\Exam01.c:79] OpenCL error -11"
Belows are my entire code.
__kernel void vec_add(__global int* A, __global int* B, __global int* C) {
int i = get_global_id(0);
C[i] = A[i] + B[i];
}
#define _CRT_SECURE_NO_WARNINGS
#include <CL/cl.h>
#include <stdio.h>
#include <stdlib.h>
#define NUM_OF_INT 16384
#define CHECK_ERROR(err) \
if(err != CL_SUCCESS) { \
printf("[%s:%d] OpenCL error %d\n", __FILE__, __LINE__, err); \
exit(EXIT_FAILURE); \
}
char* get_source_code(const char* file_name, size_t * len);
int main()
{
cl_int err;
cl_uint num_platforms;
cl_platform_id* platforms;
cl_uint num_devices;
cl_device_id* devices;
cl_context context;
cl_command_queue queue;
cl_program program;
char* kernel_source;
size_t kernel_source_size;
cl_kernel kernel_vec_add;
cl_mem bufA, bufB, bufC;
err = clGetPlatformIDs(0, NULL, &num_platforms);
CHECK_ERROR(err);
platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id) * num_platforms);
err = clGetPlatformIDs(num_platforms, platforms, NULL);
CHECK_ERROR(err);
//
size_t plat_name_size;
clGetPlatformInfo(platforms[0], CL_PLATFORM_NAME, 0, NULL, &plat_name_size);
char* plat_name = (char*)malloc(plat_name_size);
clGetPlatformInfo(platforms[0], CL_PLATFORM_NAME, plat_name_size, plat_name, NULL);
printf("%s\n",plat_name);
//
err = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
CHECK_ERROR(err);
devices = (cl_device_id*)malloc(sizeof(cl_device_id) * num_devices);
err = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
CHECK_ERROR(err);
//
size_t dev_name_size;
clGetDeviceInfo(devices[0], CL_DEVICE_NAME, 0, NULL, &dev_name_size);
char* dev_name = (char*)malloc(dev_name_size);
clGetDeviceInfo(devices[0], CL_DEVICE_NAME, dev_name_size, dev_name, NULL);
printf("%s\n",dev_name);
//
context = clCreateContext(NULL, 1, &devices[0], NULL, NULL, &err);
CHECK_ERROR(err);
queue = clCreateCommandQueueWithProperties(context, devices[0], 0, &err);
CHECK_ERROR(err);
kernel_source = get_source_code("kernel.cl", &kernel_source_size);
program = clCreateProgramWithSource(context, 1, &kernel_source, &kernel_source_size, &err);
CHECK_ERROR(err);
err = clBuildProgram(program, 1, devices, "-cl-fast-relaxed-math", NULL, NULL);
CHECK_ERROR(err);
kernel_vec_add = clCreateKernel(program, "vec_add", &err);
CHECK_ERROR(err);
srand(time(NULL));
int* a = (int*)malloc(sizeof(int) * NUM_OF_INT);
int* b = (int*)malloc(sizeof(int) * NUM_OF_INT);
int* c = (int*)malloc(sizeof(int) * NUM_OF_INT);
for (int i = 0; i < NUM_OF_INT; i++) a[i] = rand() % (INT_MAX / 2);
for (int i = 0; i < NUM_OF_INT; i++) b[i] = rand() % (INT_MAX / 2);
bufA = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int) * NUM_OF_INT, NULL, &err);
CHECK_ERROR(err);
bufB = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int) * NUM_OF_INT, NULL, &err);
CHECK_ERROR(err);
bufC = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int) * NUM_OF_INT, NULL, &err);
CHECK_ERROR(err);
err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, sizeof(int) * NUM_OF_INT, a, 0, NULL, NULL);
CHECK_ERROR(err);
err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0, sizeof(int) * NUM_OF_INT, b, 0, NULL, NULL);
CHECK_ERROR(err);
err = clSetKernelArg(kernel_vec_add, 0, sizeof(cl_mem), &bufA);
CHECK_ERROR(err);
err = clSetKernelArg(kernel_vec_add, 1, sizeof(cl_mem), &bufB);
CHECK_ERROR(err);
err = clSetKernelArg(kernel_vec_add, 2, sizeof(cl_mem), &bufC);
CHECK_ERROR(err);
size_t global_size = NUM_OF_INT;
size_t local_size = 1024;
err = clEnqueueNDRangeKernel(
queue, kernel_vec_add, 1, NULL,
&global_size, &local_size,
0, NULL, NULL);
CHECK_ERROR(err);
//clEnqueueCopyBuffer(queue, bufC, bufA, NULL, NULL, sizeof(int) * NUM_OF_INT, NULL, NULL, NULL);
clFinish(queue);
err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, sizeof(int) * NUM_OF_INT, c, 0, NULL, NULL);
CHECK_ERROR(err);
//for (int i = 0; i < NUM_OF_INT; i++) printf("%d = %d+%d\n", c[i],a[i],b[i]);
printf("\n");
return 0;
}
char* get_source_code(const char* file_name, size_t* len) {
char* source_code;
char buf[2] = "\0";
int cnt = 0;
size_t length;
FILE* file = fopen(file_name, "r");
if (file == NULL) {
printf("[%s:%d] FAiled to open %s\n", __FILE__, __LINE__, file_name);
exit(EXIT_FAILURE);
}
fseek(file, 0, SEEK_END);
length = (size_t)ftell(file);
rewind(file);
source_code = (char*)malloc(length + 1);
fread(source_code, length, 1, file);
for (int i = 0; i < length; i++) {
buf[0] = source_code[i];
if (buf[0] == '\n') cnt++;
}
source_code[length - cnt] = '\0';
fclose(file);
*len = length - cnt;
return source_code;
}
this is my full source code
I found the reason!
If you are using windows,
your account name of window must be ASCII code(a.k.a. english)
If your account name is UNICODE, change your account name or make a new account
Related
I'm following the next tutorial in order to run my first OpenCL program.
https://medium.com/#pratikone/opencl-on-visual-studio-configuration-tutorial-for-the-confused-3ec1c2b5f0ca
The result of summation, however, is not 1024 in my case, as it should be. The sum of two numbers equals 0 (Release) or -842150451 (Debug) in my case. That is, a part of the output looks like this:
1000+24 = 0
1001+23 = 0
1002+22 = 0
My display adapter is Nvidia Geforce 8400. The installation of CUDA SDK has also finished successfully.
Here are source files:
main.cpp
#include <stdio.h>
#include <stdlib.h>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#define MAX_SOURCE_SIZE (0x100000)
int main(void) {
printf("started running\n");
// Create the two input vectors
int i;
const int LIST_SIZE = 1024;
int *A = (int*)malloc(sizeof(int)*LIST_SIZE);
int *B = (int*)malloc(sizeof(int)*LIST_SIZE);
for(i = 0; i < LIST_SIZE; i++) {
A[i] = i;
B[i] = LIST_SIZE - i;
}
// Load the kernel source code into the array source_str
FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("vector_add_kernel.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
printf("kernel loading done\n");
// Get platform and device information
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret = clGetPlatformIDs(0, NULL, &ret_num_platforms);
cl_platform_id *platforms = NULL;
platforms = (cl_platform_id*)malloc(ret_num_platforms*sizeof(cl_platform_id));
ret = clGetPlatformIDs(ret_num_platforms, platforms, NULL);
printf("ret at %d is %d\n", __LINE__, ret);
ret = clGetDeviceIDs( platforms[1], CL_DEVICE_TYPE_ALL, 1,
&device_id, &ret_num_devices);
printf("ret at %d is %d\n", __LINE__, ret);
// Create an OpenCL context
cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
printf("ret at %d is %d\n", __LINE__, ret);
// Create a command queue
cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
printf("ret at %d is %d\n", __LINE__, ret);
// Create memory buffers on the device for each vector
cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
LIST_SIZE * sizeof(int), NULL, &ret);
// Copy the lists A and B to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), A, 0, NULL, NULL);
printf("ret at %d is %d\n", __LINE__, ret);
ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), B, 0, NULL, NULL);
printf("ret at %d is %d\n", __LINE__, ret);
printf("before building\n");
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&source_str, (const size_t *)&source_size, &ret);
printf("ret at %d is %d\n", __LINE__, ret);
// Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
printf("ret at %d is %d\n", __LINE__, ret);
printf("after building\n");
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "vector_add", &ret);
printf("ret at %d is %d\n", __LINE__, ret);
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
printf("ret at %d is %d\n", __LINE__, ret);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
printf("ret at %d is %d\n", __LINE__, ret);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
printf("ret at %d is %d\n", __LINE__, ret);
//added this to fix garbage output problem
//ret = clSetKernelArg(kernel, 3, sizeof(int), &LIST_SIZE);
printf("before execution\n");
// Execute the OpenCL kernel on the list
size_t global_item_size = LIST_SIZE; // Process the entire lists
size_t local_item_size = 64; // Divide work items into groups of 64
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, NULL);
printf("after execution\n");
// Read the memory buffer C on the device to the local variable C
int *C = (int*)malloc(sizeof(int)*LIST_SIZE);
ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(int), C, 0, NULL, NULL);
printf("after copying\n");
// Display the result to the screen
for(i = 0; i < LIST_SIZE; i++)
printf("%d + %d = %d\n", A[i], B[i], C[i]);
// Clean up
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(a_mem_obj);
ret = clReleaseMemObject(b_mem_obj);
ret = clReleaseMemObject(c_mem_obj);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
free(A);
free(B);
free(C);
return 0;
}
vector_add_kernel.cl
__kernel void vector_add(__global const int *A, __global const int *B, __global int *C) {
// Get the index of the current element to be processed
int i = get_global_id(0);
// Do the operation
C[i] = A[i] + B[i];
}
*another update in code and questions*
just started to learn openCL about 1 week or so, and i tried to port a CUDA program about bruteforcing a MD5 hash to get an actual string from it. I use 2 files: kernel.cl, and main.cpp.
//this is kernel.cl
{...*defining some md5 variables*...}
void IncrementBruteGPU(unsigned char* ourBrute, unsigned int charSetLen, unsigned int bruteLength, unsigned int incrementBy){
int i = 0;
while(incrementBy > 0 && i < bruteLength)
{
int add = incrementBy + ourBrute[i];
ourBrute[i] = add % charSetLen;
incrementBy = add / charSetLen;
i++;
}}
void md5_vfy(unsigned char* data, unsigned int length, unsigned int *a1, unsigned int *b1, unsigned int *c1, unsigned int *d1){
{...*some md5 hashing function*...}}
__kernel void crack(unsigned int numThreads, unsigned int charSetLen,
unsigned int bruteLength, unsigned int v1,
unsigned int v2, unsigned int v3, unsigned int v4,
__constant unsigned char *cudaBrute,
__constant unsigned char *cudaCharSet,
__global unsigned char *correctPass){
//count index
unsigned int idx = get_global_id(0);
int totalLen = 0;
int bruteStart = 0;
unsigned char word[14];
unsigned char ourBrute[14];
int i = 0;
for(i = 0; i < 14; i++)
{
ourBrute[i] = cudaBrute[i];
}
i = 0;
bruteStart = i;
i+= bruteLength;
totalLen = i;
IncrementBruteGPU(ourBrute, charSetLen, bruteLength, idx);
int timer = 0;
for(timer = 0; timer < 200; timer++)
{
//substitute into string
for(i = 0; i < bruteLength; i++)
{
word[i+bruteStart] = cudaCharSet[ourBrute[i]];
}
unsigned int c1 = 0, c2 = 0, c3 = 0, c4 = 0;
//find MD5 hash from word
md5_vfy(word,totalLen, &c1, &c2, &c3, &c4);
//compare hash with the input one
if(c1 == v1 && c2 == v2 && c3 == v3 && c4 == v4)
{
//place the right string into first index of array
int j;
for(j= 0; j < 14; j++)
{
correctPass[j] = word[j];
}
correctPass[totalLen] = 0;
}
IncrementBruteGPU(ourBrute, charSetLen, bruteLength, numThreads);
}}
and this is the main:
//just the main, not the entire main.cpp
int main( int argc, char** argv){
int digit=1;
int charSetLen = 0;
char hash[32];
char *strhash[32];
printf("Insert Hash: ");
scanf("%s", strhash);
system("cls");
int numThreads = BLOCKS * THREADS_PER_BLOCK;
unsigned char currentBrute[14];
unsigned char cpuCorrectPass[14];
ZeroFill(currentBrute, 14);
ZeroFill(cpuCorrectPass, 14);
charSetLen = 65;
unsigned char charSet[65];
memcpy(charSet, " abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789#_", charSetLen);
memcpy(hash, strhash, 32);
//break hash into 4 processes of MD5
unsigned int v1, v2, v3, v4;
md5_to_ints(hash,&v1,&v2,&v3,&v4);
//openCL starts here
cl_platform_id cpPlatform; // OpenCL platform
cl_device_id device_id; // device ID
cl_context context; // context
cl_command_queue queue; // command queue
cl_program program; // program
cl_kernel kernel; // kernel
cl_int err;
cl_mem correctPass;
cl_mem cudaCharSet;
cl_mem cudaBrute;
size_t globalSize, localSize;
size_t bytes = 14*sizeof(char);
//5 work-groups
localSize = 10;
globalSize = 50;
// Bind to platform
err = clGetPlatformIDs(1, &cpPlatform, NULL);
if(err < 0) {
perror("Couldn't identify a platform");
exit(1);
}
// Get ID for the device
err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
if(err == CL_DEVICE_NOT_FOUND) {
err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_CPU, 1, &device_id, NULL);
}
if(err < 0) {
perror("Couldn't access any devices");
exit(1);
}
// Create a context
context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
if(err < 0) {
perror("Couldn't create a context");
exit(1);
}
// Create a command queue
queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err);
if(err < 0) {
perror("Couldn't create a command queue");
exit(1);
}
// Build the program executable
program = build_program(context, device_id, PROGRAM_FILE);
// Create the compute kernel in the program we wish to run
kernel = clCreateKernel(program, KERNEL_FUNC, &err);
if(err < 0) {
perror("Couldn't create a kernel");
exit(1);
}
// Create the input and output arrays in device memory for our calculation
cudaBrute = clCreateBuffer(context, CL_MEM_READ_ONLY, 14, NULL, NULL);
cudaCharSet = clCreateBuffer(context, CL_MEM_READ_ONLY, 95, NULL, NULL);
correctPass = clCreateBuffer(context, CL_MEM_READ_WRITE, 14, NULL, NULL);
// Write our data set into the input array in device memory
err = clEnqueueWriteBuffer(queue, correctPass, CL_TRUE, 0,
bytes, cpuCorrectPass, 0, NULL, NULL);
err = clEnqueueWriteBuffer(queue, cudaCharSet, CL_TRUE, 0,
bytes, charSet, 0, NULL, NULL);
// Set the arguments to our compute kernel
err = clSetKernelArg(kernel, 0, sizeof(unsigned int), &numThreads);
err |= clSetKernelArg(kernel, 1, sizeof(unsigned int), &charSetLen);
err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &digit);
err |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &v1);
err |= clSetKernelArg(kernel, 4, sizeof(unsigned int), &v2);
err |= clSetKernelArg(kernel, 5, sizeof(unsigned int), &v3);
err |= clSetKernelArg(kernel, 6, sizeof(unsigned int), &v4);
err |= clSetKernelArg(kernel, 7, sizeof(cl_mem), &cudaBrute);
err |= clSetKernelArg(kernel, 8, sizeof(cl_mem), &cudaCharSet);
err |= clSetKernelArg(kernel, 9, sizeof(cl_mem), &correctPass);
bool finished = false;
int ct = 0;
while(true){
do{
err = clEnqueueWriteBuffer(queue, cudaBrute, CL_TRUE, 0,
bytes, currentBrute, 0, NULL, NULL);
// Execute the kernel over the entire range of the data set
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize,
0, NULL, NULL);
// Wait for the command queue to get serviced before reading back results
clFinish(queue);
// Read the results from the device
clEnqueueReadBuffer(queue, correctPass, CL_TRUE, 0, bytes, cpuCorrectPass, 0, NULL, NULL );
if(cpuCorrectPass[0] != 0)
{
printf("MD5 Cracked---->\t");
int k = 0;
while(cpuCorrectPass[k] != 0)
{
printf("%c", cpuCorrectPass[k]);
k++;
}
printf("\n\n");
return 0;
}
finished = BruteIncrement(currentBrute, charSetLen, digit, numThreads * 200);
if(ct % OUTPUT_INTERVAL == 0)
{
printf("STATUS: ");
int k = 0;
for(k = 0; k < digit; k++)
printf("%c",charSet[currentBrute[k]]);
printf("\n");
}
ct++;
} while(!finished);
digit=digit+1;
}
// release OpenCL resources
clReleaseMemObject(correctPass);
clReleaseMemObject(cudaCharSet);
clReleaseMemObject(cudaBrute);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(queue);
clReleaseContext(context);
return 0;}
the problem with this program is it never finds the right string. Seems like the idea of comparing brute-hashes and the input hash isn't working. I got the CUDA version works perfectly .
Please kindly tell me what makes this doesn't run correctly. I suspect either the kernel isn't working at all or my lack of understanding about read/write memory & buffer in openCL or in general cause this.
*if you want to see all the files, please ask me., because i think it will be too long if i post them here.
thanks before and sorry for the bad formatting.
Your kernel is reading and writing from constant arrays defined at program scope in your OpenCL kernel source code (cudaBrute, cudaCharSet, correctPass). These arrays are not initialised, and the host will never be able to get the output from the kernel. To transfer input data from the host to a kernel and to retrieve results from a kernel, you need to use kernel arguments, not program scope variables.
Your kernel definition should look something like this:
__kernel void crack(unsigned int numThreads, unsigned int charSetLen,
unsigned int bruteLength, unsigned int v1,
unsigned int v2, unsigned int v3, unsigned int v4,
__global uchar *cudaBrute,
__global uchar *cudaCharSet,
__global uchar *correctPass)
{
...
(do stuff with the arguments)
...
}
To set the arguments from your host code, you would do something like this:
// Set the arguments to our compute kernel
err = clSetKernelArg(kernel, 0, sizeof(int), &numThreads);
err |= clSetKernelArg(kernel, 1, sizeof(int), &charSetLen);
err |= clSetKernelArg(kernel, 2, sizeof(int), &digit);
err |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &v1);
err |= clSetKernelArg(kernel, 4, sizeof(unsigned int), &v2);
err |= clSetKernelArg(kernel, 5, sizeof(unsigned int), &v3);
err |= clSetKernelArg(kernel, 6, sizeof(unsigned int), &v4);
err |= clSetKernelArg(kernel, 7, sizeof(cl_mem), &cudaBrute);
err |= clSetKernelArg(kernel, 8, sizeof(cl_mem), &cudaCharSet);
err |= clSetKernelArg(kernel, 9, sizeof(cl_mem), &correctPass);
Notice the second argument, which is the argument index in your kernel definition, and how for the last three arguments we are now passing in the buffer we created with clCreateBuffer.
(EDIT: A couple more issues were found after further debugging)
You are updating the value of digit on the host. In order to pass this updated value to the device for each kernel invocation, you need to re-set the kernel argument. You can do this simply by moving this line to just before your clEnqueueNDRangeKernel call:
err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &digit);
When you write data to the cudaCharSet buffer, you need to make sure you are writing the correct amount. Your code currently uses bytes (which is 14), but this should really be charSetLen (which is 65):
err = clEnqueueWriteBuffer(queue, cudaCharSet, CL_TRUE, 0,
charSetLen, charSet, 0, NULL, NULL);
UPDATE: clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0, LIST_SIZE * sizeof(double), C, 0, NULL, NULL); is returning -5, CL_OUT_OF_RESOURCES. This funciton/call should never return this!
I've started using OpenCL and have come across a problem. If I allow a for loop (in the kernel) to run 10000 times I get all of C to be 0 if I allow the loop to run for 8000 the results are all correct.
I have added waits around the kernel to ensure it completes, thinking I was pulling the data out before completion and have tried both Clwaitforevent and CLFinish. No errors are signalled by any of the calls. I when I used ints the for loop would work at a size of 4000000. Float and doubles have the same problem however floats work at 10000, but not at 20000, when I used the floats I removed #pragma OPENCL EXTENSION cl_khr_fp64 : enable to check that wasn't the problem.
Is this some weird memory thing, I'm I using OpenCL wrong? I realise that in most kernels I woun't be implementing for loops like this, but this seems like an issue. I have also removed __private to see if that was the problem, no change. So is there a limit on the size of for loops in OpenCL kernels? Is is hardware specific? Or is this a bug?
The kernel is a simple kernel, which adds 2 arrays (A+B) together and outputs another (C). In order to get a feel for performance I put a for loop around each calculation to slow it up/increase the number of operations per run through.
The code for the kernel is as follows:
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
__kernel void vector_add(__global double *A, __global double *B, __global double *C)
{
// Get the index of the current element
int i = get_global_id(0);
// Do the operation
for (__private unsigned int j = 0; j < 10000; j++)
{
C[i] = A[i] + B[i];
}
}
The code I'm running is as follows: (I ensure that the variables are consistent between both pieces of code when I switch between float and double)
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#define MAX_SOURCE_SIZE (0x100000)
int main(void) {
// Create the two input vectors
int i;
const int LIST_SIZE = 4000000;
double *A = (double*)malloc(sizeof(double)*LIST_SIZE);
double *B = (double*)malloc(sizeof(double)*LIST_SIZE);
for(i = 0; i < LIST_SIZE; i++) {
A[i] = static_cast<double>(i);
B[i] = static_cast<double>(LIST_SIZE - i);
}
// Load the kernel source code into the array source_str
FILE *fp;
char *source_str;
size_t source_size;
fp = fopen("vector_add_kernel.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
fclose( fp );
// Get platform and device information
cl_platform_id platform_id = NULL;
cl_device_id device_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
// clGetPlatformIDs(1, &platform_id, NULL);
//clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, ret_num_devices);
cl_int ret = clGetPlatformIDs(1, &platform_id, NULL);
if (ret != CL_SUCCESS) {
printf("Error: Failed to get platforms! (%d) \n", ret);
return EXIT_FAILURE;
}
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices);
if (ret != CL_SUCCESS) {
printf("Error: Failed to query platforms to get devices! (%d) \n", ret);
return EXIT_FAILURE;
}
/*
cl_int ret = clGetPlatformIDs(1, &platform_id, NULL);
if (ret != CL_SUCCESS) {
printf("Error: Failed to get platforms! (%d) \n", ret);
return EXIT_FAILURE;
}
ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_CPU, 1,
&device_id, &ret_num_devices);
if (ret != CL_SUCCESS) {
printf("Error: Failed to query platforms to get devices! (%d) \n", ret);
return EXIT_FAILURE;
}
*/
// Create an OpenCL context
cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret);
// Create a command queue
cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
// Create memory buffers on the device for each vector
cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
LIST_SIZE * sizeof(double), NULL, &ret);
cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY,
LIST_SIZE * sizeof(double), NULL, &ret);
cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
LIST_SIZE * sizeof(double), NULL, &ret);
if (ret != CL_SUCCESS) {
printf("Error: Buffer Fail! (%d) \n", ret);
return EXIT_FAILURE;
}
// Copy the lists A and B to their respective memory buffers
ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(double), A, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(double), B, 0, NULL, NULL);
std::cout << "Begin Compile" << "\n";
// Create a program from the kernel source
cl_program program = clCreateProgramWithSource(context, 1,
(const char **)&source_str, (const size_t *)&source_size, &ret);
if (ret != CL_SUCCESS) {
printf("Error: Program Fail! (%d) \n", ret);
return EXIT_FAILURE;
}
// Build the program
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
if (ret != CL_SUCCESS) {
printf("Error: ProgramBuild Fail! (%d) \n", ret);
return EXIT_FAILURE;
}
// Create the OpenCL kernel
cl_kernel kernel = clCreateKernel(program, "vector_add", &ret);
if (ret != CL_SUCCESS) {
printf("Error: Kernel Build Fail! (%d) \n", ret);
return EXIT_FAILURE;
}
std::cout << "End Compile" << "\n";
std::cout << "Begin Data Move" << "\n";
// Set the arguments of the kernel
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
std::cout << "End Data Move" << "\n";
// Execute the OpenCL kernel on the list
size_t global_item_size = LIST_SIZE; // Process the entire lists
size_t local_item_size = 64; // Process in groups of 64
std::cout << "Begin Execute" << "\n";
cl_event event;
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL,
&global_item_size, &local_item_size, 0, NULL, &event);
clFinish(command_queue);
//clWaitForEvents(1, &event);
std::cout << "End Execute" << "\n";
if (ret != CL_SUCCESS) {
printf("Error: Execute Fail! (%d) \n", ret);
return EXIT_FAILURE;
}
// Read the memory buffer C on the device to the local variable C
std::cout << "Begin Data Move" << "\n";
double *C = (double*)malloc(sizeof(double)*LIST_SIZE);
ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0,
LIST_SIZE * sizeof(double), C, 0, NULL, NULL);
if (ret != CL_SUCCESS) {
printf("Error: Read Fail! (%d) \n", ret);
return EXIT_FAILURE;
}
clFinish(command_queue);
std::cout << "End Data Move" << "\n";
std::cout << "Done" << "\n";
std::cin.get();
// Display the result to the screen
for(i = 0; i < LIST_SIZE; i++)
printf("%f + %f = %f \n", A[i], B[i], C[i]);
// Clean up
ret = clFlush(command_queue);
ret = clFinish(command_queue);
ret = clReleaseKernel(kernel);
ret = clReleaseProgram(program);
ret = clReleaseMemObject(a_mem_obj);
ret = clReleaseMemObject(b_mem_obj);
ret = clReleaseMemObject(c_mem_obj);
ret = clReleaseCommandQueue(command_queue);
ret = clReleaseContext(context);
free(A);
free(B);
free(C);
std::cout << "Number of Devices: " << ret_num_devices << "\n";
std::cin.get();
return 0;
}
I've had a look on the internet and can't find people with similar problems, this is a concern as it could lead to code that works well till scaled up...
I'm running Ubuntu 14.04, and have a laptop graphics card for a RC520 which I run with bumblebee/optirun. If this bug isn't reproducible on other machines up to a loop size of 4000000 then I will log a bug with bumblebee/optirun.
Cheers
I found the issue, GPUs attached to displays/active VGAs/etc have a Watch Dog Timer that times out after ~5s. This is the case for cards that aren't teslas, which have this functionality to be turned off. Running on a secondary card is a work around. This sucks and needs to be fixed ASAP. It's definitely an NVidia issue, not sure about about AMD, either way, this is terrible.
Workarounds are registry changes in Windows and, in Linux/Ubuntu, altering the X conf and placing:
option "Interactive" "0"
In the gap with the graphics card, however X conf is now not generated in later versions and may have to be manually created. If anyone has a copy and paste console code fix to this that would be great and a better answer.
once my opencl kernel file exceeds a certain length, it is not correctly loaded anymore. The program build log (clBuildProgram) returns lots of errors, where it seems like there are cuts in the middle of a line (example int test; -> error unknown identifier 't').
Here is the function with which I load the program source:
char * load_program_source(const char *filename)
{
FILE *fh;
char* source;
long lSize;
fh = fopen(filename, "r");
if (fh == 0)
return 0;
//Get Filesize
fseek(fh,0,SEEK_END);
lSize = ftell(fh);
rewind(fh);
source = (char *) malloc(lSize);
memset(source,'\0',lSize);
fread(source, sizeof(char), lSize, fh);
return source;
}
And here is the code where the program is build:
//load program from file, compile kernels
cl_program program[1];
cl_kernel kernel[13];
const char * filename = "addKernel.c";
char *program_source = load_program_source(filename);
program[0] = clCreateProgramWithSource(context, 1, (const char**)&program_source,
NULL, &err);
if (err == CL_OUT_OF_HOST_MEMORY){
textBox1->Text += "Error: out of Host Memory!\r\n";
}
else if (err == CL_INVALID_CONTEXT){
textBox1->Text += "Error: invalid Context!\r\n";
}
else if (err == CL_INVALID_VALUE){
textBox1->Text += "Error: invalid Value!\r\n";
}
err = clBuildProgram(program[0], 0, NULL, NULL, NULL, NULL);
textBox1->Text += "Program build error: " + err + "\r\n";
cl_build_status status;
size_t logSize;
clGetProgramBuildInfo(program[0], deviceID[0], CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL);
clGetProgramBuildInfo(program[0], deviceID[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
char* programLog;
programLog = (char*)calloc(logSize + 1, sizeof(char));
clGetProgramBuildInfo(program[0], deviceID[0], CL_PROGRAM_BUILD_LOG, logSize + 1, programLog, NULL);
std::string tmp = std::string(programLog);
this->textBox1->Text += "Program build info: error=" + err + ", status=" + status + ", programLog:\r\n" + gcnew System::String(tmp.c_str()) + "\r\n" + "In case of an error please make sure that openCL has been initialized\r\n";
I would be happy if you cound help me out!
Try following code. If it doesn't help, attach your kernel source
File reading:
static char* Read_Source_File(const char *filename)
{
long int
size = 0,
res = 0;
char *src = NULL;
FILE *file = fopen(filename, "rb");
if (!file) return NULL;
if (fseek(file, 0, SEEK_END))
{
fclose(file);
return NULL;
}
size = ftell(file);
if (size == 0)
{
fclose(file);
return NULL;
}
rewind(file);
src = (char *)calloc(size + 1, sizeof(char));
if (!src)
{
src = NULL;
fclose(file);
return src;
}
res = fread(src, 1, sizeof(char) * size, file);
if (res != sizeof(char) * size)
{
fclose(file);
free(src);
return src;
}
src[size] = '\0'; /* NULL terminated */
fclose(file);
return src;
}
Programm building:
cl_int ret;
program = clCreateProgramWithSource(
context, 1, (const char**)&src_file, NULL, &ret);
if(ret != CL_SUCCESS){
fprintf(stderr, "Error with code %d happened.\n", ret);
}
// Warnings will be treated like errors, this is useful for debug
char build_params[] = {"-Werror"};
ret = clBuildProgram(program, 0, NULL, build_params, NULL, NULL);
if (ret != CL_SUCCESS)
{
size_t len = 0;
char *buffer;
clGetProgramBuildInfo(program,
device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &len);
buffer = calloc(len, sizeof(char));
clGetProgramBuildInfo(program,
device_id, CL_PROGRAM_BUILD_LOG, len, buffer, NULL);
fprintf(stderr, "%s\n", buffer);
free(buffer);
}
Actually, the problem is that my program will show the message of SIGSEGV fault, but not always. That means it sometimes runs well, but sometimes breaks down. So I wonder it is probably for my C program is using a lot of memory resource? And the resource limit changes every time?
Hope for your reply, thanks.
The code is so long and I'll be glad to hear from you about what section that you need.
But I have a piece of the debug information and I dont know would it be helpful for you guys:
[New Thread 0x7ffff7e63700 (LWP 31256)]
[New Thread 0x7ffff393f700 (LWP 31257)]
[New Thread 0x7ffff312c700 (LWP 31258)]
[New Thread 0x7ffff2919700 (LWP 31260)]
[New Thread 0x7ffff2106700 (LWP 31261)]
Detaching after fork from child process 31265.
Detaching after fork from child process 31266.
Program received signal SIGSEGV, Segmentation fault.
0x00007ffff708944a in _int_malloc () from /lib64/libc.so.6
As you can see, after the several threads are built, the malloc faces problems. Will it be the trouble of memory capacity?
And here is some of my codes:
#include <iostream>
#include <cstdlib>
#include <cstdio>
#include <cstring>
#include <cmath>
#include <ctime>
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#include "compcol_double.h"
#include "comprow_double.h"
#include "coord_double.h"
#include "iohb_double.h"
#include "dehaze_set_opencl.h"
#include "default_set_opencl.h"
#include "load_image_opencl.h"
using namespace std;
//relative path is where program is executing
const char *kernel_path = "dehaze.cl";
const char *kernel_name = "dehaze";
const int ARRAY_SIZE = 100;
int main(int argc, char **argv){
//OpenCL program
cl_device_id device_id = NULL;
cl_context context = NULL;
cl_command_queue command_queue = NULL;
cl_program program = NULL;
cl_kernel kernel = NULL;
cl_platform_id platform_id = NULL;
cl_uint ret_num_devices;
cl_uint ret_num_platforms;
cl_int ret;
cl_int errNum;
//Image
cl_mem imageObjects[2] = {0,0};
cl_sampler sampler = NULL;
//Get Platform and Device Info
ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices);
//Create OpenCL Context
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
//Create Command Queue
command_queue = clCreateCommandQueue(context, device_id, 0, &ret);
//Create Program
program = CreateProgram(context, device_id, kernel_path);
if (program == NULL) {
return 1;
}
// Make sure the device supports images, otherwise exit
cl_bool imageSupport = CL_FALSE;
clGetDeviceInfo(device_id, CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool),
&imageSupport, NULL);
if (imageSupport != CL_TRUE)
{
std::cerr << "OpenCL device does not support images." << std::endl;
return 1;
}
// Now the Image Processor Kernel is loaded
// Load input image from file and load it into
// an OpenCL image object
int width, height;
imageObjects[0] = LoadImage(context, (char *) "./pic/Flowers.JPG", width, height);
if (imageObjects[0] == 0)
{
std::cerr << "Error loading: " << std::string(argv[1]) << std::endl;
return 1;
}
// Create ouput image object
cl_image_format clImageFormat;
clImageFormat.image_channel_order = CL_RGBA;
clImageFormat.image_channel_data_type = CL_UNORM_INT8;
imageObjects[1] = clCreateImage2D(context,
CL_MEM_WRITE_ONLY,
&clImageFormat,
width,
height,
0,
NULL,
&errNum);
if (errNum != CL_SUCCESS)
{
std::cerr << "Error creating CL output image object." << std::endl;
return 1;
}
// Create sampler for sampling image object
sampler = clCreateSampler(context,
CL_FALSE, // Non-normalized coordinates
CL_ADDRESS_CLAMP_TO_EDGE,
CL_FILTER_NEAREST,
&errNum);
if (errNum != CL_SUCCESS)
{
std::cerr << "Error creating CL sampler object." << std::endl;
return 1;
}
//Create OpenCL kernel
//Kernel1: calculate the t's value
//t is the mainly matrix in this algorithm
kernel = clCreateKernel(program, "get_t_mat", NULL);
if(kernel == NULL){
std::cerr<<"Failed to create kernel"<<std::endl;
return 1;
}
int t_size = width * height;
int img_size = width * height;
float t_mat[width * height];
memset( t_mat, 0, sizeof(t_mat));
cl_mem t_buffer = clCreateBuffer(context,
CL_MEM_READ_WRITE,
sizeof(float) * t_size,
NULL, NULL);
if(t_buffer == NULL){
std::cerr << "Error creating buffer" <<endl;
return 1;
}
// Set the kernel arguments
errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &imageObjects[0]);
errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &t_buffer);
errNum |= clSetKernelArg(kernel, 2, sizeof(cl_sampler), &sampler);
errNum |= clSetKernelArg(kernel, 3, sizeof(cl_int), &width);
errNum |= clSetKernelArg(kernel, 4, sizeof(cl_int), &height);
if (errNum != CL_SUCCESS)
{
std::cerr << "Error setting kernel arguments." << std::endl;
return 1;
}
size_t localWorkSize[2] = { 16, 16 };
size_t globalWorkSize[2] = { RoundUp(localWorkSize[0], width),
RoundUp(localWorkSize[1], height) };
// Queue the kernel up for execution
errNum = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL,
globalWorkSize, localWorkSize,
0, NULL, NULL);
if (errNum != CL_SUCCESS)
{
std::cerr << "Error queuing kernel for execution." << std::endl;
return 1;
}
errNum = clEnqueueReadBuffer(command_queue,
t_buffer,
CL_TRUE, 0,
t_size*sizeof(float),
t_mat,
0, NULL, NULL);
if( errNum!=CL_SUCCESS){
std::cerr << "Error write back buffer" <<endl;
return 1;
}
//Kernel2: calculate the win_b
kernel = clCreateKernel(program, "get_win_b", NULL);
if(kernel == NULL){
std::cerr<<"Failed to create kernel"<<std::endl;
return 1;
}
int win_b_size = width * height;
float win_b[width * height];
memset( win_b, 0, sizeof(win_b));
cl_mem win_b_buffer = clCreateBuffer(context,
CL_MEM_READ_WRITE,
sizeof(float) * t_size,
NULL, NULL);
if(win_b_buffer == NULL){
std::cerr << "Error creating buffer" <<endl;
return 1;
}
// Set the kernel arguments
errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &t_buffer);
//errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &imageObjects[1]);
errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &win_b_buffer);
//errNum |= clSetKernelArg(kernel, 2, sizeof(cl_sampler), &sampler);
errNum |= clSetKernelArg(kernel, 2, sizeof(cl_int), &width);
errNum |= clSetKernelArg(kernel, 3, sizeof(cl_int), &height);
if (errNum != CL_SUCCESS)
{
std::cerr << "Error setting kernel arguments." << std::endl;
return 1;
}
// Queue the kernel up for execution
errNum = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL,
globalWorkSize, localWorkSize,
0, NULL, NULL);
if (errNum != CL_SUCCESS)
{
std::cerr << "Error queuing kernel for execution." << std::endl;
return 1;
}
errNum = clEnqueueReadBuffer(command_queue,
win_b_buffer,
CL_TRUE, 0,
win_b_size*sizeof(float),
win_b,
0, NULL, NULL);
if( errNum!=CL_SUCCESS){
std::cerr << "Error write back buffer" <<endl;
return 1;
}
cout << 1 << endl;
//Kernel 3: vals
int neb_size = 9;
kernel = clCreateKernel(program, "get_vals", NULL);
if(kernel == NULL){
std::cerr<<"Failed to create kernel"<<std::endl;
return 1;
}
long long tlen = width * height * neb_size * neb_size;
double *vals = new double[tlen];
int *row_inds = new int[tlen];
int *col_inds = new int[tlen];
memset(vals,0,sizeof(float)*tlen);
memset(row_inds,0,sizeof(int)*tlen);
memset(col_inds,0,sizeof(int)*tlen);
int indsM[width*height];
for(int i = 0; i<width*height; i++)
indsM[i] = i+1;
// int test_size = 0;
cl_mem vals_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE,
sizeof(float) * tlen, NULL, NULL);
cl_mem row_inds_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE,
sizeof(int) * tlen, NULL, NULL);
cl_mem col_inds_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE,
sizeof(int) * tlen, NULL, NULL);
cl_mem indsM_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE,
sizeof(int)*width*height, NULL, NULL);
//cl_mem test_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE,
// sizeof(float)*test_size, NULL, NULL);
if(vals_buffer == NULL || row_inds_buffer == NULL
|| col_inds_buffer == NULL || indsM_buffer == NULL ){
std::cerr << "Error creating buffer" <<endl;
return 1;
}
errNum = clEnqueueWriteBuffer( command_queue, indsM_buffer, CL_FALSE, 0,
width*height, indsM, 0, NULL, NULL);
if(errNum != CL_SUCCESS){
cerr<<"Error writing buffer"<<endl;
exit(1);
}
// Set the kernel arguments
// Needs to be repaired
errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &imageObjects[0]);
errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &indsM_buffer);
errNum |= clSetKernelArg(kernel, 2, sizeof(cl_sampler), &sampler);
errNum |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &vals_buffer);
errNum |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &row_inds_buffer);
errNum |= clSetKernelArg(kernel, 5, sizeof(cl_mem), &col_inds_buffer);
errNum |= clSetKernelArg(kernel, 6, sizeof(cl_int), &width);
errNum |= clSetKernelArg(kernel, 7, sizeof(cl_int), &height);
if (errNum != CL_SUCCESS)
{
std::cerr << "Error setting kernel arguments." << std::endl;
return 1;
}
// Queue the kernel up for execution
size_t t_localWorkSize[2] = { 1, 1 };
size_t t_globalWorkSize[2] = { RoundUp(localWorkSize[0], width),
RoundUp(localWorkSize[1], height) };
errNum = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL,
t_globalWorkSize, t_localWorkSize,
0, NULL, NULL);
if (errNum != CL_SUCCESS)
{
std::cerr << "Error queuing kernel for execution." << std::endl;
return 1;
}
errNum = clEnqueueReadBuffer(command_queue, vals_buffer, CL_TRUE, 0,
tlen*sizeof(float), vals, 0, NULL, NULL);
errNum |= clEnqueueReadBuffer(command_queue, row_inds_buffer, CL_TRUE, 0,
tlen*sizeof(float), row_inds, 0, NULL, NULL);
errNum |= clEnqueueReadBuffer(command_queue, col_inds_buffer, CL_TRUE, 0,
tlen*sizeof(float), col_inds, 0, NULL, NULL);
// cout << 1 << endl;
if( errNum!=CL_SUCCESS){
std::cerr << "Error write back buffer" <<endl;
return 1;
}
Coord_Mat_double SparseMat(width,height,tlen,vals,row_inds,col_inds);
cout << SparseMat.dim(0) << endl;
cout << width << endl;
// Read the output buffer back to the Host
/*
char *buffer = new char [width * height * 4];
size_t origin[3] = { 0, 0, 0 };
size_t region[3] = { width, height, 1};
errNum = clEnqueueReadImage(command_queue, imageObjects[1], CL_TRUE,
origin, region, 0, 0, buffer,
0, NULL, NULL);
if (errNum != CL_SUCCESS)
{
std::cerr << "Error reading result buffer." << std::endl;
return 1;
}
*/
//std::cout << std::endl;
std::cout << "Executed program succesfully." << std::endl;
//memset(buffer, 0xff, width * height * 4);
// Save the image out to disk
/*
if (!SaveImage((char *) "out2.png", buffer, width, height))
{
std::cerr << "Error writing output image" << std::endl;
delete [] buffer;
return 1;
}
delete [] buffer;
*/
return 0;
}
THX
you can use gdb.
compile all your source code with the -g flag.
from terminal run:
gdb <your program>
then in the gdb shell:
r <arguments>
now wait for SIGSEGV when it occur type: where or: bt
it will show you the exact place in your code it was when it crashed.