I am trying to make an Algorithm run on OpenCL. I am using this repository (Source.cpp) as a template. I now want to convert the whole program into type of long algorithm instead of float. But I always get an CL_INVALID_VALUE (-30) exception at
the second clEnqueueWriteBuffer. I have wasted hours without finding the error, so maybe I have overseen something obvious (I have not done too much with opencl yet..) ?
My code (not working)
#include <cassert>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <fstream>
#include<time.h>
#include <CL/cl.h>
//#define DATA_SIZE 1024
#define DATA_SIZE 1024
using namespace std;
//$ /f/Tools/OCL_SDK_Light/lib/x86_64/opencl.lib blelloch_scan.cpp
const char* ProgramSource =
"__kernel void add(__global long *input, __global long *output, __global long *temp, int size){\n"\
"int thid = get_global_id(0); \n"\
"int offset = 1; \n"\
"printf('%d',thid); \n"\
"temp[2*thid] = input[2*thid]; \n"\
"temp[2*thid+1] = input[2*thid+1]; \n"\
"for(int d= size>>1; d>0; d >>= 1){ \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"if(thid < d){ \n"\
"int ai = offset*(2*thid + 1)-1; \n"\
"int bi = offset*(2*thid + 2)-1; \n"\
"temp[bi] += temp[ai]; } \n"\
"offset = offset*2; \n"\
"} \n"\
"temp[size-1] = 0; \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"for(int d = 1; d<size; d *= 2){ \n"\
"offset >>= 1; barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"if(thid < d) { \n"\
"int ai = offset*(2*thid+1)-1; int bi = offset*(2*thid+2)-1; \n"\
"long t = temp[ai]; temp[ai] = temp[bi]; temp[bi] += t; } \n"\
"} \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"output[2*thid] = temp[2*thid]; \n"\
"output[2*thid+1] = temp[2*thid+1]; \n"\
"}\n"\
"\n";
/*
*/
int main(void)
{
cl_context context;
cl_context_properties properties[3];
cl_command_queue command_queue;
cl_kernel kernel;
cl_program program;
cl_int err;
cl_uint num_platforms = 0;
cl_platform_id* platforms;
cl_device_id device_id;
cl_uint num_of_devices = 0;
cl_mem inputA, inputB, output;
size_t global, loc;
std::cout << "Setup \n";
long arr[DATA_SIZE];
long inputDataA[DATA_SIZE];
long results[2 * DATA_SIZE];
long i;
for (i = 1; i < DATA_SIZE - 1;i++)
{
inputDataA[i-1] = (long)i;
arr[i-1] = (long)i;
}
clock_t ends;
/* --------------------- Get platform ---------------------*/
cl_int clResult = clGetPlatformIDs(0, NULL, &num_platforms);
assert(clResult == CL_SUCCESS);
platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id) * num_platforms);
clResult = clGetPlatformIDs(num_platforms, platforms, NULL);
assert(clResult == CL_SUCCESS);
/* --------------------- ------------ ---------------------*/
/* --------------------- Get devices ---------------------*/
cl_device_id* devices = NULL;
cl_uint num_devices;
clResult = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
assert(clResult == CL_SUCCESS);
devices = (cl_device_id*)malloc(sizeof(cl_device_id) * num_platforms);
if (clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, num_devices, devices, NULL) != CL_SUCCESS)
{
printf("could not find device id");
}
assert(clResult == CL_SUCCESS);
/* --------------------- ----------- ---------------------*/
properties[0] = CL_CONTEXT_PLATFORM;
properties[1] = 0;
cl_int contextResult;
context = clCreateContext(NULL, 1, &devices[0], NULL, NULL, &contextResult);
assert(contextResult == CL_SUCCESS);
// create command queue using the context and device
command_queue = clCreateCommandQueueWithProperties(context, devices[0], 0, &err);
// create a program from the kernel source code
program = clCreateProgramWithSource(context, 1, (const char**)&ProgramSource, NULL, &err);
// compile the program
if (clBuildProgram(program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS)
{
printf("Error building program\n");
return 1;
}
// specify which kernel from the program to execute
kernel = clCreateKernel(program, "add", &err);
// create buffers for the input and ouput
cl_int result;
inputA = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(long) * DATA_SIZE, NULL, NULL);
inputB = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(long) * DATA_SIZE, NULL, NULL);
output = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(long) * DATA_SIZE, NULL, NULL);
// load data into the input buffer
clResult = clEnqueueWriteBuffer(command_queue, inputA, CL_TRUE, 0, sizeof(long) * DATA_SIZE, inputDataA, 0, NULL, NULL);
assert(clResult == CL_SUCCESS);
clResult = clEnqueueWriteBuffer(command_queue, inputB, CL_TRUE, 0, sizeof(long) * DATA_SIZE , 0, 0, NULL, NULL);
assert(clResult == CL_SUCCESS); // ERROR HERE
clResult = clEnqueueWriteBuffer(command_queue, output, CL_TRUE, 0, sizeof(long) * DATA_SIZE, 0, 0, NULL, NULL);
assert(clResult == CL_SUCCESS);
int temp = DATA_SIZE;
clock_t start = clock();
// set the argument list for the kernel command
clResult = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputA);
assert(clResult == CL_SUCCESS);
clResult = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
assert(clResult == CL_SUCCESS);
clResult = clSetKernelArg(kernel, 2, sizeof(cl_mem), &inputB);
assert(clResult == CL_SUCCESS);
clResult = clSetKernelArg(kernel, 3, sizeof(int), &temp);
assert(clResult == CL_SUCCESS);
global = DATA_SIZE; // num of processors
loc = 256;
printf("\n>> start parallel ---------- \n");
// enqueue the kernel command for execution
clResult = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, &loc, 0, NULL, NULL);
assert(clResult == CL_SUCCESS);
// copy the results from out of the output buffer
clResult = clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sizeof(long) * DATA_SIZE, results, 0, NULL, NULL);
assert(clResult == CL_SUCCESS);
clFinish(command_queue);
ends = clock();
// print the results
int k = 1;
for (k = 1;k < 8; k++)
{
printf("%d - ", k);
printf("%d \n", results[k]);
}
double time_taken = ((double)(ends - start)) / CLK_TCK;
printf("\n>>finished parallel in %lf seconds\n", time_taken);
// cleanup - release OpenCL resources
printf("\n-------------------------------------\n");
/* -------sequential ------- */
printf("\n>> start sequential ---------- \n");
long prefixSum[DATA_SIZE] = { 0 };
const clock_t startSequential = clock();
prefixSum[0] = arr[0];
long idx = 1;
for (idx = 1; idx < DATA_SIZE; idx++) {
prefixSum[idx] = prefixSum[idx - 1] + arr[idx];
}
const clock_t endSequential = clock();
double seqTime = ((double)(endSequential - startSequential)) / CLK_TCK;
printf("\n>> finished sequential in %lf\n", seqTime);
for (int j = 0;j < 8; j++)
{
printf("%d - ", j);
printf("%d \n", prefixSum[j]);
}
clReleaseMemObject(inputA);
clReleaseMemObject(inputB);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
return 0;
}
The repository code (working):
#include <cassert>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <fstream>
#include<time.h>
#include <CL/cl.h>
#define DATA_SIZE 1024
using namespace std;
ofstream outfile;
const char* ProgramSource =
"__kernel void add(__global float *input, __global float *output, __global float *temp, int size){\n"\
"int thid = get_global_id(0); \n"\
"int offset = 1; \n"\
"temp[2*thid] = input[2*thid]; \n"\
"temp[2*thid+1] = input[2*thid+1]; \n"\
"for(int d= size>>1; d>0; d >>= 1){ \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"if(thid < d){ \n"\
"int ai = offset*(2*thid + 1)-1; \n"\
"int bi = offset*(2*thid + 2)-1; \n"\
"temp[bi] += temp[ai]; } \n"\
"offset = offset*2; \n"\
"} \n"\
"temp[size-1] = 0; \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"for(int d = 1; d<size; d *= 2){ \n"\
"offset >>= 1; barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"if(thid < d) { \n"\
"int ai = offset*(2*thid+1)-1; int bi = offset*(2*thid+2)-1; \n"\
"float t = temp[ai]; temp[ai] = temp[bi]; temp[bi] += t; } \n"\
"} \n"\
"barrier(CLK_GLOBAL_MEM_FENCE); \n"\
"output[2*thid] = temp[2*thid]; \n"\
"output[2*thid+1] = temp[2*thid+1]; \n"\
"}\n"\
"\n";
/*
*/
int main(void)
{
cl_uint num_platforms = 0;
cl_context context;
cl_context_properties properties[3];
cl_kernel kernel;
cl_platform_id* platforms;
cl_command_queue command_queue;
cl_program program;
cl_int err;
cl_uint num_of_platforms = 0;
cl_platform_id platform_id;
cl_device_id device_id;
cl_uint num_of_devices = 0;
cl_mem inputA, inputB, output;
outfile.open("shubham.txt");
size_t global, loc;
float inputDataA[DATA_SIZE];
float results[2 * DATA_SIZE] = { 0 };
int i;
for (i = 0; i < DATA_SIZE;i++)
{
inputDataA[i] = (float)i;
}
clock_t start, ends;
/* --------------------- Get platform ---------------------*/
cl_int clResult = clGetPlatformIDs(0, NULL, &num_platforms);
assert(clResult == CL_SUCCESS);
platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id) * num_platforms);
clResult = clGetPlatformIDs(num_platforms, platforms, NULL);
assert(clResult == CL_SUCCESS);
/* --------------------- ------------ ---------------------*/
/* --------------------- Get devices ---------------------*/
cl_device_id* devices = NULL;
cl_uint num_devices;
clResult = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices);
assert(clResult == CL_SUCCESS);
devices = (cl_device_id*)malloc(sizeof(cl_device_id) * num_platforms);
if (clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, num_devices, devices, NULL) != CL_SUCCESS)
{
printf("could not find device id");
}
assert(clResult == CL_SUCCESS);
/* --------------------- ----------- ---------------------*/
properties[0] = CL_CONTEXT_PLATFORM;
properties[1] = 0;
cl_int contextResult;
context = clCreateContext(NULL, 1, &devices[0], NULL, NULL, &contextResult);
assert(contextResult == CL_SUCCESS);
// create command queue using the context and device
// create command queue using the context and device
command_queue = clCreateCommandQueueWithProperties(context, devices[0], 0, &err);
// create a program from the kernel source code
program = clCreateProgramWithSource(context, 1, (const char**)&ProgramSource, NULL, &err);
// compile the program
if (clBuildProgram(program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS)
{
printf("Error building program\n");
return 1;
}
// specify which kernel from the program to execute
kernel = clCreateKernel(program, "add", &err);
// create buffers for the input and ouput
inputA = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * DATA_SIZE, NULL, NULL);
inputB = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * DATA_SIZE, NULL, NULL);
output = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * DATA_SIZE, NULL, NULL);
// load data into the input buffer
clEnqueueWriteBuffer(command_queue, inputA, CL_TRUE, 0, sizeof(float) * DATA_SIZE, inputDataA, 0, NULL, NULL);
clEnqueueWriteBuffer(command_queue, inputB, CL_TRUE, 0, sizeof(float) * DATA_SIZE, 0, 0, NULL, NULL);
clEnqueueWriteBuffer(command_queue, output, CL_TRUE, 0, sizeof(float) * DATA_SIZE, 0, 0, NULL, NULL);
int temp = DATA_SIZE;
start = clock();
// set the argument list for the kernel command
clResult = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputA);
assert(clResult == CL_SUCCESS);
clResult = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output);
assert(clResult == CL_SUCCESS);
clResult = clSetKernelArg(kernel, 2, sizeof(cl_mem), &inputB);
assert(clResult == CL_SUCCESS);
clResult = clSetKernelArg(kernel, 3, sizeof(int), &temp);
assert(clResult == CL_SUCCESS);
global = DATA_SIZE;
loc = 256;
// enqueue the kernel command for execution
clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global, &loc, 0, NULL, NULL);
clFinish(command_queue);
// copy the results from out of the output buffer
clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sizeof(float) * DATA_SIZE, results, 0, NULL, NULL);
//clEnqueueReadBuffer(command_queue, inputB, CL_TRUE, 0, sizeof(float) *16, shubh, 0, NULL, NULL);
// print the results
printf("output: ");
for (i = 0;i < 5; i++)
{
printf("%f \n", results[i]);
outfile << results[i] << " ";
}
ends = clock();
double time_taken = ((double)(ends - start)) / CLK_TCK;
outfile << endl << "Time taken is : " << time_taken << endl;
clReleaseMemObject(inputA);
clReleaseMemObject(inputB);
clReleaseMemObject(output);
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
return 0;
}
Thanks in Advance
I found your mistake. For me, it first wouldn't compile the OpenCL C code, so I debugged with
char info[1024];
clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 1024*sizeof(char), (void*)info, NULL); // print build log
printf(info);
to get the build log:
<kernel>:4:8: warning: multi-character character constant
printf('0',thid);
^
<kernel>:4:8: warning: incompatible integer to pointer conversion passing 'int' to parameter of type '__constant char *'
printf('190',thid);
^~~~
cl_kernel.h:4694:32: note: passing argument to parameter here
printf(constant char * restrict, ...) __asm("llvm.nvvm.internal.printf.cl");
Seems ' instead of \" was the issue. Change this line:
"printf(\"%d\",thid); \n"
Then the OpenCL C code compiles and I can reproduce the CL_INVALID_VALUE error.
Here is the issue: You use clEnqueueWriteBuffer to copy data from inputB to 0. You need to add C++ arrays to copy the data into:
long inputDataA[DATA_SIZE];
long inputDataB[DATA_SIZE];
long outputData[DATA_SIZE];
and
clResult = clEnqueueWriteBuffer(command_queue, inputA, CL_TRUE, 0, sizeof(long) * DATA_SIZE, inputDataA, 0, NULL, NULL);
assert(clResult == CL_SUCCESS);
clResult = clEnqueueWriteBuffer(command_queue, inputB, CL_TRUE, 0, sizeof(long) * DATA_SIZE, inputDataB, 0, NULL, NULL);
assert(clResult == CL_SUCCESS); // WORKS NOW
clResult = clEnqueueWriteBuffer(command_queue, output, CL_TRUE, 0, sizeof(long) * DATA_SIZE, outputData, 0, NULL, NULL);
assert(clResult == CL_SUCCESS);
Then it works, and I get this output:
Setup
0
>> start parallel ----------
25625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035101234567891011121314151617181920212223242526272829303138438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441564656667686970717273747576777879808182838485868788899091929394953523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823839697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612741641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863932333435363738394041424344454647484950515253545556575859606162634484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784798328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628635445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745751921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222234804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105118648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948955125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425431281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581598008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308315765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066072242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542559609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909916406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706711601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901917687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987997367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667679929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210236726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027039289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589597047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347358968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269271 - 0
2 - 1
3 - 2
4 - 4
5 - 6
6 - 9
7 - 12
>>finished parallel in 0.023000 seconds
-------------------------------------
>> start sequential ----------
>> finished sequential in 0.000000
0 - 1
1 - 3
2 - 6
3 - 10
4 - 15
5 - 21
6 - 28
7 - 36
Also note, that in OpenCL C, long = 64-bit integer, but in C++, long = "at least" 32-bit integer, for whatever stupid reason. In C++ you shoud use long long int as this is alwqays 64-bit integer. You can use for example typedef int64_t slong;, where int64_t itself is a typedef of long long int.
Another issue is that the program is not deterministic. When executed multiple times, I get a different result each time. There must be some race condition present. I suppose you wrongly assume that barrier(CLK_GLOBAL_MEM_FENCE); provides global synchronization of all threads, but this is not true. The only global synchronization is to split the kernel into multiple kernels at the desired synchronizatipon points and execute them one after the other.
Finally, to make OpenCL development in C++ much easier, and to prevent wasting hours on such simple errors, I have created a lightweight OpenCL-Wrapper to eliminate all of the OpenCL code overhead. With this, your code is 4x shorter and much easier to understand:
#include "opencl.hpp"
#define DATA_SIZE 1024
int main() {
Clock clock;
clock.start();
Device device(select_device_with_most_flops()); // compile OpenCL C code for the fastest available device
Memory<slong> arr(device, DATA_SIZE, 1u, true, false);
Memory<slong> inputA(device, DATA_SIZE);
Memory<slong> inputB(device, DATA_SIZE);
Memory<slong> output(device, DATA_SIZE);
for(int i=1; i<DATA_SIZE-1; i++) {
inputA[i-1] = (slong)i;
arr[i-1] = (slong)i;
}
inputA.write_to_device();
Kernel kernel(device, DATA_SIZE, "add", inputA, output, inputB);
kernel.add_constants(DATA_SIZE);
kernel.run();
output.read_from_device();
double time_taken = clock.stop();
// print the results
for(int k=1; k<8; k++) {
printf("%d - ", k);
printf("%d \n", output[k]);
}
printf("\n>>finished parallel in %lf seconds\n", time_taken);
printf("\n-------------------------------------\n");
printf("\n>> start sequential ---------- \n");
long prefixSum[DATA_SIZE] = { 0 };
clock.start();
prefixSum[0] = arr[0];
for(long idx=1; idx<DATA_SIZE; idx++) {
prefixSum[idx] = prefixSum[idx-1]+arr[idx];
}
double seqTime = clock.stop();
printf("\n>> finished sequential in %lf\n", seqTime);
for(int j=0; j<8; j++) {
printf("%d - ", j);
printf("%d \n", prefixSum[j]);
}
wait();
return 0;
}
#include "kernel.hpp" // note: unbalanced round brackets () are not allowed and string literals can't be arbitrarily long, so periodically interrupt with )+R(
string opencl_c_container() { return R( // ########################## begin of OpenCL C code ####################################################################
kernel void add(__global long* input, __global long* output, __global long* temp, int size) {
int thid = get_global_id(0);
int offset = 1;
printf("%d",thid);
temp[2*thid] = input[2*thid];
temp[2*thid+1] = input[2*thid+1];
for(int d= size>>1; d>0; d >>= 1) {
barrier(CLK_GLOBAL_MEM_FENCE);
if(thid < d) {
int ai = offset*(2*thid + 1)-1;
int bi = offset*(2*thid + 2)-1;
temp[bi] += temp[ai];
}
offset = offset*2;
}
temp[size-1] = 0;
barrier(CLK_GLOBAL_MEM_FENCE);
for(int d = 1; d<size; d *= 2) {
offset >>= 1; barrier(CLK_GLOBAL_MEM_FENCE);
if(thid < d) {
int ai = offset*(2*thid+1)-1; int bi = offset*(2*thid+2)-1;
long t = temp[ai]; temp[ai] = temp[bi]; temp[bi] += t;
}
}
barrier(CLK_GLOBAL_MEM_FENCE);
output[2*thid] = temp[2*thid];
output[2*thid+1] = temp[2*thid+1];
}
);} // ############################################################### end of OpenCL C code #####################################################################
Related
I am trying to process an array of arrays of variables lengths with OpenCL 1.2 in C++. In each instance (workitem?) I want to process one sub array.
Below I've tried to treat the array of arrays as a 1D array, but it does not work - random parts of the data are not processes.
Host:
vector<cl::Platform> platforms; cl::Platform::get(&platforms); _ASSERT(platforms.size() > 0); auto platform = platforms.front(); //get the platform
std::vector<cl::Device> devices; platform.getDevices(CL_DEVICE_TYPE_GPU, &devices); _ASSERT(devices.size() > 0); auto device = devices.front(); // get the device
std::ifstream myFile("DynMultiDimArr.cl"); string src(istreambuf_iterator<char>(myFile), (istreambuf_iterator<char>())); cl::Program::Sources sources(1, std::make_pair(src.c_str(), src.length() + 1)); //create program from cl file
cl::Context context(device);
cl::Program program(context, sources);
auto err = program.build(); if (err!=0) printf("%s\n",program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device).c_str() );
cl::CommandQueue queue(context, device);
int lens[10] = { 5,7,9,6,21,12,4,18,15,10 }, *idx=new int[10], totSize=0, c=0;
for (int i = 0; i < 10; i++) totSize += lens[i];
double *dat = new double[totSize], **myDat = new double *[10]; // array of arrays of different lengths
for (int i = 0; i < 10; i++) {
idx[i] = c;
myDat[i] = dat + c;
for (int j = 0; j < lens[i]; j++) myDat[i][j] = c++;
}
cl::Buffer inBuf(context, CL_MEM_READ_ONLY | CL_MEM_HOST_NO_ACCESS | CL_MEM_COPY_HOST_PTR, sizeof(double)*totSize, dat, &err);
cl::Buffer iBuf(context, CL_MEM_READ_ONLY | CL_MEM_HOST_NO_ACCESS | CL_MEM_COPY_HOST_PTR, sizeof(int)*10, lens, &err);
cl::Buffer lBuf(context, CL_MEM_READ_ONLY | CL_MEM_HOST_NO_ACCESS | CL_MEM_COPY_HOST_PTR, sizeof(int)*10, idx, &err);
cl::Buffer outBuf(context, CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY, sizeof(double)*totSize, nullptr, &err);
cl::Kernel kernel(program, "processSubArr");
err = kernel.setArg(0, inBuf);
err = kernel.setArg(1, lBuf);
err = kernel.setArg(2, iBuf);
err = kernel.setArg(3, outBuf);
err=queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(10));
err=queue.enqueueReadBuffer(outBuf, CL_FALSE, 0, sizeof(double)*totSize, dat);
cl::finish();
Kernel:
__kernel void processSubArr(__global double* data, __global int* len, __global int* idx, __global double* outData) {
for (int i=0;i<len[get_global_id(0)];i++)
outData[idx[get_global_id(0)]+i] = data[idx[get_global_id(0)]+i]+1000;
}
This is test code only. In my real problem I have to pass 8 arrays of arrays (all same dimensions). First dimension is 105 to 106 long, second 1 to 100 long. The kernel code is ~100 lines of code, calculating turbulent eddy viscosity and diffusivity on each sub array with the k-epsilon method.
Is this the way or am I on the completely wrong path? I'm new to OpenCL - any help will be much appreciated.
Updated code that works. I have no clue about the performance of this method.
vector<cl::Platform> platforms; cl::Platform::get(&platforms); _ASSERT(platforms.size() > 0); auto platform = platforms.front(); //get the platform
std::vector<cl::Device> devices; platform.getDevices(CL_DEVICE_TYPE_GPU, &devices); _ASSERT(devices.size() > 0); auto device = devices.front(); // get the device
std::ifstream myFile("DynMultiDimArr.cl"); string src(istreambuf_iterator<char>(myFile), (istreambuf_iterator<char>())); cl::Program::Sources sources(1, std::make_pair(src.c_str(), src.length() + 1)); //create program from cl file
cl::Context context(device);
cl::Program program(context, sources);
auto err = program.build(); if (err!=0) printf("%s\n",program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(device).c_str() );
cl::CommandQueue queue(context, device);
int lens[10] = { 5,7,9,6,21,12,4,18,15,10 }, *idx=new int[10], totSize=0, c=0;
for (int i = 0; i < 10; i++) totSize += lens[i];
double *dat = new double[totSize], **myDat = new double *[10]; // array of arrays of different lengths
for (int i = 0; i < 10; i++) {
idx[i] = c;
myDat[i] = dat + c;
for (int j = 0; j < lens[i]; j++) myDat[i][j] = c++;
}
cl::Buffer inBuf(context, CL_MEM_READ_ONLY | CL_MEM_HOST_NO_ACCESS | CL_MEM_COPY_HOST_PTR, sizeof(double)*totSize, dat, &err);
cl::Buffer lBuf(context, CL_MEM_READ_ONLY | CL_MEM_HOST_NO_ACCESS | CL_MEM_COPY_HOST_PTR, sizeof(int)*10, lens, &err);
cl::Buffer iBuf(context, CL_MEM_READ_ONLY | CL_MEM_HOST_NO_ACCESS | CL_MEM_COPY_HOST_PTR, sizeof(int)*10, idx, &err);
cl::Buffer outBuf(context, CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY, sizeof(double)*totSize, nullptr, &err);
cl::Kernel kernel(program, "processSubArr");
err = kernel.setArg(0, inBuf);
err = kernel.setArg(1, lBuf);
err = kernel.setArg(2, iBuf);
err = kernel.setArg(3, outBuf);
err=queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(10));
err=queue.enqueueReadBuffer(outBuf, CL_FALSE, 0, sizeof(double)*totSize, dat); //queue.enqueueMapBuffer + memcpy faster?
queue.finish();
cl::finish();
for (int i = 0; i < 10; i++) {
int j = 0;
for (j = 0; j < lens[i]-1; j++)
cout << myDat[i][j] << ",";
cout << myDat[i][j] << endl;
}
delete[] dat;
I want to parallelize temperatures distribution, using OpenCL technology. I stocked on problem with my GPU - work item id for every other kernel function are the same. Instead of result, for example, from 0 to 1024, I got this result. What I did incorrectcly?
enter image description here
Source.cpp
include <iostream>
#include <string>
#include <fstream>
#include <omp.h>
#include <CL/cl.hpp>
float*** distributeOpenCL(float*** cuboid, int k, int m, int n)
{
// OpenCL init
int size = k * m * n;
float*** hResult = initCuboid(k, m, n);
cl_platform_id platform;
cl_device_id device;
cl_int error = 0;
std::ifstream file("program.cl");
std::string fileText = std::string(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>());
const char* srcText = fileText.data();
size_t srcLength = fileText.size();
cl_context context;
cl_program program;
cl_kernel kernel;
cl_command_queue queue;
cl_mem dCuboid, dRes;
size_t localSize[2] = { k,m };
size_t globalSize[2] = { ceil(size / (float)localSize[0]) * localSize[0], ceil(size / (float)localSize[1]) * localSize[1] };
// Get GPU
error |= clGetPlatformIDs(1, &platform, NULL);
error |= clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
// Compile and build
context = clCreateContext(NULL, 1, &device, NULL, NULL, &error);
program = clCreateProgramWithSource(context, 1, &srcText, &srcLength, &error);
error |= clBuildProgram(program, 1, &device, NULL, NULL, NULL);
// What funtion from file we have to run
kernel = clCreateKernel(program, "distributeKernel", &error);
// Add to Queue
queue = clCreateCommandQueueWithProperties(context, device, NULL, &error);
// Create buffer
dCuboid = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * size, NULL, NULL);
dRes = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * size, NULL, NULL);
// Write data to buffer
error |= clEnqueueWriteBuffer(queue, dCuboid, CL_TRUE, 0, sizeof(float) * size, cuboid, 0, NULL, NULL);
// Kernel args
error |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &dCuboid);
error |= clSetKernelArg(kernel, 1, sizeof(int), &k);
error |= clSetKernelArg(kernel, 2, sizeof(int), &m);
error |= clSetKernelArg(kernel, 3, sizeof(int), &n);
error |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &dRes);
// Start task
error |= clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalSize, localSize, 0, NULL, NULL);
// Wait execution
clFinish(queue);
// Read Result
error |= clEnqueueReadBuffer(queue, dRes, CL_TRUE, 0, sizeof(float) * size, hResult, 0, NULL, NULL);
//printCuboid(resP, k, m, n, resPFile);
// Deallocation
clReleaseKernel(kernel);
clReleaseMemObject(dCuboid);
clReleaseMemObject(dRes);
clReleaseCommandQueue(queue);
clReleaseProgram(program);
clReleaseContext(context);
return hResult;
}
int main(int argc, char* argv[])
{
std::ofstream filledFile("filled.txt");
std::ofstream resLFile("resL.txt");
std::ofstream resPFile("resP.txt");
double durationL, durationP, time1, time2;
int k = 5, m = 5, n = 5, temp1 = 10, temp2 = 15;
float*** cuboid, *** resL, *** resP;
if (argc > 1) {
k = atoi(argv[1]), m = atoi(argv[2]), n = atoi(argv[3]),
temp1 = atoi(argv[4]), temp2 = atoi(argv[5]);
}
// Linear
cuboid = initCuboid(k, m, n);
fillCuboid(cuboid, k, m, n, temp1, temp2);
printCuboidToFile(cuboid, k, m, n, filledFile);
time1 = omp_get_wtime();
resL = distribute(cuboid, k, m, n);
time2 = omp_get_wtime();
durationL = time2 - time1;
printCuboidToFile(resL, k, m, n, resLFile);
// Parallel
time1 = omp_get_wtime();
resP = distributeOpenCL(cuboid, k, m, n);
time2 = omp_get_wtime();
durationP = time2 - time1;
//printCuboidToFile(resP, k, m, n, resPFile);
std::cout << "Linear time: " << durationL << std::endl;
std::cout << "Parallel time: " << durationP << std::endl;
std::cout << "Parallel faster than linear on: " << durationL - durationP << std::endl;
// Delete 3d arrays, closing files
deleteCuboid(cuboid, k, m, n);
deleteCuboid(resL, k, m, n);
deleteCuboid(resP, k, m, n);
filledFile.close();
resLFile.close();
resPFile.close();
return 0;
}
program.cl
__kernel void distributeKernel(__global float*** cuboid, int k, int m, int n, __global float*** result)
{
int gz = get_global_id(0);
int gy = get_global_id(1);
printf("gy - %d \n", &gy);
printf("gz - %d \n", &gz);
bool isDissipated = false;
int size = k * m * n;
// Ends if temperatures in cube becomes balanced
while (!isDissipated) {
int dissipatedCount = 0;
for (int x = 0; x < n; x++) {
// Calc average temperature
float sum = 0;
int count = 0;
float average;
for (int zSum = gz - 1; zSum <= gz + 1; zSum++) {
for (int ySum = gy - 1; ySum <= gy + 1; ySum++) {
for (int xSum = x - 1; xSum <= x + 1; xSum++) {
if (zSum >= 0 && ySum >= 0 && xSum >= 0
&& zSum < k && ySum < m && xSum < n) {
count++;
sum += result[gz][gy][xSum];
}
}
}
}
average = round(sum / count * 100) / 100;
if (average == result[gz][gy][x]) {
dissipatedCount++;
}
else {
result[gz][gy][x] = average;
}
}
if (dissipatedCount == size) {
isDissipated = true;
}
}
}
To get the issue with the supposedly wrong get_global_id() fixed, start with a simple, minimal "Hello World"-style vector addition program and than advance forward to your temperature distribution application step-by-step.
With your code I see several issues:
You can only have 1D pointers (with a single *) in OpenCL.
__kernel void distributeKernel(__global float* cuboid, __global float* result)
Introduce a linear index to access more than 1 dimension: For 2D for example int n = x+y*get_global_size(0);
From what I see, k, m, n are lattice dimensions. Eliminate them from the kernel entirely. Get size via get_global_size(...).
The kernel looks rather complex with a lot of loops and branching. This could kill any performance benefit you hope to get from GPU parallelization. Get rid of loops and branching as far as possible. Also, there should not be any loop over one of the lattice dimensions since the lattice position is what you parallelize.
I would also advice to use only 1D parallelization in OpenCL and do the linear indexing yourself. This gives you more flexibility regarding workgroup size.
I am trying to rearrange this algorithm for SHA-256 I found online for use on multiple threads, so that I can compute many hashes really fast. It is basically just to show the difference between CPU and GPU work time. So far it works relatively fine, I just ran into a problem, that I cannot compute more than around 7670 hashes somehow. If I go bigger than that I will get a segfault.
My question is if someone maybe knows a fix to that problem.
Do I need a bigger buffer or do I have to set the globalworksize static? If so, how may I be able to manage that.
I am completely new to OpenCL and this is my first try on it. It does not have to be perfect, my goal is only to show that I can complete repetitive tasks faster on a GPU than on a CPU.
Thanks for your time and help!
Best regards.
My main:
#include <iostream>
#include "sha256.h"
#include <string>
#include <chrono>
#include <random>
#define STRING_LENGTH 8
using std::cout;
using std::cin;
using std::endl;
int TOTAL_HASHES;
std::string generate(int max_length) {
using namespace std;
string possible_characters = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
random_device rd;
mt19937 engine(rd());
uniform_int_distribution<> dist(0, possible_characters.size() - 1);
string ret = "";
for (int i = 0; i < max_length; i++) {
int random_index = dist(engine); //get index between 0 and possible_characters.size()-1
ret += possible_characters[random_index];
}
return ret;
}
void crypt(char* input)
{
char result[sizeof(cl_uint)*65*TOTAL_HASHES];
sha256_crypt(input, result, TOTAL_HASHES);
}
int main()
{
cout << "Wieviele Hashes?"; cin >> TOTAL_HASHES;
std::string str1;
char* cstr = new char[TOTAL_HASHES * (STRING_LENGTH + 1)];
for (int i = 0; i < TOTAL_HASHES; i++)
{
str1 = generate(STRING_LENGTH);
strcpy((cstr+i * STRING_LENGTH), str1.c_str());
}
std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();
crypt((char*)cstr);
std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
std::cout << "Time elapsed = " << std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count() << "[micros]" << std::endl;
std::cout << "Time elapsed = " << std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count() << "[ms]" << std::endl;
std::cout << "Time elapsed = " << std::chrono::duration_cast<std::chrono::seconds>(end - begin).count() << "[s]" << std::endl;
cout << "----------------------------------------------------------------------" << endl;
cout << endl;
}
My sha256.c
#include "sha256.h"
#include <time.h>
static cl_platform_id platform_id = NULL;
static cl_device_id device_id = NULL;
static cl_uint ret_num_devices;
static cl_uint ret_num_platforms;
static cl_context context;
static cl_int ret;
static char* source_str;
static size_t source_size;
static cl_program program;
static cl_kernel kernel;
static cl_command_queue command_queue;
static cl_mem pinned_saved_keys, pinned_partial_hashes, buffer_out, buffer_keys, data_info;
static cl_uint* partial_hashes;
static cl_uint* res_hashes;
static char* saved_plain;
static unsigned int datai[3];
static int have_full_hashes;
static size_t kpc = 4;
static size_t global_work_size = 1;
static size_t local_work_size = 1;
static size_t string_len;
void load_source();
void createDevice();
void createkernel();
void create_clobj();
void print_hashes();
void crypt_all();
void MeasureIt();
void sha256_init(size_t user_kpc)
{
kpc = user_kpc;
load_source();
createDevice();
createkernel();
create_clobj();
}
size_t total_hashes;
void print_hashes(char* input) {
int offset = -1 * datai[2]; // -8
for (int j = 0; j < total_hashes; j++)
{
offset += datai[2];
/*printf("%x\n", partial_hashes[0]);
printf("%x\n", partial_hashes[1]);*/
//printf("offset: %i\n", offset);
printf("input: ");
for (int k = 0; k < SHA256_RESULT_SIZE; k++)
{
printf("%c", input[k + offset]);
}
printf("\noutput: ");
for (int i = 0; i < SHA256_RESULT_SIZE; i++)
{
printf("%08X", partial_hashes[i + offset]);
}
printf("\n");
}
}
void sha256_crypt(char* input, char* output, int total)
{
string_len = strlen(input);
total_hashes = total;
global_work_size = total;
datai[0] = SHA256_PLAINTEXT_LENGTH; //64
datai[1] = total_hashes;
datai[2] = string_len/total_hashes;
sha256_init(2048);
memcpy(saved_plain, input, string_len + 1);
crypt_all();
//print_hashes(input);
}
void crypt_all()
{
//printf("%s\n",saved_plain);
ret = clEnqueueWriteBuffer(command_queue, data_info, CL_TRUE, 0, sizeof(unsigned int) * 3, datai, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, buffer_keys, CL_TRUE, 0, sizeof(char) * string_len, saved_plain, 0, NULL, NULL);
// printf("%s\n",buffer_keys);
//zeit messen und Kernel starten
clock_t start = clock();
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
clock_t end = clock();
float seconds = (float)(end - start) / CLOCKS_PER_SEC;
float milliseconds = seconds / 1000;
float microseconds = milliseconds / 1000/1000/1000/1000;
printf("Zeit in s: %f\n", seconds);
printf("Zeit in micros: %f\n", microseconds);
ret = clFinish(command_queue);
// read back partial hashes
ret = clEnqueueReadBuffer(command_queue, buffer_out, CL_TRUE, 0, sizeof(cl_uint) * SHA256_RESULT_SIZE* total_hashes, partial_hashes, 0, NULL, NULL);
have_full_hashes = 0;
}
void load_source()
{
FILE* fp;
fp = fopen("sha256.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);
}
void create_clobj() {
pinned_saved_keys = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, (SHA256_PLAINTEXT_LENGTH)*kpc, NULL, &ret);
saved_plain = (char*)clEnqueueMapBuffer(command_queue, pinned_saved_keys, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, (SHA256_PLAINTEXT_LENGTH)*kpc, 0, NULL, NULL, &ret);
memset(saved_plain, 0, SHA256_PLAINTEXT_LENGTH * kpc);
res_hashes = (cl_uint*)malloc(sizeof(cl_uint) * SHA256_RESULT_SIZE);
memset(res_hashes, 0, sizeof(cl_uint) * SHA256_RESULT_SIZE);
pinned_partial_hashes = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(cl_uint) * SHA256_RESULT_SIZE* total_hashes, NULL, &ret);
partial_hashes = (cl_uint*)clEnqueueMapBuffer(command_queue, pinned_partial_hashes, CL_TRUE, CL_MAP_READ, 0, sizeof(cl_uint) * SHA256_RESULT_SIZE* total_hashes, 0, NULL, NULL, &ret);
memset(partial_hashes, 0, sizeof(cl_uint) * SHA256_RESULT_SIZE* total_hashes);
buffer_keys = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(char) * string_len, NULL, &ret);
buffer_out = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_uint) * SHA256_RESULT_SIZE* total_hashes, NULL, &ret);
data_info = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(unsigned int) * 3, NULL, &ret);
clSetKernelArg(kernel, 0, sizeof(data_info), (void*)&data_info);
clSetKernelArg(kernel, 1, sizeof(buffer_keys), (void*)&buffer_keys);
clSetKernelArg(kernel, 2, sizeof(buffer_out), (void*)&buffer_out);
}
void createDevice()
{
ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, 1, &device_id, &ret_num_devices);
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
}
void createkernel()
{
program = clCreateProgramWithSource(context, 1, (const char**)&source_str, (const size_t*)&source_size, &ret);
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
kernel = clCreateKernel(program, "sha256_crypt_kernel", &ret);
command_queue = clCreateCommandQueueWithProperties(context, device_id, 0, &ret);
}
And finally my sha256.cl
#ifndef uint32_t
#define uint32_t unsigned int
#endif
#define H0 0x6a09e667
#define H1 0xbb67ae85
#define H2 0x3c6ef372
#define H3 0xa54ff53a
#define H4 0x510e527f
#define H5 0x9b05688c
#define H6 0x1f83d9ab
#define H7 0x5be0cd19
uint rotr(uint x, int n) {
if (n < 32) return (x >> n) | (x << (32 - n));
return x;
}
uint ch(uint x, uint y, uint z) {
return (x & y) ^ (~x & z);
}
uint maj(uint x, uint y, uint z) {
return (x & y) ^ (x & z) ^ (y & z);
}
uint sigma0(uint x) {
return rotr(x, 2) ^ rotr(x, 13) ^ rotr(x, 22);
}
uint sigma1(uint x) {
return rotr(x, 6) ^ rotr(x, 11) ^ rotr(x, 25);
}
uint gamma0(uint x) {
return rotr(x, 7) ^ rotr(x, 18) ^ (x >> 3);
}
uint gamma1(uint x) {
return rotr(x, 17) ^ rotr(x, 19) ^ (x >> 10);
}
__kernel void sha256_crypt_kernel(__global uint *data_info,__global char *plain_key, __global uint *digest){
int workUnit = (int)get_global_id(0);
// printf("workUnit: %i\n", workUnit);
// printf("plain_key %s\n", plain_key);
char real_key[8+1];
int offset = 8*workUnit;
for (int i = 0; i < data_info[2]; i++){
real_key[i] = plain_key[i+offset];
}
// printf("real key: %s \n", real_key);
int t, gid, msg_pad;
int stop, mmod;
uint i, ulen, item, total;
uint W[80], temp, A,B,C,D,E,F,G,H,T1,T2;
uint num_keys = data_info[1];
int current_pad;
uint K[64]={
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
msg_pad=0;
ulen = data_info[2];
total = ulen%64>=56?2:1 + ulen/64;
//printf("ulen: %u total:%u\n", ulen, total);
digest[0+offset] = H0;
digest[1+offset] = H1;
digest[2+offset] = H2;
digest[3+offset] = H3;
digest[4+offset] = H4;
digest[5+offset] = H5;
digest[6+offset] = H6;
digest[7+offset] = H7;
for(item=0; item<total; item++)
{
A = digest[0+offset];
B = digest[1+offset];
C = digest[2+offset];
D = digest[3+offset];
E = digest[4+offset];
F = digest[5+offset];
G = digest[6+offset];
H = digest[7+offset];
#pragma unroll
for (t = 0; t < 80; t++){
W[t] = 0x00000000;
}
msg_pad=item*64;
if(ulen > msg_pad)
{
current_pad = (ulen-msg_pad)>64?64:(ulen-msg_pad);
}
else
{
current_pad =-1;
}
// printf("current_pad: %d\n",current_pad);
if(current_pad>0)
{
i=current_pad;
stop = i/4;
// printf("i:%d, stop: %d msg_pad:%d\n",i,stop, msg_pad);
for (t = 0 ; t < stop ; t++){
W[t] = ((uchar) real_key[msg_pad + t * 4]) << 24;
W[t] |= ((uchar) real_key[msg_pad + t * 4 + 1]) << 16;
W[t] |= ((uchar) real_key[msg_pad + t * 4 + 2]) << 8;
W[t] |= (uchar) real_key[msg_pad + t * 4 + 3];
//printf("W[%u]: %u\n",t,W[t]);
}
mmod = i % 4;
if ( mmod == 3){
W[t] = ((uchar) real_key[msg_pad + t * 4]) << 24;
W[t] |= ((uchar) real_key[msg_pad + t * 4 + 1]) << 16;
W[t] |= ((uchar) real_key[msg_pad + t * 4 + 2]) << 8;
W[t] |= ((uchar) 0x80) ;
} else if (mmod == 2) {
W[t] = ((uchar) real_key[msg_pad + t * 4]) << 24;
W[t] |= ((uchar) real_key[msg_pad + t * 4 + 1]) << 16;
W[t] |= 0x8000 ;
} else if (mmod == 1) {
W[t] = ((uchar) real_key[msg_pad + t * 4]) << 24;
W[t] |= 0x800000 ;
} else /*if (mmod == 0)*/ {
W[t] = 0x80000000 ;
}
if (current_pad<56)
{
W[15] = ulen*8 ;
//printf("ulen avlue 2 :w[15] :%u\n", W[15]);
}
}
else if(current_pad <0)
{
if( ulen%64==0)
W[0]=0x80000000;
W[15]=ulen*8;
//printf("ulen avlue 3 :w[15] :%u\n", W[15]);
}
for (t = 0; t < 64; t++) {
if (t >= 16)
W[t] = gamma1(W[t - 2]) + W[t - 7] + gamma0(W[t - 15]) + W[t - 16];
T1 = H + sigma1(E) + ch(E, F, G) + K[t] + W[t];
T2 = sigma0(A) + maj(A, B, C);
H = G; G = F; F = E; E = D + T1; D = C; C = B; B = A; A = T1 + T2;
}
digest[0+offset] += A;
digest[1+offset] += B;
digest[2+offset] += C;
digest[3+offset] += D;
digest[4+offset] += E;
digest[5+offset] += F;
digest[6+offset] += G;
digest[7+offset] += H;
// for (t = 0; t < 80; t++)
// {
// printf("W[%d]: %u\n",t,W[t]);
// }
}
// for (int t = 0; t < 8; t++)
// {
// printf("%x", digest[t]);
// }
// printf("\n");
}
I developed a small code to add two small vector using GPU by OpenCL library. The main code vectorAdd.cc is as follows:
#include <iostream>
#include <CL/cl.hpp>
#include <cassert>
#include <fstream>
#include <time.h>
#include <cmath>
void randomInit(float *data, int size)
{
for (unsigned int i = 0; i < size; ++i)
data[i] = rand() / (float)RAND_MAX;
}
int main()
{
//get all platforms (drivers)
std::vector<cl::Platform> platforms;
cl::Platform::get(&platforms);
assert(platforms.size() > 0);
cl::Platform myPlatform = platforms[0];
std::cout << "Using platform: "<<myPlatform.getInfo<CL_PLATFORM_NAME>()<<"\n";
//get default device of the default platform
std::vector<cl::Device> devices;
myPlatform.getDevices(CL_DEVICE_TYPE_ALL, &devices);
assert(devices.size() > 0);
cl::Device myDevice = devices[0];
std::cout<< "Using device: "<<myDevice.getInfo<CL_DEVICE_NAME>()<<"\n";
std::ifstream vectorAddFile("vector_add_kernel.cl" );
std::string src(std::istreambuf_iterator<char>(vectorAddFile), (std::istreambuf_iterator<char>()));
cl::Program::Sources sources(1, std::make_pair(src.c_str(), src.length() + 1));
cl::Context context(myDevice);
cl::Program program(context, sources);
int szVec = 10;
float* A = new float[szVec];
float* B = new float[szVec];
randomInit(A,szVec);
randomInit(B,szVec);
float* C = new float[szVec];
std::fill_n(C, szVec, 0);
// create buffers on the device
cl::Buffer buffer_A = cl::Buffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, szVec * sizeof(float), A);
cl::Buffer buffer_B = cl::Buffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, szVec * sizeof(float), B);
cl::Buffer buffer_C = cl::Buffer(context, CL_MEM_WRITE_ONLY|CL_MEM_COPY_HOST_PTR, szVec * sizeof(float), C);
//create queue to which we will push commands for the device.
cl::CommandQueue queue(context, myDevice);
//write arrays A and B to the device
//queue.enqueueWriteBuffer(buffer_A, CL_TRUE, 0, sizeof(float) * szVec, A);
//queue.enqueueWriteBuffer(buffer_B, CL_TRUE, 0, sizeof(float) * szVec, B);
auto err = program.build("cl.std.CL1.2");
// run the kernel
cl::Kernel kernel(program,"vector_add", &err);
kernel.setArg(0, buffer_A);
kernel.setArg(1, buffer_B);
kernel.setArg(2, buffer_C);
queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(szVec), cl::NullRange);
queue.finish();
//read result C from the device to array C
queue.enqueueReadBuffer(buffer_C, CL_TRUE, 0, sizeof(float) * szVec, C);
std::cout<<" result: \n";
for(int i = 0; i < szVec; i++)
{
std::cout << A[i] << " + " << B[i] << " = " << C[i] << std::endl;
}
std::cout << std::endl;
return 0;
}
and the kernel code vector_add_kernel.cl is as follows:
__kernel void vector_add(__global float *A, __global float *B, __global float *C)
{
// Get the index of the current element
int i = get_global_id(0);
// Do the operation
C[i] = A[i] + B[i];
}
and the result i got is:
Using platform: NVIDIA CUDA
Using device: Tesla K20m
result:
0.840188 + 0.477397 = 0
0.394383 + 0.628871 = 0
0.783099 + 0.364784 = 0
0.79844 + 0.513401 = 0
0.911647 + 0.95223 = 0
0.197551 + 0.916195 = 0
0.335223 + 0.635712 = 0
0.76823 + 0.717297 = 0
0.277775 + 0.141603 = 0
0.55397 + 0.606969 = 0
The problem as you can see, the result is always what I initialized vector C, I do not understand why. I also initialized vectorC with some other values and again the result was the initial values.
It's probably just a syntax error.
auto err = program.build("cl.std.CL1.2");
should be
auto err = program.build("-cl-std=CL1.2");
The documentation on clBuildProgram has more information about the supported options.
The problem stems from building the program with this command
auto err = program.build("cl.std.CL1.2");
and by replacing the command above with
auto err = program.build();
The problem solved.
But still I do not know why this happened. Any idea?
If this sort of question has been asked I apologize, link me to the thread please!
Anyhow I am new to CUDA (I'm coming from OpenCL) and wanted to try generating an image with it. The relevant CUDA code is:
__global__
void mandlebrot(uint8_t *pixels, size_t pitch, unsigned long width, unsigned long height) {
unsigned block_size = blockDim.x;
uint2 location = {blockIdx.x*block_size, blockIdx.y*block_size};
ulong2 pixel_location = {threadIdx.x, threadIdx.y};
ulong2 real_location = {location.x + pixel_location.x, location.y + pixel_location.y};
if (real_location.x >= width || real_location.y >= height)
return;
uint8_t *row = (uint8_t *)((char *)pixels + real_location.y * pitch);
row[real_location.x * 4+0] = 0;
row[real_location.x * 4+1] = 255;
row[real_location.x * 4+2] = 0;
row[real_location.x * 4+3] = 255;
}
cudaError_t err = cudaSuccess;
#define CUDA_ERR(e) \
if ((err = e) != cudaSuccess) { \
fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err)); \
exit(-1); \
}
int main(void) {
ulong2 dims = {1000, 1000};
unsigned long block_size = 500;
dim3 threads_per_block(block_size, block_size);
dim3 remainders(dims.x % threads_per_block.x, dims.y % threads_per_block.y);
dim3 blocks(dims.x / threads_per_block.x + (remainders.x == 0 ? 0 : 1), dims.y / threads_per_block.y + (remainders.y == 0 ? 0 : 1));
size_t pitch;
uint8_t *pixels, *h_pixels = NULL;
CUDA_ERR(cudaMallocPitch(&pixels, &pitch, dims.x * 4 * sizeof(uint8_t), dims.y));
mandlebrot<<<blocks, threads_per_block>>>(pixels, pitch, dims.x, dims.y);
h_pixels = (uint8_t *)malloc(dims.x * 4 * sizeof(uint8_t) * dims.y);
memset(h_pixels, 0, dims.x * 4 * sizeof(uint8_t) * dims.y);
CUDA_ERR(cudaMemcpy2D(h_pixels, dims.x * 4 * sizeof(uint8_t), pixels, pitch, dims.x, dims.y, cudaMemcpyDeviceToHost));
save_png("out.png", h_pixels, dims.x, dims.y);
CUDA_ERR(cudaFree(pixels));
free(h_pixels);
CUDA_ERR(cudaDeviceReset());
puts("Success");
return 0;
}
The save_png function is a usual utility function I created for taking a block of data and saving it to a png:
void save_png(const char *filename, uint8_t *buffer, unsigned long width, unsigned long height) {
png_structp png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL);
if (!png_ptr) {
std::cerr << "Failed to create png write struct" << std::endl;
return;
}
png_infop info_ptr = png_create_info_struct(png_ptr);
if (!info_ptr) {
std::cerr << "Failed to create info_ptr" << std::endl;
png_destroy_write_struct(&png_ptr, NULL);
return;
}
FILE *fp = fopen(filename, "wb");
if (!fp) {
std::cerr << "Failed to open " << filename << " for writing" << std::endl;
png_destroy_write_struct(&png_ptr, &info_ptr);
return;
}
if (setjmp(png_jmpbuf(png_ptr))) {
png_destroy_write_struct(&png_ptr, &info_ptr);
std::cerr << "Error from libpng!" << std::endl;
return;
}
png_init_io(png_ptr, fp);
png_set_IHDR(png_ptr, info_ptr, width, height, 8, PNG_COLOR_TYPE_RGBA, PNG_INTERLACE_NONE, PNG_COMPRESSION_TYPE_DEFAULT, PNG_FILTER_TYPE_DEFAULT);
png_write_info(png_ptr, info_ptr);
png_byte *row_pnts[height];
size_t i;
for (i = 0; i < height; i++) {
row_pnts[i] = buffer + width * 4 * i;
}
png_write_image(png_ptr, row_pnts);
png_write_end(png_ptr, info_ptr);
png_destroy_write_struct(&png_ptr, &info_ptr);
fclose(fp);
}
Anyways the image that's generated is a weird whiteish strip that's speckled with random colored pixels which can be seen here.
Is there something glaring I did wrong? I tried to follow the introduction documentation on the CUDA site. Otherwise can anyone help me out to fix this? Here I'm simply trying to fill the pixels buffer with green pixels.
I am using a MBP retina with an NVIDIA GeForce GT 650M discrete graphics card. I can run and paste the output to print_devices from the cuda sample code if need be.
EDIT: Note no errors or warnings during compilation with the following makefile:
all:
nvcc -c mandlebrot.cu -o mandlebrot.cu.o
nvcc mandlebrot.cu.o -o mandlebrot -lpng
and no errors at runtime.
It's better if you provide a complete code that someone can copy, paste, compile, and run, without adding anything or changing anything, Stripping off the include headers isn't helpful, in my opinion, and making your test code dependent on a png library that others may not have is also not productive, if you want help.
Your error checking on kernel launches is broken. You may want to review proper cuda error checking. If you had proper error checking, or ran your code with cuda-memcheck, you would discover an error 9 on the kernel launch. This is an invalid configuration. If you print out your blocks and threads_per_block variables, you'll see something like this:
blocks: 2, 2
threads: 500, 500
You are in fact setting threads per block to 500,500 here:
unsigned long block_size = 500;
dim3 threads_per_block(block_size, block_size);
That is illegal, as you are requesting 500x500 threads per block (i.e. 250000 threads) which exceeds the maximum limit of 1024 threads per block.
So your kernel is not running at all and you're getting garbage.
You can fix this error pretty simply by changing your block_size definition:
unsigned long block_size = 16;
After that there is still an issue, as you've misinterpreted the parameters for cudaMemcpy2D.:
CUDA_ERR(cudaMemcpy2D(h_pixels, dims.x * 4 * sizeof(uint8_t), pixels, pitch, dims.x, dims.y, cudaMemcpyDeviceToHost));
The documentation states for the 5th parameter:
width - Width of matrix transfer (columns in bytes)
but you've passed the width in elements (groups of 4 bytes) rather than bytes.
This will fix that:
CUDA_ERR(cudaMemcpy2D(h_pixels, dims.x * 4 * sizeof(uint8_t), pixels, pitch, dims.x*4, dims.y, cudaMemcpyDeviceToHost));
With the above changes, I was able to get good results with a test version of your code:
#include <stdio.h>
#include <stdint.h>
__global__
void mandlebrot(uint8_t *pixels, size_t pitch, unsigned long width, unsigned long height) {
unsigned block_size = blockDim.x;
uint2 location = {blockIdx.x*block_size, blockIdx.y*block_size};
ulong2 pixel_location = {threadIdx.x, threadIdx.y};
ulong2 real_location = {location.x + pixel_location.x, location.y + pixel_location.y};
if (real_location.x >= width || real_location.y >= height)
return;
uint8_t *row = (uint8_t *)((char *)pixels + real_location.y * pitch);
row[real_location.x * 4+0] = 0;
row[real_location.x * 4+1] = 255;
row[real_location.x * 4+2] = 0;
row[real_location.x * 4+3] = 255;
}
cudaError_t err = cudaSuccess;
#define CUDA_ERR(e) \
if ((err = e) != cudaSuccess) { \
fprintf(stderr, "Failed to allocate device vector A (error code %s)!\n", cudaGetErrorString(err)); \
exit(-1); \
}
int main(void) {
ulong2 dims = {1000, 1000};
dim3 threads_per_block(16, 16);
dim3 remainders(dims.x % threads_per_block.x, dims.y % threads_per_block.y);
dim3 blocks(dims.x / threads_per_block.x + (remainders.x == 0 ? 0 : 1), dims.y / threads_per_block.y + (remainders.y == 0 ? 0 : 1));
size_t pitch;
uint8_t *pixels, *h_pixels = NULL;
CUDA_ERR(cudaMallocPitch(&pixels, &pitch, dims.x * 4 * sizeof(uint8_t), dims.y));
printf("blocks: %u, %u\n", blocks.x, blocks.y);
printf("threads: %u, %u\n", threads_per_block.x, threads_per_block.y);
mandlebrot<<<blocks, threads_per_block>>>(pixels, pitch, dims.x, dims.y);
h_pixels = (uint8_t *)malloc(dims.x * 4 * sizeof(uint8_t) * dims.y);
memset(h_pixels, 0, dims.x * 4 * sizeof(uint8_t) * dims.y);
CUDA_ERR(cudaMemcpy2D(h_pixels, dims.x * 4 * sizeof(uint8_t), pixels, pitch, dims.x*4, dims.y, cudaMemcpyDeviceToHost));
// save_png("out.png", h_pixels, dims.x, dims.y);
for (int row = 0; row < dims.y; row++)
for (int col = 0; col < dims.x; col++){
if (h_pixels[(row*dims.x*4) + col*4 ] != 0) {printf("mismatch 0 at %u,%u: was: %u should be: %u\n", row,col, h_pixels[(row*dims.x)+col*4], 0); return 1;}
if (h_pixels[(row*dims.x*4) + col*4 +1] != 255) {printf("mismatch 1 at %u,%u: was: %u should be: %u\n", row,col, h_pixels[(row*dims.x)+col*4 +1], 255); return 1;}
if (h_pixels[(row*dims.x*4) + col*4 +2] != 0) {printf("mismatch 2: was: %u should be: %u\n", h_pixels[(row*dims.x)+col*4 +2], 0); return 1;}
if (h_pixels[(row*dims.x*4) + col*4 +3] != 255) {printf("mismatch 3: was: %u should be: %u\n", h_pixels[(row*dims.x)+col*4 +3 ], 255); return 1;}
}
CUDA_ERR(cudaFree(pixels));
free(h_pixels);
CUDA_ERR(cudaDeviceReset());
puts("Success");
return 0;
}
Note the above code is a complete code you can copy, paste, compile and run.