OpenCL SHA-256 GPU cannot compute more than 7670 hashes - c++

I am trying to rearrange this algorithm for SHA-256 I found online for use on multiple threads, so that I can compute many hashes really fast. It is basically just to show the difference between CPU and GPU work time. So far it works relatively fine, I just ran into a problem, that I cannot compute more than around 7670 hashes somehow. If I go bigger than that I will get a segfault.
My question is if someone maybe knows a fix to that problem.
Do I need a bigger buffer or do I have to set the globalworksize static? If so, how may I be able to manage that.
I am completely new to OpenCL and this is my first try on it. It does not have to be perfect, my goal is only to show that I can complete repetitive tasks faster on a GPU than on a CPU.
Thanks for your time and help!
Best regards.
My main:
#include <iostream>
#include "sha256.h"
#include <string>
#include <chrono>
#include <random>
#define STRING_LENGTH 8
using std::cout;
using std::cin;
using std::endl;
int TOTAL_HASHES;
std::string generate(int max_length) {
using namespace std;
string possible_characters = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
random_device rd;
mt19937 engine(rd());
uniform_int_distribution<> dist(0, possible_characters.size() - 1);
string ret = "";
for (int i = 0; i < max_length; i++) {
int random_index = dist(engine); //get index between 0 and possible_characters.size()-1
ret += possible_characters[random_index];
}
return ret;
}
void crypt(char* input)
{
char result[sizeof(cl_uint)*65*TOTAL_HASHES];
sha256_crypt(input, result, TOTAL_HASHES);
}
int main()
{
cout << "Wieviele Hashes?"; cin >> TOTAL_HASHES;
std::string str1;
char* cstr = new char[TOTAL_HASHES * (STRING_LENGTH + 1)];
for (int i = 0; i < TOTAL_HASHES; i++)
{
str1 = generate(STRING_LENGTH);
strcpy((cstr+i * STRING_LENGTH), str1.c_str());
}
std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();
crypt((char*)cstr);
std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
std::cout << "Time elapsed = " << std::chrono::duration_cast<std::chrono::microseconds>(end - begin).count() << "[micros]" << std::endl;
std::cout << "Time elapsed = " << std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count() << "[ms]" << std::endl;
std::cout << "Time elapsed = " << std::chrono::duration_cast<std::chrono::seconds>(end - begin).count() << "[s]" << std::endl;
cout << "----------------------------------------------------------------------" << endl;
cout << endl;
}
My sha256.c
#include "sha256.h"
#include <time.h>
static cl_platform_id platform_id = NULL;
static cl_device_id device_id = NULL;
static cl_uint ret_num_devices;
static cl_uint ret_num_platforms;
static cl_context context;
static cl_int ret;
static char* source_str;
static size_t source_size;
static cl_program program;
static cl_kernel kernel;
static cl_command_queue command_queue;
static cl_mem pinned_saved_keys, pinned_partial_hashes, buffer_out, buffer_keys, data_info;
static cl_uint* partial_hashes;
static cl_uint* res_hashes;
static char* saved_plain;
static unsigned int datai[3];
static int have_full_hashes;
static size_t kpc = 4;
static size_t global_work_size = 1;
static size_t local_work_size = 1;
static size_t string_len;
void load_source();
void createDevice();
void createkernel();
void create_clobj();
void print_hashes();
void crypt_all();
void MeasureIt();
void sha256_init(size_t user_kpc)
{
kpc = user_kpc;
load_source();
createDevice();
createkernel();
create_clobj();
}
size_t total_hashes;
void print_hashes(char* input) {
int offset = -1 * datai[2]; // -8
for (int j = 0; j < total_hashes; j++)
{
offset += datai[2];
/*printf("%x\n", partial_hashes[0]);
printf("%x\n", partial_hashes[1]);*/
//printf("offset: %i\n", offset);
printf("input: ");
for (int k = 0; k < SHA256_RESULT_SIZE; k++)
{
printf("%c", input[k + offset]);
}
printf("\noutput: ");
for (int i = 0; i < SHA256_RESULT_SIZE; i++)
{
printf("%08X", partial_hashes[i + offset]);
}
printf("\n");
}
}
void sha256_crypt(char* input, char* output, int total)
{
string_len = strlen(input);
total_hashes = total;
global_work_size = total;
datai[0] = SHA256_PLAINTEXT_LENGTH; //64
datai[1] = total_hashes;
datai[2] = string_len/total_hashes;
sha256_init(2048);
memcpy(saved_plain, input, string_len + 1);
crypt_all();
//print_hashes(input);
}
void crypt_all()
{
//printf("%s\n",saved_plain);
ret = clEnqueueWriteBuffer(command_queue, data_info, CL_TRUE, 0, sizeof(unsigned int) * 3, datai, 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue, buffer_keys, CL_TRUE, 0, sizeof(char) * string_len, saved_plain, 0, NULL, NULL);
// printf("%s\n",buffer_keys);
//zeit messen und Kernel starten
clock_t start = clock();
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
clock_t end = clock();
float seconds = (float)(end - start) / CLOCKS_PER_SEC;
float milliseconds = seconds / 1000;
float microseconds = milliseconds / 1000/1000/1000/1000;
printf("Zeit in s: %f\n", seconds);
printf("Zeit in micros: %f\n", microseconds);
ret = clFinish(command_queue);
// read back partial hashes
ret = clEnqueueReadBuffer(command_queue, buffer_out, CL_TRUE, 0, sizeof(cl_uint) * SHA256_RESULT_SIZE* total_hashes, partial_hashes, 0, NULL, NULL);
have_full_hashes = 0;
}
void load_source()
{
FILE* fp;
fp = fopen("sha256.cl", "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);
}
void create_clobj() {
pinned_saved_keys = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, (SHA256_PLAINTEXT_LENGTH)*kpc, NULL, &ret);
saved_plain = (char*)clEnqueueMapBuffer(command_queue, pinned_saved_keys, CL_TRUE, CL_MAP_WRITE | CL_MAP_READ, 0, (SHA256_PLAINTEXT_LENGTH)*kpc, 0, NULL, NULL, &ret);
memset(saved_plain, 0, SHA256_PLAINTEXT_LENGTH * kpc);
res_hashes = (cl_uint*)malloc(sizeof(cl_uint) * SHA256_RESULT_SIZE);
memset(res_hashes, 0, sizeof(cl_uint) * SHA256_RESULT_SIZE);
pinned_partial_hashes = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, sizeof(cl_uint) * SHA256_RESULT_SIZE* total_hashes, NULL, &ret);
partial_hashes = (cl_uint*)clEnqueueMapBuffer(command_queue, pinned_partial_hashes, CL_TRUE, CL_MAP_READ, 0, sizeof(cl_uint) * SHA256_RESULT_SIZE* total_hashes, 0, NULL, NULL, &ret);
memset(partial_hashes, 0, sizeof(cl_uint) * SHA256_RESULT_SIZE* total_hashes);
buffer_keys = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(char) * string_len, NULL, &ret);
buffer_out = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_uint) * SHA256_RESULT_SIZE* total_hashes, NULL, &ret);
data_info = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(unsigned int) * 3, NULL, &ret);
clSetKernelArg(kernel, 0, sizeof(data_info), (void*)&data_info);
clSetKernelArg(kernel, 1, sizeof(buffer_keys), (void*)&buffer_keys);
clSetKernelArg(kernel, 2, sizeof(buffer_out), (void*)&buffer_out);
}
void createDevice()
{
ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, 1, &device_id, &ret_num_devices);
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
}
void createkernel()
{
program = clCreateProgramWithSource(context, 1, (const char**)&source_str, (const size_t*)&source_size, &ret);
ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
kernel = clCreateKernel(program, "sha256_crypt_kernel", &ret);
command_queue = clCreateCommandQueueWithProperties(context, device_id, 0, &ret);
}
And finally my sha256.cl
#ifndef uint32_t
#define uint32_t unsigned int
#endif
#define H0 0x6a09e667
#define H1 0xbb67ae85
#define H2 0x3c6ef372
#define H3 0xa54ff53a
#define H4 0x510e527f
#define H5 0x9b05688c
#define H6 0x1f83d9ab
#define H7 0x5be0cd19
uint rotr(uint x, int n) {
if (n < 32) return (x >> n) | (x << (32 - n));
return x;
}
uint ch(uint x, uint y, uint z) {
return (x & y) ^ (~x & z);
}
uint maj(uint x, uint y, uint z) {
return (x & y) ^ (x & z) ^ (y & z);
}
uint sigma0(uint x) {
return rotr(x, 2) ^ rotr(x, 13) ^ rotr(x, 22);
}
uint sigma1(uint x) {
return rotr(x, 6) ^ rotr(x, 11) ^ rotr(x, 25);
}
uint gamma0(uint x) {
return rotr(x, 7) ^ rotr(x, 18) ^ (x >> 3);
}
uint gamma1(uint x) {
return rotr(x, 17) ^ rotr(x, 19) ^ (x >> 10);
}
__kernel void sha256_crypt_kernel(__global uint *data_info,__global char *plain_key, __global uint *digest){
int workUnit = (int)get_global_id(0);
// printf("workUnit: %i\n", workUnit);
// printf("plain_key %s\n", plain_key);
char real_key[8+1];
int offset = 8*workUnit;
for (int i = 0; i < data_info[2]; i++){
real_key[i] = plain_key[i+offset];
}
// printf("real key: %s \n", real_key);
int t, gid, msg_pad;
int stop, mmod;
uint i, ulen, item, total;
uint W[80], temp, A,B,C,D,E,F,G,H,T1,T2;
uint num_keys = data_info[1];
int current_pad;
uint K[64]={
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
msg_pad=0;
ulen = data_info[2];
total = ulen%64>=56?2:1 + ulen/64;
//printf("ulen: %u total:%u\n", ulen, total);
digest[0+offset] = H0;
digest[1+offset] = H1;
digest[2+offset] = H2;
digest[3+offset] = H3;
digest[4+offset] = H4;
digest[5+offset] = H5;
digest[6+offset] = H6;
digest[7+offset] = H7;
for(item=0; item<total; item++)
{
A = digest[0+offset];
B = digest[1+offset];
C = digest[2+offset];
D = digest[3+offset];
E = digest[4+offset];
F = digest[5+offset];
G = digest[6+offset];
H = digest[7+offset];
#pragma unroll
for (t = 0; t < 80; t++){
W[t] = 0x00000000;
}
msg_pad=item*64;
if(ulen > msg_pad)
{
current_pad = (ulen-msg_pad)>64?64:(ulen-msg_pad);
}
else
{
current_pad =-1;
}
// printf("current_pad: %d\n",current_pad);
if(current_pad>0)
{
i=current_pad;
stop = i/4;
// printf("i:%d, stop: %d msg_pad:%d\n",i,stop, msg_pad);
for (t = 0 ; t < stop ; t++){
W[t] = ((uchar) real_key[msg_pad + t * 4]) << 24;
W[t] |= ((uchar) real_key[msg_pad + t * 4 + 1]) << 16;
W[t] |= ((uchar) real_key[msg_pad + t * 4 + 2]) << 8;
W[t] |= (uchar) real_key[msg_pad + t * 4 + 3];
//printf("W[%u]: %u\n",t,W[t]);
}
mmod = i % 4;
if ( mmod == 3){
W[t] = ((uchar) real_key[msg_pad + t * 4]) << 24;
W[t] |= ((uchar) real_key[msg_pad + t * 4 + 1]) << 16;
W[t] |= ((uchar) real_key[msg_pad + t * 4 + 2]) << 8;
W[t] |= ((uchar) 0x80) ;
} else if (mmod == 2) {
W[t] = ((uchar) real_key[msg_pad + t * 4]) << 24;
W[t] |= ((uchar) real_key[msg_pad + t * 4 + 1]) << 16;
W[t] |= 0x8000 ;
} else if (mmod == 1) {
W[t] = ((uchar) real_key[msg_pad + t * 4]) << 24;
W[t] |= 0x800000 ;
} else /*if (mmod == 0)*/ {
W[t] = 0x80000000 ;
}
if (current_pad<56)
{
W[15] = ulen*8 ;
//printf("ulen avlue 2 :w[15] :%u\n", W[15]);
}
}
else if(current_pad <0)
{
if( ulen%64==0)
W[0]=0x80000000;
W[15]=ulen*8;
//printf("ulen avlue 3 :w[15] :%u\n", W[15]);
}
for (t = 0; t < 64; t++) {
if (t >= 16)
W[t] = gamma1(W[t - 2]) + W[t - 7] + gamma0(W[t - 15]) + W[t - 16];
T1 = H + sigma1(E) + ch(E, F, G) + K[t] + W[t];
T2 = sigma0(A) + maj(A, B, C);
H = G; G = F; F = E; E = D + T1; D = C; C = B; B = A; A = T1 + T2;
}
digest[0+offset] += A;
digest[1+offset] += B;
digest[2+offset] += C;
digest[3+offset] += D;
digest[4+offset] += E;
digest[5+offset] += F;
digest[6+offset] += G;
digest[7+offset] += H;
// for (t = 0; t < 80; t++)
// {
// printf("W[%d]: %u\n",t,W[t]);
// }
}
// for (int t = 0; t < 8; t++)
// {
// printf("%x", digest[t]);
// }
// printf("\n");
}

Related

How to manipulate audio data buffers correctly?

I have implemented recording and playing back audio from a microphone in C++. The next step is to process the audio data for speech recognition. For this I want to write them to large buffers so that there are no word breaks. To do this, I implemented copying to large buffers using the memcpy function. Unfortunately, it doesn't work because only part of words can be recognized. What is my mistake and can this buffer manipulation be done in a more convenient way?
My code:
#include <stdio.h>
#include <Windows.h>
#include <mmsystem.h>
#include <iostream>
#include <fstream>
using namespace std;
#pragma comment(lib, "winmm.lib")
#define Samples 16000
#define NUM_FRAMES Samples*2
#define Channels 1
const int NUM_BUF = 4;
int main()
{
HWAVEIN inStream;
HWAVEOUT outStream;
WAVEFORMATEX waveFormat;
WAVEHDR buffer[NUM_BUF];
waveFormat.cbSize = 0;
waveFormat.wFormatTag = WAVE_FORMAT_PCM;
waveFormat.nChannels = Channels;
waveFormat.nSamplesPerSec = Samples;
waveFormat.wBitsPerSample = 16;
waveFormat.nBlockAlign = waveFormat.nChannels * waveFormat.wBitsPerSample / 8;
waveFormat.nAvgBytesPerSec = waveFormat.nBlockAlign * waveFormat.nSamplesPerSec;
HANDLE event = CreateEventA(NULL, TRUE, FALSE, "waveout event");
MMRESULT res = MMSYSERR_NOERROR;
res = waveInOpen(&inStream, WAVE_MAPPER, &waveFormat, (unsigned long)event, 0, CALLBACK_EVENT);
if (res != MMSYSERR_NOERROR) {
printf("error in waveInOpen\n");
return -1;
}
res = waveOutOpen(&outStream, WAVE_MAPPER, &waveFormat, (unsigned long)event, 0, CALLBACK_EVENT);
if (res != MMSYSERR_NOERROR) {
printf("error in waveOutOpen\n");
return -2;
}
short int *_pBuf;
size_t bpbuff = 16000*2;
_pBuf = new short int [bpbuff * NUM_BUF];
for ( int i = 0; i < NUM_BUF; i++ )
{
buffer[i].lpData = (LPSTR)&_pBuf [i * bpbuff];
buffer[i].dwBufferLength = bpbuff*sizeof(*_pBuf);
buffer[i].dwFlags = 0L;
buffer[i].dwLoops = 0L;
waveInPrepareHeader(inStream, & buffer[i], sizeof(WAVEHDR));
}
ResetEvent(event);
for (int index = 0; index < NUM_BUF; index++) // queue all buffers for input
waveInAddBuffer(inStream, &buffer[index], sizeof(WAVEHDR));
waveInStart(inStream);
int len_buff = buffer[0].dwBufferLength*6 + 1;
int limit_buff = buffer[0].dwBufferLength*5 + 1;
int size = buffer[0].dwBufferLength;
int rl = 0;
int flagg = 0;
char * buff1 = new char[len_buff];
char * buff2 = new char[len_buff];
int flag_buf = 0;
int flag1 = 0, flag2 = 0;
int i = 0;
int inIndex = 0, outIndex = 0; // the next input and output to watch
while (true) {
if (buffer[inIndex].dwFlags & WHDR_DONE & flagg!=1)
{
flagg = 1;
waveInAddBuffer(inStream, &buffer[inIndex], sizeof(WAVEHDR));
inIndex = (inIndex + 1) % NUM_BUF;
}
if (buffer[outIndex].dwFlags & WHDR_DONE & flagg!=0) {
flagg = 0;
if (flag_buf == 0)
{
if (rl<limit_buff)
{
cout << rl << endl;
if (flag1 == 0)
{
//strcpy(buff1, buffer[outIndex].lpData);
memcpy(buff1, buffer[outIndex].lpData, size);
flag1 = 1;
rl = size + 1;
}
else
{
//strcat(buff1, buffer[outIndex].lpData);
memcpy(buff1 + rl, buffer[outIndex].lpData, size);
rl = rl + size;
}
}
else
{
//recognize buff1
flag_buf = 1;
flag1 = 0;
rl = 0;
}
}
else
{
if (rl<limit_buff)
{
if (flag2 == 0)
{
memcpy(buff2, buffer[outIndex].lpData, size);
flag2 = 1;
rl = size + 1;
}
else
{
memcpy(buff2 + rl, buffer[outIndex].lpData, size);
rl = rl + size;
}
}
else
{
//recognize buff2
flag_buf = 0;
flag2 = 0;
rl = 0;
}
}
waveOutWrite(outStream, &buffer[outIndex], sizeof(WAVEHDR));
outIndex = (outIndex + 1) % NUM_BUF;
printf("N_buff_%i %i\n",outIndex , i);
i++;
}
}
for (int index = 0; index < 4; index++)
waveInUnprepareHeader(inStream, &buffer[inIndex], sizeof(WAVEHDR));
free(buffer);
}

OpenCL method get_global_id() works incorrectly on GPU

I want to parallelize temperatures distribution, using OpenCL technology. I stocked on problem with my GPU - work item id for every other kernel function are the same. Instead of result, for example, from 0 to 1024, I got this result. What I did incorrectcly?
enter image description here
Source.cpp
include <iostream>
#include <string>
#include <fstream>
#include <omp.h>
#include <CL/cl.hpp>
float*** distributeOpenCL(float*** cuboid, int k, int m, int n)
{
// OpenCL init
int size = k * m * n;
float*** hResult = initCuboid(k, m, n);
cl_platform_id platform;
cl_device_id device;
cl_int error = 0;
std::ifstream file("program.cl");
std::string fileText = std::string(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>());
const char* srcText = fileText.data();
size_t srcLength = fileText.size();
cl_context context;
cl_program program;
cl_kernel kernel;
cl_command_queue queue;
cl_mem dCuboid, dRes;
size_t localSize[2] = { k,m };
size_t globalSize[2] = { ceil(size / (float)localSize[0]) * localSize[0], ceil(size / (float)localSize[1]) * localSize[1] };
// Get GPU
error |= clGetPlatformIDs(1, &platform, NULL);
error |= clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
// Compile and build
context = clCreateContext(NULL, 1, &device, NULL, NULL, &error);
program = clCreateProgramWithSource(context, 1, &srcText, &srcLength, &error);
error |= clBuildProgram(program, 1, &device, NULL, NULL, NULL);
// What funtion from file we have to run
kernel = clCreateKernel(program, "distributeKernel", &error);
// Add to Queue
queue = clCreateCommandQueueWithProperties(context, device, NULL, &error);
// Create buffer
dCuboid = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * size, NULL, NULL);
dRes = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * size, NULL, NULL);
// Write data to buffer
error |= clEnqueueWriteBuffer(queue, dCuboid, CL_TRUE, 0, sizeof(float) * size, cuboid, 0, NULL, NULL);
// Kernel args
error |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &dCuboid);
error |= clSetKernelArg(kernel, 1, sizeof(int), &k);
error |= clSetKernelArg(kernel, 2, sizeof(int), &m);
error |= clSetKernelArg(kernel, 3, sizeof(int), &n);
error |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &dRes);
// Start task
error |= clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalSize, localSize, 0, NULL, NULL);
// Wait execution
clFinish(queue);
// Read Result
error |= clEnqueueReadBuffer(queue, dRes, CL_TRUE, 0, sizeof(float) * size, hResult, 0, NULL, NULL);
//printCuboid(resP, k, m, n, resPFile);
// Deallocation
clReleaseKernel(kernel);
clReleaseMemObject(dCuboid);
clReleaseMemObject(dRes);
clReleaseCommandQueue(queue);
clReleaseProgram(program);
clReleaseContext(context);
return hResult;
}
int main(int argc, char* argv[])
{
std::ofstream filledFile("filled.txt");
std::ofstream resLFile("resL.txt");
std::ofstream resPFile("resP.txt");
double durationL, durationP, time1, time2;
int k = 5, m = 5, n = 5, temp1 = 10, temp2 = 15;
float*** cuboid, *** resL, *** resP;
if (argc > 1) {
k = atoi(argv[1]), m = atoi(argv[2]), n = atoi(argv[3]),
temp1 = atoi(argv[4]), temp2 = atoi(argv[5]);
}
// Linear
cuboid = initCuboid(k, m, n);
fillCuboid(cuboid, k, m, n, temp1, temp2);
printCuboidToFile(cuboid, k, m, n, filledFile);
time1 = omp_get_wtime();
resL = distribute(cuboid, k, m, n);
time2 = omp_get_wtime();
durationL = time2 - time1;
printCuboidToFile(resL, k, m, n, resLFile);
// Parallel
time1 = omp_get_wtime();
resP = distributeOpenCL(cuboid, k, m, n);
time2 = omp_get_wtime();
durationP = time2 - time1;
//printCuboidToFile(resP, k, m, n, resPFile);
std::cout << "Linear time: " << durationL << std::endl;
std::cout << "Parallel time: " << durationP << std::endl;
std::cout << "Parallel faster than linear on: " << durationL - durationP << std::endl;
// Delete 3d arrays, closing files
deleteCuboid(cuboid, k, m, n);
deleteCuboid(resL, k, m, n);
deleteCuboid(resP, k, m, n);
filledFile.close();
resLFile.close();
resPFile.close();
return 0;
}
program.cl
__kernel void distributeKernel(__global float*** cuboid, int k, int m, int n, __global float*** result)
{
int gz = get_global_id(0);
int gy = get_global_id(1);
printf("gy - %d \n", &gy);
printf("gz - %d \n", &gz);
bool isDissipated = false;
int size = k * m * n;
// Ends if temperatures in cube becomes balanced
while (!isDissipated) {
int dissipatedCount = 0;
for (int x = 0; x < n; x++) {
// Calc average temperature
float sum = 0;
int count = 0;
float average;
for (int zSum = gz - 1; zSum <= gz + 1; zSum++) {
for (int ySum = gy - 1; ySum <= gy + 1; ySum++) {
for (int xSum = x - 1; xSum <= x + 1; xSum++) {
if (zSum >= 0 && ySum >= 0 && xSum >= 0
&& zSum < k && ySum < m && xSum < n) {
count++;
sum += result[gz][gy][xSum];
}
}
}
}
average = round(sum / count * 100) / 100;
if (average == result[gz][gy][x]) {
dissipatedCount++;
}
else {
result[gz][gy][x] = average;
}
}
if (dissipatedCount == size) {
isDissipated = true;
}
}
}
To get the issue with the supposedly wrong get_global_id() fixed, start with a simple, minimal "Hello World"-style vector addition program and than advance forward to your temperature distribution application step-by-step.
With your code I see several issues:
You can only have 1D pointers (with a single *) in OpenCL.
__kernel void distributeKernel(__global float* cuboid, __global float* result)
Introduce a linear index to access more than 1 dimension: For 2D for example int n = x+y*get_global_size(0);
From what I see, k, m, n are lattice dimensions. Eliminate them from the kernel entirely. Get size via get_global_size(...).
The kernel looks rather complex with a lot of loops and branching. This could kill any performance benefit you hope to get from GPU parallelization. Get rid of loops and branching as far as possible. Also, there should not be any loop over one of the lattice dimensions since the lattice position is what you parallelize.
I would also advice to use only 1D parallelization in OpenCL and do the linear indexing yourself. This gives you more flexibility regarding workgroup size.

Half-precision PyTorch float tensors have same performance as single precision float tensors?

Background: I've implemented the antiobject/"field AI" pattern (https://home.cs.colorado.edu/~ralex/papers/PDF/OOPSLA06antiobjects.pdf) for single diffusion using LibTorch/PyTorch.
This works fine, but in the process of running it on the GPU and optimizing it, I've run into a problem. I have a Titan V, which I believe excels at half-precision float math. However, when I make the tensors torch::kHalf, the performance is the same. (I've also tried torch::kFloat16). Any ideas?
The code that I timed is in update():
#define SDL_MAIN_HANDLED
#include <simple2d.h>
#include <torch/torch.h>
#include <c10/cuda/CUDAStream.h>
#include <ATen/cuda/CUDAEvent.h>
#include <math.h>
#include <chrono>
#define DEBUG_NO_DRAW
torch::Device gpu(torch::kCUDA);
torch::Device cpu(torch::kCPU);
torch::Device device = gpu;
const int windowLength = 1000;
const int64_t length = 500;
const float diffusionRate = 0.25;
const int obstacleCount = 4000;
const int entityCount = 1000;
float cellLength = windowLength / length;
torch::Tensor scent = torch::zeros({ length, length }, device).to(torch::kHalf);
torch::Tensor up, down, left, right;
torch::Tensor topWallMask = torch::ones({ length, length }, device).to(torch::kHalf);
torch::Tensor bottomWallMask = torch::ones({ length, length }, device).to(torch::kHalf);
torch::Tensor leftWallMask = torch::ones({ length, length }, device).to(torch::kHalf);
torch::Tensor rightWallMask = torch::ones({ length, length }, device).to(torch::kHalf);
torch::Tensor obstaclesMask = torch::ones({ length, length }, device).to(torch::kHalf);
torch::Tensor entities = torch::zeros({ length, length }, device).to(torch::kHalf);
c10::cuda::CUDAStream stream = c10::cuda::getCurrentCUDAStream();
std::time_t *lastFpsUpdate = NULL;
std::time_t *currentTime = new std::time_t();
int frameAccumulator = 0;
std::vector<long> updateDurations;
void update() {
torch::NoGradGuard no_grad;
AT_CUDA_CHECK(cudaStreamSynchronize(stream));
auto startTime = std::chrono::high_resolution_clock::now();
down = scent.roll(1, 0) * obstaclesMask * topWallMask;
up = scent.roll(-1, 0) * obstaclesMask * bottomWallMask;
right = scent.roll(1, 1) * obstaclesMask * leftWallMask;
left = scent.roll(-1, 1) * obstaclesMask * rightWallMask;
scent = scent + ((down - scent) + (up - scent) + (right - scent) + (left - scent)) * diffusionRate;
scent = torch::max(scent, entities);
AT_CUDA_CHECK(cudaStreamSynchronize(stream));
auto endTime = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(endTime - startTime);
updateDurations.push_back(duration.count());
}
void render() {
if (lastFpsUpdate == NULL) {
lastFpsUpdate = new std::time_t();
std::time(lastFpsUpdate);
}
torch::Tensor sqrtedScent = scent.sqrt().to(torch::kFloat).to(cpu); // just to make darker scents a little brighter for display
auto obstaclesMaskCPU = obstaclesMask.to(torch::kFloat).to(cpu);
auto sqrtedScentAccessor = sqrtedScent.accessor<float, 2>();
auto obstaclesMaskAccessor = obstaclesMaskCPU.accessor<float, 2>();
float r = 0, g = 0, b = 0, a = 0;
#ifndef DEBUG_NO_DRAW
S2D_DrawQuad(
0, 0, 0, 0, 0, 1,
windowLength, 0, 0, 0, 0, 1,
windowLength, windowLength, 0, 0, 0, 1,
0, windowLength, 0, 0, 0, 1);
#endif
for (int i = 0; i < length; i++) {
for(int j = 0; j < length; j++) {
if (obstaclesMaskAccessor[i][j] == 0) {
r = 1; g = 1; b = 1; a = 1;
}
else {
r = 1; g = 0; b = 0; a = sqrtedScentAccessor[i][j];
}
#ifndef DEBUG_NO_DRAW
S2D_DrawQuad(cellLength * j, cellLength * i, r, g, b, a,
cellLength * (j + 1), cellLength * i, r, g, b, a,
cellLength * (j + 1), cellLength * (i + 1), r, g, b, a,
cellLength * j, cellLength * (i + 1), r, g, b, a);
#endif
}
}
frameAccumulator++;
std::time(currentTime);
if (std::difftime(*currentTime, *lastFpsUpdate) > 1.0) {
std::cout << "FPS: " << frameAccumulator << std::endl;
frameAccumulator = 0;
*lastFpsUpdate = *currentTime;
int updateCount = updateDurations.size();
long totalUpdateTime = 0;
for (int i = 0; i < updateCount; i++) {
totalUpdateTime += updateDurations[i];
}
long averageUpdateTime = totalUpdateTime / updateCount;
std::cout << "AverageUpdateTime: " << averageUpdateTime << "us" << std::endl;
updateDurations.clear();
}
}
int main() {
if (torch::cuda::is_available()) {
std::cout << "CUDA is available!" << std::endl;
}
std::cout << "Using " << (device == cpu ? "CPU" : "GPU") << std::endl;
for (int i = 0; i < length; i++) {
topWallMask[0][i] = 0;
bottomWallMask[length - 1][i] = 0;
leftWallMask[i][0] = 0;
rightWallMask[i][length - 1] = 0;
}
for (int i = 0; i < obstacleCount; i++) {
int x = rand() % length;
int y = rand() % length;
obstaclesMask[x][y] = 0;
}
//std::cout << obstaclesMask << std::endl;
for (int i = 0; i < entityCount; i++) {
int x = rand() % length;
int y = rand() % length;
if (obstaclesMask[x][y].item() == 0)
continue;
entities[x][y] = 1;
}
S2D_Window* window = S2D_CreateWindow(
"Collab Diffuse", windowLength, windowLength, update, render, 0
);
S2D_Show(window);
return 0;
}
In both single precision and half precision versions of the code, update() takes about 2700 microseconds.
I'm using PyTorch/LibTorch 1.7.1.
Any other performance tips would be appreciated. (I'm aware drawing pixel by pixel is very slow, so I plan to switch from Simple2D to something else that can draw bitmaps from memory).

How can reverse a part of an array?

I am writing a program that takes a bitmap file to read into memory. But as I am reading it into memory I am making some changes. First I am inverting the colors of the pixels. I managed to get this working. Now I am trying to flip the image on the Y-Axis. I have tried using two for loops but would end up get segmentation faults and also I didn't like how messy it looked. On my second attempt I found a different approach that's cleaner due to it only using one loop and one condition vs 2 loops and 2 conditions. My code now produces no errors but doesn't perform the intended operation. Is there another algorithm I could possibly try?
Below is some code for part of my program. I am trying to reverse the pixel when I am reading them row by row.
fseek(fin, bfh.offset, SEEK_SET);
Pixel *p = new Pixel[bih.height * bih.width];
for (uint32_t i = 0; i < bih.height; i++) {
for (uint32_t j = 0; j < bih.width; j++) {
uint32_t index = i * bih.width + j;
fread(&p[index], 1, sizeof(p[0]), fin);
p[index].blue = 255 - p[index].blue;
p[index].green = 255 - p[index].green;
p[index].red = 255 - p[index].red;
}
uint32_t k = (bih.width * i) - 1;
uint32_t c = 0 + (i * bih.width);
if ( i == 0) {
k = bih.width - 1;
}
while( c < k)
{
temp = p[c];
p[c] = p[k];
p[k] = temp;
c++;
k--;
}
fseek(fin, padding_bytes, SEEK_CUR);
}
fclose(fin);
Below is my whole program if needed.
#include <cstdint>
#include <cstdio>
#pragma pack(push, 2)
struct BitmapFileHeader {
uint16_t type;
uint32_t size;
uint16_t reserved_1;
uint16_t reserved_2;
uint32_t offset;
};
struct BitmapInfoHeader {
uint32_t size;
uint32_t width;
uint32_t height;
uint16_t planes;
uint16_t bitcount;
uint32_t compression;
uint32_t imagesize;
uint32_t x_pixels_per_meter;
uint32_t y_pixels_per_meter;
uint32_t color_used;
uint32_t color_important;
};
#pragma pack(pop)
struct Pixel {
uint8_t blue;
uint8_t green;
uint8_t red;
};
int main(int argc, char* argv[])
{
if(argc != 3) {
printf("Usage : %s input_file output_file\n", argv[0]);
return 1;
}
FILE *fin;
FILE *fout;
BitmapFileHeader bfh;
BitmapInfoHeader bih;
Pixel temp;
fin = fopen(argv[1], "rb");
if (nullptr == fin) {
perror(argv[1]);
return -1;
}
if (sizeof(BitmapFileHeader) != fread(
&bfh,
1,
sizeof(bfh),
fin
)) {
printf("Unable to read bitmap file header. \n");
return -2;
}
if (sizeof(BitmapInfoHeader) != fread(
&bih,
1,
sizeof(bih),
fin
)) {
printf("Unable to read bitmap info header. \n");
return -3;
}
printf("Size of File Header = %lu\n", sizeof(BitmapFileHeader));
int8_t first = (bfh.type >> 8) & 0xff;
int8_t second = bfh.type & 0xff;
if ( (first != 'M') && (second != 'B') ){
printf("Input file is not a Bitmap file. \n");
return -4;
}
printf("File type = %c%c\n", first, second);
printf("File size = %u\n", bfh.size);
printf("File offset = %u\n", bfh.offset);
printf("File width = %u\n", bih.width);
printf("Info size = %u\n", bih.size);
uint32_t padding_bytes = 0;
uint32_t row_bytes_final = bih.width * sizeof(Pixel);
uint32_t row_bytes_initial = row_bytes_final;
do{
uint32_t rem = row_bytes_final % 4;
if (rem != 0) {
row_bytes_final += 1;
}
padding_bytes = row_bytes_final - row_bytes_initial;
} while( (row_bytes_final % 4) != 0);
fseek(fin, bfh.offset, SEEK_SET);
Pixel *p = new Pixel[bih.height * bih.width];
for (uint32_t i = 0; i < bih.height; i++) {
for (uint32_t j = 0; j < bih.width; j++) {
uint32_t index = i * bih.width + j;
fread(&p[index], 1, sizeof(p[0]), fin);
p[index].blue = 255 - p[index].blue;
p[index].green = 255 - p[index].green;
p[index].red = 255 - p[index].red;
}
uint32_t k = (bih.width * i) - 1;
uint32_t c = 0 + (i * bih.width);
if ( i == 0) {
k = bih.width - 1;
}
while( (c * bih.width) < (k * bih.width))
{
temp = p[c];
p[c] = p[k];
p[k] = temp;
c++;
k--;
}
fseek(fin, padding_bytes, SEEK_CUR);
}
fclose(fin);
fout = fopen(argv[2], "wb");
if(nullptr == fout) {
perror(argv[2]);
return -5;
}
if( sizeof(BitmapFileHeader) != fwrite(
&bfh,
1,
sizeof(bfh),
fout
)) {
printf("Unable to write bitmap file header.\n");
return -6;
}
if( sizeof(BitmapInfoHeader) != fwrite(
&bih,
1,
sizeof(bih),
fout
)) {
printf("Unable to write bitmap info header.\n");
return -7;
}
fseek(fout, bfh.offset, SEEK_SET);
for (uint32_t i = 0; i < bih.height; i++) {
for (uint32_t j = 0; j < bih.width; j++) {
uint32_t index = i * bih.width + j;
fwrite(&p[index], 1, sizeof(p[0]), fout);
}
fseek(fout, padding_bytes, SEEK_CUR);
}
if (padding_bytes > 0) {
fseek(fout, -1, SEEK_CUR);
fputc('\0', fout);
}
fclose(fout);
delete[] p;
return 0;
}
You got the bounds wrong, it should be c = i * bih.width; k = (i + 1) * bih.width - 1;.
You can also use std::reverse to do this:
std::reverse(p + i * bih.width, p + (i + 1) * bih.width); // Exclusive end, so no -1

What's wrong with this parallel algorithm?

I'm trying to write parallel algorithm in openCL for L-system Pythagoras Tree :
var:A,B;
const: (,);
axiom:A;
rules:(B->BB),(A->B[A]A)
But i can't get over 9th iteration. 10th iteration returns disordered string. Here is my kernel:
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_amd_printf : enable
__kernel void l_system(int string_lenght){}
__kernel void l_system_interation(int string_lenght, __global char *sentence, __local char *string, __global int * local_char_num)
{
int local_x = (int)get_local_id(0);
int local_size = (int)get_local_size(0);
int x = (int)get_global_id(0);
int size = (int)get_global_size(0);
int group = (int)get_group_id(0);
int local_mem_index;
if(x < string_lenght){
//local mem index - offset for next group, copy char to local
local_mem_index = local_x * 5;
string[local_mem_index] = sentence[x];
if(local_x == 0){
//reset counter
atomic_xchg(&local_char_num[group], 0);
//atomic_add(&local_char_num[0], group);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_GLOBAL_MEM_FENCE);
if(x < string_lenght){
if(string[local_mem_index] == 'A'){
atomic_add(&local_char_num[group], 5);
string[local_mem_index] = 'B';
string[local_mem_index + 1] = '(';
string[local_mem_index + 2] = 'A';
string[local_mem_index + 3] = ')';
string[local_mem_index + 4] = 'A';
}
else if(string[local_mem_index] == 'B'){
atomic_add(&local_char_num[group], 2);
string[local_mem_index + 1] = 'B';
//reset 3rd char of local_mem
string[local_mem_index + 2] = '0';
}
else{
atomic_add(&local_char_num[group], 1);
//reset 3rd char of local_mem
string[local_mem_index + 2] = '0';
string[local_mem_index + 2] = '0';
}
}
barrier(CLK_LOCAL_MEM_FENCE);
barrier(CLK_GLOBAL_MEM_FENCE);
//1 compute unit for every char from src
if(x < string_lenght){
//local first compute unit writes to result whole group string
if(local_x == 0){
int j = 0;
//find offset for write to result string
if(x != 0){
for(int l = 1;l <= group; l++)
{
j += atomic_xchg(&local_char_num[group-l], local_char_num[group-l]);
//if(l == 0)
}
atomic_xchg(&local_char_num[99+group], local_char_num[group]);
}
for(int i = 0; i < local_size; i++){
//only valid chars
if(string_lenght > (x+i)){
local_mem_index = i * 5;
//B rule, copy (,)
if(string[local_mem_index+2] != 'A'){
sentence[j++] = string[local_mem_index];
if(string[local_mem_index] == 'B'){
sentence[j++] = string[local_mem_index+1];
}
continue;//B,(,); next index;
}
else{ // A rule
sentence[j++] = string[local_mem_index];
sentence[j++] = string[local_mem_index+1];
sentence[j++] = string[local_mem_index+2];
sentence[j++] = string[local_mem_index+3];
sentence[j++] = string[local_mem_index+4];
}//if 'A'
//sentence[j] = 0;
}//if x+i
}//for
}// lx == 0
}
barrier(CLK_GLOBAL_MEM_FENCE);
}
I think, that something overflow anywhere, but can't find where... Maybe there is something wrong with my code in main:
cl_int letter_count = 0;
cl_int next_letter_count = 1;
for (int i = 0; i < iter_count; i++)
{
//printf("%s\n", sentence_init);
letter_count = next_letter_count;
next_letter_count = STRING_LENGTH_PAR((i + 1));
printf("in count: %d out count: %d\n", letter_count, next_letter_count);
CheckOpenCLError(clSetKernelArg(kernel_iteration, 0, sizeof(cl_int), &letter_count), "clSetKernelArg: letter_count");
CheckOpenCLError(clSetKernelArg(kernel_iteration, 2, sizeof(cl_char)* (local * RULE_SIZE + 1), NULL), "clSetKernelArg: tmp_string");
CheckOpenCLError(clEnqueueNDRangeKernel(queue, kernel_iteration, 1, NULL, &global, &local, 0, NULL, &kernel_iteration_event), "clEnqueueNDRangeKernel: kernel_iteration");
CheckOpenCLError(clFinish(queue), "clFinish");
kernel_computing_time += getEventTime(kernel_iteration_event);
}
CheckOpenCLError(clEnqueueReadBuffer(queue, sentence_dev, CL_TRUE, 0, sizeof(cl_char)* (next_letter_count), sentence_result, 0, NULL, &result_iteration_event), "clEnqueueReadBuffer: result_iteration_event");
cl_int *p = (cl_int*)malloc(sizeof(cl_int)*(STRING_LENGTH_PAR(iter_count)));
CheckOpenCLError(clEnqueueReadBuffer(queue, p_dev, CL_TRUE, 0, sizeof(cl_int)* (STRING_LENGTH_PAR(iter_count)), p, 0, NULL, &result_iteration_event), "clEnqueueReadBuffer: result_iteration_event");