CUDA SHA-Calculation fails [closed]

CUDA SHA-Calculation fails [closed] - c++

Closed. This question needs debugging details. It is not currently accepting answers.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
Closed 8 years ago.
Improve this question
I want to write a SHA1-Function in CUDA, but when I execute the function, I get wrong results out of the function. When I run the same function on the CPU, I get correct results. My SHA-Function looks like:
__device__ void SHA1_CUDA(uint8_t input_string[], int slen, uint32_t Hash_ptr[])
{
printf("Input string is %s, input len is %d\n", input_string, slen);
uint32_t K[80];
uint32_t A,B,C,D,E,TEMP;
int r,k,ln,t,l,i,j;
Hash_ptr[0]=0x67452301;
Hash_ptr[1]=0xefcdab89;
Hash_ptr[2]=0x98badcfe;
Hash_ptr[3]=0x10325476;
Hash_ptr[4]=0xc3d2e1f0;
ln=slen;
r = (int)((ln+1)/64);
if (((ln+1) % 64) > 56)
{
r=r+1;
}
// initialize Constants
for(t=0; t<80; t++)
{
if (t<20)
{
K[t] = 0x5a827999;
}
if ((t>19)&(t<40))
{
K[t] = 0x6ED9EBA1;
}
if ((t>39)&(t<60))
{
K[t] = 0x8F1BBCDC;
}
if (t>59)
{
K[t] = 0xca62c1d6;
}
}
for(l=0; l <= r; l++)
{
uint32_t W[80]={0};
//Initialize Text
for (i=0; i<16; i++)
{
for(j=0; j<4; j++)
{
if (4*i+j <= ln)
{
k = input_string[64*l+4*i+j];
}
else
{
k =0;
}
if (k<0)
{
k = k +256;
}
if (4*i+j == ln)
{
k = 0x80;
}
// W[i]= W[i] + k*(uint32_t)pow(256,(double)3-j);
W[i]= W[i] + k*expo_d[3-j];
}
}
if ((W[14]==0)&(W[15]==0))
{
W[15]=8*slen;
}
// Hash Cycle
for (t = 16; t <80; t++)
{
W[t] = Rol(W[t-3]^W[t-8]^W[t-14]^W[t-16],1);
}
A = Hash_ptr[0];
B = Hash_ptr[1];
C = Hash_ptr[2];
D = Hash_ptr[3];
E = Hash_ptr[4];
for(t = 0; t < 80; t++)
{
TEMP = (Rol(A,5) + f(B,C,D,t) + E + W[t] + K[t]);
E = D;
D = C;
C = Rol(B,30);
B = A;
A = TEMP;
}
Hash_ptr[0] = Hash_ptr[0] + A;
Hash_ptr[1] = Hash_ptr[1] + B;
Hash_ptr[2] = Hash_ptr[2] + C;
Hash_ptr[3] = Hash_ptr[3] + D;
Hash_ptr[4] = Hash_ptr[4] + E;
ln = ln - 64;
}
}
(host function is analogous, only with __host__ instead of __device__).
My kernel function is
__global__ void test_sha(uint8_t pw[], int* pw_len, uint32_t H[])
{
SHA1_CUDA(pw, *pw_len, H);
}
and I'm calling it like
printf("\nTesting SHA\n");
uint32_t * H_h = (uint32_t*)malloc(sizeof(uint32_t)*5);
memset(H_h, 0, sizeof(uint32_t) * 5);
uint32_t * H_d;
cudaMalloc(&H_d, sizeof(uint32_t)*5);
cudaMemcpy(H_d, H_h, 5*sizeof(uint32_t), cudaMemcpyHostToDevice);
test_sha<<<1, 1>>>(Pass_d, Pass_len_d, H_d);
cudaMemcpy(H_h, H_d, 5*sizeof(uint32_t), cudaMemcpyDeviceToHost);
cudaFree(H_d);
for(int i = 0; i < 5; i++)
printf("%x ", H_h[i]);
printf("\n\n");
printf("Comparing to CPU: \n");
SHA1_CUDA_h(Pass_h, Pass_len, H_h);
for(int i = 0; i < 5; i++)
printf("%x ", H_h[i]);
printf("\n\n");
free(H_h);
So, my printf-function in the SHA-function tells me that everything has been transferred correctly, but nevertheless I get wrong results...
Where is my mistake?

Problem solved, the ROL-function Rol_CUDA I was using in my function returned bad values, thus no one except me could solve the problem.
For everyone who wants to use this function: In line 51 on pastebin, there should be a 32-y, and not a -y. With this correction everything works.

Related

"Enlarging" a 2D array by factor (m,n) [closed]

Closed. This question needs to be more focused. It is not currently accepting answers.
Want to improve this question? Update the question so it focuses on one problem only by editing this post.
Closed 4 years ago.
Improve this question
Say I have a 2D array of size h1 x w1
I want to enlarge it (Just like we enlarge an image) by a factor m,n
So that my resulting array if of the size h1*m , w1*n
for example, I have an array arr1[2][2] of size (2, 2)
01 11
Now I enlarged it by a factor (3, 3) so my new array arr2[6][6] becomes
000111
000111
000111
111111
111111
111111
Can you suggest me an algorithm / iterative loop which could generate arr2 by given information ?

I use pointers to arrays in a smart way to index continuous memory as 2d array and a declarator for VLA in a function prototype.
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <stdint.h>
#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0]))
#define ERR_ON(expr, msg, ...) \
do{ if(expr) { fprintf(stderr, "%d: error %s failed: " msg "\n", __LINE__, #expr, ##__VA_ARGS__); exit(-1); } }while(0)
/**
* Input: array p with x columns and y rows
* Output: int arr[y * factory][x * factorx]
* newly allocated array created from p resized by a factorx and factory
*/
void *arr2d_getenlarged(size_t x, size_t y, int p[y][x], size_t factorx, size_t factory)
{
const size_t newx = x * factorx;
const size_t newy = y * factory;
int (* const ret)[newx] = malloc(sizeof(*p) * newx * newy);
ERR_ON(ret == NULL, "");
for (size_t i = 0; i < x; ++i) {
for (size_t j = 0; j < y; ++j) {
const int val = p[i][j];
for (size_t m = 0; m < factorx; ++m) {
for (size_t n = 0; n < factory; ++n) {
ret[i * factorx + m][j * factory + n] = val;
}
}
}
}
return ret;
}
void arr2d_print(size_t x, size_t y, int (*arr)[x])
{
printf("--- %p ---\n", (void*)arr);
for (size_t i = 0; i < x; ++i) {
for (size_t j = 0; j < y; ++j) {
printf("[%d,%d]=%d ", i, j, arr[i][j]);
}
printf("\n");
}
}
int main()
{
int (*arr)[2] = malloc(sizeof(*arr) * 2);
ERR_ON(arr == NULL, "");
memcpy(arr, (int[2][2]){{0,1},{1,1}}, sizeof((int[2][2]){0}));
arr2d_print(2, 2, arr);
int (*arr3)[6] = (void*)arr2d_getenlarged(2, 2, &arr[0][0], 3, 3);
arr2d_print(6, 6, arr3);
free(arr);
free(arr3);
printf("Hello World");
return 0;
}
Example output:
--- 0x1203010 ---
[0,0]=0 [0,1]=1
[1,0]=1 [1,1]=1
--- 0x1203030 ---
[0,0]=0 [0,1]=0 [0,2]=0 [0,3]=1 [0,4]=1 [0,5]=1
[1,0]=0 [1,1]=0 [1,2]=0 [1,3]=1 [1,4]=1 [1,5]=1
[2,0]=0 [2,1]=0 [2,2]=0 [2,3]=1 [2,4]=1 [2,5]=1
[3,0]=1 [3,1]=1 [3,2]=1 [3,3]=1 [3,4]=1 [3,5]=1
[4,0]=1 [4,1]=1 [4,2]=1 [4,3]=1 [4,4]=1 [4,5]=1
[5,0]=1 [5,1]=1 [5,2]=1 [5,3]=1 [5,4]=1 [5,5]=1
Hello World
Live version available at onlinegdb.

#include <stdio.h>
#include <stdlib.h>
#define ENLARGEX 3
int src[2][2] = {{0,1},{1,1}};
int main(int argc, const char * argv[]) {
// insert code here...
int **dest;
//int dest[6][6] ={0,};
int i, j;
int sizesrc = sizeof(src)/sizeof(int)/2;
dest = (int **)malloc(sizeof(int*)*sizesrc*ENLARGEX);
for (i = 0; i < sizesrc*ENLARGEX; i++) {
dest[i] = (int *)malloc(sizeof(int)*sizesrc*ENLARGEX);
}
for (i = 0; i < sizesrc*ENLARGEX; i++){
for(j = 0; j < sizesrc*ENLARGEX; j++) {
dest[i][j] = src[i/ENLARGEX][j/ENLARGEX];
printf("%d ", dest[i][j]);
}
printf("\n");
}
for (i = 0; i < sizesrc*ENLARGEX; i++) {
free(dest[i]);
}
free(dest);
return 0;
}

Corrupted memory issue when deleting allocated memory

I am trying to store a sparse vector using a bit mask. I allocate a char* to represent the bit mask. However, when I delete [] the mask, I get a memory corruption error. Upon investigation, I'm seeing that it's because I'm freeing memory that I'm not supposed to. This is confusing, since I don't see how this could be the case.
When I run this on my case, it prints out "ALLOCATED" and "DEALLOCATING" but nothing further.
void set_i_bit(char* mask, int i) {
int field_num = floor(i/8);
int bit_num = i %8;
mask[field_num] = (1 << bit_num) | mask[field_num];
}
int write_sparse_with_bitmask(vector<float> arr, ofstream* fout) {
int mx_sz = arr.size() - 1;
float tol = 0.5;
char* mask = 0;
for(int i = arr.size() -1; i>=0; i-=1) {
if (fabs(arr[i]) > tol) break;
mx_sz = i;
}
int sprse_cnt = 0;
for(int i = 0; i<=mx_sz; i+=1) {
if (fabs(arr[i]) < tol) sprse_cnt++;
}
int bitmask_sz = ceil(mx_sz/8);
if (sprse_cnt*sizeof(int16_t) + sizeof(int16_t) > bitmask_sz) {
cout<<"ALLOCATED"<<endl;
mask = new char[bitmask_sz];
for (int i =0; i<bitmask_sz; i++) mask[i] = 0;
for(int i = 0; i<=mx_sz; i+=1) {
if (fabs(arr[i]) > coef_tol) {
set_i_bit(mask, i);
}
}
}
else {
bitmask_sz = 0;
}
uint16_t sz = mx_sz + 1;
uint16_t bt_msk = bitmask_sz + 1;
char flag = 0;
if (bitmask_sz > 0) {
flag = flag | 1;
}
fout->write((char*)&sz, sizeof(uint16_t));
fout->write((char*)&flag, sizeof(char));
int w_size = sizeof(uint16_t) + sizeof(char);
if (flag & 1) {
fout->write((char*)&bt_msk, sizeof(uint16_t));
fout->write(mask, sizeof(char)*bt_msk);
cout<<"DEALLOCATING"<<endl;
delete [] mask;
cout<<"THIS DOESN'T PRINT"<<endl;
w_size += sizeof(uint16_t) + sizeof(char)*bt_msk;
}
for(int i = 0; i<=mx_sz; i+=1) {
if (fabs(arr[i]) > tol || !(flag & 1)) {
int16_t vl = arr[i];
fout->write((char*) &vl, sizeof(int16_t));
w_size += sizeof(int16_t);
}
}
return w_size;
}

Gradient descent converging towards the wrong value

I'm trying to implement a gradient descent algorithm in C++. Here's the code I have so far :
#include <iostream>
double X[] {163,169,158,158,161,172,156,161,154,145};
double Y[] {52, 68, 49, 73, 71, 99, 50, 82, 56, 46 };
double m, p;
int n = sizeof(X)/sizeof(X[0]);
int main(void) {
double alpha = 0.00004; // 0.00007;
m = (Y[1] - Y[0]) / (X[1] - X[0]);
p = Y[0] - m * X[0];
for (int i = 1; i <= 8; i++) {
gradientStep(alpha);
}
return 0;
}
double Loss_function(void) {
double res = 0;
double tmp;
for (int i = 0; i < n; i++) {
tmp = Y[i] - m * X[i] - p;
res += tmp * tmp;
}
return res / 2.0 / (double)n;
}
void gradientStep(double alpha) {
double pg = 0, mg = 0;
for (int i = 0; i < n; i++) {
pg += Y[i] - m * X[i] - p;
mg += X[i] * (Y[i] - m * X[i] - p);
}
p += alpha * pg / n;
m += alpha * mg / n;
}
This code converges towards m = 2.79822, p = -382.666, and an error of 102.88. But if I use my calculator to find out the correct linear regression model, I find that the correct values of m and p should respectively be 1.601 and -191.1.
I also noticed that the algorithm won't converge for alpha > 0.00007, which seems quite low, and the value of p barely changes during the 8 iterations (or even after 2000 iterations).
What's wrong with my code?
Here's a good overview of the algorithm I'm trying to implement. The values of theta0 and theta1 are called p and m in my program.
Other implementation in python
More about the algorithm

This link gives a comprehensive view of the algorithm; it turns out I was following a completely wrong approach.
The following code does not work properly (and I have no plans to work on it further), but should put on track anyone who's confronted to the same problem as me :
#include <vector>
#include <iostream>
typedef std::vector<double> vect;
std::vector<double> y, omega(2, 0), omega2(2, 0);;
std::vector<std::vector<double>> X;
int n = 10;
int main(void) {
/* Initialize x so that each members contains (1, x_i) */
/* Initialize x so that each members contains y_i */
double alpha = 0.00001;
display();
for (int i = 1; i <= 8; i++) {
gradientStep(alpha);
display();
}
return 0;
}
double f_function(const std::vector<double> &x) {
double c;
for (unsigned int i = 0; i < omega.size(); i++) {
c += omega[i] * x[i];
}
return c;
}
void gradientStep(double alpha) {
for (int i = 0; i < n; i++) {
for (unsigned int j = 0; j < X[0].size(); j++) {
omega2[j] -= alpha/(double)n * (f_function(X[i]) - y[i]) * X[i][j];
}
}
omega = omega2;
}
void display(void) {
double res = 0, tmp = 0;
for (int i = 0; i < n; i++) {
tmp = y[i] - f_function(X[i]);
res += tmp * tmp; // Loss functionn
}
std::cout << "omega = ";
for (unsigned int i = 0; i < omega.size(); i++) {
std::cout << "[" << omega[i] << "] ";
}
std::cout << "\tError : " << res * .5/(double)n << std::endl;
}

Seeking knowledge on array of arrays memory performance

Context: Multichannel real time digital audio processing.
Access pattern: "Column-major", like so:
for (int sample = 0; sample < size; ++sample)
{
for (int channel = 0; channel < size; ++channel)
{
auto data = arr[channel][sample];
// do some computations
}
}
I'm seeking advice on how to make the life easier for the CPU and memory, in general. I realize interleaving the data would be better, but it's not possible.
My theory is, that as long as you sequentially access memory for a while, the CPU will prefetch it - will this hold for N (channel) buffers? What about size of the buffers, any "breaking points"?
Will it be very beneficial to have the channels in contiguous memory (increasing locality), or does that only hold for very small buffers (like, size of cache lines)? We could be talking buffersizes > 100 kb apart.
I guess there would also be a point where the time of the computational part makes memory optimizations negligible - ?
Is this a case, where manual prefetching makes sense?
I could test/profile my own system, but I only have that - 1 system. So any design choices I make may only positively affect that particular system. Any knowledge on these matters are appreciated, links, literature etc., platform specific knowledge.
Let me know if the question is too vague, I primarily thought it would be nice to have some wiki-ish experience / info on this area.
edit:
I created a program, that tests the three cases I mentioned (distant, adjecant and contiguous mentioned in supposedly increasing performance order), which tests these patterns on small and big data sets. Maybe people will run it and report anomalies.
#include <iostream>
#include <chrono>
#include <algorithm>
const int b = 196000;
const int s = 64 / sizeof(float);
const int extra_it = 16;
float sbuf1[s];
float bbuf1[b];
int main()
{
float sbuf2[s];
float bbuf2[b];
float * sbuf3 = new float[s];
float * bbuf3 = new float[b];
float * sbuf4 = new float[s * 3];
float * bbuf4 = new float[b * 3];
float use = 0;
while (1)
{
using namespace std;
int c;
bool sorb;
cout << "small or big test (0/1)? ";
if (!(cin >> sorb))
return -1;
cout << endl << "test distant buffers (0), contiguous access (1) or adjecant access (2)? ";
if (!(cin >> c))
return -1;
auto t = std::chrono::high_resolution_clock::now();
if (c == 0)
{
// "worst case scenario", 3 distant buffers constantly touched
if (sorb)
{
for (int k = 0; k < b * extra_it; ++k)
for (int i = 0; i < s; ++i)
{
sbuf1[i] = k; // static memory
sbuf2[i] = k; // stack memory
sbuf3[i] = k; // heap memory
}
}
else
{
for (int k = 0; k < s * extra_it; ++k)
for (int i = 0; i < b; ++i)
{
bbuf1[i] = k; // static memory
bbuf2[i] = k; // stack memory
bbuf3[i] = k; // heap memory
}
}
}
else if (c == 1)
{
// "best case scenario", only contiguous memory touched, interleaved
if (sorb)
{
for (int k = 0; k < b * extra_it; ++k)
for (int i = 0; i < s * 3; i += 3)
{
sbuf4[i] = k;
sbuf4[i + 1] = k;
sbuf4[i + 2] = k;
}
}
else
{
for (int k = 0; k < s * extra_it; ++k)
for (int i = 0; i < b * 3; i += 3)
{
bbuf4[i] = k;
bbuf4[i + 1] = k;
bbuf4[i + 2] = k;
}
}
}
else if (c == 2)
{
// "compromise", adjecant memory buffers touched
if (sorb)
{
auto b1 = sbuf4;
auto b2 = sbuf4 + s;
auto b3 = sbuf4 + s * 2;
for (int k = 0; k < b * extra_it; ++k)
for (int i = 0; i < s; ++i)
{
b1[i] = k;
b2[i] = k;
b3[i] = k;
}
}
else
{
auto b1 = bbuf4;
auto b2 = bbuf4 + b;
auto b3 = bbuf4 + b * 2;
for (int k = 0; k < s * extra_it; ++k)
for (int i = 0; i < b; ++i)
{
b1[i] = k;
b2[i] = k;
b3[i] = k;
}
}
}
else
break;
cout << chrono::duration_cast<chrono::milliseconds>(chrono::high_resolution_clock::now() - t).count() << " ms" << endl;
// basically just touching the buffers, avoiding clever optimizations
use += std::accumulate(sbuf1, sbuf1 + s, 0);
use += std::accumulate(sbuf2, sbuf2 + s, 0);
use += std::accumulate(sbuf3, sbuf3 + s, 0);
use += std::accumulate(sbuf4, sbuf4 + s * 3, 0);
use -= std::accumulate(bbuf1, bbuf1 + b, 0);
use -= std::accumulate(bbuf2, bbuf2 + b, 0);
use -= std::accumulate(bbuf3, bbuf3 + b, 0);
use -= std::accumulate(bbuf4, bbuf4 + b * 3, 0);
}
std::cout << use;
std::cin.get();
}
On my Intel i7-3740qm surprisingly, distant buffers consistently outperforms the more locality-friendly tests. It is close, however.

Compile time error..Lvalue requiered c++ [closed]

Closed. This question needs debugging details. It is not currently accepting answers.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
Closed 8 years ago.
Improve this question
This code shows a compile time error
#include<stdio.h>
#include<conio.h>
#define SIZE 7
int a[SIZE][SIZE],q[SIZE*SIZE],visited[SIZE][SIZE],n,i,j,f=0,r=-1;
int parent[SIZE*SIZE], x_dest, y_dest, x_temp, y_temp;
int flag =0;
void find_neighbours(int x, int y)
{
if (( ((y+1)=0) && (a[x][y-1])) && (visited[x][y-1]))
{
q[++r]= x*n + (y-1);
parent[x*n + (y-1)] = x*n +y;
visited[x][y-1] =1;
}
if ((x+1)=0 && (a[x-1][y]) && !visited[x-1][y])
{
q[++r]=(x-1)*n+(y);
parent[(x-1)*n+(y)]=x*n+y;
visited[x-1][y]=1;
}
}
void bfs(int x, int y)
{
find_neighbours(x, y);
if(f<=r)
{
x_temp = q[f]/n;
y_temp = q[f] - n*x_temp;
if (x_temp == x_dest && y_temp == y_dest)
{
flag =1;
return ;
}
//visited[x_temp][y_temp] = 1;
f++;
bfs(x_temp,y_temp);
}
}
int main()
{
int x,y,val;
int source_x , source_y;
n = SIZE;
for(i=0;i< n;i++)
{
q[i]=0;
}
for(i=0;i<n;i++)
{
for(j=0;j<n;j++)
{
visited[i][j] = 0;
}
}
for(i=0;i<n*n;i++)
parent[i] = 0;
printf("n Enter graph data in matrix form:n");
for(i=0;i< n;i++)
for(j=0;j<n;j++)
scanf("%d",&a[i][j]);
source_x = 0;
source_y = 0;
visited[source_x][source_y] =1;
x_dest = 6;
y_dest = 6;
bfs(0,0);
if (!flag)
{
printf("not reachable \n");
return 0;
}
x = x_temp;
y = y_temp;
while(x!=0 || y!=0)
{
val = parent[x *n + y];
x = val/n;
y = val - x*n;
printf("%d %dn ", x,y);
}
return 0;
}

You have code like this all over the place:
if ((x+1)=0)
x+1 is not an lvalue yet you're trying to assign to it.
Presumably you meant ==.
As an aside, your code is generally very difficult to read. What prevented you from writing documenting comments and employing meaningful variable names?

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

CUDA SHA-Calculation fails [closed] - c++

Problem solved, the ROL-function Rol_CUDA I was using in my function returned bad values, thus no one except me could solve the problem. For everyone who wants to use this function: In line 51 on pastebin, there should be a 32-y, and not a -y. With this correction everything works.

Related

"Enlarging" a 2D array by factor (m,n) [closed]

Corrupted memory issue when deleting allocated memory

Gradient descent converging towards the wrong value

Seeking knowledge on array of arrays memory performance

Compile time error..Lvalue requiered c++ [closed]

Categories

Resources