SSE addition and conversion - c++

Here's the thing, how can I add two unsigned char arrays and store the result in an unsigned short array by using SSE. Can anyone give me some help or hint. This is what I have done so far. I just don't know where the error is..need some help
using namespace std;
void sse_add(unsigned char * input1, unsigned char *input2, unsigned short *output, const int N)
unsigned char *op3 = new unsigned char[N];
unsigned char *op4 = new unsigned char[N];
__m128i *sse_op3 = (__m128i*)op3;
__m128i *sse_op4 = (__m128i*)op4;
__m128i *sse_result = (__m128i*)output;
for (int i = 0; i < N; i = i + 16)
__m128i src = _mm_loadu_si128((__m128i*)input1);
__m128i zero = _mm_setzero_si128();
__m128i higher = _mm_unpackhi_epi8(src, zero);
__m128i lower = _mm_unpacklo_epi8(src, zero);
_mm_storeu_si128(sse_op3, lower);
sse_op3 = sse_op3 + 1;
_mm_storeu_si128(sse_op3, higher);
sse_op3 = sse_op3 + 1;
input1 = input1 + 16;
for (int j = 0; j < N; j = j + 16)
__m128i src1 = _mm_loadu_si128((__m128i*)input2);
__m128i zero1 = _mm_setzero_si128();
__m128i higher1 = _mm_unpackhi_epi8(src1, zero1);
__m128i lower1 = _mm_unpacklo_epi8(src1, zero1);
_mm_storeu_si128(sse_op4, lower1);
sse_op4 = sse_op4 + 1;
_mm_storeu_si128(sse_op4, higher1);
sse_op4 = sse_op4 + 1;
input2 = input2 + 16;
__m128i *sse_op3_new = (__m128i*)op3;
__m128i *sse_op4_new = (__m128i*)op4;
for (int y = 0; y < N; y = y + 8)
*sse_result = _mm_adds_epi16(*sse_op3_new, *sse_op4_new);
sse_result = sse_result + 1;
sse_op3_new = sse_op3_new + 1;
sse_op4_new = sse_op4_new + 1;
void C_add(unsigned char * input1, unsigned char *input2, unsigned short *output, int N)
for (int i = 0; i < N; i++)
output[i] = (unsigned short)input1[i] + (unsigned short)input2[i];
int main()
int n = 1023;
unsigned char *p0 = new unsigned char[n];
unsigned char *p1 = new unsigned char[n];
unsigned short *p21 = new unsigned short[n];
unsigned short *p22 = new unsigned short[n];
for (int j = 0; j < n; j++)
p21[j] = rand() % 256;
p22[j] = rand() % 256;
C_add(p0, p1, p22, n);
cout << "C_add finished!" << endl;
sse_add(p0, p1, p21, n);
cout << "sse_add finished!" << endl;
for (int j = 0; j < n; j++)
if (p21[j] != p22[j])
cout << "diff!!!!!#######" << endl;
delete[] p0;
delete[] p1;
delete[] p21;
delete[] p22;
return 0;

Assuming everything is aligned to _Alignof(__m128i) and the size of the array is a multiple of sizeof(__m128i), something like this should work:
void addw(size_t size, uint16_t res[size], uint8_t a[size], uint8_t b[size]) {
__m128i* r = (__m128i*) res;
__m128i* ap = (__m128i*) a;
__m128i* bp = (__m128i*) b;
for (size_t i = 0 ; i < (size / sizeof(__m128i)) ; i++) {
r[(i * 2)] = _mm_add_epi16(_mm_cvtepu8_epi16(ap[i]), _mm_cvtepu8_epi16(bp[i]));
r[(i * 2) + 1] = _mm_add_epi16(_mm_cvtepu8_epi16(_mm_srli_si128(ap[i], 8)), _mm_cvtepu8_epi16(_mm_srli_si128(bp[i], 8)));
FWIW, NEON would be a bit simpler (using vaddl_u8 and vaddl_high_u8).
If you're dealing with unaligned data you can use _mm_loadu_si128/_mm_storeu_si128. If size isn't a multiple of 16 you'll just have to do the remainder without SSE.
Note that this may be something your compiler can do automatically (I haven't checked). You may want to try something like this:
#pragma omp simd
for (size_t i = 0 ; i < size ; i++) {
res[i] = ((uint16_t) a[i]) + ((uint16_t) b[i]);
That uses OpenMP 4, but there is also Cilk++ (#pragma simd), clang (#pragma clang loop vectorize(enable)), gcc (#pragma GCC ivdep), or you could just hope the compiler is smart enough without the pragma hint.


Copy 80 bit hex number from char array to uint16_t vector or array

Say I have a text file containing the 80bit hex number
My C++ program reads that using fstream into a char array called buffer.
But then I want to store it in a uint16_t array such that:
uint16_t * key = {0xabcd, 0xef01, 0x2345, 0x6789, 0xabcd}
I have tried several approaches, but I continue to get decimal integers, for instance:
const std::size_t strLength = strlen(buffer);
std::vector<uint16_t> arr16bit((strLength / 2) + 1);
for (std::size_t i = 0; i < strLength; ++i)
arr16bit[i / 2] <<= 8;
arr16bit[i / 2] |= buffer[i];
arr16bit = {24930, 25444, 25958, 12337, 12851}
There must be an easy way to do this that I'm just not seeing.
Here is the full solution I came up with based on the comments:
int hex_char_to_int(char c) {
if (int(c) < 58) //numbers
return c - 48;
else if (int(c) < 91) //capital letters
return c - 65 + 10;
else if (int(c) < 123) //lower case letters
return c - 97 + 10;
uint16_t ints_to_int16(int i0, int i1, int i2, int i3) {
return (i3 * 16 * 16 * 16) + (i2 * 16 * 16) + (i1 * 16) + i0;
void readKey() {
const int bufferSize = 25;
char buffer[bufferSize] = { NULL };
ifstream* pStream = new ifstream("key.txt");
if (pStream->is_open() == true)
pStream->read(buffer, bufferSize);
cout << buffer << endl;
const size_t strLength = strlen(buffer);
int* hex_to_int = new int[strLength - 2];
for (int i = 2; i < strLength; i++) {
hex_to_int[i - 2] = hex_char_to_int(buffer[i]);
cout << endl;
uint16_t* key16 = new uint16_t[5];
int j = 0;
for (int i = 0; i < 5; i++) {
key16[i] = ints_to_int16(hex_to_int[j++], hex_to_int[j++], hex_to_int[j++], hex_to_int[j++]);
cout << "0x" << hex << key16[i] << " ";
cout << endl;
This outputs:
0xabcd 0xef01 0x2345 0x6789 0xabcd

Implementation of the SHA256 algorithm do not return the expected result

With the implementation below, based on the pseudo-code available here, I am trying to convert a string generated with the concatenation of the members from this class:
class BlockHeader
int version;
string hashPrevBlock;
string hashMerkleRoot;
int time;
int bits;
int nonce;
into a SHA256 hash, like what was done with the python code below, available here:
>>> import hashlib
>>> header_hex = ("01000000" +
"81cd02ab7e569e8bcd9317e2fe99f2de44d49ab2b8851ba4a308000000000000" +
"e320b6c2fffc8d750423db8b1eb942ae710e951ed797f7affc8892b0f1fc122b" +
"c7f5d74d" +
"f2b9441a" +
>>> header_bin = header_hex.decode('hex')
>>> hash = hashlib.sha256(hashlib.sha256(header_bin).digest()).digest()
>>> hash.encode('hex_codec')
>>> hash[::-1].encode('hex_codec')
I expect that my program would return the same result the program above returned, but instead, when I compile and run this:
int main() {
BlockHeader header;
Sha256 hash1(header.bytes());
array<BYTE, SHA256_BLOCK_SIZE> h1 = hash1.hash();
cout << "hash1: ";
for(int i=0; i<h1.size(); i++)
printf("%.2x", h1[i]);
Sha256 hash2(h1);
array<BYTE, SHA256_BLOCK_SIZE> h2 = hash2.hash();
cout << "hash2: ";
for(int i=0; i<h2.size(); i++)
printf("%.2x", h2[i]);
the result is that:
hash1: e2245204380a75c6bc6ac56f0000000040030901000000001100011000000000
hash2: 68a74f2a36c8906068c6cd6f00000000020000000000000080a7d06f00000000
I am aware the endianess in my program are not the same of the python result, but this I can fix later, when I get the correct result. Looking in the code below, can anyone give a hint of what I am missing here?
#define ROTLEFT(a,b) (((a) << (b)) | ((a) >> (32-(b))))
#define ROTRIGHT(a,b) (((a) >> (b)) | ((a) << (32-(b))))
#define CH(x,y,z) (((x) & (y)) ^ (~(x) & (z)))
#define MAJ(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
#define EP0(x) (ROTRIGHT(x,2) ^ ROTRIGHT(x,13) ^ ROTRIGHT(x,22))
#define EP1(x) (ROTRIGHT(x,6) ^ ROTRIGHT(x,11) ^ ROTRIGHT(x,25))
#define SIG0(x) (ROTRIGHT(x,7) ^ ROTRIGHT(x,18) ^ ((x) >> 3))
#define SIG1(x) (ROTRIGHT(x,17) ^ ROTRIGHT(x,19) ^ ((x) >> 10))
Sha256::Sha256(vector<BYTE> data) {
SIZE64 L = data.size() / 2;
SIZE64 K = 0;
while( (L + 1 + K + 8) % 64 != 0)
K = K + 1;
for(int i=0; i<L; i++) {
BYTE c = (data[i] % 32 + 9) % 25 * 16 + (data[i+1] % 32 + 9) % 25;
for(int i=0; i<K; i++)
SIZE64 x = L + 1 + K + 8;
for(int i=0; i<sizeof(x); i++)
source.push_back( x >> i*8 );
Sha256::Sha256(array<BYTE, SHA256_BLOCK_SIZE> data) {
SIZE64 L = data.size() / 2;
SIZE64 K = 0;
while( (L + 1 + K + 8) % 64 != 0)
K = K + 1;
for(int i=0; i<L; i++) {
BYTE c = (data[i] % 32 + 9) % 25 * 16 + (data[i+1] % 32 + 9) % 25;
for(int i=0; i<K; i++)
SIZE64 x = L + 1 + K + 8;
for(int i=0; i<sizeof(x); i++)
source.push_back( x >> i*8 );
array<BYTE, SHA256_BLOCK_SIZE> Sha256::hash() {
array<BYTE, SHA256_BLOCK_SIZE> result;
WORD32 h0 = 0x6a09e667, h1 = 0xbb67ae85, h2 = 0x3c6ef372, h3 = 0xa54ff53a, h4 = 0x510e527f, h5 = 0x9b05688c, h6 = 0x1f83d9ab, h7 = 0x5be0cd19;
WORD32 k[64] = {0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2};
WORD32 a, b, c, d, e, f, g, h, i, j, t1, t2, m[64];
for(int chunk=0; chunk<=source.size()/64; chunk++) {
for (i = 0, j = chunk*64; i < 16; ++i, j += 4)
m[i] = (source[j] << 24) | (source[j + 1] << 16) | (source[j + 2] << 8) | (source[j + 3]);
for ( ; i < 64; ++i)
m[i] = SIG1(m[i - 2]) + m[i - 7] + SIG0(m[i - 15]) + m[i - 16];
a = h0;
b = h1;
c = h2;
d = h3;
e = h4;
f = h5;
g = h6;
h = h7;
for (i = 0; i < 64; ++i) {
t1 = h + EP1(e) + CH(e,f,g) + k[i] + m[i];
t2 = EP0(a) + MAJ(a,b,c);
h = g;
g = f;
f = e;
e = d + t1;
d = c;
c = b;
b = a;
a = t1 + t2;
h0 += a;
h1 += b;
h2 += c;
h3 += d;
h4 += e;
h5 += f;
h6 += g;
h7 += h;
for(int i=0; i<4; i++) result[0] = h0 >> i;
for(int i=0; i<4; i++) result[1] = h1 >> i;
for(int i=0; i<4; i++) result[2] = h2 >> i;
for(int i=0; i<4; i++) result[3] = h3 >> i;
for(int i=0; i<4; i++) result[4] = h4 >> i;
for(int i=0; i<4; i++) result[5] = h5 >> i;
for(int i=0; i<4; i++) result[6] = h6 >> i;
for(int i=0; i<4; i++) result[7] = h7 >> i;
return result;
In the Sha256::hash function, result is a BYTE array, whereas h0 is a WORD32. You might want to split h0 into 4 BYTEs and store into the result array, but the for loop at the end of the function won't achieve your goal.
What you want to do is to concatenate h0 to h7, and then extract the bytes from h0 to h7 by shifting 24, 16, 8, 0 bits:
// concatenate h0 to h7
WORD32 hs[8] = {h0, h1, h2, h3, h4, h5, h6, h7};
// extract bytes from hs to result
for(int i=0; i<8; i++) { // loop from h0 to h7
result[i*4 ] = hs[i] >> 24; // the most significant byte of h_i
result[i*4+1] = hs[i] >> 16;
result[i*4+2] = hs[i] >> 8;
result[i*4+3] = hs[i]; // the least significant byte of h_i
After some testing, I found another error:
for(int chunk=0; chunk<=source.size()/64; chunk++) {
should be
for(int chunk=0; chunk<source.size()/64; chunk++) {
chuck starts from 0, so you should use < instead of <=.
For example, when source.size() is 64, you only have 1 chunk to process.
I fully tested your code and found two problems in the constructors of the Sha256 class.
Your code implies that you assume the vector<BYTE> passed to the constructor is a hex string. That is OK, but you use the same code for the array<BYTE, SHA256_BLOCK_SIZE> version, which is the return type of hash() function, which returns a BYTE array instead of hex string.
For a BYTE array, you can simply push the byte data[i] into the source. Also, L should be data.size() because every element has size 1 in a byte array.
Besides, you try to append the size of the input(x) to source, but x should not include the appended one and zeros, and it is the bit count of the input, so x should simply be L*8. Also, the size should be a big-endian integer, so you have to push the bigger byte first:
for(int i=0; i<sizeof(x); i++) // WRONG: little endian
for(int i=sizeof(SIZE64)-1; i>=0; i--) // Correct: big endian
I have made it execute correctly and output:
hash1: b9d751533593ac10cdfb7b8e03cad8babc67d8eaeac0a3699b82857dacac9390
hash2: 1dbd981fe6985776b644b173a4d0385ddc1aa2a829688d1e0000000000000000
If you encounter other problems, feel free to ask. You are very close to the correct answer. Hope you can fix all the bugs successfully :)
EDIT3: implementation of other function
struct BlockHeader {
int version;
string hashPrevBlock;
string hashMerkleRoot;
int time;
int bits;
int nonce;
vector<BYTE> bytes();
#define c2x(x) (x>='A' && x<='F' ? (x-'A'+10) : x>='a' && x<='f' ? (x-'a'+10) : x-'0')
vector<BYTE> BlockHeader::bytes() {
vector<BYTE> bytes;
for (int i=24; i>=0; i-=8) bytes.push_back(version>>i);
for (int i=0; i<hashPrevBlock.size(); i+=2)
bytes.push_back(c2x(hashPrevBlock[i])<<4 | c2x(hashPrevBlock[i+1]));
for (int i=0; i<hashMerkleRoot.size(); i+=2)
bytes.push_back(c2x(hashMerkleRoot[i])<<4 | c2x(hashMerkleRoot[i+1]));
for (int i=24; i>=0; i-=8) bytes.push_back(time>>i);
for (int i=24; i>=0; i-=8) bytes.push_back(bits>>i);
for (int i=24; i>=0; i-=8) bytes.push_back(nonce>>i);
return bytes; // return bytes instead of hex string
// exactly the same as the vector<BYTE> version
Sha256::Sha256(array<BYTE, SHA256_BLOCK_SIZE> data) {
SIZE64 L = data.size(); // <<
SIZE64 K = 0;
while( (L + 1 + K + 8) % 64 != 0)
K = K + 1;
// can be simplified to: int K = (128-1-8-L%64)%64;
// ** thanks to "chux - Reinstate Monica" pointing out i should be a SIZE64
for(SIZE64 i=0; i<L; i++) { // **
source.push_back(data[i]); // <<
for(int i=0; i<K; i++)
SIZE64 x = L*8; // <<
for(int i=sizeof(SIZE64)-1; i>=0; i--) { // big-endian
source.push_back(x >> i*8);
EDIT4: variable size in for loop
As "chux - Reinstate Monica" pointed out, it may be a problem if the size of the data is bigger than INT_MAX. All for-loop using a size as the upper limit should use a size_t type counter(instead of int) to prevent this problem.
// in BlockHeader::bytes()
for (size_t i=0; i<hashPrevBlock.size(); i+=2)
// in Sha256::hash()
for (size_t chunk=0; chunk<source.size()/64; chunk++)
// in main()
for (size_t i=0; i<h1.size(); i++)
for (size_t i=0; i<h2.size(); i++)
Notice that size_t is unsigned. The reverse version won't work because i is never less than 0.
for (size_t i=data.size()-1; i>=0; i--) // infinite loop

& operator for bitwise programming returns wrong value (CPP)

I have this piece of code in cpp on Visual Studio
((handrule1 - maskRule1[0]) & test)
All of the variables are unsigned int.
Their values are respectively
I keep getting value zero as the outcome for this line, which should not be possible.
How does this come?
I already tried working with long unsigned variables instead.
I am guessing that maybe I am doing something else wrong when choosing the data types.
Underneath you can find my full code.
Some of the variables are not defined but that's because they are already defined in another cpp-file we are not supposed to use.
void init(void) {
int aa, ab, l, x = 0;
for (int i = 4; i <= 13; i++) {
aa = 13 - i;
for (int j = (aa + 2) / 3; j <= i && j <= aa; j++) {
ab = aa - j;
for (int k = (ab + 1) / 2; k <= j && k <= ab; k++) {
adj[x] = i + j - 8;
l = ab - k; code[x++] = (((i - 4) * 7) + j) * 5 + k;
// printf("%d %d %d %d: %d\n", i, j, k, l, (((i-4)*7)+j)*5+k);
char countSetBits(long long unsigned n)
if (n == 0)
return 0;
return 1 + countSetBits(n & (n - 1));
void init_set(void) {
long long unsigned hand = 0;
char honorPoints = 0;
char nrOfSpades = 0;
char nrOfHearts = 0;
char nrOfDiamonds = 0;
char nrOfClubs = 0;
long long unsigned maskAces = 0x8004002001;
long long unsigned maskKings = 0x10008004002;
long long unsigned maskQueens = 0x20010008004;
long long unsigned maskJacks = 0x40020010008;
long long unsigned maskSpades = 0x1FFF;
long long unsigned maskHearts = 0x3FFE000;
long long unsigned maskDiamonds = 0x7FFC000000;
long long unsigned maskClubs = 0xFFF8000000000;
char upperbound[RU][4];
char lowerbound[RU][4];
unsigned int handrule1 = 0;
unsigned int handrule2 = 0;
unsigned int handrule3 = 0;
unsigned int maskRule1[RU];
unsigned int maskRule2[RU];
unsigned int maskRule3[RU];
unsigned int maskInverse = 0x00FF00FF;
unsigned int test = 0x80808080;
unsigned int result1 = 0;
bool applicableRule = false;
unsigned int fuck = 0xFF936636;
result1 = fuck & test;
for (int r = 0; r < nrr; r++)
for (int i = 0; i < 4; i++)
upperbound[r][i] = 13;
lowerbound[r][i] = 0;
for (int r = 0; r < nrr; r++)
if (res[r] != 0)
for (int i = 0; i < res[r]; i++)
upperbound[r][color[r][i]] = (char) nru[r][i];
lowerbound[r][color[r][i]] = (char) nrl[r][i];
maskRule1[r] = (((char) distl[r] << 24) | ((char) distu[r] << 16) | ((char) ahpl[r] << 8) | (char) ahpu[r]) ^ maskInverse;
maskRule2[r] = ((lowerbound[r][0] << 24) | (upperbound[r][0] << 16) | (lowerbound[r][1] << 8) | upperbound[r][1]) ^ maskInverse;
maskRule3[r] = ((lowerbound[r][2] << 24) | (upperbound[r][2] << 16) | (lowerbound[r][3] << 8) | upperbound[r][3]) ^ maskInverse;
int x[52], y, a;
for (int i = 0; i < 52; i++) x[i] = i;
for (int i = 0; i < CRD; i++) {
for (int j = 52; --j > 1;) {
y = rand() % j;
a = x[y]; x[y] = x[j]; x[j] = a;
for (int j = 0; j < 4; j++)
for (int k = 13 * j; k < 13 * (j + 1); k++)
hand |= 1LLU << x[k];
//Counting honorpoints
honorPoints = (countSetBits(hand & maskAces) * 4) + (countSetBits(hand & maskKings) * 3) + (countSetBits(hand & maskQueens) * 2) + (countSetBits(hand & maskJacks) * 1);
hp[i][j] = (char) honorPoints;
honorPoints = 0;
//Counting distributions
nrOfSpades = countSetBits(hand & maskSpades);
nrOfHearts = countSetBits(hand & maskHearts);
nrOfDiamonds = countSetBits(hand & maskDiamonds);
nrOfClubs = countSetBits(hand & maskClubs);
std::array<char, 4> arrayTest = { nrOfSpades, nrOfHearts, nrOfDiamonds, nrOfClubs };
std::sort(arrayTest.begin(), arrayTest.end());
char p = arrayTest[3];
char o = arrayTest[2];
char m = arrayTest[1];
int test = (((p - 4) * 7) + o) * 5 + m;
for (int x = 0; x < 39; x++)
if (code[x] == test)
dis[i][j] = (char) x;
//Counting opening bids
ahp[i][j] = hp[i][j] + adj[dis[i][j]];
if (ahp[i][j] < 0) ahp[i][j] = 0;
handrule1 = ((dis[i][j] << 24) | (dis[i][j] << 16) | (ahp[i][j] << 8) | ahp[i][j]) ^ maskInverse;
handrule2 = ((nrOfSpades << 24) | (nrOfSpades << 16) | (nrOfHearts << 8) | nrOfHearts) ^ maskInverse;
handrule3 = ((nrOfDiamonds << 24) | (nrOfDiamonds << 16) | (nrOfClubs << 8) | nrOfClubs) ^ maskInverse;
printf("%u \n", handrule1);
printf("%u \n", maskRule1[0]);
for (int r = 0; r < nrr; r++)
if ((((handrule1 - maskRule1[r]) & test) == 0) && (((handrule2 - maskRule2[r]) & test) == 0) && (((handrule3 - maskRule3[r]) & test) == 0))
applicableRule = true;
if (applicableRule == false)
applicableRule = false;
handrule1 = 0;
handrule2 = 0;
handrule3 = 0;
nrOfSpades = 0;
nrOfHearts = 0;
nrOfDiamonds = 0;
nrOfClubs = 0;
hand = 0;

Corrupted memory issue when deleting allocated memory

I am trying to store a sparse vector using a bit mask. I allocate a char* to represent the bit mask. However, when I delete [] the mask, I get a memory corruption error. Upon investigation, I'm seeing that it's because I'm freeing memory that I'm not supposed to. This is confusing, since I don't see how this could be the case.
When I run this on my case, it prints out "ALLOCATED" and "DEALLOCATING" but nothing further.
void set_i_bit(char* mask, int i) {
int field_num = floor(i/8);
int bit_num = i %8;
mask[field_num] = (1 << bit_num) | mask[field_num];
int write_sparse_with_bitmask(vector<float> arr, ofstream* fout) {
int mx_sz = arr.size() - 1;
float tol = 0.5;
char* mask = 0;
for(int i = arr.size() -1; i>=0; i-=1) {
if (fabs(arr[i]) > tol) break;
mx_sz = i;
int sprse_cnt = 0;
for(int i = 0; i<=mx_sz; i+=1) {
if (fabs(arr[i]) < tol) sprse_cnt++;
int bitmask_sz = ceil(mx_sz/8);
if (sprse_cnt*sizeof(int16_t) + sizeof(int16_t) > bitmask_sz) {
mask = new char[bitmask_sz];
for (int i =0; i<bitmask_sz; i++) mask[i] = 0;
for(int i = 0; i<=mx_sz; i+=1) {
if (fabs(arr[i]) > coef_tol) {
set_i_bit(mask, i);
else {
bitmask_sz = 0;
uint16_t sz = mx_sz + 1;
uint16_t bt_msk = bitmask_sz + 1;
char flag = 0;
if (bitmask_sz > 0) {
flag = flag | 1;
fout->write((char*)&sz, sizeof(uint16_t));
fout->write((char*)&flag, sizeof(char));
int w_size = sizeof(uint16_t) + sizeof(char);
if (flag & 1) {
fout->write((char*)&bt_msk, sizeof(uint16_t));
fout->write(mask, sizeof(char)*bt_msk);
delete [] mask;
cout<<"THIS DOESN'T PRINT"<<endl;
w_size += sizeof(uint16_t) + sizeof(char)*bt_msk;
for(int i = 0; i<=mx_sz; i+=1) {
if (fabs(arr[i]) > tol || !(flag & 1)) {
int16_t vl = arr[i];
fout->write((char*) &vl, sizeof(int16_t));
w_size += sizeof(int16_t);
return w_size;

mexcuda having breakpoint at delete[]() in .cu-file

I am having some trouble finding the error I made with my memory allocation. I am currently using Visual Studio 2013, Matlab 2015b and CUDA 7.0 on a GeForce GT 630 and I am quite a newbie to GPU-programming, CUDA and mex.
When I call my code from Matlab with mexcuda it goes fine until I add the small part with colIndexStepSize to the .cu-file. The program runs normally till delete. After informing me about having reached a breakpoint here, Matlab crashes.
When I remove the code lines in question, everything runs smoothly again.
I am quite sure that there is something wrong with my memory handling but I simpy cannot find the bug. Here is the code that is making trouble:
#include <cuda_runtime.h>
#include <cuda.h>
#include <cusparse.h>
#include <device_launch_parameters.h>
#include <curand.h>
#include <vector>
// Test-Makro : (Funktionieren die Zugriffe auf die GPU?)
#define gpuErrchk(ans){gpuAssert((ans), __FILE__, __LINE__);}
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true){
if (code != cudaSuccess){
fprintf(stderr, "GPUassert: %s%s%d\n", cudaGetErrorString(code), file, line);
__global__ void startEndIndex(int *ergArray, int *first, int *last, float *dxmax, unsigned int *length){
unsigned int index = threadIdx.x + blockIdx.x*blockDim.x;
if (index < *length){
first[index] = (*dxmax)*ergArray[index];
last[index] = (*dxmax)*ergArray[index + 1] - 1;
void rotateOSSARTrechnung(std::vector<float> *detektor, SparseMatrix<float, float, float> *systemMatrix_coo, Volumen<float, float, float> *volumen, unsigned int iterationen, std::vector<float> *deltaBIterationN, std::vector<float> *matdVoxelGrid, float projektionen,float dxmax, float detZellen, unsigned int threads_max_n, unsigned int threads_max_m, unsigned int threads_max_nnz){
unsigned int nnz = (unsigned int)systemMatrix_coo->nnz;
unsigned int n = (unsigned int)systemMatrix_coo->columnNumber;
unsigned int mNeu = detZellen;
float *measuredValues = 0; measuredValues = new float[mNeu]();
float *volumeN = 0; volumeN =new float[n]();
float *volumeAlt = 0; volumeAlt = new float[n]();
float *initValuesM = 0; initValuesM = new float[mNeu]();
float *volumeNInitZero = 0; volumeNInitZero = new float[n]();
float *initValuesMInitZero = 0; initValuesMInitZero = new float[mNeu]();
int *cooRowHostPtr=0; cooRowHostPtr = new int[nnz]();
int *cooColHostPtr=0; cooColHostPtr = new int[nnz]();
float *cooValuesHostPtr = 0; cooValuesHostPtr = new float[nnz]();
unsigned int *colIndex = 0; colIndex = new unsigned int[nnz]();
float *valIndex = 0; valIndex = new float[nnz]();
unsigned int *colIndexStepSize = 0; colIndexStepSize = new unsigned int[n]();
for (unsigned int i = 0; i < n; i++){
colIndexStepSize[i] = nnz;
unsigned int length = matdVoxelGrid->size();
int *ergArray = 0; ergArray = new int[length+1]();
int *first = 0; first = new int[length]();
int *last = 0; last = new int[length]();
int *cooHostColRot = 0; cooHostColRot = new int[nnz]();
int *d_cooColPtr;
int *d_cooRowPtr;
unsigned int *d_nnz;
int *d_colIndexPtr;
float *d_valIndexPtr;
unsigned int *d_colIndexStepSizePtr;
float *d_cooValuesPtr;
float *d_measuredValues;
float *d_volume_alt;
float *d_volume_neu;
int *d_ergArray;
float *d_dxmax;
unsigned int *d_length;
unsigned int *d_size;
int *d_first;
int *d_last;
int *d_cooColRotPtr;
unsigned int *d_count;
gpuErrchk(cudaMalloc((void**)&d_cooRowPtr, nnz*sizeof(int)));;
gpuErrchk(cudaMalloc((void**)&d_cooColPtr, nnz*sizeof(int)));
gpuErrchk(cudaMalloc((void**)&d_cooValuesPtr, nnz*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_measuredValues, mNeu*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_volume_alt, n*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_volume_neu, n*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_nnz, sizeof(unsigned int)));
gpuErrchk(cudaMalloc((void**)&d_colIndexPtr, (nnz)*sizeof(int)));
gpuErrchk(cudaMalloc((void**)&d_valIndexPtr, (nnz)*sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_colIndexStepSizePtr, n*sizeof(unsigned int)));
gpuErrchk(cudaMalloc((void**)&d_ergArray, (length+1)*sizeof(int)));
gpuErrchk(cudaMalloc((void**)&d_dxmax, sizeof(float)));
gpuErrchk(cudaMalloc((void**)&d_length, sizeof(unsigned int)));
gpuErrchk(cudaMalloc((void**)&d_size, sizeof(unsigned int)));
gpuErrchk(cudaMalloc((void**)&d_first, length*sizeof(int)));
gpuErrchk(cudaMalloc((void**)&d_last, length*sizeof(int)));
gpuErrchk(cudaMalloc((void**)&d_cooColRotPtr, nnz*sizeof(int)));
gpuErrchk(cudaMalloc((void**)&d_count, sizeof(unsigned int)));
for (unsigned int i = 0; i < nnz; i++){
cooRowHostPtr[i] = systemMatrix_coo->cooRowInd->at(i);
cooColHostPtr[i] = systemMatrix_coo->cooColInd->at(i);
cooValuesHostPtr[i] = systemMatrix_coo->cooValues->at(i);
for (unsigned int j = 0; j < n; j++){
volumen->setValueAtElement(j, (float)cooColHostPtr[j]);
gpuErrchk(cudaMemcpy(d_nnz, &nnz, sizeof(unsigned int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_dxmax, &dxmax, sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_length, &length, sizeof(unsigned int), cudaMemcpyHostToDevice));
// (Initialwerte sind immer gleich)
gpuErrchk(cudaMemcpy(d_cooRowPtr, cooRowHostPtr, nnz*sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_cooValuesPtr, cooValuesHostPtr, nnz*sizeof(float), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_cooColPtr, cooColHostPtr, nnz*sizeof(int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_valIndexPtr, cooValuesHostPtr, nnz*sizeof(float), cudaMemcpyHostToDevice));
unsigned int threads_nnz = threads_max_nnz;
unsigned int thread_length = length;
unsigned int block_length = 1;
unsigned int index = 0;
for (unsigned int s = 0; s < length; s++){
for (unsigned int t = 0; t <= s; t++){
index = s + 1;
ergArray[index] += (int)matdVoxelGrid->at(t);
gpuErrchk(cudaMemcpy(d_ergArray, ergArray, (length+1)*sizeof(int), cudaMemcpyHostToDevice));
startEndIndex <<< block_length, thread_length >>>(d_ergArray, d_first, d_last, d_dxmax, d_length);
gpuErrchk(cudaMemcpy(first, d_first, length*sizeof(int), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(last, d_last, length*sizeof(int), cudaMemcpyDeviceToHost));
for (unsigned int j = 0; j < length; j++){
volumen->setValueAtElement(j, (float)first[j]);
for (unsigned int j = 0; j < length; j++){
volumen->setValueAtElement(j, (float)last[j]);
unsigned int size = 0;
for (unsigned int iter = 0; iter < iterationen; iter++){
for (unsigned int proj = 1; proj <= projektionen; proj++){
unsigned int begin1 = (proj - 1)*mNeu;
unsigned int end1 = proj*mNeu;
for (unsigned int j = begin1; j < end1; j++){
measuredValues[j] = detektor->at(j);
gpuErrchk(cudaMemcpy(d_measuredValues, measuredValues, mNeu*sizeof(float), cudaMemcpyHostToDevice));
for (unsigned int u = 0; u < length; u++){
size = ceil(matdVoxelGrid->at(u)* (proj - 1) * dxmax / projektionen);
gpuErrchk(cudaMemcpy(d_size, &size, sizeof(unsigned int), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_count, &u, sizeof(unsigned int), cudaMemcpyHostToDevice));
if (proj > 1){
for (unsigned int i = 0; i < nnz; i++) {//(first[u] <= cooCols[index] <= last[u]){
if (first[u] <= cooColHostPtr[i] && cooColHostPtr[i] <= last[u]){
cooHostColRot[i] = first[u] + (int)(cooColHostPtr[i] + size) % (last[u] - first[u] + 1);// (int)(cooColHostPtr[i] + size) % (last[u]); // (int)(first[u] + ((int)(cooColHostPtr[i] + dxmax) % (last[u] - first[u] + 1)));
for (unsigned int i = 0; i < nnz; i++) {
cooHostColRot[i] = cooColHostPtr[i];
// --------- troubling code starts HERE ----------------
unsigned int wert = 0, index = 0;
for (unsigned int i = 0; i < nnz; i++){
index = cooHostColRot[i];
wert = colIndexStepSize[index];
if (wert >= i){
colIndexStepSize[index] = i;
for (unsigned int j = 0; j < n; j++){
volumen->setValueAtElement(j, colIndexStepSize[j]);
gpuErrchk(cudaMemcpy(d_colIndexStepSizePtr, colIndexStepSize, n*sizeof(unsigned int), cudaMemcpyHostToDevice));
// --------- troubling code ends HERE ----------------
gpuErrchk(cudaMemcpy(d_colIndexPtr, cooHostColRot, nnz*sizeof(int), cudaMemcpyHostToDevice));
delete[](ergArray); ergArray = NULL;
delete[](measuredValues); measuredValues = NULL;
delete[](cooColHostPtr); cooColHostPtr = NULL;
delete[](cooRowHostPtr); cooRowHostPtr = NULL;
delete[](cooValuesHostPtr); cooValuesHostPtr = NULL;
delete[](volumeN); volumeN = NULL;
delete[](ergArray); ergArray = NULL;
delete[](initValuesM); initValuesM = NULL;
delete[](colIndex); colIndex = NULL;
delete[](valIndex); valIndex = NULL;
delete[](volumeAlt); volumeAlt = NULL;
delete[](volumeNInitZero); volumeNInitZero = NULL;
delete[](initValuesMInitZero); initValuesMInitZero = NULL;
delete[](colIndexStepSize); colIndexStepSize = NULL;
delete[](deltaBArray); deltaBArray = NULL;
delete[](first); first = NULL;
delete[](last); last = NULL;
delete[](cooHostColRot); cooHostColRot = NULL;
deltaB = NULL;
If somebody sees any mistake I made, please tell me, I am open to any advice.
Thanks in advance!
Best regards
#AnderBiguri was right, I made an out of bounds access to the array measuredValues. Here is the corrected part of the code in question:
for (unsigned int j = 0; j < mNeu; j++){
measuredValues[j] = detektor->at((proj-1)*mNeu+j);
measuredValues is only mNeu elements long but I did access some elements way behind this point.
So, thanks a lot for the help !
#AnderBiguri was right, I made an out of bounds access to the array measuredValues. Here is the corrected part of the code in question:
for (unsigned int j = 0; j < mNeu; j++){
measuredValues[j] = detektor->at((proj-1)*mNeu+j);
I just had to adjust the boundaries of the for loop and vector accessing to fit the bounds of the array.
Thanks a lot once again!