Why are there two variants of the implementation of CRCs in software - crc

I'm digging into the subtleties of CRCs. If I understand correctly, every CRC polynomial is provided in at least two representations, the normal one and the reversed one.
The normal one targets implementations where the content is processed from most signifiant bit to least significant bit and switched to the left (like for example in this wikipedia page).
The reversed one aims to handle LSb to MSb interfaces. If you process LSb to MSb with the reversed polynomial and switching to the right you get the same CRC value (also encoded LSb to MSb). This is described for example here. This is convenient for LSb to MSb communication interfaces.
What I don't understand is when you switch to software implementations. Why are there two variants of a software ie. byte implementation? (One for MSb to LSb, and one for the opposite bit order.)

You do not get the "same CRC value" (reflected or not) with the reflected calculation. It is an entirely different value, because the bits of the message are processed in the opposite order.
"when you switch": You simply use the CRC definition, reflected or not, that matches what the application is expecting. Whether the CRC is reflected is one of several parameters that define the CRC, along with the number of the bits in the CRC, the polynomial, the initial value, and the final exclusive or value. You can find the definition of over a hundred different CRCs here.
"why are there two": The forward implementation exists because that corresponds most closely to the mathematics, with the least significant term of the polynomial in the least significant bit of the binary representation of the polynomial. The reflected implementation exists because it was realized that it could be implemented in software a little more simply, with fewer instructions, but still have the same error-detection performance.
Here is an example for two common 32-bit CRCs with the same polynomial. Forward, CRC-32/BZIP bit-wise implementation:
uint32_t crc32bzip2_bit(uint32_t crc, void const *mem, size_t len) {
unsigned char const *data = mem;
if (data == NULL)
return 0;
crc = ~crc;
for (size_t i = 0; i < len; i++) {
crc ^= (uint32_t)data[i] << 24;
for (unsigned k = 0; k < 8; k++) {
crc = crc & 0x80000000 ? (crc << 1) ^ 0x4c11db7 : crc << 1;
}
}
crc = ~crc;
return crc;
}
Reflected CRC-32/ZIP bit-wise:
uint32_t crc32iso_hdlc_bit(uint32_t crc, void const *mem, size_t len) {
unsigned char const *data = mem;
if (data == NULL)
return 0;
crc = ~crc;
for (size_t i = 0; i < len; i++) {
crc ^= data[i];
for (unsigned k = 0; k < 8; k++) {
crc = crc & 1 ? (crc >> 1) ^ 0xedb88320 : crc >> 1;
}
}
crc = ~crc;
return crc;
}
The main savings is one instruction, the shift up of the data byte, that you can get rid of with the reflected implementation. Also the constant that you & with (1 vs. 0x80000000) is smaller, which may also save an instruction or a register, or perhaps just result in a shorter instruction, depending on the size of immediate values supported in the instruction set.
The shift is avoided for byte-wise calculations as well:
uint32_t crc32bzip2_byte(uint32_t crc, void const *mem, size_t len) {
unsigned char const *data = mem;
if (data == NULL)
return 0;
for (size_t i = 0; i < len; i++) {
crc = (crc << 8) ^
table_byte[((crc >> 24) ^ data[i]) & 0xff];
}
return crc;
}
vs.
uint32_t crc32iso_hdlc_byte(uint32_t crc, void const *mem, size_t len) {
unsigned char const *data = mem;
if (data == NULL)
return 0;
for (size_t i = 0; i < len; i++) {
crc = (crc >> 8) ^
table_byte[(crc ^ data[i]) & 0xff];
}
return crc;
}

Related

Parallel CRC32 in C++

I am trying to implement CRC32 calculation of file by splitting it in parts. I used algorithms and ideas from CRC Calculation Of A Mostly Static Data Stream.
Unfortunately, my program gives incorrect answer, although it returns the same value of CRC regardless of number of parts.
Please, tell me where the mistake is and what I do wrong.
Here is the code of program:
#include <iostream>
#include <fstream>
#include <stdint.h>
#include <string>
#include <sstream>
#include <stdlib.h>
#include <stdio.h>
#include <pthread.h>
using namespace std;
struct data {
pthread_t id;
uint8_t *buf;
long int start, end;
long int num_zeros;
uint32_t crc;
};
//Straight function
uint32_t crc_32(ifstream& input) {
input.seekg(0, input.end);
size_t size = input.tellg();
input.seekg(0, input.beg);
uint32_t polynomial = 0xEDB88320;
uint32_t table[256];
for(uint32_t i=0; i<=0xff; i++) {
uint32_t c = i;
for (size_t j = 0; j < 8; j++)
{
if (c & 1) {
c = polynomial ^ (c >> 1);
}
else {
c >>= 1;
}
}
table[i] = c;
}
uint32_t CRC = 0xffffffff;
uint8_t buf;
for(size_t i=0; i<size; i++) {
input.read( (char *) &buf, sizeof(buf));
CRC = (CRC>>8) ^ table[(CRC ^ buf) & 0xff ];
}
CRC ^= 0xffffffff;
return CRC;
}
// generate crc
uint32_t GenCrc(data *work, long int beg, long int end, uint32_t *crctbl) {
uint32_t init = 0x00000000;
for(long int i = beg; i<end; i++) {
init = (init<<8)^crctbl[ (init>>24) ^ work->buf[i] ];
}
return init;
}
// (a*b)%crc
uint32_t MpyModCrc(uint32_t a, uint32_t b) {
uint32_t pd = 0;
uint32_t i;
for(i = 0; i < 32; i++){
pd = (pd<<1)^((0-(pd>>31))&0x04c11db7);
pd ^= (0-(b>>31))&a;
b <<= 1;
}
return pd;
}
// pow(2,p)%crc
uint32_t PowModCrc(uint32_t p) {
uint32_t prd = 0x1u;
uint32_t sqr = 0x2u;
while(p) {
if(p&1)
prd = MpyModCrc(prd, sqr);
sqr = MpyModCrc(sqr, sqr);
p >>= 1;
}
return prd;
}
void do_work(data *work) {
//Generate lookup table:
uint32_t polynomial = 0x04c11db7;
uint32_t crctbl[256];
uint32_t crc;
uint32_t c;
uint32_t i;
for(c=0; c <256; c++) {
crc = c<<24;
/*
for(i=0; i <8; i++) {
if( (crc & 0x80000000) !=0) {
crc <<= 1;
crc ^= polynomial;
}
else {
crc <<=1;
}
}
*/
for(i=0; i<8; i++) {
crc = (crc<<1)^(0-(crc>>31))&polynomial;
}
crctbl[c] = crc;
}
uint32_t pmc;
uint32_t crf;
crf = GenCrc(work, work->start, work->end, crctbl);
if(work->num_zeros > 0) {
pmc = PowModCrc((work->num_zeros)*8);
crf = MpyModCrc(crf, pmc);
}
work->crc = crf;
}
void *do_stuff(void *d) {
data *mydata = (data*)d;
do_work(mydata);
return 0;
}
int main(int argc, char** argv) {
ifstream input("8733718.zip", ios::binary);
if(!input) {
cerr << "Can't open file." <<endl;
}
input.seekg(0, input.end);
long int len = input.tellg();
input.seekg(0, input.beg);
uint8_t *buf = new uint8_t[len];
input.read( (char *) buf, len);
int k;
cout << "Enter number of parts: ";
if(!(cin >> k) || k<1) {
cout << "Error. We need at least one part!" <<endl;
return -1;
}
data *work = new data[k+1];
for(int i=0; i < k; i++) {
work[i].start = len*i;
work[i].start /=k;
work[i].end = len * (i+1);
work[i].end /= k;
work[i].num_zeros = len - work[i].end;
work[i].buf = buf;
}
for(int i=0; i < k; i++) {
void *tmp = (void*)&work[i];
pthread_create(&work[i].id, 0, do_stuff, tmp);
}
for(int i=0; i<k; i++) {
pthread_join(work[i].id, 0);
}
uint32_t crc = work[0].crc;
for(int i=1; i<k; i++) {
crc ^= work[i].crc;
}
delete [] buf;
delete [] work;
cout << "Straigth CRC_32 = ";
uint32_t result;
result = crc_32(input);
cout << hex << result;
cout <<endl <<endl;
cout << "Parallel CRC_32 = ";
uint32_t result2;
result2 = crc;
cout << hex << crc <<endl <<endl;
cout <<endl <<endl;
cout <<"=========================="<<endl<<endl;
input.close();
return 0;
}
"Straight" function gives the answer which coincides with the answer of, for example, website https://emn178.github.io/online-tools/crc32_checksum.html.
But "parallel" procedure gives another answer.
As rcgldr noted, you are mixing up reflected and non-reflected CRC calculations. The original CRC is reflected so you need to stick with that. You need to always be shifting right. You always need to use the reflected polynomial, as in the original, 0xedb88320.
So, GenCRC() needs to shift right, and use the low eight bits of the CRC (which you're calling init) instead of the high eight bits to get the index.
MpyModCrc() needs to shift right, use the low bit instead of the high bit for the decisions, and use the correct polynomial.
PowModCrc() is starting off with the wrong initial polynomials. Since the polynomials are reflected, the initial values need to be as well. So 1 (x0) is 0x80000000, and x (x1) is 0x40000000.
In do_work(), you need to generate the table exactly as you did crc_32(). Of course, why you're generating the exact same table in every thread, I have no idea. Do that once.
Lastly, having done all that, your threads will be computing the CRC with a zero initial value and zero final exclusive or. That's ok, so long as you then exclusive-or with the CRC of len zero bytes with an initial value of 0xffffffff, and then exclusive-or the final result with 0xffffffff to get the same effect. That is:
crc ^= MpyModCrc(0xffffffff, PowModCrc(len * 8)) ^ 0xffffffff;
Alternatively, you could start the first work unit, and only that one, with an initial CRC of 0xffffffff. And then exclusive-or the final result with 0xffffffff.
Another improvement is to not calculate the CRC of so many zeros (even though it is an O(log n) calculation), and to only calculate the power function once. You can combine the CRCs as you join each thread, only needing PowModCrc() of your chunk size, calculating that just once, and applying it each time. And once more for the final chunk which may be smaller.
You don't need to read in the entire file and then do the CRC calculation. You should be calculating in parallel with reading. Instead of deciding how many pieces, decide on a fixed chunk size. Then read chunks and fire off threads for CRC calculations as you read them. Combine the CRCs as the threads complete, joining them in order. Limit the number of threads to something like twice the number of cores. You want to keep the cores busy, but you don't want the overhead of too many threads and too much memory in use.
The final alternative would be to not do any of this, and simply use zlib which provides the CRC combination functions for you.
crc_32() is a right shifting CRC that initial CRC = 0xffffffff and final CRC ^= 0xffffffff, while do_work() gencrc(), ... are using a left shifting CRC.
Change do_work() and gencrc() to be a modified version of crc_32() with initial CRC = 0, and final CRC not changed.
Changes for CRC32 (which is a reflected CRC):
#define CRCPOLY 0xEDB88320u
#define INITXOR 0xFFFFFFFFu
#define FINALXOR 0xFFFFFFFFu
static uint32_t crc_table[256];
void GenCrcTable()
{
uint32_t crc;
for (uint32_t byte = 0; byte <= 0xFFu; byte++ )
{
crc = byte;
for (uint8_t bit = 0; bit < 8; bit++ )
crc = (crc&1) ? (crc>>1)^CRCPOLY : (crc>>1);
crc_table[byte] = crc;
}
}
int Crc32(uint8_t *buffer, size_t length)
{
uint32_t crc = INITXOR;
for (size_t i = 0; i < length; ++i)
crc = crc_table[(crc&0xFFu)^*buffer++]^(crc>>8);
crc ^= FINALXOR;
return crc;
}
// use this one for the multi-thread code
int Crc320(uint8_t *buffer, size_t length)
{
uint32_t crc = 0;
for (size_t i = 0; i < length; ++i)
crc = crc_table[(crc&0xFFu)^*buffer++]^(crc>>8);
return crc;
}
// carryless multiply modulo crc
uint32_t MpyModCrc(uint32_t a, uint32_t b) // (a*b)%crc
{
uint32_t prd = 0;
uint32_t i;
for(i = 0; i < 32; i++){
prd = (prd&1u) ? (prd>>1)^CRCPOLY : (prd>>1);
prd ^= (b&1u) ? a : 0;
b >>= 1;
}
return prd;
}
// exponentiate by repeated squaring modulo crc
uint32_t PowModCrc(uint32_t p) // pow(2,p)%crc
{
uint32_t prd = 0x80000000u; // current product
uint32_t sqr = 0x40000000u; // current square
while(p){
if(p&1)
prd = MpyModCrc(prd, sqr);
sqr = MpyModCrc(sqr, sqr);
p >>= 1;
}
return prd;
}
After these changes try this code at the end (I haven't tested this yet):
uint32_t crc = work[0].crc;
for(int i=1; i<k; i++) {
crc ^= work[i].crc;
}
uint32_t pmc = PowModCrc(len*8); // add initial CRC of 0xffffffff
crc ^= MpyModCrc(0xffffffff, pmc);
crc ^= 0xffffffff; // final_xor = 0xffffffff
If running on a PC in 64 bit mode, you could use a PCLMULQDQ based CRC to speed things up (probably to the point that multi-threading won't help much). You can search github for examples of this. The assembly files are a bit over 500 lines. I have 6 sets of these for Visual Studio | MASM (ML64.EXE). Link to to code for 32 bit CRC reflected. If using my code, you need to change the if defines from 0 to 1 to use CRC32 instead of CRC32C.
https://github.com/jeffareid/crc/tree/master/crc32r

How to generate CRC7 based on lookup table?

I am trying to implement CRC-7/MMC checksum with pregenerated lookup table. This is the code so far:
#include <iostream>
#include <string>
#include <cstdint>
using namespace std;
/* CRC-7/MMC
Poly: 0x09
Xorout: NO
Init: 0x00
Check value: 0x75 for "123456789"
*/
uint16_t CRC7_table[256];
void generate_crc7_table() {
uint16_t byte;
for (uint16_t i = 0; i < 256; i++) {
byte = i;
for (uint16_t bit = 0; bit < 8; bit++) {
if (byte & 1) { //if lsb is 1
byte >>= 1;
byte ^= 0x09;
}
else
byte >>= 1;
}
CRC7_table[i] = byte >> 1; //to drop off MSB
}
}
uint16_t crc7(string input) {
uint16_t reg = 0;
uint16_t b;
for (uint16_t i = 0; i < input.length(); i++) {
b = (input[i] ^ reg) & 0xFF;
reg = (reg >> 1) ^ CRC7_table[b];
}
return reg;
}
int main()
{
generate_crc7_table();
cout << hex << crc7("123456789") << endl;
return 0;
}
But it gives wrong output. I should get 0x75 but I get 0x07. I used this website to checkout the outputs.
Any suggestion or idea is highly appreciated. Thanks.
Note that the CRC definition you pointed to includes refin=false refout=false. That CRC is not reflected, so it is computed with left shifts, not right shifts.
Given that, and the fact that the CRC is less than eight bits in length, you will also want to keep the seven bits at the top of the byte being used for calculation, as opposed to the bottom. I.e. bits 1 to 7, as opposed to bits 0 to 6. The polynomial is then also shifted up by one for the table calculation.
This allows the table-driven, byte-wise calculation to simply exclusive-or each message byte into the byte being used for the calculations. If you want to return the CRC in the low seven bits, you can shift it down one at the end.
Example (0x12 is 0x09 shifted up one):
#include <iostream>
#include <string>
uint8_t table[256];
void make_table() {
uint8_t octet = 0;
do {
uint8_t crc = octet;
for (int k = 0; k < 8; k++)
crc = crc & 0x80 ? (crc << 1) ^ 0x12 : crc << 1;
table[octet++] = crc;
} while (octet);
}
uint8_t crc7(std::string const& input) {
uint8_t crc = 0;
for (auto octet : input)
crc = table[crc ^ octet];
return crc >> 1;
}
int main() {
make_table();
std::cout << std::hex << (unsigned)crc7("123456789") << '\n';
}
crcany will generate CRC code for you, given the definition. Since your CRC-7/MMC is in Greg's catalogue, crcany will generate that code out of the box, along with the other 100+ CRCs defined there. The generated code includes bit-wise, byte-wise, and word-wise calculations.

How to modify crc-32 to crc-32/mpeg-2

I am trying to code a function to match CRC 32 output from a device to the actual CRC-32 sum that I calculate. Following is my code:
#include <iostream>
#include <string.h>
#define CRC32_POLYNOMIAL 0xEDB88320
using namespace std;
unsigned int crc32b(unsigned char *message,size_t l)
{
int i, j;
unsigned int byte, crc, mask;
i = 0;
crc = 0xFFFFFFFF;
while (i<l) {
byte = message[i]; // Get next byte.
crc = crc ^ byte;
for (j = 7; j >= 0; j--) { // Do eight times.
mask = -(crc & 1);
crc = (crc >> 1) ^ (0xEDB88320 & mask);
}
i = i + 1;
}
return ~crc;
}
int main()
{
unsigned char Buff[] = {0x91,0xFF,0xFC,0xEA,0xFF,0xFF,0x70,0xFF,0xFD,0x87,0x00,0xFF,0xF9,0x1B,0xFF,0xF3,0x4E,0x00,0xFB,0x00,0x00,0x02,0x01,0xFB};
unsigned long CRC = crc32b((unsigned char *)Buff,24);
cout << hex << CRC <<endl;
getchar();
return 0;
}
This gives me the 32 bit CRC output of following payload:
91FFFCEAFFFF70FFFD8700FFF91BFFF34E00FB00000201FB
as 1980AC80. However the device is giving the checksum as 8059143D.
Upon further inspection using online CRC calculators I found that the device is sending out CRC-32/MPEG-2 checksum value. (Can be verified here). I have browsed multiple sites but did not find any straight forward implementation of CRC32/MPEG2 which I can integrate in my code. Can anyone help?
As noted in the crcalc web page, crc32/mpeg2 uses a left shifting (not reflected) CRC along with the CRC polynomial 0x104C11DB7 and initial CRC value of 0xFFFFFFFF, and not post complemented:
unsigned int crc32b(unsigned char *message, size_t l)
{
size_t i, j;
unsigned int crc, msb;
crc = 0xFFFFFFFF;
for(i = 0; i < l; i++) {
// xor next byte to upper bits of crc
crc ^= (((unsigned int)message[i])<<24);
for (j = 0; j < 8; j++) { // Do eight times.
msb = crc>>31;
crc <<= 1;
crc ^= (0 - msb) & 0x04C11DB7;
}
}
return crc; // don't complement crc on output
}

CRC midstream instead of at the end

Normally one would add a CRC to the end of the data stream. The CRC check would include the CRC itself and return 0 if the CRC is correct.
I need to add a CRC to verify my embedded code. It needs to be checked in place, but the top word in memory space is for an interrupt vector. Is it possible to place a key value midstream such that the CRC check returns 0 for the whole code? (or is this unsolvable?)
It's definitely possible. You can run a CRC backwards, which would be fast and easy. Below is example code.
In fact, you can give me the locations of bits scattered wherever in the stream, and if you give me enough of them I can tell you what to set them to to get a zero CRC at the end, or any other CRC value for that matter. My spoof code solves the linear equations to come up with that answer.
However I would wonder why you'd want to do any of that. Why not just know where the CRC is stored and compute the CRC for everything but that, and then check the result against the stored CRC?
// Example of the generation of a "middle" CRC, which is inserted somewhere in
// the middle of a sequence, where the CRC is generated such that the CRC of
// the complete sequence will be zero. This particular CRC has no pre or post
// processing.
//
// Placed into the public domain by Mark Adler, 11 May 2016.
#include <stddef.h> // for size_t
#include <stdint.h> // for uint32_t and uint64_t
#define POLY 0xedb88320 // CRC polynomial
// Byte-wise CRC tables for forward and reverse calculations.
uint32_t crc_forward_table[256];
uint32_t crc_reverse_table[256];
// Fill in CRC tables using bit-wise calculations.
void crc32_make_tables(void) {
for (uint32_t n = 0; n < 256; n++) {
uint32_t crc = n;
for (int k = 0; k < 8; k++)
crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
crc_forward_table[n] = crc;
crc_reverse_table[crc >> 24] = (crc << 8) ^ n;
}
}
// Return the forward CRC of buf[0..len-1], starting with crc at the front.
uint32_t crc32(uint32_t crc, unsigned char *buf, size_t len) {
for (size_t n = 0; n < len; n++)
crc = (crc >> 8) ^ crc_forward_table[(crc ^ buf[n]) & 0xff];
return crc;
}
// Return the reverse CRC of buf[0..len-1], starting with crc at the end.
uint32_t crc32_reverse(uint32_t crc, unsigned char *buf, size_t len) {
while (len)
crc = (crc << 8) ^ crc_reverse_table[crc >> 24] ^ buf[--len];
return crc;
}
// Put a 32-bit value into a byte buffer in little-endian order.
void put4(uint32_t word, unsigned char *pos) {
pos[0] = word;
pos[1] = word >> 8;
pos[2] = word >> 16;
pos[3] = word >> 24;
}
#include <stdlib.h> // for random() and srandomdev()
// Fill dat[0..len-1] with uniformly random byte values. All of the bits from
// each random() call are used, except for possibly a few leftover at the end.
void ranfill(unsigned char *dat, size_t len) {
uint64_t ran = 1;
while (len) {
if (ran < 0x100)
ran = (ran << 31) + random();
*dat++ = ran;
ran >>= 8;
len--;
}
}
#include <stdio.h> // for printf()
#define LEN 1024 // length of the message without the CRC
// Demonstrate the generation of a middle-CRC, using the forward and reverse
// CRC computations. Verify that the CRC of the resulting sequence is zero.
int main(void) {
crc32_make_tables();
srandomdev();
unsigned char dat[LEN+4];
ranfill(dat, LEN/2);
put4(0, dat + LEN/2); // put zeros where the CRC will go
ranfill(dat + LEN/2 + 4, (LEN+1)/2);
put4(crc32(0, dat, LEN/2) ^ crc32_reverse(0, dat + LEN/2, (LEN+1)/2 + 4),
dat + LEN/2); // replace the zeros with the CRC
printf("%08x\n", crc32(0, dat, LEN+4));
return 0;
}

Big Endian and Little Endian for Files in C++

I am trying to write some processor independent code to write some files in big endian. I have a sample of code below and I can't understand why it doesn't work. All it is supposed to do is let byte store each byte of data one by one in big endian order. In my actual program I would then write the individual byte out to a file, so I get the same byte order in the file regardless of processor architecture.
#include <iostream>
int main (int argc, char * const argv[]) {
long data = 0x12345678;
long bitmask = (0xFF << (sizeof(long) - 1) * 8);
char byte = 0;
for(long i = 0; i < sizeof(long); i++) {
byte = data & bitmask;
data <<= 8;
}
return 0;
}
For some reason byte always has the value of 0. This confuses me, I am looking at the debugger and see this:
data = 00010010001101000101011001111000
bitmask = 11111111000000000000000000000000
I would think that data & mask would give 00010010, but it just makes byte 00000000 every time! How can his be? I have written some code for the little endian order and this works great, see below:
#include <iostream>
int main (int argc, char * const argv[]) {
long data = 0x12345678;
long bitmask = 0xFF;
char byte = 0;
for(long i = 0; i < sizeof(long); i++) {
byte = data & bitmask;
data >>= 8;
}
return 0;
}
Why does the little endian one work and the big endian not? Thanks for any help :-)
You should use the standard functions ntohl() and kin for this. They operate on explicit sized variables (i.e. uint16_t and uin32_t) rather than compiler-specific long, which necessary for portability.
Some platforms provide 64-bit versions in <endian.h>
In your example, data is 0x12345678.
Your first assignment to byte is therefore:
byte = 0x12000000;
which won't fit in a byte, so it gets truncated to zero.
try:
byte = (data & bitmask) >> (sizeof(long) - 1) * 8);
You're getting the shifting all wrong.
#include <iostream>
int main (int argc, char * const argv[]) {
long data = 0x12345678;
int shift = (sizeof(long) - 1) * 8
const unsigned long mask = 0xff;
char byte = 0;
for (long i = 0; i < sizeof(long); i++, shift -= 8) {
byte = (data & (mask << shift)) >> shift;
}
return 0;
}
Now, I wouldn't recommend you do things this way. I would recommend instead writing some nice conversion functions. Many compilers have these as builtins. So you can write your functions to do it the hard way, then switch them to just forward to the compiler builtin when you figure out what it is.
#include <tr1/cstdint> // To get uint16_t, uint32_t and so on.
inline uint16_t to_bigendian(uint16_t val, char bytes[2])
{
bytes[0] = (val >> 8) & 0xffu;
bytes[1] = val & 0xffu;
}
inline uint32_t to_bigendian(uint32_t val, char bytes[4])
{
bytes[0] = (val >> 24) & 0xffu;
bytes[1] = (val >> 16) & 0xffu;
bytes[2] = (val >> 8) & 0xffu;
bytes[3] = val & 0xffu;
}
This code is simpler and easier to understand than your loop. It's also faster. And lastly, it is recognized by some compilers and automatically turned into the single byte swap operation that would be required on most CPUs.
because you are masking off the top byte from an integer and then not shifting it back down 24 bits ...
Change your loop to:
for(long i = 0; i < sizeof(long); i++) {
byte = (data & bitmask) >> 24;
data <<= 8;
}