Simplifying a boolean logical expression to make it faster - c++

Consider the following two functions:
template <typename Type, Type Mask, class = typename std::enable_if<std::is_unsigned<Type>::value>::type>
inline bool function1(const Type n, const Type m)
{
const Type diff = m-n;
const Type msk = Mask & diff;
return (n <= m) && ((!msk && !diff) || (msk && msk <= diff));
}
template <typename Type, Type Mask, class = typename std::enable_if<std::is_unsigned<Type>::value>::type>
inline bool function2(const Type n, const Type m)
{
return (n <= m) && ((!(Mask & (m-n)) && !(m-n)) || ((Mask & (m-n)) && (Mask & (m-n)) <= (m-n)));
}
They do exactly the same thing, except that the first one is more readable due to the use of temporary values (function2 is function1 but where I replaced the temporaries by their original values).
It happens that function2 is a little faster than function1 and due to the fact that I will call it billion times on supercomputers I would like to know whether there is a more simple boolean expression that will produce exactly the same result (Type will always be an unsigned integral type).

The expression can be optimized as follows:
(!msk && !diff) can be rewritten to !diff, as the expression will be true if both are zero, and msk is zero if diff is zero.
Also, isn't diff always >= msk? That is, because using & cannot increase the value of msk. (This holds if Type is an unsigned integer)
I changed the order of !diff and msk as it seems plausible that msk is more often true than !diff.
The final expression is:
(n <= m) && (msk || !diff).
Another equivalent expression (suggested by anatolyg) is:
(n < m && (Mask && (m - n))) || (n == m)

A test might be flawed.
First Test:
#include <iostream>
#include <chrono>
template <unsigned Mask>
inline bool function1(const unsigned n, const unsigned m)
{
const unsigned diff = m-n;
const unsigned msk = Mask & diff;
return (n <= m) && ((!msk && !diff) || (msk && msk <= diff));
}
template <unsigned Mask>
inline bool function2(const unsigned n, const unsigned m)
{
return (n <= m) && ((!(Mask & (m-n)) && !(m-n)) || ((Mask & (m-n)) && (Mask & (m-n)) <= (m-n)));
}
template <unsigned Mask>
inline bool function3(const unsigned n, const unsigned m)
{
if(m < n) return false;
else if(m == n) return true;
else return Mask & (m-n);
}
template <unsigned Mask>
inline bool function4(const unsigned n, const unsigned m)
{
return (n < m && (Mask & (m-n))) || (n == m);
}
volatile unsigned a = std::rand();
volatile unsigned b = std::rand();
volatile bool result;
inline double duration(
std::chrono::system_clock::time_point start,
std::chrono::system_clock::time_point end)
{
return double((end - start).count())
/ std::chrono::system_clock::period::den;
}
int main() {
typedef bool (*Function)(const unsigned, const unsigned);
const unsigned N = 4;
std::chrono::system_clock::duration timing[N] = {};
Function fn[] = {
function1<0x1234>,
function2<0x1234>,
function3<0x1234>,
function4<0x1234>,
};
for(unsigned i = 0; i < 10000; ++i) {
for(unsigned j = 0; j < 100; ++j) {
unsigned Loops = 100;
for(unsigned f = 0; f < N; ++f) {
auto start = std::chrono::system_clock::now();
for(unsigned loop = 0; loop < Loops; ++loop) {
result = fn[f](a, b);
}
auto end = std::chrono::system_clock::now();
timing[f] += (end-start);
}
}
}
for(unsigned i = 0; i < 4; ++i) {
std::cout
<< "Timing " << i+1 << ": "
<< double(timing[i].count()) / std::chrono::system_clock::period::den
<< "\n";
}
}
compiled with g++ -std=c++11 -O3
shows:
Timing 1: 0.435909
Timing 2: 0.435438
Timing 3: 0.435435
Timing 4: 0.435523
Second Test:
#include <iostream>
#include <chrono>
inline bool function1(const unsigned Mask, const unsigned n, const unsigned m)
{
const unsigned diff = m-n;
const unsigned msk = Mask & diff;
return (n <= m) && ((!msk && !diff) || (msk && msk <= diff));
}
inline bool function2(const unsigned Mask, const unsigned n, const unsigned m)
{
return (n <= m) && ((!(Mask & (m-n)) && !(m-n)) || ((Mask & (m-n)) && (Mask & (m-n)) <= (m-n)));
}
inline bool function3(const unsigned Mask, const unsigned n, const unsigned m)
{
if(m < n) return false;
else if(m == n) return true;
else return Mask & (m-n);
}
inline bool function4(const unsigned Mask, const unsigned n, const unsigned m)
{
return (n < m && (Mask & (m-n))) || (n == m);
}
inline double duration(
std::chrono::system_clock::time_point start,
std::chrono::system_clock::time_point end)
{
return double((end - start).count())
/ std::chrono::system_clock::period::den;
}
int main() {
typedef bool (*Function)(const unsigned, const unsigned, const unsigned);
const unsigned N = 4;
std::chrono::system_clock::duration timing[N] = {};
Function fn[] = {
function1,
function2,
function3,
function4,
};
const unsigned OuterLoops = 1000000;
const unsigned InnerLoops = 100;
const unsigned Samples = OuterLoops * InnerLoops;
unsigned* M = new unsigned[Samples];
unsigned* A = new unsigned[Samples];
unsigned* B = new unsigned[Samples];
for(unsigned i = 0; i < Samples; ++i) {
M[i] = std::rand();
A[i] = std::rand();
B[i] = std::rand();
}
unsigned result[N];
for(unsigned i = 0; i < OuterLoops; ++i) {
for(unsigned f = 0; f < N; ++f) {
auto start = std::chrono::system_clock::now();
for(unsigned j = 0; j < InnerLoops; ++j) {
unsigned index = i + j;
unsigned mask = M[index];
unsigned a = A[index];
unsigned b = B[index];
result[f] = fn[f](mask, a, b);
}
auto end = std::chrono::system_clock::now();
timing[f] += (end-start);
}
for(unsigned f = 1; f < N; ++f) {
if(result[0] != result[f]) {
std::cerr << "Different Results\n";
exit(-1);
}
}
}
for(unsigned i = 0; i < 4; ++i) {
std::cout
<< "Timing " << i+1 << ": "
<< double(timing[i].count()) / std::chrono::system_clock::period::den
<< "\n";
}
}
compiled with g++ -std=c++11 -O3
shows:
Timing 1: 0.763875
Timing 2: 0.738105
Timing 3: 0.518714
Timing 4: 0.785299
Disassembly of the second functions (compiled without inline):
0000000000000000 <_Z9function1jjj>:
0: 31 c0 xor %eax,%eax
2: 39 f2 cmp %esi,%edx
4: 72 10 jb 16 <_Z9function1jjj+0x16>
6: 29 f2 sub %esi,%edx
8: 21 d7 and %edx,%edi
a: 89 f8 mov %edi,%eax
c: 09 d0 or %edx,%eax
e: 74 18 je 28 <_Z9function1jjj+0x28>
10: 39 d7 cmp %edx,%edi
12: 76 0c jbe 20 <_Z9function1jjj+0x20>
14: 31 c0 xor %eax,%eax
16: f3 c3 repz retq
18: 0f 1f 84 00 00 00 00 nopl 0x0(%rax,%rax,1)
1f: 00
20: 85 ff test %edi,%edi
22: 74 f0 je 14 <_Z9function1jjj+0x14>
24: 0f 1f 40 00 nopl 0x0(%rax)
28: b8 01 00 00 00 mov $0x1,%eax
2d: c3 retq
2e: 66 90 xchg %ax,%ax
0000000000000030 <_Z9function2jjj>:
30: 31 c0 xor %eax,%eax
32: 39 d6 cmp %edx,%esi
34: 77 0c ja 42 <_Z9function2jjj+0x12>
36: 89 d1 mov %edx,%ecx
38: 29 f1 sub %esi,%ecx
3a: 21 cf and %ecx,%edi
3c: 75 0a jne 48 <_Z9function2jjj+0x18>
3e: 39 f2 cmp %esi,%edx
40: 74 0a je 4c <_Z9function2jjj+0x1c>
42: f3 c3 repz retq
44: 0f 1f 40 00 nopl 0x0(%rax)
48: 39 f9 cmp %edi,%ecx
4a: 72 f6 jb 42 <_Z9function2jjj+0x12>
4c: b8 01 00 00 00 mov $0x1,%eax
51: c3 retq
52: 66 66 66 66 66 2e 0f data32 data32 data32 data32 nopw %cs:0x0(%rax,%rax,1)
59: 1f 84 00 00 00 00 00
0000000000000060 <_Z9function3jjj>:
60: 31 c0 xor %eax,%eax
62: 39 f2 cmp %esi,%edx
64: 72 0f jb 75 <_Z9function3jjj+0x15>
66: 74 08 je 70 <_Z9function3jjj+0x10>
68: 29 f2 sub %esi,%edx
6a: 85 fa test %edi,%edx
6c: 0f 95 c0 setne %al
6f: c3 retq
70: b8 01 00 00 00 mov $0x1,%eax
75: f3 c3 repz retq
77: 66 0f 1f 84 00 00 00 nopw 0x0(%rax,%rax,1)
7e: 00 00
0000000000000080 <_Z9function4jjj>:
80: 39 d6 cmp %edx,%esi
82: 73 0d jae 91 <_Z9function4jjj+0x11>
84: 89 d1 mov %edx,%ecx
86: b8 01 00 00 00 mov $0x1,%eax
8b: 29 f1 sub %esi,%ecx
8d: 85 f9 test %edi,%ecx
8f: 75 05 jne 96 <_Z9function4jjj+0x16>
91: 39 d6 cmp %edx,%esi
93: 0f 94 c0 sete %al
96: f3 c3 repz retq
Hardware:
Intel® Core™ i3-2310M CPU # 2.10GHz × 4
7.7 GiB
My conclusion:
Analyze the algorithm properly (See #George answer)
Express the optimized algorithm in simple code and leave fine tuning optimizations to the compiler.
Write a proper test case (measurement), but the kind of measurement will impact the result. (Here: The first and second show different results) -

The difference between f1 and f2 is probably because the compiler fail to delay the evaluation of diff and msk in the case n>m in f1.
Below a sample code to time your functions and results in microseconds on my computer under VS2013, also, as #George said, there is redundant evaluations so i added f1b and f3.
f1 = 98201.7us
f1b = 95574.1us
f2 = 96613.1us
f3 = 94809.9us
And the code :
#include <iostream>
#include <vector>
#include <random>
#include <limits>
#include <chrono>
#include <algorithm>
#define NOMINMAX
#include <windows.h>
struct HighResClock {
typedef long long rep;
typedef std::nano period;
typedef std::chrono::duration<rep, period> duration;
typedef std::chrono::time_point<HighResClock> time_point;
static const bool is_steady = true;
static time_point now( );
};
namespace {
const long long g_Frequency = [] ( ) -> long long {
LARGE_INTEGER frequency;
QueryPerformanceFrequency( &frequency );
return frequency.QuadPart;
}( );
}
HighResClock::time_point HighResClock::now( ) {
LARGE_INTEGER count;
QueryPerformanceCounter( &count );
return time_point( duration( count.QuadPart * static_cast<rep>( period::den ) / g_Frequency ) );
}
template <typename Type, Type Mask>
inline bool function1( const Type n, const Type m ) {
static_assert( std::is_unsigned<Type>::value, "Type must be unsigned" );
const Type diff = m - n;
const Type msk = Mask & diff;
return ( n <= m ) && ( ( !msk && !diff ) || ( msk && msk <= diff ) );
}
template <typename Type, Type Mask>
inline bool function1b( const Type n, const Type m ) {
static_assert( std::is_unsigned<Type>::value, "Type must be unsigned" );
if ( n > m )
return false;
const Type diff = m - n;
const Type msk = Mask & diff;
return ( ( !msk && !diff ) || ( msk && msk <= diff ) );
}
template <typename Type, Type Mask>
inline bool function2( const Type n, const Type m ) {
static_assert( std::is_unsigned<Type>::value, "Type must be unsigned" );
return ( n <= m ) && ( ( !( Mask & ( m - n ) ) && !( m - n ) ) || ( ( Mask & ( m - n ) ) && ( Mask & ( m - n ) ) <= ( m - n ) ) );
}
template <typename Type, Type Mask>
inline bool function3( const Type n, const Type m ) {
static_assert( std::is_unsigned<Type>::value, "Type must be unsigned" );
if ( n == m )
return true;
if ( n>m )
return false;
const Type diff = m - n;
const Type msk = Mask & diff;
return msk && msk <= diff;
}
std::vector<std::pair<size_t, size_t>> fill( size_t n ) {
std::random_device rd;
std::mt19937 gen( rd( ) );
std::uniform_int_distribution<size_t> dis( 0, std::numeric_limits<size_t>::max( ) );
auto rnd = [ &] { return dis( gen ); };
std::vector<std::pair<size_t, size_t>> result;
result.reserve( n );
while ( n-- ) {
result.push_back( { rnd( ), rnd( ) } );
}
return result;
}
size_t ignoreOptim {};
template <typename F>
std::chrono::microseconds foo( std::vector<std::pair<size_t, size_t>> const nms, F &&f ) {
using clock = HighResClock; // Does VS2014 will fix the high_resolution_clock fallbacking to system_clock ???
auto t0 = clock::now( );
auto f1 = std::count_if( begin( nms ), end( nms ), std::forward<F&&>( f ) );
auto t1 = clock::now( );
ignoreOptim += f1;
auto result = std::chrono::duration_cast<std::chrono::microseconds>( t1 - t0 );
return result;
}
template <typename F>
void bar( std::vector<std::pair<size_t, size_t>> const nms, char const* name, F &&f ) {
std::chrono::microseconds f1 {};
for ( int i {}; i != 100; ++i )
f1 += foo( nms, std::forward<F&&>( f ) );
std::cout << name << " = " << float( f1.count( ) ) / 10.f << "us" << std::endl;
}
int main( ) {
auto nms = fill( 1 << 21 );
bar( nms, "f1", [] ( std::pair<size_t, size_t> nm ) { return function1<size_t, 0x0003000000000000ull>( nm.first, nm.second ); } );
bar( nms, "f1b", [] ( std::pair<size_t, size_t> nm ) { return function1b<size_t, 0x0003000000000000ull>( nm.first, nm.second ); } );
bar( nms, "f2", [] ( std::pair<size_t, size_t> nm ) { return function2<size_t, 0x0003000000000000ull>( nm.first, nm.second ); } );
bar( nms, "f3", [] ( std::pair<size_t, size_t> nm ) { return function3<size_t, 0x0003000000000000ull>( nm.first, nm.second ); } );
return 0;
}

Related

Dynamic stack buffer overflow error (Leetcode)

I'm getting a runtime error while submitting the code on Leetcode.
Code:
class Solution {
public:
void gameOfLife(vector<vector<int>>& board) {
int m = board.size();
int n = board[0].size();
int mat[m][n], copy[m][n];
int i = 0, j = 0;
vector<vector<int>>::iterator row;
vector<int>::iterator col;
for (row = board.begin(); row != board.end(); row++) {
for (col = row->begin(); col != row->end(); col++) {
mat[i][j] = *col;
copy[i][j] = *col;
j++;
}
i++;
j = 0;
}
for (i = 0; i < m; i++) {
for (j = 0; j < n; j++) {
int c = 0;
if (i == 0 && j == 0) {
if (mat[1][0] == 1) c++;
if (mat[0][1] == 1) c++;
if (mat[1][1] == 1) c++;
} else if (i == 0 && j == n - 1) {
if (mat[0][n - 2] == 1) c++;
if (mat[1][n - 1] == 1) c++;
if (mat[1][n - 2] == 1) c++;
} else if (i == 0 && j != 0 && j != n - 1) {
if (mat[0][j - 1] == 1) c++;
if (mat[0][j + 1] == 1) c++;
if (mat[1][j - 1] == 1) c++;
if (mat[1][j] == 1) c++;
if (mat[1][j + 1] == 1) c++;
} else if (j == 0 && i != 0 && i != m - 1) {
if (mat[i - 1][0] == 1) c++;
if (mat[i + 1][0] == 1) c++;
if (mat[i - 1][j + 1] == 1) c++;
if (mat[i][j + 1] == 1) c++;
if (mat[i + 1][j + 1] == 1) c++;
} else if (j == n - 1 && i != 0 && i != m - 1) {
if (mat[i - 1][j] == 1) c++;
if (mat[i + 1][j] == 1) c++;
if (mat[i - 1][j - 1] == 1) c++;
if (mat[i][j - 1] == 1) c++;
if (mat[i + 1][j - 1] == 1) c++;
} else if (i == m - 1 && j == 0) {
if (mat[i][j + 1] == 1) c++;
if (mat[i - 1][j] == 1) c++;
if (mat[i - 1][j + 1] == 1) c++;
} else if (i == m - 1 && j == n - 1) {
if (mat[i][j - 1] == 1) c++;
if (mat[i - 1][j] == 1) c++;
if (mat[i - 1][j - 1] == 1) c++;
} else if (i == m - 1 && j != 0 && j != n - 1) {
if (mat[i][j - 1] == 1) c++;
if (mat[i][j + 1] == 1) c++;
if (mat[i - 1][j - 1] == 1) c++;
if (mat[i - 1][j] == 1) c++;
if (mat[i - 1][j + 1] == 1) c++;
} else {
if (mat[i][j - 1] == 1) c++;
if (mat[i][j + 1] == 1) c++;
if (mat[i - 1][j - 1] == 1) c++;
if (mat[i - 1][j] == 1) c++;
if (mat[i - 1][j + 1] == 1) c++;
if (mat[i + 1][j - 1] == 1) c++;
if (mat[i + 1][j] == 1) c++;
if (mat[i + 1][j + 1] == 1) c++;
}
if (mat[i][j] == 0) {
if (c == 3) copy[i][j] = 1;
} else {
if (c != 2 && c != 3) copy[i][j] = 0;
}
}
}
i = 0, j = 0;
vector<vector<int>>::iterator r;
vector<int>::iterator cl;
for (r = board.begin(); r != board.end(); r++) {
for (cl = r->begin(); cl != r->end(); cl++) {
*cl = copy[i][j];
j++;
}
i++;
j = 0;
}
}
};
I don't have much clarity regarding the dynamic-stack-buffer-overflow error. I referenced the link: Error: dynamic-stack-buffer-overflow and got the idea it's due to an out-of-bound index reference. I tried dry running with the sample input but it didn't help much.
Error message:
=================================================================
==34==ERROR: AddressSanitizer: dynamic-stack-buffer-overflow on address 0x7ffc2a0e3024 at pc 0x000000347dd6 bp 0x7ffc2a0e2f90 sp 0x7ffc2a0e2f88
READ of size 4 at 0x7ffc2a0e3024 thread T0
#2 0x7f85967fb0b2 (/lib/x86_64-linux-gnu/libc.so.6+0x270b2)
Address 0x7ffc2a0e3024 is located in stack of thread T0
Shadow bytes around the buggy address:
0x1000054145b0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
0x1000054145c0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
0x1000054145d0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
0x1000054145e0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
0x1000054145f0: 00 00 00 00 ca ca ca ca 04 cb cb cb cb cb cb cb
=>0x100005414600: ca ca ca ca[04]cb cb cb cb cb cb cb f1 f1 f1 f1
0x100005414610: 00 f2 f2 f2 f8 f3 f3 f3 00 00 00 00 00 00 00 00
0x100005414620: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
0x100005414630: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
0x100005414640: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
0x100005414650: 00 00 00 00 00 00 00 00 f1 f1 f1 f1 01 f2 04 f2
Shadow byte legend (one shadow byte represents 8 application bytes):
Addressable: 00
Partially addressable: 01 02 03 04 05 06 07
Heap left redzone: fa
Freed heap region: fd
Stack left redzone: f1
Stack mid redzone: f2
Stack right redzone: f3
Stack after return: f5
Stack use after scope: f8
Global redzone: f9
Global init order: f6
Poisoned by user: f7
Container overflow: fc
Array cookie: ac
Intra object redzone: bb
ASan internal: fe
Left alloca redzone: ca
Right alloca redzone: cb
Shadow gap: cc
==34==ABORTING
Any suggestions as to which part of the code is throwing the error would be much appreciated.
You should really look include the input for which the error was thrown. When the grid only consists of 1 row/column or for example {{0}} then i == 0 && j == 0 is true, but mat[1] is out of bounds, possibly filled with garbage which then gets dereferenced. I recommend you clean up the bounds checking and neighbor counting so that it is not only more compact but much easier to look through as well. One example on how it can be rewritten is:
for(int a = -1; a <= 1; ++a)
for(int b = -1; b <= 1; ++b){
if (a == 0 && b == 0) continue;
if (i + a < 0 || i + a >= board.size() ||
j + b < 0 || j + b >= board[i].size()) continue;
c += board[i+a][j+b];
}
An other note I mentioned in the comment int mat[n][m] is not standard C++. Some compilers support it some don't. If you want dynamic arrays use new or you can use some C style alloca to still be standard compliant, but generally stl containers are your friend so you could also just use a std::vector<std::vector<int>>, like the problem does.
Note about alloca: As someone pointed out errors can be caused by placing arrays too large on the stack, so be careful. Also generally this is only useful to people who already know about it and know its limitations well. Mainly there as an example, use standard containers.

Convert char array to byte array

I have a string declared like so.
CHAR bkp[40] = "dc74699a8381da395f10b"; <- this value comes from querying a registry value
In memory (using VS memory window) I see..
0x00000071432FF918 64 63 37 34 36 39 39 61 38 33 38 31 64 61 33 39 35 66 31 30 62 00 .. .. .. ..
I am trying to convert the string to memory so that when I examine that memory address I see..
0x00000071432FF918 dc 74 69 9a 83 81 da 39 5f 10 0b .. .. .. ..
My project is in C++ but the function requires that it gets returned to a char *. So if the char array needs to be converted to a C++ string it can.
Simply iterate through the string, and for every 2-char pair, you can do some very simple calculations and bit shifts to extract the byte values. For example:
BYTE decodeHex(char c)
{
if (c >= '0' && c <= '9')
return c - '0';
else if (c >= 'A' && c <= 'F')
return (c - 'A') + 10;
else if (c >= 'a' && c <= 'f')
return (c - 'a') + 10;
else
// illegal! throw something...
}
CHAR bkp[] = "dc74699a8381da395f100b";
int slen = strlen(bkp);
BYTE *bytes = new BYTE[slen / 2];
int blen = 0;
for(int i = 0; i < slen; i += 2)
{
bytes[blen++] = (decodeHex(bkp[i]) << 4) | decodeHex(bkp[i+1]);
}
// use bytes up to blen as needed...
delete[] bytes;
You need to convert you character array to a binary. Your input array is a hex string so this is rather straigforward.
unsigned char toBinary(char c)
{
if (c >= '0' && c <= '9')
return c - '0';
return (c - 'a') + 10;
}
CHAR bkp[40] = "dc74699a8381da395f10b"
unsigned char b[20];
int bi = 0;
for(int i = 0; i < 40; i += 2)
{
char c = bkp[i];
unsigned char v = toBinary(bkp[i]) << 4;
v += toBinary(bkp[i+1])
b[bi++] = v;
}
The array is a character string, so you'll have to convert from characters to hex.
Let's use the old fashioned method:
const unsigned int length = sizeof(bkp);
const std::string hex_digits = "0123456789abcdef";
std::vector<uint8_t> destination;
for (unsigned int index = 0U; index < length; index += 2U)
{
uint8_t byte_value = 0;
std::string::size_type position = hex_digits.find(bkp[index]);
if (position == std::string::npos)
{
std::cerr << "invalid hex value at position " << index << "\n";
break;
}
byte_value = position;
++index;
position = hex_digits.find(bkp[index]);
if (position == std::string::npos)
{
std::cerr << "invalid hex value at position " << index << "\n";
break;
}
byte_value = (byte_value * 256) + position;
destination.push_back(byte_value);
}
Note: the above code uses C++ features since the original post was tagged as C++.
Just for some fun, you can perform the conversion using non-conditional operations.
In general:
'A' = 64, 'a' = 96, both of which have bit 6 set (value 64 decimal)
'0' = 48, so does not have the 6th bit set.
You can take the input, take the lower 4 bits to give us the 0->9, A->F or a->f and then take bit 6 and use it as a multiplier to add the +10 if needed.
#include <conio.h>
#include <stdio.h>
#include <string.h>
void HexStrToRaw(char* in, unsigned char* out)
{
for (int loop = 0, o_loop = 0; loop < strlen(in); loop += 2, o_loop++)
{
out[o_loop] = (((in[loop] & 15) + ((in[loop] >> 6) * 9)) << 4) | ((in[loop + 1] & 15) + ((in[loop + 1] >> 6) * 9));
}
}
int main(int argc, char** argv)
{
char in[40] = "dc74699a8381da395f10b";
unsigned char out[20];
HexStrToRaw(in, out);
for (int loop = 0; loop < sizeof(out); loop++)
{
printf("%d -> 0x%02x\n", loop, out[loop]);
}
return 0;
}
The output becomes:
0 -> 0xdc
1 -> 0x74
2 -> 0x69
3 -> 0x9a
4 -> 0x83
5 -> 0x81
6 -> 0xda
7 -> 0x39
8 -> 0x5f
9 -> 0x10
10 -> 0xb0
11 -> 0xcc
12 -> 0xcc
13 -> 0xcc
14 -> 0xcc
15 -> 0xcc
16 -> 0xcc
17 -> 0xcc
18 -> 0xcc
19 -> 0xcc

Why _umul128 works slower than scalar code for mul128x64x2 function?

I am second time trying to implement fast mul128x64x2 function. First time I ask the question without comparision with _umul128 MSVC version. Now I made such a comparison and the results that I got show that the _umul128 function slower then native scalar and handmade simd AVX 1.0 code.
Below my test code:
#include <iostream>
#include <chrono>
#include <intrin.h>
#include <emmintrin.h>
#include <immintrin.h>
#pragma intrinsic(_umul128)
constexpr uint32_t LOW[4] = { 4294967295u, 0u, 4294967295u, 0u };
__forceinline void multiply128x128( const uint32_t ABCD[4], const uint32_t EFGH[4], uint32_t OUT[2][4] ) noexcept
{
__m128i L = _mm_lddqu_si128( reinterpret_cast< __m128i const* >( LOW ) );
__m128i IN = _mm_lddqu_si128( reinterpret_cast< __m128i const* >( EFGH ) );
__m128i A = _mm_set1_epi32( ABCD[0] );
__m128i B = _mm_set1_epi32( ABCD[1] );
__m128i C = _mm_set1_epi32( ABCD[2] );
__m128i D = _mm_set1_epi32( ABCD[3] );
__m128i ED = _mm_mul_epu32( IN, D );
__m128i EC = _mm_mul_epu32( IN, C );
__m128i EB = _mm_mul_epu32( IN, B );
__m128i EA = _mm_mul_epu32( IN, A );
IN = _mm_srli_epi64( IN, 32 );
__m128i FD = _mm_mul_epu32( IN, D );
__m128i FC = _mm_mul_epu32( IN, C );
__m128i FB = _mm_mul_epu32( IN, B );
__m128i FA = _mm_mul_epu32( IN, A );
__m128i FD_H = _mm_srli_epi64( FD, 32 );
__m128i FD_L = _mm_and_si128 ( L, FD );
__m128i FC_H = _mm_srli_epi64( FC, 32 );
__m128i FC_L = _mm_and_si128 ( L, FC );
__m128i FB_H = _mm_srli_epi64( FB, 32 );
__m128i FB_L = _mm_and_si128 ( L, FB );
__m128i FA_H = _mm_srli_epi64( FA, 32 );
__m128i FA_L = _mm_and_si128 ( L, FA );
__m128i ED_H = _mm_srli_epi64( ED, 32 );
__m128i ED_L = _mm_and_si128 ( L, ED );
__m128i EC_H = _mm_srli_epi64( EC, 32 );
__m128i EC_L = _mm_and_si128 ( L, EC );
__m128i EB_H = _mm_srli_epi64( EB, 32 );
__m128i EB_L = _mm_and_si128 ( L, EB );
__m128i EA_H = _mm_srli_epi64( EA, 32 );
__m128i EA_L = _mm_and_si128 ( L, EA );
__m128i SUM_FC_L_FD_H = _mm_add_epi64( FC_L, FD_H );
__m128i SUM_FB_L_FC_H = _mm_add_epi64( FB_L, FC_H );
__m128i SUM_FA_L_FB_H = _mm_add_epi64( FA_L, FB_H );
__m128i SUM_EC_L_ED_H = _mm_add_epi64( EC_L, ED_H );
__m128i SUM_EB_L_EC_H = _mm_add_epi64( EB_L, EC_H );
__m128i SUM_EA_L_EB_H = _mm_add_epi64( EA_L, EB_H );
__m128i SUM_FC_L_FD_H_ED_L = _mm_add_epi64( SUM_FC_L_FD_H, ED_L );
__m128i SUM_FB_L_FC_H_EC_L_ED_H = _mm_add_epi64( SUM_FB_L_FC_H, SUM_EC_L_ED_H );
__m128i SUM_FA_L_FB_H_EB_L_EC_H = _mm_add_epi64( SUM_FA_L_FB_H, SUM_EB_L_EC_H );
__m128i SUM_FA_H_EA_L_EB_H = _mm_add_epi64( FA_H, SUM_EA_L_EB_H );
__m128i SUM_FC_L_FD_H_ED_L_L = _mm_srli_epi64( SUM_FC_L_FD_H_ED_L, 32 );
SUM_FC_L_FD_H_ED_L_L = _mm_add_epi64 ( SUM_FC_L_FD_H_ED_L_L, SUM_FB_L_FC_H_EC_L_ED_H );
__m128i SUM_FC_L_FD_H_ED_L_L_L = _mm_srli_epi64( SUM_FC_L_FD_H_ED_L_L, 32 );
SUM_FC_L_FD_H_ED_L_L_L = _mm_add_epi64 ( SUM_FC_L_FD_H_ED_L_L_L, SUM_FA_L_FB_H_EB_L_EC_H );
__m128i SUM_FC_L_FD_H_ED_L_L_L_L = _mm_srli_epi64( SUM_FC_L_FD_H_ED_L_L_L, 32 );
SUM_FC_L_FD_H_ED_L_L_L_L = _mm_add_epi64 ( SUM_FC_L_FD_H_ED_L_L_L_L, SUM_FA_H_EA_L_EB_H );
__m128i SUM_FC_L_FD_H_ED_L_L_L_L_L = _mm_srli_epi64( SUM_FC_L_FD_H_ED_L_L_L_L, 32 );
SUM_FC_L_FD_H_ED_L_L_L_L_L = _mm_add_epi64 ( SUM_FC_L_FD_H_ED_L_L_L_L_L, EA_H );
OUT[0][0] = SUM_FC_L_FD_H_ED_L_L_L_L_L.m128i_u32[0];
OUT[0][1] = SUM_FC_L_FD_H_ED_L_L_L_L.m128i_u32[0];
OUT[0][2] = SUM_FC_L_FD_H_ED_L_L_L.m128i_u32[0];
OUT[0][3] = SUM_FC_L_FD_H_ED_L_L.m128i_u32[0];
OUT[1][0] = SUM_FC_L_FD_H_ED_L_L_L_L_L.m128i_u32[2];
OUT[1][1] = SUM_FC_L_FD_H_ED_L_L_L_L.m128i_u32[2];
OUT[1][2] = SUM_FC_L_FD_H_ED_L_L_L.m128i_u32[2];
OUT[1][3] = SUM_FC_L_FD_H_ED_L_L.m128i_u32[2];
}
__forceinline void multiply128x128_1( const uint32_t ABCD[4], const uint32_t EFGH[4], uint32_t OUT[2][4] ) noexcept
{
uint64_t ED = static_cast<uint64_t>( ABCD[3] ) * static_cast<uint64_t>( EFGH[0] );
uint64_t EC = static_cast<uint64_t>( ABCD[2] ) * static_cast<uint64_t>( EFGH[0] );
uint64_t EB = static_cast<uint64_t>( ABCD[1] ) * static_cast<uint64_t>( EFGH[0] );
uint64_t EA = static_cast<uint64_t>( ABCD[0] ) * static_cast<uint64_t>( EFGH[0] );
uint64_t FD = static_cast<uint64_t>( ABCD[3] ) * static_cast<uint64_t>( EFGH[1] );
uint64_t FC = static_cast<uint64_t>( ABCD[2] ) * static_cast<uint64_t>( EFGH[1] );
uint64_t FB = static_cast<uint64_t>( ABCD[1] ) * static_cast<uint64_t>( EFGH[1] );
uint64_t FA = static_cast<uint64_t>( ABCD[0] ) * static_cast<uint64_t>( EFGH[1] );
uint64_t GD = static_cast<uint64_t>( ABCD[3] ) * static_cast<uint64_t>( EFGH[2] );
uint64_t GC = static_cast<uint64_t>( ABCD[2] ) * static_cast<uint64_t>( EFGH[2] );
uint64_t GB = static_cast<uint64_t>( ABCD[1] ) * static_cast<uint64_t>( EFGH[2] );
uint64_t GA = static_cast<uint64_t>( ABCD[0] ) * static_cast<uint64_t>( EFGH[2] );
uint64_t HD = static_cast<uint64_t>( ABCD[3] ) * static_cast<uint64_t>( EFGH[3] );
uint64_t HC = static_cast<uint64_t>( ABCD[2] ) * static_cast<uint64_t>( EFGH[3] );
uint64_t HB = static_cast<uint64_t>( ABCD[1] ) * static_cast<uint64_t>( EFGH[3] );
uint64_t HA = static_cast<uint64_t>( ABCD[0] ) * static_cast<uint64_t>( EFGH[3] );
uint64_t SUM_FC_L_FD_H = ( FC & 0xFFFFFFFF ) + ( FD >> 32u );
uint64_t SUM_FB_L_FC_H = ( FB & 0xFFFFFFFF ) + ( FC >> 32u );
uint64_t SUM_FA_L_FB_H = ( FA & 0xFFFFFFFF ) + ( FB >> 32u );
uint64_t SUM_EC_L_ED_H = ( EC & 0xFFFFFFFF ) + ( ED >> 32u );
uint64_t SUM_EB_L_EC_H = ( EB & 0xFFFFFFFF ) + ( EC >> 32u );
uint64_t SUM_EA_L_EB_H = ( EA & 0xFFFFFFFF ) + ( EB >> 32u );
uint64_t SUM_HC_L_HD_H = ( HC & 0xFFFFFFFF ) + ( HD >> 32u );
uint64_t SUM_HB_L_HC_H = ( HB & 0xFFFFFFFF ) + ( HC >> 32u );
uint64_t SUM_HA_L_HB_H = ( HA & 0xFFFFFFFF ) + ( HB >> 32u );
uint64_t SUM_GC_L_GD_H = ( GC & 0xFFFFFFFF ) + ( GD >> 32u );
uint64_t SUM_GB_L_GC_H = ( GB & 0xFFFFFFFF ) + ( GC >> 32u );
uint64_t SUM_GA_L_GB_H = ( GA & 0xFFFFFFFF ) + ( GB >> 32u );
uint64_t SUM_FC_L_FD_H_ED_L = SUM_FC_L_FD_H + ( ED & 0xFFFFFFFF );
uint64_t SUM_FB_L_FC_H_EC_L_ED_H = SUM_FB_L_FC_H + SUM_EC_L_ED_H;
uint64_t SUM_FA_L_FB_H_EB_L_EC_H = SUM_FA_L_FB_H + SUM_EB_L_EC_H;
uint64_t SUM_FA_H_EA_L_EB_H = SUM_EA_L_EB_H + ( FA >> 32u );
uint64_t SUM_FC_L_FD_H_ED_L_L = ( SUM_FC_L_FD_H_ED_L >> 32u ) + SUM_FB_L_FC_H_EC_L_ED_H;
uint64_t SUM_FC_L_FD_H_ED_L_L_L = ( SUM_FC_L_FD_H_ED_L_L >> 32u ) + SUM_FA_L_FB_H_EB_L_EC_H;
uint64_t SUM_FC_L_FD_H_ED_L_L_L_L = ( SUM_FC_L_FD_H_ED_L_L_L >> 32u ) + SUM_FA_H_EA_L_EB_H;
uint64_t SUM_FC_L_FD_H_ED_L_L_L_L_L = ( SUM_FC_L_FD_H_ED_L_L_L_L >> 32u ) + ( EA >> 32u );
uint64_t SUM_HC_L_HD_H_GD_L = SUM_HC_L_HD_H + ( GD & 0xFFFFFFFF );
uint64_t SUM_HB_L_HC_H_GC_L_GD_H = SUM_HB_L_HC_H + SUM_GC_L_GD_H;
uint64_t SUM_HA_L_HB_H_GB_L_GC_H = SUM_HA_L_HB_H + SUM_GB_L_GC_H;
uint64_t SUM_HA_H_GA_L_GB_H = SUM_GA_L_GB_H + ( HA >> 32u );
uint64_t SUM_HC_L_HD_H_GD_L_L = ( SUM_HC_L_HD_H_GD_L >> 32u ) + SUM_HB_L_HC_H_GC_L_GD_H;
uint64_t SUM_HC_L_HD_H_GD_L_L_L = ( SUM_HC_L_HD_H_GD_L_L >> 32u ) + SUM_HA_L_HB_H_GB_L_GC_H;
uint64_t SUM_HC_L_HD_H_GD_L_L_L_L = ( SUM_HC_L_HD_H_GD_L_L_L >> 32u ) + SUM_HA_H_GA_L_GB_H;
uint64_t SUM_HC_L_HD_H_GD_L_L_L_L_L = ( SUM_HC_L_HD_H_GD_L_L_L_L >> 32u ) + ( GA >> 32u );
OUT[0][0] = SUM_FC_L_FD_H_ED_L_L_L_L_L;
OUT[0][1] = SUM_FC_L_FD_H_ED_L_L_L_L;
OUT[0][2] = SUM_FC_L_FD_H_ED_L_L_L;
OUT[0][3] = SUM_FC_L_FD_H_ED_L_L;
OUT[1][0] = SUM_HC_L_HD_H_GD_L_L_L_L_L;
OUT[1][1] = SUM_HC_L_HD_H_GD_L_L_L_L;
OUT[1][2] = SUM_HC_L_HD_H_GD_L_L_L;
OUT[1][3] = SUM_HC_L_HD_H_GD_L_L;
}
__forceinline void mulShift( const uint64_t* const m, const uint64_t* const mul , uint32_t OUT[2][4]) noexcept
{
uint64_t B0[2];
uint64_t B2[2];
{
B0[0] = _umul128( m[1], mul[0], &B0[1] );
B2[0] = _umul128( m[0], mul[0], &B2[1] );
uint64_t S = B0[1] + B2[0];
OUT[0][2] = S >> 32;
OUT[0][3] = S & 0xFFFFFFFF;
uint64_t M = B2[1] + ( S < B2[0] );
OUT[0][1] = M & 0xFFFFFFFF;
OUT[0][0] = M >> 32;
}
{
B0[0] = _umul128( m[1], mul[1], &B0[1] );
B2[0] = _umul128( m[0], mul[1], &B2[1] );
uint64_t S = B0[1] + B2[0];
OUT[1][2] = S >> 32;
OUT[1][3] = S & 0xFFFFFFFF;
uint64_t M = B2[1] + ( S < B2[0] );
OUT[1][1] = M & 0xFFFFFFFF;
OUT[1][0] = M >> 32;
}
}
constexpr uint32_t N = 1 << 28;
int main()
{
uint32_t OUT[2][4];
uint32_t ABCD[4] = { 4294967295u, 4294967295u, 4294967295u, 4294967295u };
uint32_t EFGH[4] = { 4294967295u, 4294967295u, 4294967295u, 4294967295u };
multiply128x128_1( ABCD, EFGH, OUT );
uint64_t S_1 = 0u;
uint64_t S_2 = 0u;
uint64_t S_3 = 0u;
auto start_1 = std::chrono::high_resolution_clock::now();
for ( uint32_t i = 0; i < N; ++i )
{
EFGH[0] = i;
EFGH[1] = i;
EFGH[2] = i + 1;
EFGH[3] = i + 1;
ABCD[0] = i;
ABCD[1] = i;
ABCD[2] = i + 1;
ABCD[3] = i + 1;
multiply128x128( ABCD, EFGH, OUT );
S_1 += OUT[0][0] + OUT[0][1] + OUT[0][2] + OUT[0][3];
S_1 += OUT[1][0] + OUT[1][1] + OUT[1][2] + OUT[1][3];
}
auto stop_1 = std::chrono::high_resolution_clock::now();
std::cout << "Test A: " << std::chrono::duration_cast<std::chrono::milliseconds>( stop_1 - start_1 ).count() << '\n';
auto start_2 = std::chrono::high_resolution_clock::now();
for ( uint32_t i = 0; i < N; ++i )
{
EFGH[0] = i;
EFGH[1] = i;
EFGH[2] = i + 1;
EFGH[3] = i + 1;
ABCD[0] = i;
ABCD[1] = i;
ABCD[2] = i + 1;
ABCD[3] = i + 1;
mulShift( reinterpret_cast<const uint64_t*>( ABCD ), reinterpret_cast<const uint64_t*>( EFGH ), OUT );
S_2 += OUT[0][0] + OUT[0][1] + OUT[0][2] + OUT[0][3];
S_2 += OUT[1][0] + OUT[1][1] + OUT[1][2] + OUT[1][3];
}
auto stop_2 = std::chrono::high_resolution_clock::now();
std::cout << "Test B: " << std::chrono::duration_cast<std::chrono::milliseconds>( stop_2 - start_2 ).count() << '\n';
auto start_3 = std::chrono::high_resolution_clock::now();
for ( uint32_t i = 0; i < N; ++i )
{
EFGH[0] = i;
EFGH[1] = i;
EFGH[2] = i + 1;
EFGH[3] = i + 1;
ABCD[0] = i;
ABCD[1] = i;
ABCD[2] = i + 1;
ABCD[3] = i + 1;
multiply128x128_1( ABCD, EFGH, OUT );
S_3 += OUT[0][0] + OUT[0][1] + OUT[0][2] + OUT[0][3];
S_3 += OUT[1][0] + OUT[1][1] + OUT[1][2] + OUT[1][3];
}
auto stop_3 = std::chrono::high_resolution_clock::now();
std::cout << "Test C: " << std::chrono::duration_cast<std::chrono::milliseconds>( stop_3 - start_3 ).count() << '\n';
std::cout << S_1 << " " << S_2 << " " << S_3 << '\n';
}
Why is _umul128 so slow? Maybe i did some mistakes in my test code above?
My results:
Test A (simd): 4546ms.
Test B (_umul128): 6637ms.
Test C (scalar): 2333ms.
Tested on Windows 10, x64, MSVC 2019
The _umul128 version isn't really that slow but you're gimping it with store-forwarding stalls by messing around with 32-bit arrays that makes MSVC emit terrible asm.
Optimization is defeating your benchmark; the pure C version isn't really that fast.
Especially with the simple input data:
ABCD[0] = EFGH[0] = i;
ABCD[1] = EFGH[1] = i;
ABCD[2] = EFGH[2] = i + 1;
ABCD[3] = EFGH[3] = i + 1;
Initializing both inputs like this creates a huge amount of opportunity for optimization after inlining the pure C version. It does i*i 4 times, and i*(i+1) = i*i + i another 8 times, and also (i+1)*(i+1) 4 times. MSVC isn't dumb and notices this. This is called Common Subexpression Elimination (CSE).
You'll need to come up with a more sophisticated way to fake input if you want to see how slow the pure C really is. Maybe generate ahead of time then loop over memory containing inputs? Setting up inputs from a loop counter costs almost as much as a multiply.
MSVC's asm output confirms that much of the work optimized away for the pure C version. (Godbolt with MSVC 19.22 for x64)
...
$LL10#main:
lea r15, QWORD PTR [rax+1]
mov rcx, r15
mov r9, r15
imul rcx, rax # only 3, not 16, imul instructions.
imul rax, rax # (None appear later in this loop in the ... part)
imul r9, r15
mov edi, ecx
mov r14, rcx
mov r8d, eax
shr r14, 32 ; 00000020H
shr rax, 32 ; 00000020H
...
sub r13, 1
jne $LL10#main
MSVC is bad at optimizing intrinsics and does all 4 mul m64 instructions instead of noticing that ii * i1i1 is done twice.
More importantly, the _umul128 loop is hurt by store-forwarding stalls because it actually stores your array to memory with 32-bit stores and then uses 64-bit loads to feed mul m64.
Also, handling the output in 32-bit chunks just shoots yourself in the foot, introducing extra shifts and mov operations.
This is not complicated, literally just 3 instructions, mul r64 and imul r64, r64 plus an add for the high half, is all that's needed. GCC/clang easily emit the right thing, and the x86-64 System V calling convention can return a 128-bit int in registers.
On Godbolt: https://godbolt.org/z/DcZhSl
#include <stdint.h>
#ifdef __GNUC__
typedef unsigned __int128 u128;
u128 mul128x64( u128 a, uint64_t b) {
return a * b;
}
#endif
# clang -O3 for the x86-64 System V ABI (Linux)
mul128x64(unsigned __int128, unsigned long): #
mov rax, rdi
imul rsi, rdx
mul rdx
add rdx, rsi
ret
For MSVC we have to do that ourself, and the calling convention means the result is returned in memory.
#ifdef _MSC_VER
#include <intrin.h>
struct u128 { uint64_t u64[2]; };
u128 mul128x64( uint64_t a_lo, uint64_t a_hi, uint64_t b)
{
uint64_t lolo_high;
uint64_t lolo = _umul128( a_lo, b, &lolo_high );
uint64_t lohi = a_hi * b;
return {{lolo, lohi + lolo_high}};
}
#endif
# MSVC x64 -O2
u128 mul128x64(unsigned __int64,unsigned __int64,unsigned __int64) PROC
mov rax, r9
mul rdx
imul r8, r9
mov QWORD PTR [rcx], rax # store the retval into hidden pointer
mov rax, rcx
add r8, rdx
mov QWORD PTR [rcx+8], r8
ret 0
Your __m128i intrinsics version is unlikely to be a win. Modern x86 (mainstream Intel SnB-family, AMD Ryzen) has 1/clock throughput for mul and imul. (Except Ryzen where widening i/mul r64 has 2c throughput, but still 1/clock for imul r64,r64.)
So overall throughput for a 64 x 128-bit multiply on Sandybridge-family is one per 2 cycles (bottlenecked on port 1), if you implement in C that compiles to asm like this.
Given that you need more than 4 pmuludq instructions to implement a multiply, AVX1 is a non-starter. (Skylake has 0.5c throughput for pmuludq. Sandybridge has 1c throughput so you'd need to get the job done in 2 pmuludq insns per multiply (on average) to compete with scalar. And that's without considering all the shift / shuffle / add work that needs doing.
Possibly worth considering on Bulldozer-family where 64-bit scalar multiply is 4c throughput but pmuludq is 1c. (https://agner.org/optimize/) Producing 128 product bits per cycle (two 32x32 => 64-bit products) is better than producing 128 product bits per 4 cycles, if you can get them shifted and added without eating up too many extra cycles.
Again, MSVC is bad at constant-propagation or CSE optimization through intrinsincs, so your intrinsics version doesn't benefit from anything.
Your test code also uses _mm_set1_epi32( ) from scalar integer loop variables, requiring vmovd and vpshufd instructions.
And you get scalar store / vector reload for the lddqu intrinsics on those arrays, so again you have store-forwarding stalls.
The only hope for this being good with SSE2 or AVX1 is if your data comes from memory, not registers. Or if you can keep your data in vector registers for a long time, not constantly moving it back and forth. Especially on Bulldozer-family where int <-> SIMD has high latency.

AES 256-cbc encryption C++ using OpenSSL

I am trying to create a function I can put a string key (I have another algorithm to generate the key) into and a message string. The function should encrypt and decrypt the text using the aes256-cbc from OpenSSL library
#define AES_KEYLENGTH 256
string cipher_AES(string key, string message);
int main(int argc, char* argv[])
{
cipher_AES("115792089237316195423570985008687907853269984665640564039457583884239776304164", "Hello, how are you, you mad?");
return 0;
}
// a simple hex-print routine. could be modified to print 16 bytes-per-line
static void hex_print(const void* pv, size_t len)
{
const unsigned char * p = (const unsigned char*)pv;
if (NULL == pv)
printf("NULL");
else
{
size_t i = 0;
for (; i<len;++i)
printf("%02X ", *p++);
}
printf("\n");
}
/* computes the ciphertext from plaintext and key using AES256-CBC algorithm */
string cipher_AES(string key, string message)
{
size_t inputslength = message.length();
unsigned char aes_input[inputslength];
unsigned char aes_key[AES_KEYLENGTH];
memset(aes_input, 0, inputslength/8);
memset(aes_key, 0, AES_KEYLENGTH/8);
strcpy((char*) aes_input, message.c_str());
strcpy((char*) aes_key, key.c_str());
/* init vector */
unsigned char iv[AES_BLOCK_SIZE];
memset(iv, 0x00, AES_BLOCK_SIZE);
// buffers for encryption and decryption
const size_t encslength = ((inputslength + AES_BLOCK_SIZE) / AES_BLOCK_SIZE) * AES_BLOCK_SIZE;
unsigned char enc_out[encslength];
unsigned char dec_out[inputslength];
memset(enc_out, 0, sizeof(enc_out));
memset(dec_out, 0, sizeof(dec_out));
AES_KEY enc_key, dec_key;
AES_set_encrypt_key(aes_key, AES_KEYLENGTH, &enc_key);
AES_cbc_encrypt(aes_input, enc_out, inputslength, &enc_key, iv, AES_ENCRYPT);
AES_set_decrypt_key(aes_key, AES_KEYLENGTH, &dec_key);
AES_cbc_encrypt(enc_out, dec_out, encslength, &dec_key, iv, AES_DECRYPT);
printf("original:\t");
hex_print(aes_input, sizeof(aes_input));
printf("encrypt:\t");
hex_print(enc_out, sizeof(enc_out));
printf("decrypt:\t");
hex_print(dec_out, sizeof(dec_out));
stringstream ss;
for(int i = 0; i < encslength; i++)
{
ss << enc_out[i];
}
return ss.str(););
}
In the output it seems that something is the same but not all of it:
original: 48 65 6C 6C 6F 2C 20 68 6F 77 20 61 72 65 20 79 6F 75 2C 20 79 6F 75 20 69 64 69 6F 74 3F
encrypt: 25 C3 B4 4B 92 68 2E DA 61 B6 AB 19 97 D3 90 8A 5F 8B 3C 4B 78 13 FC E1 3A AF 2C B5 3F C8 2B D7
decrypt: 17 EE 50 27 17 3F DC 89 55 D8 0C D4 4D AD 0B AE 6F 75 2C 20 79 6F 75 20 69 64 69 6F 74 3F
Looking at your data, the first block (16 bytes) is wrong but following blocks are correct. This suggests that the wrong IV is being used when decrypting. A little testing (printing the IV before and after the first call to AES_cbc_encrypt) shows that the IV does indeed change during this call.
A little poking around the OpenSSL source shows that it changes the IV parameter to be the last block of the ciphertext when encrypting.
Resetting the IV before the decryption fixes it though, and you get the correct plaintext regenerated:
memset(iv, 0x00, AES_BLOCK_SIZE);
AES_cbc_encrypt(enc_out, dec_out, encslength, &dec_key, iv, AES_DECRYPT);
Reading the source code of cbc128.c of openssl, there exists a loop calling the iv or the output of last block. When the loop ends, iv will be renewed.
void CRYPTO_cbc128_encrypt(const unsigned char *in, unsigned char *out,
size_t len, const void *key,
unsigned char ivec[16], block128_f block)
{
size_t n;
const unsigned char *iv = ivec;
if (len == 0)
return;
#if !defined(OPENSSL_SMALL_FOOTPRINT)
if (STRICT_ALIGNMENT &&
((size_t)in | (size_t)out | (size_t)ivec) % sizeof(size_t) != 0) {
while (len >= 16) {
for (n = 0; n < 16; ++n)
out[n] = in[n] ^ iv[n];
(*block) (out, out, key);
iv = out;
len -= 16;
in += 16;
out += 16;
}
} else {
while (len >= 16) {
for (n = 0; n < 16; n += sizeof(size_t))
*(size_t_aX *)(out + n) =
*(size_t_aX *)(in + n) ^ *(size_t_aX *)(iv + n);
(*block) (out, out, key);
iv = out;
len -= 16;
in += 16;
out += 16;
}
}
#endif
while (len) {
for (n = 0; n < 16 && n < len; ++n)
out[n] = in[n] ^ iv[n];
for (; n < 16; ++n)
out[n] = iv[n];
(*block) (out, out, key);
iv = out;
if (len <= 16)
break;
len -= 16;
in += 16;
out += 16;
}
memcpy(ivec, iv, 16);
}

How do I print bytes as hexadecimal?

I know in C# you can use String.Format method. But how do you do this in C++? Is there a function that allows me to convert a byte to a Hex?? Just need to convert a 8 byte long data to Hex, how do I do that?
If you want to use C++ streams rather than C functions, you can do the following:
int ar[] = { 20, 30, 40, 50, 60, 70, 80, 90 };
const int siz_ar = sizeof(ar) / sizeof(int);
for (int i = 0; i < siz_ar; ++i)
cout << ar[i] << " ";
cout << endl;
for (int i = 0; i < siz_ar; ++i)
cout << hex << setfill('0') << setw(2) << ar[i] << " ";
cout << endl;
Very simple.
Output:
20 30 40 50 60 70 80 90
14 1e 28 32 3c 46 50 5a
Well you can convert one byte (unsigned char) at a time into a array like so
char buffer [17];
buffer[16] = 0;
for(j = 0; j < 8; j++)
sprintf(&buffer[2*j], "%02X", data[j]);
C:
static void print_buf(const char *title, const unsigned char *buf, size_t buf_len)
{
size_t i = 0;
fprintf(stdout, "%s\n", title);
for(i = 0; i < buf_len; ++i)
fprintf(stdout, "%02X%s", buf[i],
( i + 1 ) % 16 == 0 ? "\r\n" : " " );
}
C++:
void print_bytes(std::ostream& out, const char *title, const unsigned char *data, size_t dataLen, bool format = true) {
out << title << std::endl;
out << std::setfill('0');
for(size_t i = 0; i < dataLen; ++i) {
out << std::hex << std::setw(2) << (int)data[i];
if (format) {
out << (((i + 1) % 16 == 0) ? "\n" : " ");
}
}
out << std::endl;
}
You can do it with C++20 std::format which is similar to String.Format in C#:
std::string s = std::format("{:x}", std::byte(42)); // s == 2a
Until std::format is widely available you can use the {fmt} library, std::format is based on (godbolt):
std::string s = fmt::format("{:x}", std::byte(42)); // s == 2a
Disclaimer: I'm the author of {fmt} and C++20 std::format.
Printing arbitrary structures in modern C++
All answers so far only tell you how to print an array of integers, but we can also print any arbitrary structure, given that we know its size. The example below creates such structure and iterates a pointer through its bytes, printing them to the output:
#include <iostream>
#include <iomanip>
#include <cstring>
using std::cout;
using std::endl;
using std::hex;
using std::setfill;
using std::setw;
using u64 = unsigned long long;
using u16 = unsigned short;
using f64 = double;
struct Header {
u16 version;
u16 msgSize;
};
struct Example {
Header header;
u64 someId;
u64 anotherId;
bool isFoo;
bool isBar;
f64 floatingPointValue;
};
int main () {
Example example;
// fill with zeros so padding regions don't contain garbage
memset(&example, 0, sizeof(Example));
example.header.version = 5;
example.header.msgSize = sizeof(Example) - sizeof(Header);
example.someId = 0x1234;
example.anotherId = 0x5678;
example.isFoo = true;
example.isBar = true;
example.floatingPointValue = 1.1;
cout << hex << setfill('0'); // needs to be set only once
auto *ptr = reinterpret_cast<unsigned char *>(&example);
for (int i = 0; i < sizeof(Example); i++, ptr++) {
if (i % sizeof(u64) == 0) {
cout << endl;
}
cout << setw(2) << static_cast<unsigned>(*ptr) << " ";
}
return 0;
}
And here's the output:
05 00 24 00 00 00 00 00
34 12 00 00 00 00 00 00
78 56 00 00 00 00 00 00
01 01 00 00 00 00 00 00
9a 99 99 99 99 99 f1 3f
Notice this example also illustrates memory alignment working. We see version occupying 2 bytes (05 00), followed by msgSize with 2 more bytes (24 00) and then 4 bytes of padding, after which comes someId (34 12 00 00 00 00 00 00) and anotherId (78 56 00 00 00 00 00 00). Then isFoo, which occupies 1 byte (01) and isBar, another byte (01), followed by 6 bytes of padding, finally ending with the IEEE 754 standard representation of the double field floatingPointValue.
Also notice that all values are represented as little endian (least significant bytes come first), since this was compiled and run on an Intel platform.
This is a modified version of the Nibble to Hex method
void hexArrayToStr(unsigned char* info, unsigned int infoLength, char **buffer) {
const char* pszNibbleToHex = {"0123456789ABCDEF"};
int nNibble, i;
if (infoLength > 0) {
if (info != NULL) {
*buffer = (char *) malloc((infoLength * 2) + 1);
buffer[0][(infoLength * 2)] = 0;
for (i = 0; i < infoLength; i++) {
nNibble = info[i] >> 4;
buffer[0][2 * i] = pszNibbleToHex[nNibble];
nNibble = info[i] & 0x0F;
buffer[0][2 * i + 1] = pszNibbleToHex[nNibble];
}
} else {
*buffer = NULL;
}
} else {
*buffer = NULL;
}
}
I don't know of a better way than:
unsigned char byData[xxx];
int nLength = sizeof(byData) * 2;
char *pBuffer = new char[nLength + 1];
pBuffer[nLength] = 0;
for (int i = 0; i < sizeof(byData); i++)
{
sprintf(pBuffer[2 * i], "%02X", byData[i]);
}
You can speed it up by using a Nibble to Hex method
unsigned char byData[xxx];
const char szNibbleToHex = { "0123456789ABCDEF" };
int nLength = sizeof(byData) * 2;
char *pBuffer = new char[nLength + 1];
pBuffer[nLength] = 0;
for (int i = 0; i < sizeof(byData); i++)
{
// divide by 16
int nNibble = byData[i] >> 4;
pBuffer[2 * i] = pszNibbleToHex[nNibble];
nNibble = byData[i] & 0x0F;
pBuffer[2 * i + 1] = pszNibbleToHex[nNibble];
}
Yet another answer, in case the byte array is defined as char[], uppercase and separated by spaces.
void debugArray(const unsigned char* data, size_t len) {
std::ios_base::fmtflags f( std::cout.flags() );
for (size_t i = 0; i < len; ++i)
std::cout << std::uppercase << std::hex << std::setfill('0') << std::setw(2) << (((int)data[i]) & 0xFF) << " ";
std::cout << std::endl;
std::cout.flags( f );
}
Example:
unsigned char test[]={0x01, 0x02, 0x03, 0x04, 0x05, 0x06};
debugArray(test, sizeof(test));
Output:
01 02 03 04 05 06
Use C++ streams and restore state afterwards
This is a variation of How do I print bytes as hexadecimal? but:
runnable
considering that this alters the state of cout and trying to restore it at the end as asked at: Restore the state of std::cout after manipulating it
main.cpp
#include <iomanip>
#include <iostream>
int main() {
int array[] = {0, 0x8, 0x10, 0x18};
constexpr size_t size = sizeof(array) / sizeof(array[0]);
// Sanity check decimal print.
for (size_t i = 0; i < size; ++i)
std::cout << array[i] << " ";
std::cout << std::endl;
// Hex print and restore default afterwards.
std::ios cout_state(nullptr);
cout_state.copyfmt(std::cout);
std::cout << std::hex << std::setfill('0') << std::setw(2);
for (size_t i = 0; i < size; ++i)
std::cout << array[i] << " ";
std::cout << std::endl;
std::cout.copyfmt(cout_state);
// Check that cout state was restored.
for (size_t i = 0; i < size; ++i)
std::cout << array[i] << " ";
std::cout << std::endl;
}
Compile and run:
g++ -o main.out -std=c++11 main.cpp
./main.out
Output:
0 8 16 24
00 8 10 18
0 8 16 24
Tested on Ubuntu 16.04, GCC 6.4.0.
Another C++17 alternative because why not!
std::cout<<std::hex<<std::setfill('0');
struct {
std::uint16_t first{666};
std::array<char,4> second{'a','b','c','d'};
} my_struct;
auto ptr = reinterpret_cast<std::byte*>(&my_struct);
auto buffer = std::vector<std::byte>(ptr, ptr + sizeof(my_struct));
std::for_each(std::begin(buffer),std::end(buffer),[](auto byte){
std::cout<<std::setw(2)<<std::to_integer<int>(byte)<<' ';
});
Executable code here.