Some years ago I needed a way to do some basic 128 bit integer math with Cuda:
128 bit integer on cuda?.
Now I am having the same problem, but this time I need to run some basic 128 bit arithmetics (sums, bitshifts and multiplications) on a 32 bit embedded system (Intel Edison) that does not support 128 bits of any kind. There are, however, 64 bit integers supported directly (unsigned long long int).
I tried naively to use the asm code that was answered to me last time on the CPU, but I got a bunch of errors. I am really not experienced with asm, so: what is the most efficient way, having 64 bit integers, to implement additions, multiplications and bit shifting in 128 bits?
Update: Since the OP hasn't accepted an answer yet <hint><hint>, I've attached a bit more code.
Using the libraries discussed above is probably a good idea. While you might only need a few functions today, eventually you may find that you need one more. Then one more after that. Until eventually you end up writing, debugging and maintaining your own 128bit math library. Which is a waste of your time and effort.
That said. If you are determined to roll your own:
1) The cuda question you asked previously already has c code for multiplication. Was there some problem with it?
2) The shift probably won't benefit from using asm, so a c solution makes sense to me here as well. Although if performance is really an issue here, I'd see if the Edison supports SHLD/SHRD, which might make this a bit faster. Otherwise, m Maybe an approach like this?
my_uint128_t lshift_uint128 (const my_uint128_t a, int b)
{
my_uint128_t res;
if (b < 32) {
res.x = a.x << b;
res.y = (a.y << b) | (a.x >> (32 - b));
res.z = (a.z << b) | (a.y >> (32 - b));
res.w = (a.w << b) | (a.z >> (32 - b));
} elseif (b < 64) {
...
}
return res;
}
Update: Since it appears that the Edison may support SHLD/SHRD, here's an alternative which might be more performant than the 'c' code above. As with all code purporting to be faster, you should test it.
inline
unsigned int __shld(unsigned int into, unsigned int from, unsigned int c)
{
unsigned int res;
if (__builtin_constant_p(into) &&
__builtin_constant_p(from) &&
__builtin_constant_p(c))
{
res = (into << c) | (from >> (32 - c));
}
else
{
asm("shld %b3, %2, %0"
: "=rm" (res)
: "0" (into), "r" (from), "ic" (c)
: "cc");
}
return res;
}
inline
unsigned int __shrd(unsigned int into, unsigned int from, unsigned int c)
{
unsigned int res;
if (__builtin_constant_p(into) &&
__builtin_constant_p(from) &&
__builtin_constant_p(c))
{
res = (into >> c) | (from << (32 - c));
}
else
{
asm("shrd %b3, %2, %0"
: "=rm" (res)
: "0" (into), "r" (from), "ic" (c)
: "cc");
}
return res;
}
my_uint128_t lshift_uint128 (const my_uint128_t a, unsigned int b)
{
my_uint128_t res;
if (b < 32) {
res.x = a.x << b;
res.y = __shld(a.y, a.x, b);
res.z = __shld(a.z, a.y, b);
res.w = __shld(a.w, a.z, b);
} else if (b < 64) {
res.x = 0;
res.y = a.x << (b - 32);
res.z = __shld(a.y, a.x, b - 32);
res.w = __shld(a.z, a.y, b - 32);
} else if (b < 96) {
res.x = 0;
res.y = 0;
res.z = a.x << (b - 64);
res.w = __shld(a.y, a.x, b - 64);
} else if (b < 128) {
res.x = 0;
res.y = 0;
res.z = 0;
res.w = a.x << (b - 96);
} else {
memset(&res, 0, sizeof(res));
}
return res;
}
my_uint128_t rshift_uint128 (const my_uint128_t a, unsigned int b)
{
my_uint128_t res;
if (b < 32) {
res.x = __shrd(a.x, a.y, b);
res.y = __shrd(a.y, a.z, b);
res.z = __shrd(a.z, a.w, b);
res.w = a.w >> b;
} else if (b < 64) {
res.x = __shrd(a.y, a.z, b - 32);
res.y = __shrd(a.z, a.w, b - 32);
res.z = a.w >> (b - 32);
res.w = 0;
} else if (b < 96) {
res.x = __shrd(a.z, a.w, b - 64);
res.y = a.w >> (b - 64);
res.z = 0;
res.w = 0;
} else if (b < 128) {
res.x = a.w >> (b - 96);
res.y = 0;
res.z = 0;
res.w = 0;
} else {
memset(&res, 0, sizeof(res));
}
return res;
}
3) The addition may benefit from asm. You could try this:
struct my_uint128_t
{
unsigned int x;
unsigned int y;
unsigned int z;
unsigned int w;
};
my_uint128_t add_uint128 (const my_uint128_t a, const my_uint128_t b)
{
my_uint128_t res;
asm ("addl %5, %[resx]\n\t"
"adcl %7, %[resy]\n\t"
"adcl %9, %[resz]\n\t"
"adcl %11, %[resw]\n\t"
: [resx] "=&r" (res.x), [resy] "=&r" (res.y),
[resz] "=&r" (res.z), [resw] "=&r" (res.w)
: "%0"(a.x), "irm"(b.x),
"%1"(a.y), "irm"(b.y),
"%2"(a.z), "irm"(b.z),
"%3"(a.w), "irm"(b.w)
: "cc");
return res;
}
I just dashed this off, so use at your own risk. I don't have an Edison, but this works with x86.
Update: If you are just doing accumulation (think to += from instead of the code above which is c = a + b), this code might serve you better:
inline
void addto_uint128 (my_uint128_t *to, const my_uint128_t from)
{
asm ("addl %[fromx], %[tox]\n\t"
"adcl %[fromy], %[toy]\n\t"
"adcl %[fromz], %[toz]\n\t"
"adcl %[fromw], %[tow]\n\t"
: [tox] "+&r"(to->x), [toy] "+&r"(to->y),
[toz] "+&r"(to->z), [tow] "+&r"(to->w)
: [fromx] "irm"(from.x), [fromy] "irm"(from.y),
[fromz] "irm"(from.z), [fromw] "irm"(from.w)
: "cc");
}
If using an external library is an option then have a look at this question. You can use TTMath which is a very simple header for big precision math. On 32-bit architectures ttmath:UInt<4> will create a 128-bit int type with four 32-bit limbs. Some other alternatives are (u)int128_t in Boost.Multiprecision or calccrypto/uint128_t
If you must write it your own then there are already a lot of solutions on SO and I'll summarize them here
For addition and subtraction, it's very easy and straightforward, simply add/subtract the words (which big int libraries often called limbs) from the lowest significant to higher significant, with carry of course.
typedef struct INT128 {
uint64_t H, L;
} my_uint128_t;
inline my_uint128_t add(my_uint128_t a, my_uint128_t b)
{
my_uint128_t c;
c.L = a.L + b.L;
c.H = a.H + b.H + (c.L < a.L); // c = a + b
return c;
}
The assembly output can be checked with Compiler Explorer
The compilers can already generate efficient code for double-word operations, but many aren't smart enough to use "add with carry" when compiling multi-word operations from high level languages as you can see in the question efficient 128-bit addition using carry flag. Therefore using 2 long longs like above will make it not only more readable but also easier for the compiler to emit a little more efficient code.
If that still doesn't suit your performance requirement, you must use intrinsic or write it in assembly. To add the 128-bit value stored in bignum to the 128-bit value in {eax, ebx, ecx, edx} you can use the following code
add edx, [bignum]
adc ecx, [bignum+4]
adc ebx, [bignum+8]
adc eax, [bignum+12]
The equivalent intrinsic will be like this for Clang
unsigned *x, *y, *z, carryin=0, carryout;
z[0] = __builtin_addc(x[0], y[0], carryin, &carryout);
carryin = carryout;
z[1] = __builtin_addc(x[1], y[1], carryin, &carryout);
carryin = carryout;
z[2] = __builtin_addc(x[2], y[2], carryin, &carryout);
carryin = carryout;
z[3] = __builtin_addc(x[3], y[3], carryin, &carryout);
You need to change the intrinsic to the one supported by your compiler, for example __builtin_uadd_overflow in gcc, or _addcarry_u32 for MSVC and ICC
For more information read these
Working with Big Numbers Using x86 Instructions
How can I add and subtract 128 bit integers in C or C++ if my compiler does not support them?
Producing good add with carry code from clang
multi-word addition using the carry flag
For bit shifts you can find the C solution in the question Bitwise shift operation on a 128-bit number. This is a simple left shift but you can unroll the recursive call for more performance
void shiftl128 (
unsigned int& a,
unsigned int& b,
unsigned int& c,
unsigned int& d,
size_t k)
{
assert (k <= 128);
if (k >= 32) // shifting a 32-bit integer by more than 31 bits is "undefined"
{
a=b;
b=c;
c=d;
d=0;
shiftl128(a,b,c,d,k-32);
}
else
{
a = (a << k) | (b >> (32-k));
b = (b << k) | (c >> (32-k));
c = (c << k) | (d >> (32-k));
d = (d << k);
}
}
The assembly for less-than-32-bit shifts can be found in the question 128-bit shifts using assembly language?
shld edx, ecx, cl
shld ecx, ebx, cl
shld ebx, eax, cl
shl eax, cl
Right shifts can be implemented similarly, or just copy from the above linked question
Multiplication and divisions are a lot more complex and you can reference the solution in the question Efficient Multiply/Divide of two 128-bit Integers on x86 (no 64-bit):
class int128_t
{
uint32_t dw3, dw2, dw1, dw0;
// Various constrctors, operators, etc...
int128_t& operator*=(const int128_t& rhs) __attribute__((always_inline))
{
int128_t Urhs(rhs);
uint32_t lhs_xor_mask = (int32_t(dw3) >> 31);
uint32_t rhs_xor_mask = (int32_t(Urhs.dw3) >> 31);
uint32_t result_xor_mask = (lhs_xor_mask ^ rhs_xor_mask);
dw0 ^= lhs_xor_mask;
dw1 ^= lhs_xor_mask;
dw2 ^= lhs_xor_mask;
dw3 ^= lhs_xor_mask;
Urhs.dw0 ^= rhs_xor_mask;
Urhs.dw1 ^= rhs_xor_mask;
Urhs.dw2 ^= rhs_xor_mask;
Urhs.dw3 ^= rhs_xor_mask;
*this += (lhs_xor_mask & 1);
Urhs += (rhs_xor_mask & 1);
struct mul128_t
{
int128_t dqw1, dqw0;
mul128_t(const int128_t& dqw1, const int128_t& dqw0): dqw1(dqw1), dqw0(dqw0){}
};
mul128_t data(Urhs,*this);
asm volatile(
"push %%ebp \n\
movl %%eax, %%ebp \n\
movl $0x00, %%ebx \n\
movl $0x00, %%ecx \n\
movl $0x00, %%esi \n\
movl $0x00, %%edi \n\
movl 28(%%ebp), %%eax #Calc: (dw0*dw0) \n\
mull 12(%%ebp) \n\
addl %%eax, %%ebx \n\
adcl %%edx, %%ecx \n\
adcl $0x00, %%esi \n\
adcl $0x00, %%edi \n\
movl 24(%%ebp), %%eax #Calc: (dw1*dw0) \n\
mull 12(%%ebp) \n\
addl %%eax, %%ecx \n\
adcl %%edx, %%esi \n\
adcl $0x00, %%edi \n\
movl 20(%%ebp), %%eax #Calc: (dw2*dw0) \n\
mull 12(%%ebp) \n\
addl %%eax, %%esi \n\
adcl %%edx, %%edi \n\
movl 16(%%ebp), %%eax #Calc: (dw3*dw0) \n\
mull 12(%%ebp) \n\
addl %%eax, %%edi \n\
movl 28(%%ebp), %%eax #Calc: (dw0*dw1) \n\
mull 8(%%ebp) \n\
addl %%eax, %%ecx \n\
adcl %%edx, %%esi \n\
adcl $0x00, %%edi \n\
movl 24(%%ebp), %%eax #Calc: (dw1*dw1) \n\
mull 8(%%ebp) \n\
addl %%eax, %%esi \n\
adcl %%edx, %%edi \n\
movl 20(%%ebp), %%eax #Calc: (dw2*dw1) \n\
mull 8(%%ebp) \n\
addl %%eax, %%edi \n\
movl 28(%%ebp), %%eax #Calc: (dw0*dw2) \n\
mull 4(%%ebp) \n\
addl %%eax, %%esi \n\
adcl %%edx, %%edi \n\
movl 24(%%ebp), %%eax #Calc: (dw1*dw2) \n\
mull 4(%%ebp) \n\
addl %%eax, %%edi \n\
movl 28(%%ebp), %%eax #Calc: (dw0*dw3) \n\
mull (%%ebp) \n\
addl %%eax, %%edi \n\
pop %%ebp \n"
:"=b"(this->dw0),"=c"(this->dw1),"=S"(this->dw2),"=D"(this->dw3)
:"a"(&data):"%ebp");
dw0 ^= result_xor_mask;
dw1 ^= result_xor_mask;
dw2 ^= result_xor_mask;
dw3 ^= result_xor_mask;
return (*this += (result_xor_mask & 1));
}
};
You can also find a lot of related questions with the 128bit tag
Related
I have compiled this code and got Run-Time Check Failure #2 - Stack around the variable 'result' was corrupted exception. But when I changed result array size from 2 to 4 exception disappeared. Can you explain why this happens?
Sorry, if you found this question too basic.
#include "stdafx.h"
string get_cpu_name()
{
uint32_t data[4] = { 0 };
_asm
{
cpuid;
mov data[0], ebx;
mov data[4], edx;
mov data[8], ecx;
}
return string((const char *)data);
}
void assembler()
{
cout << "CPU is " << get_cpu_name() << endl;
float f1[] = { 1.f , 22.f};
float f2[] = { 5.f , 3.f };
float result[2] = { 0.f };
/*float f1[] = { 1.f , 22.f , 1.f , 22.f };
float f2[] = { 5.f , 3.f , 1.f , 22.f };
float result[4] = { 0.f };*/
_asm
{
movups xmm1, f1;
movups xmm2, f2;
mulps xmm1, xmm2;
movups result, xmm1;
}
/*for (size_t i = 0; i < 4; i++)*/
for (size_t i = 0; i < 2; i++)
{
cout << result[i] << "\t";
}
cout << endl;
}
int main()
{
assembler();
getchar();
return 0;
}
The movups instruction writes 128 bits (16 bytes) to memory. You are writing this to the location of an 8-byte array (2*4 bytes, or 64 bits). The 8 bytes after the array will also be written to.
You should make sure there are at least 16 bytes of space to write the result, or you should make sure to write less than 16 bytes there.
I'm pondering at how to speed up bit testing in the following routine:
void histSubtractFromBits(uint64* cursor, uint16* hist){
//traverse each bit of the 256-bit-long bitstring by splitting up into 4 bitsets
std::bitset<64> a(*cursor);
std::bitset<64> b(*(cursor+1));
std::bitset<64> c(*(cursor+2));
std::bitset<64> d(*(cursor+3));
for(int bit = 0; bit < 64; bit++){
hist[bit] -= a.test(bit);
}
for(int bit = 0; bit < 64; bit++){
hist[bit+64] -= b.test(bit);
}
for(int bit = 0; bit < 64; bit++){
hist[bit+128] -= c.test(bit);
}
for(int bit = 0; bit < 64; bit++){
hist[bit+192] -= d.test(bit);
}
}
The actual gcc implementation does a range-check for the bit argument, then &-s with a bitmask. I could do it without the bitsets and with my own bit-shifting / masking, but I'm fairly certain that won't yield any significant speedup (tell me if I'm wrong and why).
I'm not really familiar with the x86-64 assembly, but I am aware of a certain bit test instruction, and I am aware that it's theoretically possible to do inline assembly with gcc.
1) Do you think it at all worthwhile to write an inline-assembly analogue for the above code?
2) If yes, then how would I go about doing it, i.e. could you show me some basic starter code / samples to point me in the right direction?
As far as I can tell, you basically iterate over each bit. As such, I'd imagine simply shifting and masking off the LSB every time should provide good performance. Something like:
uint64_t a = *cursor;
for(int bit = 0; a != 0; bit++, a >>= 1) {
hist[bit] -= (a & 1);
}
Alternatively, if you expect only very few bits to be set and are happy with gcc specific stuff, you could use __builtin_ffsll
uint64_t a = *cursor;
int next;
for(int bit = 0; (next = __builtin_ffsll(a)) != 0; ) {
bit += next;
hist[bit - 1] -= 1;
a >>= next;
}
The idea should be fine, but no warranty for the actual code :)
Update: code using vector extensions:
typedef short v8hi __attribute__ ((vector_size (16)));
static v8hi table[256];
void histSubtractFromBits(uint64_t* cursor, uint16_t* hist)
{
uint8_t* cursor_tmp = (uint8_t*)cursor;
v8hi* hist_tmp = (v8hi*)hist;
for(int i = 0; i < 32; i++, cursor_tmp++, hist_tmp++)
{
*hist_tmp -= table[*cursor_tmp];
}
}
void setup_table()
{
for(int i = 0; i < 256; i++)
{
for(int j = 0; j < 8; j++)
{
table[i][j] = (i >> j) & 1;
}
}
}
This will be compiled to SSE instructions if available, for example I get:
leaq 32(%rdi), %rdx
.p2align 4,,10
.p2align 3
.L2:
movzbl (%rdi), %eax
addq $1, %rdi
movdqa (%rsi), %xmm0
salq $4, %rax
psubw table(%rax), %xmm0
movdqa %xmm0, (%rsi)
addq $16, %rsi
cmpq %rdx, %rdi
jne .L2
Of course this approach relies on the table being in cache.
Another suggestion is to combine data caching, registers and loop unrolling:
// Assuming your processor has 64-bit words
void histSubtractFromBits(uint64_t const * cursor, uint16* hist)
{
register uint64_t a = *cursor++;
register uint64_t b = *cursor++;
register uint64_t c = *cursor++;
register uint64_t d = *cursor++;
register unsigned int i = 0;
for (i = 0; i < (sizeof(*cursor) * CHAR_BIT; ++i)
{
hist[i + 0] += a & 1;
hist[i + 64] += b & 1;
hist[i + 128] += c & 1;
hist[i + 192] += d & 1;
a >>= 1;
b >>= 1;
c >>= 1;
d >>= 1;
}
}
I'm not sure if you gain any more performance by reordering the instructions like this:
hist[i + 0] += a & 1;
a >>= 1;
You could try both ways and compare the assembly language for both.
One of the ideas here is to maximize the register usage. The values to test are loaded into registers and then the testing begins.
Profiling suggests that this function here is a real bottle neck for my application:
static inline int countEqualChars(const char* string1, const char* string2, int size) {
int r = 0;
for (int j = 0; j < size; ++j) {
if (string1[j] == string2[j]) {
++r;
}
}
return r;
}
Even with -O3 and -march=native, G++ 4.7.2 does not vectorize this function (I checked the assembler output). Now, I'm not an expert with SSE and friends, but I think that comparing more than one character at once should be faster. Any ideas on how to speed things up? Target architecture is x86-64.
Of course it can.
pcmpeqb compares two vectors of 16 bytes and produces a vector with zeros where they differed, and -1 where they match. Use this to compare 16 bytes at a time, adding the result to an accumulator vector (make sure to accumulate the results of at most 255 vector compares to avoid overflow). When you're done, there are 16 results in the accumulator. Sum them and negate to get the number of equal elements.
If the lengths are very short, it will be hard to get a significant speedup from this approach. If the lengths are long, then it will be worth pursuing.
Compiler flags for vectorization:
-ftree-vectorize
-ftree-vectorize -march=<your_architecture> (Use all instruction-set extensions available on your computer, not just baseline like SSE2 for x86-64). Use -march=native to optimize for the machine the compiler is running on.) -march=<foo> also sets -mtune=<foo>, which is also a good thing.
Using SSEx intrinsics:
Padd and align the buffer to 16 bytes (according to the vector size you're actually going to use)
Create an accumlator countU8 with _mm_set1_epi8(0)
For all n/16 input (sub) vectors, do:
Load 16 chars from both strings with _mm_load_si128 or _mm_loadu_si128 (for unaligned loads)
_mm_cmpeq_epi8
compare the octets in parallel. Each match yields 0xFF (-1), 0x00 otherwise.
Substract the above result vector from countU8 using _mm_sub_epi8 (minus -1 -> +1)
Always after 255 cycles, the 16 8bit counters must be extracted into a larger integer type to prevent overflows. See unpack and horizontal add in this nice answer for how to do that: https://stackoverflow.com/a/10930706/1175253
Code:
#include <iostream>
#include <vector>
#include <cassert>
#include <cstdint>
#include <climits>
#include <cstring>
#include <emmintrin.h>
#ifdef __SSE2__
#if !defined(UINTPTR_MAX) || !defined(UINT64_MAX) || !defined(UINT32_MAX)
# error "Limit macros are not defined"
#endif
#if UINTPTR_MAX == UINT64_MAX
#define PTR_64
#elif UINTPTR_MAX == UINT32_MAX
#define PTR_32
#else
# error "Current UINTPTR_MAX is not supported"
#endif
template<typename T>
void print_vector(std::ostream& out,const __m128i& vec)
{
static_assert(sizeof(vec) % sizeof(T) == 0,"Invalid element size");
std::cout << '{';
const T* const end = reinterpret_cast<const T*>(&vec)-1;
const T* const upper = end+(sizeof(vec)/sizeof(T));
for(const T* elem = upper;
elem != end;
--elem
)
{
if(elem != upper)
std::cout << ',';
std::cout << +(*elem);
}
std::cout << '}' << std::endl;
}
#define PRINT_VECTOR(_TYPE,_VEC) do{ std::cout << #_VEC << " : "; print_vector<_TYPE>(std::cout,_VEC); } while(0)
///#note SSE2 required (macro: __SSE2__)
///#warning Not tested!
size_t counteq_epi8(const __m128i* a_in,const __m128i* b_in,size_t count)
{
assert(a_in != nullptr && (uintptr_t(a_in) % 16) == 0);
assert(b_in != nullptr && (uintptr_t(b_in) % 16) == 0);
//assert(count > 0);
/*
//maybe not so good with all that branching and additional loop variables
__m128i accumulatorU8 = _mm_set1_epi8(0);
__m128i sum2xU64 = _mm_set1_epi8(0);
for(size_t i = 0;i < count;++i)
{
//this operation could also be unrolled, where multiple result registers would be accumulated
accumulatorU8 = _mm_sub_epi8(accumulatorU8,_mm_cmpeq_epi8(*a_in++,*b_in++));
if(i % 255 == 0)
{
//before overflow of uint8, the counter will be extracted
__m128i sum2xU16 = _mm_sad_epu8(accumulatorU8,_mm_set1_epi8(0));
sum2xU64 = _mm_add_epi64(sum2xU64,sum2xU16);
//reset accumulatorU8
accumulatorU8 = _mm_set1_epi8(0);
}
}
//blindly accumulate remaining values
__m128i sum2xU16 = _mm_sad_epu8(accumulatorU8,_mm_set1_epi8(0));
sum2xU64 = _mm_add_epi64(sum2xU64,sum2xU16);
//do a horizontal addition of the two counter values
sum2xU64 = _mm_add_epi64(sum2xU64,_mm_srli_si128(sum2xU64,64/8));
#if defined PTR_64
return _mm_cvtsi128_si64(sum2xU64);
#elif defined PTR_32
return _mm_cvtsi128_si32(sum2xU64);
#else
# error "macro PTR_(32|64) is not set"
#endif
*/
__m128i sum2xU64 = _mm_set1_epi32(0);
while(count--)
{
__m128i matches = _mm_sub_epi8(_mm_set1_epi32(0),_mm_cmpeq_epi8(*a_in++,*b_in++));
__m128i sum2xU16 = _mm_sad_epu8(matches,_mm_set1_epi32(0));
sum2xU64 = _mm_add_epi64(sum2xU64,sum2xU16);
#ifndef NDEBUG
PRINT_VECTOR(uint16_t,sum2xU64);
#endif
}
//do a horizontal addition of the two counter values
sum2xU64 = _mm_add_epi64(sum2xU64,_mm_srli_si128(sum2xU64,64/8));
#ifndef NDEBUG
std::cout << "----------------------------------------" << std::endl;
PRINT_VECTOR(uint16_t,sum2xU64);
#endif
#if !defined(UINTPTR_MAX) || !defined(UINT64_MAX) || !defined(UINT32_MAX)
# error "Limit macros are not defined"
#endif
#if defined PTR_64
return _mm_cvtsi128_si64(sum2xU64);
#elif defined PTR_32
return _mm_cvtsi128_si32(sum2xU64);
#else
# error "macro PTR_(32|64) is not set"
#endif
}
#endif
int main(int argc, char* argv[])
{
std::vector<__m128i> a(64); // * 16 bytes
std::vector<__m128i> b(a.size());
const size_t nBytes = a.size() * sizeof(std::vector<__m128i>::value_type);
char* const a_out = reinterpret_cast<char*>(a.data());
char* const b_out = reinterpret_cast<char*>(b.data());
memset(a_out,0,nBytes);
memset(b_out,0,nBytes);
a_out[1023] = 1;
b_out[1023] = 1;
size_t equalBytes = counteq_epi8(a.data(),b.data(),a.size());
std::cout << "equalBytes = " << equalBytes << std::endl;
return 0;
}
The fastest SSE implementation I got for large and small arrays:
size_t counteq_epi8(const __m128i* a_in,const __m128i* b_in,size_t count)
{
assert((count > 0 ? a_in != nullptr : true) && (uintptr_t(a_in) % sizeof(__m128i)) == 0);
assert((count > 0 ? b_in != nullptr : true) && (uintptr_t(b_in) % sizeof(__m128i)) == 0);
//assert(count > 0);
const size_t maxInnerLoops = 255;
const size_t nNestedLoops = count / maxInnerLoops;
const size_t nRemainderLoops = count % maxInnerLoops;
const __m128i zero = _mm_setzero_si128();
__m128i sum16xU8 = zero;
__m128i sum2xU64 = zero;
for(size_t i = 0;i < nNestedLoops;++i)
{
for(size_t j = 0;j < maxInnerLoops;++j)
{
sum16xU8 = _mm_sub_epi8(sum16xU8,_mm_cmpeq_epi8(*a_in++,*b_in++));
}
sum2xU64 = _mm_add_epi64(sum2xU64,_mm_sad_epu8(sum16xU8,zero));
sum16xU8 = zero;
}
for(size_t j = 0;j < nRemainderLoops;++j)
{
sum16xU8 = _mm_sub_epi8(sum16xU8,_mm_cmpeq_epi8(*a_in++,*b_in++));
}
sum2xU64 = _mm_add_epi64(sum2xU64,_mm_sad_epu8(sum16xU8,zero));
sum2xU64 = _mm_add_epi64(sum2xU64,_mm_srli_si128(sum2xU64,64/8));
#if UINTPTR_MAX == UINT64_MAX
return _mm_cvtsi128_si64(sum2xU64);
#elif UINTPTR_MAX == UINT32_MAX
return _mm_cvtsi128_si32(sum2xU64);
#else
# error "macro PTR_(32|64) is not set"
#endif
}
Auto-vectorization in current gcc is a matter of helping the compiler to understand that's easy to vectorize the code. In your case: it will understand the vectorization request if you remove the conditional and rewrite the code in a more imperative way:
static inline int count(const char* string1, const char* string2, int size) {
int r = 0;
bool b;
for (int j = 0; j < size; ++j) {
b = (string1[j] == string2[j]);
r += b;
}
return r;
}
In this case:
movdqa 16(%rsp), %xmm1
movl $.LC2, %esi
pxor %xmm2, %xmm2
movzbl 416(%rsp), %edx
movdqa .LC1(%rip), %xmm3
pcmpeqb 224(%rsp), %xmm1
cmpb %dl, 208(%rsp)
movzbl 417(%rsp), %eax
movl $1, %edi
pand %xmm3, %xmm1
movdqa %xmm1, %xmm5
sete %dl
movdqa %xmm1, %xmm4
movzbl %dl, %edx
punpcklbw %xmm2, %xmm5
punpckhbw %xmm2, %xmm4
pxor %xmm1, %xmm1
movdqa %xmm5, %xmm6
movdqa %xmm5, %xmm0
movdqa %xmm4, %xmm5
punpcklwd %xmm1, %xmm6
(etc.)
I'm trying to build a clock, so I'm working with ASM and an arduino. For most parts, plain C will be fine, but for preparing the time to be output to BCD to Decimal converters I decided to go with ASM. I wrote the following code in 8086 C++/ASM and it runs fine on my computer:
#include <iostream>
using namespace std;
int main(int argc, char **argv)
{
for(int num = 0; num < 16; num++) {
int bin[4];
bin[0] = bin[1] = bin[2] = bin[3] = 0;
asm("movl %%ebx, %%eax;"
"andl $8, %%eax;"
"cmp $0, %%eax;"
"je a;"
"movl $1, %0;"
"jmp b;"
"a: movl $0, %0;"
"b: movl %%ebx, %%eax;"
"andl $4, %%eax;"
"cmp $0, %%eax;"
"je c;"
"movl $1, %1;"
"jmp d;"
"c: movl $0, %1;"
"d: movl %%ebx, %%eax;"
"andl $2, %%eax;"
"cmp $0, %%eax;"
"je e;"
"movl $1, %2;"
"jmp f;"
"e: movl $0, %2;"
"f: movl %%ebx, %%eax;"
"andl $1, %%eax;"
"cmp $0, %%eax;"
"je g;"
"movl $1, %3;"
"jmp h;"
"g: movl $0, %3;"
"h: nop"
: "=r" (bin[0]), "=r" (bin[1]), "=r" (bin[2]), "=b" (bin[3])
: "b" (num)
: "%eax"
);
cout << num << ": ";
for(int i = 0; i < 4; i++) {
cout << bin[i];
}
cout << endl;
}
return 0;
}
However, when I modified it to run on the Arduino, things stop working entirely:
for(uint8_t num = 0; num < 16; num++) {
uint8_t bin[4];
bin[0] = bin[1] = bin[2] = bin[3] = 0;
asm("mov __tmp_reg__, %[val];"
"and __tmp_reg__, $8;"
"cmp __tmp_reg__, $0;"
"je a;"
"mov %[bit8], $1;"
"rjmp b;"
"a: mov $0, %[bit8];"
"b: mov %[val], __tmp_reg__;"
"and __tmp_reg__, $4;"
"cmp __tmp_reg__, $0;"
"je c;"
"mov %[bit4], $1;"
"rjmp d;"
"c: mov $0, %[bit4];"
"d: mov %[val], __tmp_reg__;"
"and __tmp_reg__, $2;"
"cmp __tmp_reg__, $0;"
"je e;"
"mov %[bit2], $1;"
"rjmp f;"
"e: mov $0, %[bit2];"
"f: mov %[val], __tmp_reg__;"
"and __tmp_reg__, $1;"
"cmp __tmp_reg__, $0;"
"je g;"
"mov %[bit1], $1;"
"rjmp h;"
"g: mov $0, %[bit1];"
"h: nop"
: [bit8] "=r" (bin[0]), [bit4] "=r" (bin[1]), [bit2] "=r" (bin[2]), [bit1] "=r" (bin[3])
: [val] "r" (num)
: "r0"
);
The 8086 code gives the output you'd expect:
0: 0000
1: 0001
2: 0010
3: 0011
...
13: 1101
14: 1110
15: 1111
But the code run on the Arduino gives a different output:
0: 5000
1: 0000
2: 0000
3: 0000
... (zeros continue)
13: 0000
14: 0000
15: 0000
As you can imagine, the code becomes useless if it returns... five. And I'm clueless as to how it could return 5 when nothing is anywhere close to 5 in the source. I'm at a loss as to what to do here, so I could really use some help.
I'm using the Arduino Leonardo, which has an ATMega32U processor. I've tried disassembling the executable generated by the Arduino software (which compiles it with AVR-GCC), but I can't seem to get anywhere in my efforts to find the code I put in.
Thanks for your time, Stack Overflow.
The code you have can EASILY be written in C++, like this:
int bin[4] = {};
bin[0] = !!(num & 8);
bin[1] = !!(num & 4);
bin[2] = !!(num & 2);
bin[3] = !!(num & 1);
or:
int bin[4];
int bit = 8;
for(int i = 0; i < 4; i++)
{
bin[i] = !!(num & bit);
bit >>= 1;
}
If you don't like !! (which makes "take the next value and make it either 0 [if it's false] or 1 [if it's true]), you could replace it with:
for(int i = 0; i < 4; i++)
{
bin[i] = (num >> 3 - i) & 1;
}
I take it you intentionally want the highest bit in the lowest bin index, rather than the usual case of highest bit in the highest index.
I have some critical branching code inside a loop that's run about 2^26 times. Branch prediction is not optimal because m is random. How would I remove the branching, possibly using bitwise operators?
bool m;
unsigned int a;
const unsigned int k = ...; // k >= 7
if(a == 0)
a = (m ? (a+1) : (k));
else if(a == k)
a = (m ? 0 : (a-1));
else
a = (m ? (a+1) : (a-1));
And here is the relevant assembly generated by gcc -O3:
.cfi_startproc
movl 4(%esp), %edx
movb 8(%esp), %cl
movl (%edx), %eax
testl %eax, %eax
jne L15
cmpb $1, %cl
sbbl %eax, %eax
andl $638, %eax
incl %eax
movl %eax, (%edx)
ret
L15:
cmpl $639, %eax
je L23
testb %cl, %cl
jne L24
decl %eax
movl %eax, (%edx)
ret
L23:
cmpb $1, %cl
sbbl %eax, %eax
andl $638, %eax
movl %eax, (%edx)
ret
L24:
incl %eax
movl %eax, (%edx)
ret
.cfi_endproc
The branch-free division-free modulo could have been useful, but testing shows that in practice, it isn't.
const unsigned int k = 639;
void f(bool m, unsigned int &a)
{
a += m * 2 - 1;
if (a == -1u)
a = k;
else if (a == k + 1)
a = 0;
}
Testcase:
unsigned a = 0;
f(false, a);
assert(a == 639);
f(false, a);
assert(a == 638);
f(true, a);
assert(a == 639);
f(true, a);
assert(a == 0);
f(true, a);
assert(a == 1);
f(false, a);
assert(a == 0);
Actually timing this, using a test program:
int main()
{
for (int i = 0; i != 10000; i++)
{
unsigned int a = k / 2;
while (a != 0) f(rand() & 1, a);
}
}
(Note: there's no srand, so results are deterministic.)
My original answer: 5.3s
The code in the question: 4.8s
Lookup table: 4.5s (static unsigned lookup[2][k+1];)
Lookup table: 4.3s (static unsigned lookup[k+1][2];)
Eric's answer: 4.2s
This version: 4.0s
The fastest I've found is now the table implementation
Timings I got (UPDATED for new measurement code)
HVD's most recent: 9.2s
Table version: 7.4s (with k=693)
Table creation code:
unsigned int table[2*k];
table_ptr = table;
for(int i = 0; i < k; i++){
unsigned int a = i;
f(0, a);
table[i<<1] = a;
a = i;
f(1, a);
table[i<<1 + 1] = a;
}
Table runtime loop:
void f(bool m, unsigned int &a){
a = table_ptr[a<<1 | m];
}
With HVD's measurement code, I saw the cost of the rand() dominating the runtime, so that the runtime for a branchless version was about the same range as these solutions. I changed the measurement code to this (UPDATED to keep random branch order, and pre-computing random values to prevent rand(), etc. from trashing the cache)
int main(){
unsigned int a = k / 2;
int m[100000];
for(int i = 0; i < 100000; i++){
m[i] = rand() & 1;
}
for (int i = 0; i != 10000; i++
{
for(int j = 0; j != 100000; j++){
f(m[j], a);
}
}
}
I don't think you can remove the branches entirely, but you can reduce the number by branching on m first.
if (m){
if (a==k) {a = 0;} else {++a;}
}
else {
if (a==0) {a = k;} else {--a;}
}
Adding to Antimony's rewrite:
if (a==k) {a = 0;} else {++a;}
looks like an increase with wraparound. You can write this as
a=(a+1)%k;
which, of course, only makes sense if divisions are actually faster than branches.
Not sure about the other one; too lazy to think about what the (~0)%k will be.
This has no branches. Because K is constant, compiler might be able to optimize the modulo depending on it's value. And if K is 'small' then a full lookup table solution would probably be even faster.
bool m;
unsigned int a;
const unsigned int k = ...; // k >= 7
const int inc[2] = {1, k};
a = a + inc[m] % (k+1);
If k isn't large enough to cause overflow, you could do something like this:
int a; // Note: not unsigned int
int plusMinus = 2 * m - 1;
a += plusMinus;
if(a == -1)
a = k;
else if (a == k+1)
a = 0;
Still branches, but the branch prediction should be better, since the edge conditions are rarer than m-related conditions.