Sparse array compression using SIMD (AVX2) - c++

I have a sparse array a (mostly zeroes):
unsigned char a[1000000];
and I would like to create an array b of indexes to non-zero elements of a using SIMD instructions on Intel x64 architecture with AVX2. I'm looking for tips how to do it efficiently. Specifically, are there SIMD instruction(s) to get positions of consecutive non-zero elements in SIMD register, arranged contiguously?

Five methods to compute the indices of the nonzeros are:
Semi vectorized loop: Load a SIMD vector with chars, compare with zero and apply a movemask. Use a small scalar loop if any of the chars is nonzero
(also suggested by #stgatilov). This works well for very sparse arrays. Function arr2ind_movmsk in the code below uses BMI1 instructions
for the scalar loop.
Vectorized loop: Intel Haswell processors and newer support the BMI1 and BMI2 instruction sets. BMI2 contains
the pext instruction (Parallel bits extract, see wikipedia link),
which turns out to be useful here. See arr2ind_pext in the code below.
Classic scalar loop with if statement: arr2ind_if.
Scalar loop without branches: arr2ind_cmov.
Lookup table: #stgatilov shows that it is possible to use a lookup table instead of the pdep and other integer
instructions. This might work well, however, the lookup table is quite large: it doesn't fit in the L1 cache.
Not tested here. See also the discussion here.
/*
gcc -O3 -Wall -m64 -mavx2 -fopenmp -march=broadwell -std=c99 -falign-loops=16 sprs_char2ind.c
example: Test different methods with an array a of size 20000 and approximate 25/1024*100%=2.4% nonzeros:
./a.out 20000 25
*/
#include <stdio.h>
#include <immintrin.h>
#include <stdint.h>
#include <omp.h>
#include <string.h>
__attribute__ ((noinline)) int arr2ind_movmsk(const unsigned char * restrict a, int n, int * restrict ind, int * m){
int i, m0, k;
__m256i msk;
m0=0;
for (i=0;i<n;i=i+32){ /* Load 32 bytes and compare with zero: */
msk=_mm256_cmpeq_epi8(_mm256_load_si256((__m256i *)&a[i]),_mm256_setzero_si256());
k=_mm256_movemask_epi8(msk);
k=~k; /* Search for nonzero bits instead of zero bits. */
while (k){
ind[m0]=i+_tzcnt_u32(k); /* Count the number of trailing zero bits in k. */
m0++;
k=_blsr_u32(k); /* Clear the lowest set bit in k. */
}
}
*m=m0;
return 0;
}
__attribute__ ((noinline)) int arr2ind_pext(const unsigned char * restrict a, int n, int * restrict ind, int * m){
int i, m0;
uint64_t cntr_const = 0xFEDCBA9876543210;
__m256i shft = _mm256_set_epi64x(0x04,0x00,0x04,0x00);
__m256i vmsk = _mm256_set1_epi8(0x0F);
__m256i cnst16 = _mm256_set1_epi32(16);
__m256i shf_lo = _mm256_set_epi8(0x80,0x80,0x80,0x0B, 0x80,0x80,0x80,0x03, 0x80,0x80,0x80,0x0A, 0x80,0x80,0x80,0x02,
0x80,0x80,0x80,0x09, 0x80,0x80,0x80,0x01, 0x80,0x80,0x80,0x08, 0x80,0x80,0x80,0x00);
__m256i shf_hi = _mm256_set_epi8(0x80,0x80,0x80,0x0F, 0x80,0x80,0x80,0x07, 0x80,0x80,0x80,0x0E, 0x80,0x80,0x80,0x06,
0x80,0x80,0x80,0x0D, 0x80,0x80,0x80,0x05, 0x80,0x80,0x80,0x0C, 0x80,0x80,0x80,0x04);
__m128i pshufbcnst = _mm_set_epi8(0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80, 0x0E,0x0C,0x0A,0x08,0x06,0x04,0x02,0x00);
__m256i i_vec = _mm256_setzero_si256();
m0=0;
for (i=0;i<n;i=i+16){
__m128i v = _mm_load_si128((__m128i *)&a[i]); /* Load 16 bytes. */
__m128i msk = _mm_cmpeq_epi8(v,_mm_setzero_si128()); /* Generate 16x8 bit mask. */
msk = _mm_srli_epi64(msk,4); /* Pack 16x8 bit mask to 16x4 bit mask. */
msk = _mm_shuffle_epi8(msk,pshufbcnst); /* Pack 16x8 bit mask to 16x4 bit mask. */
msk = _mm_xor_si128(msk,_mm_set1_epi32(-1)); /* Invert 16x4 mask. */
uint64_t msk64 = _mm_cvtsi128_si64x(msk); /* _mm_popcnt_u64 and _pext_u64 work on 64-bit general-purpose registers, not on simd registers.*/
int p = _mm_popcnt_u64(msk64)>>2; /* p is the number of nonzeros in 16 bytes of a. */
uint64_t cntr = _pext_u64(cntr_const,msk64); /* parallel bits extract. cntr contains p 4-bit integers. The 16 4-bit integers in cntr_const are shuffled to the p 4-bit integers that we want */
/* The next 7 intrinsics unpack these p 4-bit integers to p 32-bit integers. */
__m256i cntr256 = _mm256_set1_epi64x(cntr);
cntr256 = _mm256_srlv_epi64(cntr256,shft);
cntr256 = _mm256_and_si256(cntr256,vmsk);
__m256i cntr256_lo = _mm256_shuffle_epi8(cntr256,shf_lo);
__m256i cntr256_hi = _mm256_shuffle_epi8(cntr256,shf_hi);
cntr256_lo = _mm256_add_epi32(i_vec,cntr256_lo);
cntr256_hi = _mm256_add_epi32(i_vec,cntr256_hi);
_mm256_storeu_si256((__m256i *)&ind[m0],cntr256_lo); /* Note that the stores of iteration i and i+16 may overlap. */
_mm256_storeu_si256((__m256i *)&ind[m0+8],cntr256_hi); /* Array ind has to be large enough to avoid segfaults. At most 16 integers are written more than strictly necessary */
m0 = m0+p;
i_vec = _mm256_add_epi32(i_vec,cnst16);
}
*m=m0;
return 0;
}
__attribute__ ((noinline)) int arr2ind_if(const unsigned char * restrict a, int n, int * restrict ind, int * m){
int i, m0;
m0=0;
for (i=0;i<n;i++){
if (a[i]!=0){
ind[m0]=i;
m0=m0+1;
}
}
*m=m0;
return 0;
}
__attribute__((noinline)) int arr2ind_cmov(const unsigned char * restrict a, int n, int * restrict ind, int * m){
int i, m0;
m0=0;
for (i=0;i<n;i++){
ind[m0]=i;
m0=(a[i]==0)? m0 : m0+1; /* Compiles to cmov instruction. */
}
*m=m0;
return 0;
}
__attribute__ ((noinline)) int print_nonz(const unsigned char * restrict a, const int * restrict ind, const int m){
int i;
for (i=0;i<m;i++) printf("i=%d, ind[i]=%d a[ind[i]]=%u\n",i,ind[i],a[ind[i]]);
printf("\n"); fflush( stdout );
return 0;
}
__attribute__ ((noinline)) int print_chk(const unsigned char * restrict a, const int * restrict ind, const int m){
int i; /* Compute a hash to compare the results of different methods. */
unsigned int chk=0;
for (i=0;i<m;i++){
chk=((chk<<1)|(chk>>31))^(ind[i]);
}
printf("chk = %10X\n",chk);
return 0;
}
int main(int argc, char **argv){
int n, i, m;
unsigned int j, k, d;
unsigned char *a;
int *ind;
double t0,t1;
int meth, nrep;
char txt[30];
sscanf(argv[1],"%d",&n); /* Length of array a. */
n=n>>5; /* Adjust n to a multiple of 32. */
n=n<<5;
sscanf(argv[2],"%u",&d); /* The approximate fraction of nonzeros in a is: d/1024 */
printf("n=%d, d=%u\n",n,d);
a=_mm_malloc(n*sizeof(char),32);
ind=_mm_malloc(n*sizeof(int),32);
/* Generate a pseudo random array a. */
j=73659343;
for (i=0;i<n;i++){
j=j*653+1;
k=(j & 0x3FF00)>>8; /* k is a pseudo random number between 0 and 1023 */
if (k<d){
a[i] = (j&0xFE)+1; /* Set a[i] to nonzero. */
}else{
a[i] = 0;
}
}
/* for (i=0;i<n;i++){if (a[i]!=0){printf("i=%d, a[i]=%u\n",i,a[i]);}} printf("\n"); */ /* Uncomment this line to print the nonzeros in a. */
char txt0[]="arr2ind_movmsk: ";
char txt1[]="arr2ind_pext: ";
char txt2[]="arr2ind_if: ";
char txt3[]="arr2ind_cmov: ";
nrep=10000; /* Repeat a function nrep times to make relatively accurate timings possible. */
/* With nrep=1000000: ./a.out 10016 4 ; ./a.out 10016 48 ; ./a.out 10016 519 */
/* With nrep=10000: ./a.out 1000000 5 ; ./a.out 1000000 52 ; ./a.out 1000000 513 */
printf("nrep = \%d \n\n",nrep);
arr2ind_movmsk(a,n,ind,&m); /* Make sure that the arrays a and ind are read and/or written at least one time before benchmarking. */
for (meth=0;meth<4;meth++){
t0=omp_get_wtime();
switch (meth){
case 0: for(i=0;i<nrep;i++) arr2ind_movmsk(a,n,ind,&m); strcpy(txt,txt0); break;
case 1: for(i=0;i<nrep;i++) arr2ind_pext(a,n,ind,&m); strcpy(txt,txt1); break;
case 2: for(i=0;i<nrep;i++) arr2ind_if(a,n,ind,&m); strcpy(txt,txt2); break;
case 3: for(i=0;i<nrep;i++) arr2ind_cmov(a,n,ind,&m); strcpy(txt,txt3); break;
default: ;
}
t1=omp_get_wtime();
printf("method = %s ",txt);
/* print_chk(a,ind,m); */
printf(" elapsed time = %6.2f\n",t1-t0);
}
print_nonz(a, ind, 2); /* Do something with the results */
printf("density = %f %% \n\n",((double)m)/((double)n)*100); /* Actual nonzero density of array a. */
/* print_nonz(a, ind, m); */ /* Uncomment this line to print the indices of the nonzeros. */
return 0;
}
/*
With nrep=1000000:
./a.out 10016 4 ; ./a.out 10016 4 ; ./a.out 10016 48 ; ./a.out 10016 48 ; ./a.out 10016 519 ; ./a.out 10016 519
With nrep=10000:
./a.out 1000000 5 ; ./a.out 1000000 5 ; ./a.out 1000000 52 ; ./a.out 1000000 52 ; ./a.out 1000000 513 ; ./a.out 1000000 513
*/
The code was tested with array size of n=10016 (the data fits in L1 cache) and n=1000000, with
different nonzero densities of about 0.5%, 5% and 50%. For accurate timing the functions were called 1000000
and 10000 times, respectively.
Time in seconds, size n=10016, 1e6 function calls. Intel core i5-6500
0.53% 5.1% 50.0%
arr2ind_movmsk: 0.27 0.53 4.89
arr2ind_pext: 1.44 1.59 1.45
arr2ind_if: 5.93 8.95 33.82
arr2ind_cmov: 6.82 6.83 6.82
Time in seconds, size n=1000000, 1e4 function calls.
0.49% 5.1% 50.1%
arr2ind_movmsk: 0.57 2.03 5.37
arr2ind_pext: 1.47 1.47 1.46
arr2ind_if: 5.88 8.98 38.59
arr2ind_cmov: 6.82 6.81 6.81
In these examples the vectorized loops are faster than the scalar loops.
The performance of arr2ind_movmsk depends a lot on the density of a. It is only
faster than arr2ind_pext if the density is sufficiently small. The break-even point also depends on the array size n.
Function 'arr2ind_if' clearly suffers from failing branch prediction at 50% nonzero density.

If you expect number of nonzero elements to be very low (i.e. much less than 1%), then you can simply check each 16-byte chunk for being nonzero:
int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(reg, _mm_setzero_si128());
if (mask != 65535) {
//store zero bits of mask with scalar code
}
If percentage of good elements is sufficiently small, the cost of mispredicted branches and the cost of slow scalar code inside 'if' would be negligible.
As for a good general solution, first consider SSE implementation of stream compaction. It removes all zero elements from byte array (idea taken from here):
__m128i shuf [65536]; //must be precomputed
char cnt [65536]; //must be precomputed
int compress(const char *src, int len, char *dst) {
char *ptr = dst;
for (int i = 0; i < len; i += 16) {
__m128i reg = _mm_load_si128((__m128i*)&src[i]);
__m128i zeroMask = _mm_cmpeq_epi8(reg, _mm_setzero_si128());
int mask = _mm_movemask_epi8(zeroMask);
__m128i compressed = _mm_shuffle_epi8(reg, shuf[mask]);
_mm_storeu_si128((__m128i*)ptr, compressed);
ptr += cnt[mask]; //alternative: ptr += 16-_mm_popcnt_u32(mask);
}
return ptr - dst;
}
As you see, (_mm_shuffle_epi8 + lookup table) can do wonders. I don't know any other way of vectorizing structurally complex code like stream compaction.
Now the only remaining problem with your request is that you want to get indices. Each index must be stored in 4-byte value, so a chunk of 16 input bytes may produce up to 64 bytes of output, which do not fit into single SSE register.
One way to handle this is to honestly unpack the output to 64 bytes. So you replace reg with constant (0,1,2,3,4,...,15) in the code, then unpack the SSE register into 4 registers, and add a register with four i values. This would take much more instructions: 6 unpack instructions, 4 adds, and 3 stores (one is already there). As for me, that is a huge overhead, especially if you expect less than 25% of nonzero elements.
Alternatively, you can limit the number of nonzero bytes processed by single loop iteration by 4, so that one register is always enough for output.
Here is the sample code:
__m128i shufMask [65536]; //must be precomputed
char srcMove [65536]; //must be precomputed
char dstMove [65536]; //must be precomputed
int compress_ids(const char *src, int len, int *dst) {
const char *ptrSrc = src;
int *ptrDst = dst;
__m128i offsets = _mm_setr_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
__m128i base = _mm_setzero_si128();
while (ptrSrc < src + len) {
__m128i reg = _mm_loadu_si128((__m128i*)ptrSrc);
__m128i zeroMask = _mm_cmpeq_epi8(reg, _mm_setzero_si128());
int mask = _mm_movemask_epi8(zeroMask);
__m128i ids8 = _mm_shuffle_epi8(offsets, shufMask[mask]);
__m128i ids32 = _mm_unpacklo_epi16(_mm_unpacklo_epi8(ids8, _mm_setzero_si128()), _mm_setzero_si128());
ids32 = _mm_add_epi32(ids32, base);
_mm_storeu_si128((__m128i*)ptrDst, ids32);
ptrDst += dstMove[mask]; //alternative: ptrDst += min(16-_mm_popcnt_u32(mask), 4);
ptrSrc += srcMove[mask]; //no alternative without LUT
base = _mm_add_epi32(base, _mm_set1_epi32(dstMove[mask]));
}
return ptrDst - dst;
}
One drawback of this approach is that now each subsequent loop iteration cannot start until the line ptrDst += dstMove[mask]; is executed on the previous iteration. So the critical path has increased dramatically. Hardware hyperthreading or its manual emulation can remove this penalty.
So, as you see, there are many variations of this basic idea, all of which solve your problem with different degree of efficiency. You can also reduce size of LUT if you don't like it (again, at the cost of decreasing throughput performance).
This approach cannot be fully extended to wider registers (i.e. AVX2 and AVX-512), but you can try to combine instructions of several consecutive iterations into single AVX2 or AVX-512 instruction, thus slightly increasing throughput.
Note: I didn't test any code (because precomputing LUT correctly requires noticeable effort).

Although AVX2 instruction set has many GATHER instructions, but its performance is too slow. And the most effective way to do this - to process an array manually.

Related

Creating bitmask in O(1) with SIMD operations

I am new at c++ and working with simd.
What i am trying to do is creating a bitmask from input in constant time.
For example:
input 1,3,4 => output 13 (or 26 indexing scheme does not matter): 1101 (1st, 3rd and 4th bits are 1)
input 2,5,7 => output 82 : 1010010 (2nd, 5th and 7th bits are 1)
input type does not matter, it can be array for example.
I accomplished this with for loop, which is not wanted. Is there a function to create bitmask in a constant time?
Constant time cannot work if you have a variable number inputs. You have to iterate over the values at least once, right?
In any case, you can use intrinsics to minimize the number of operations. You have not specified your target architecture or the integer size. So I assume AVX2 and 64 bit integers as output. Also, for convenience, I assume that the inputs are 64 bit.
If your inputs are smaller-sized integers than the output, you have to add some zero-extensions.
#include <immintrin.h>
#include <array>
#include <cstdint>
#include <cstdio>
std::uint64_t accum(const std::uint64_t* bitpositions, std::size_t n)
{
// 2 x 64 bit integers set to 1
const __m128i ones2 = _mm_set1_epi64(_m_from_int64(1));
// 4 x 64 bit integers set to 1
const __m256i ones4 = _mm256_broadcastsi128_si256(ones2);
// 4 x 64 bit integers serving as partial bit masks
__m256i accum4 = _mm256_setzero_si256();
std::size_t i;
for(i = 0; i + 4 <= n; i += 4) {
// may be replaced with aligned load
__m256i positions = _mm256_loadu_si256((const __m256i*)(bitpositions + i));
// vectorized (1 << position) bit shift
__m256i shifted = _mm256_sllv_epi64(ones4, positions);
// add new bits to accumulator
accum4 = _mm256_or_si256(accum4, shifted);
}
// reduce 4 to 2 64 bit integers
__m128i accum2 = _mm256_castsi256_si128(accum4);
__m128i high2 = _mm256_extracti128_si256(accum4, 1);
if(i + 2 <= n) {
// zero or one iteration with 2 64 bit integers
__m128i positions = _mm_loadu_si128((const __m128i*)(bitpositions + i));
__m128i shifted = _mm_sllv_epi64(ones2, positions);
accum2 = _mm_or_si128(accum2, shifted);
i += 2;
}
// high2 folded in with delay to account for extract latency
accum2 = _mm_or_si128(accum2, high2);
// reduce to 1 64 bit integer
__m128i high1 = _mm_unpackhi_epi64(accum2, accum2);
accum2 = _mm_or_si128(accum2, high1);
std::uint64_t accum1 = static_cast<std::uint64_t>(_mm_cvtsi128_si64(accum2));
if(i < n)
accum1 |= 1 << bitpositions[i];
return accum1;
}
EDIT
I have just seen that your example inputs use 1-based indexing. So bit 1 would be set for value 1 and input value 0 is probably undefined behavior. I suggest switching to zero-based indexing. But if you are stuck with that notation, just add a _mm256_sub_epi64(positions, ones4) or _mm_sub_epi64(positions, ones2) before the shift.
With smaller input size
And here is a version for byte-sized input integers.
std::uint64_t accum(const std::uint8_t* bitpositions, std::size_t n)
{
const __m128i ones2 = _mm_set1_epi64(_m_from_int64(1));
const __m256i ones4 = _mm256_broadcastsi128_si256(ones2);
__m256i accum4 = _mm256_setzero_si256();
std::size_t i;
for(i = 0; i + 4 <= n; i += 4) {
/*
* As far as I can see, there is no point in loading a full 128 or 256 bit
* vector. To zero-extend more values, we would need to use many shuffle
* instructions and those have a lower throughput than repeated
* 32 bit loads
*/
__m128i positions = _mm_cvtsi32_si128(*(const int*)(bitpositions + i));
__m256i extended = _mm256_cvtepu8_epi64(positions);
__m256i shifted = _mm256_sllv_epi64(ones4, extended);
accum4 = _mm256_or_si256(accum4, shifted);
}
__m128i accum2 = _mm256_castsi256_si128(accum4);
__m128i high2 = _mm256_extracti128_si256(accum4, 1);
accum2 = _mm_or_si128(accum2, high2);
/*
* Until AVX512, there is no single instruction to load 2 byte into a vector
* register. So we don't bother. Instead, the scalar code below will run up
* to 3 times
*/
__m128i high1 = _mm_unpackhi_epi64(accum2, accum2);
accum2 = _mm_or_si128(accum2, high1);
std::uint64_t accum1 = static_cast<std::uint64_t>(_mm_cvtsi128_si64(accum2));
/*
* We use a separate accumulator to avoid the long dependency chain through
* the reduction above
*/
std::uint64_t tail = 0;
/*
* Compilers create a ton of code if we give them a simple loop because they
* think they can vectorize. So we unroll the loop, even if it is stupid
*/
if(i + 2 <= n) {
tail = std::uint64_t(1) << bitpositions[i++];
tail |= std::uint64_t(1) << bitpositions[i++];
}
if(i < n)
tail |= std::uint64_t(1) << bitpositions[i];
return accum1 | tail;
}

Looking for an index of an element in array via SIMD. A fast way

I need to find an index/position of an 8-bit value element N in an array ARR via SIMD. It must be a fast fashion.
For now the algorithm is that I'd load 8-bit values of ARR into one SIMD register and a character code of N into other SIMD register.
Then I'd use negation and check which byte is successful with popcnt.
Is there a faster way?
The operations may be saturated used if needed.
Which instruction set/architecture are you using? That will somewhat impact the 'correct' answer to this question.
in SSE:
#include <immintrin.h>
#include <stdio.h>
int byteIndex(__m128i ARR, __m128i N)
{
__m128i cmp = _mm_cmpeq_epi8(ARR, N);
int mask = _mm_movemask_epi8(cmp);
return _tzcnt_u32(mask);
}
int main()
{
__m128i ARR = _mm_setr_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
// test case that will work
__m128i N = _mm_set1_epi8(3);
printf("%d\n", byteIndex(ARR, N)); ///< prints '3'
// test case that will fail
__m128i F = _mm_set1_epi8(16);
printf("%d\n", byteIndex(ARR, F)); ///< prints '32'
return 1;
}

Rotating (by 90°) a bit matrix (up to 8x8 bits) within a 64-bit integer

I have a bit matrix (of size 6x6, or 7x7, or 8x8) stored within one single 64-bit integer.
I am looking for c++ code that rotates these matrices by 90, 180, 270 degrees, as well as c++ code for shifting (horizontally and vertically) and mirroring these matrices. The output must be again a 64-bit integer.
Using some of the advanced CPU instruction sets would probably be okay, as well as using hash tables or similar techniques - speed is of highest importance, and RAM is available. I will run this on an AMD Ryzen 7 1700 eight-core PC. I am not familiar with these instruction sets (e.g. SSE2), but I have used __popcnt64() and _rotl64() within C++.
Can anybody point me in the right direction? I have written my own code for the 7x7 matrix but I now need the code for 6x6 and 8x8 and wonder whether anybody has published anything on this topic, perhaps in a more clever way than my 7x7 approach.
By the way, the 6x6 and 7x7 matrices are stored in the least significant 36 and 49 bits, respectively, with the remaining bits set to zero.
In principle AVX2 can be quite useful here. For example, to rotate 90 degrees, you can do:
#include <stdio.h>
#include <immintrin.h>
#include <stdint.h>
/* gcc -O3 -Wall -m64 -mfma -mavx2 -march=skylake rot_bit_mat.c */
int print_bitmat(uint64_t k);
uint64_t bitmat_rot_90(uint64_t x){ /* 0xFEDCBA9876543210 */
__m256i mask1 = _mm256_set_epi64x(0x1010101010101010, 0x2020202020202020, 0x4040404040404040, 0x8080808080808080);
__m256i mask2 = _mm256_set_epi64x(0x0101010101010101, 0x0202020202020202, 0x0404040404040404, 0x0808080808080808);
__m256i x_bc = _mm256_set1_epi64x(x); /* Broadcast x */
__m256i r_lo = _mm256_and_si256(x_bc,mask1); /* Extract the right bits within bytes */
r_lo = _mm256_cmpeq_epi8(r_lo,mask1); /* Test if bits within bytes are set */
uint64_t t_lo = _mm256_movemask_epi8(r_lo); /* Move 32 bytes to 32 bit mask */
__m256i r_hi = _mm256_and_si256(x_bc,mask2);
r_hi = _mm256_cmpeq_epi8(r_hi,mask2);
uint64_t t_hi = _mm256_movemask_epi8(r_hi);
return t_lo | (t_hi << 32);
}
int main(int argc, char **argv){
/* 0xFEDCBA9876543210 */
uint64_t k = 0xA49B17E63298D5C3;
print_bitmat(k);
printf("\n");
print_bitmat(bitmat_rot_90(k));
printf("\n\n");
return 0;
}
int print_bitmat(uint64_t k){
uint64_t i,j;
for (i = 0; i < 8; i++){
for (j = 0; j < 8; j++){
printf("%llu",1ull & (k >> (i * 8ull + j)));
}
printf("\n");
}
return 0;
}
The output is:
$ ./a.out
11000011
10101011
00011001
01001100
01100111
11101000
11011001
00100101
11101011
11001000
00011001
01110110
00100010
01001101
10011110
11000110
It is likely that similar techniques can be used for other transformations. Although it may take some time to figure out the right bit masks.
The comments on the question give directions for other transformations:
AVX2 bit reversal of bytes is of interest here, see here
and here. Although the latter answer bit reverses
32 bit ints, while in your case bit reversal of 64 bit ints is relevant; so, it needs some modifications.
The _bswap64() intrinsic can be used to mirror the bit matrix upside down.

How would you transpose a binary matrix?

I have binary matrices in C++ that I repesent with a vector of 8-bit values.
For example, the following matrix:
1 0 1 0 1 0 1
0 1 1 0 0 1 1
0 0 0 1 1 1 1
is represented as:
const uint8_t matrix[] = {
0b01010101,
0b00110011,
0b00001111,
};
The reason why I'm doing it this way is because then computing the product of such a matrix and a 8-bit vector becomes really simple and efficient (just one bitwise AND and a parity computation, per row), which is much better than calculating each bit individually.
I'm now looking for an efficient way to transpose such a matrix, but I haven't been able to figure out how to do it without having to manually calculate each bit.
Just to clarify, for the above example, I'd like to get the following result from the transposition:
const uint8_t transposed[] = {
0b00000000,
0b00000100,
0b00000010,
0b00000110,
0b00000001,
0b00000101,
0b00000011,
0b00000111,
};
NOTE: I would prefer an algorithm that can calculate this with arbitrary-sized matrices but am also interested in algorithms that can only handle certain sizes.
I've spent more time looking for a solution, and I've found some good ones.
The SSE2 way
On a modern x86 CPU, transposing a binary matrix can be done very efficiently with SSE2 instructions. Using such instructions it is possible to process a 16×8 matrix.
This solution is inspired by this blog post by mischasan and is vastly superior to every suggestion I've got so far to this question.
The idea is simple:
#include <emmintrin.h>
Pack 16 uint8_t variables into an __m128i
Use _mm_movemask_epi8 to get the MSBs of each byte, producing an uint16_t
Use _mm_slli_epi64 to shift the 128-bit register by one
Repeat until you've got all 8 uint16_ts
A generic 32-bit solution
Unfortunately, I also need to make this work on ARM. After implementing the SSE2 version, it would be easy to just just find the NEON equivalents, but the Cortex-M CPU, (contrary to the Cortex-A) does not have SIMD capabilities, so NEON isn't too useful for me at the moment.
NOTE: Because the Cortex-M doesn't have native 64-bit arithmetics, I could not use the ideas in any answers that suggest to do it by treating a 8x8 block as an uint64_t. Most microcontrollers that have a Cortex-M CPU also don't have too much memory so I prefer to do all this without a lookup table.
After some thinking, the same algorithm can be implemented using plain 32-bit arithmetics and some clever coding. This way, I can work with 4×8 blocks at a time. It was suggested by a collegaue and the magic lies in the way 32-bit multiplication works: you can find a 32-bit number with which you can multiply and then the MSB of each byte gets next to each other in the upper 32 bits of the result.
Pack 4 uint8_ts in a 32-bit variable
Mask the 1st bit of each byte (using 0x80808080)
Multiply it with 0x02040810
Take the 4 LSBs of the upper 32 bits of the multiplication
Generally, you can mask the Nth bit in each byte (shift the mask right by N bits) and multiply with the magic number, shifted left by N bits. The advantage here is that if your compiler is smart enough to unroll the loop, both the mask and the 'magic number' become compile-time constants so shifting them does not incur any performance penalty whatsoever. There's some trouble with the last series of 4 bits, because then one LSB is lost, so in that case I needed to shift the input left by 8 bits and use the same method as the first series of 4-bits.
If you do this with two 4×8 blocks, then you can get an 8x8 block done and arrange the resulting bits so that everything goes into the right place.
My suggestion is that, you don't do the transposition, rather you add one bit information to your matrix data, indicating whether the matrix is transposed or not.
Now, if you want to multiply a transposd matrix with a vector, it will be the same as multiplying the matrix on the left by the vector (and then transpose). This is easy: just some xor operations of your 8-bit numbers.
This however makes some other operations complicated (e.g. adding two matrices). But in the comment you say that multiplication is exactly what you want to optimize.
Here is the text of Jay Foad's email to me regarding fast Boolean matrix
transpose:
The heart of the Boolean transpose algorithm is a function I'll call transpose8x8 which transposes an 8x8 Boolean matrix packed in a 64-bit word (in row major order from MSB to LSB). To transpose any rectangular matrix whose width and height are multiples of 8, break it down into 8x8 blocks, transpose each one individually and store them at the appropriate place in the output. To load an 8x8 block you have to load 8 individual bytes and shift and OR them into a 64-bit word. Same kinda thing for storing.
A plain C implementation of transpose8x8 relies on the fact that all the bits on any diagonal line parallel to the leading diagonal move the same distance up/down and left/right. For example, all the bits just above the leading diagonal have to move one place left and one place down, i.e. 7 bits to the right in the packed 64-bit word. This leads to an algorithm like this:
transpose8x8(word) {
return
(word & 0x0100000000000000) >> 49 // top right corner
| (word & 0x0201000000000000) >> 42
| ...
| (word & 0x4020100804020100) >> 7 // just above diagonal
| (word & 0x8040201008040201) // leading diagonal
| (word & 0x0080402010080402) << 7 // just below diagonal
| ...
| (word & 0x0000000000008040) << 42
| (word & 0x0000000000000080) << 49; // bottom left corner
}
This runs about 10x faster than the previous implementation, which copied each bit individually from the source byte in memory and merged it into the destination byte in memory.
Alternatively, if you have PDEP and PEXT instructions you can implement a perfect shuffle, and use that to do the transpose as mentioned in Hacker's Delight. This is significantly faster (but I don't have timings handy):
shuffle(word) {
return pdep(word >> 32, 0xaaaaaaaaaaaaaaaa) | pdep(word, 0x5555555555555555);
} // outer perfect shuffle
transpose8x8(word) { return shuffle(shuffle(shuffle(word))); }
POWER's vgbbd instruction effectively implements the whole of transpose8x8 in a single instruction (and since it's a 128-bit vector instruction it does it twice, independently, on the low 64 bits and the high 64 bits). This gave about 15% speed-up over the plain C implementation. (Only 15% because, although the bit twiddling is much faster, the overall run time is now dominated by the time it takes to load 8 bytes and assemble them into the argument to transpose8x8, and to take the result and store it as 8 separate bytes.)
My suggestion would be to use a lookup table to speed up the processing.
Another thing to note is with the current definition of your matrix the maximum size will be 8x8 bits. This fits into a uint64_t so we can use this to our advantage especially when using a 64-bit platform.
I have worked out a simple example using a lookup table which you can find below and run using: http://www.tutorialspoint.com/compile_cpp11_online.php online compiler.
Example code
#include <iostream>
#include <bitset>
#include <stdint.h>
#include <assert.h>
using std::cout;
using std::endl;
using std::bitset;
/* Static lookup table */
static uint64_t lut[256];
/* Helper function to print array */
template<int N>
void print_arr(const uint8_t (&arr)[N]){
for(int i=0; i < N; ++i){
cout << bitset<8>(arr[i]) << endl;
}
}
/* Transpose function */
template<int N>
void transpose_bitmatrix(const uint8_t (&matrix)[N], uint8_t (&transposed)[8]){
assert(N <= 8);
uint64_t value = 0;
for(int i=0; i < N; ++i){
value = (value << 1) + lut[matrix[i]];
}
/* Ensure safe copy to prevent misalignment issues */
/* Can be removed if input array can be treated as uint64_t directly */
for(int i=0; i < 8; ++i){
transposed[i] = (value >> (i * 8)) & 0xFF;
}
}
/* Calculate lookup table */
void calculate_lut(void){
/* For all byte values */
for(uint64_t i = 0; i < 256; ++i){
auto b = std::bitset<8>(i);
auto v = std::bitset<64>(0);
/* For all bits in current byte */
for(int bit=0; bit < 8; ++bit){
if(b.test(bit)){
v.set((7 - bit) * 8);
}
}
lut[i] = v.to_ullong();
}
}
int main()
{
calculate_lut();
const uint8_t matrix[] = {
0b01010101,
0b00110011,
0b00001111,
};
uint8_t transposed[8];
transpose_bitmatrix(matrix, transposed);
print_arr(transposed);
return 0;
}
How it works
your 3x8 matrix will be transposed to a 8x3 matrix, represented in an 8x8 array.
The issue is that you want to convert bits, your "horizontal" representation to a vertical one, divided over several bytes.
As I mentioned above, we can take advantage of the fact that the output (8x8) will always fit into a uint64_t. We will use this to our advantage because now we can use an uint64_t to write the 8 byte array, but we can also use it for to add, xor, etc. because we can perform basic arithmetic operations on a 64 bit integer.
Each entry in your 3x8 matrix (input) is 8 bits wide, to optimize processing we first generate 256 entry lookup table (for each byte value). The entry itself is a uint64_t and will contain a rotated version of the bits.
example:
byte = 0b01001111 = 0x4F
lut[0x4F] = 0x0001000001010101 = (uint8_t[]){ 0, 1, 0, 0, 1, 1, 1, 1 }
Now for the calculation:
For the calculations we use the uint64_t but keep in mind that under water it will represent a uint8_t[8] array. We simple shift the current value (start with 0), look up our first byte and add it to the current value.
The 'magic' here is that each byte of the uint64_t in the lookup table will either be 1 or 0 so it will only set the least significant bit (of each byte). Shifting the uint64_t will shift each byte, as long as we make sure we do not do this more than 8 times! we can do operations on each byte individually.
Issues
As someone noted in the comments: Translate(Translate(M)) != M so if you need this you need some additional work.
Perfomance can be improved by directly mapping uint64_t's instead of uint8_t[8] arrays since it omits a "safe-copy" to prevent alignment issues.
I have added a new awnser instead of editing my original one to make this more visible (no comment rights unfortunatly).
In your own awnser you add an additional requirement not present in the first one: It has to work on ARM Cortex-M
I did come up with an alternative solution for ARM in my original awnser but omitted it as it was not part of the question and seemed off topic (mostly because of the C++ tag).
ARM Specific solution Cortex-M:
Some or most Cortex-M 3/4 have a bit banding region which can be used for exactly what you need, it expands bits into 32-bit fields, this region can be used to perform atomic bit operations.
If you put your array in a bitbanded region it will have an 'exploded' mirror in the bitband region where you can just use move operations on the bits itself. If you make a loop the compiler will surely be able to unroll and optimize to just move operations.
If you really want to, you can even setup a DMA controller to process an entire batch of transpose operations with a bit of effort and offload it entirely from the cpu :)
Perhaps this might still help you.
This is a bit late, but I just stumbled across this interchange today.
If you look at Hacker's Delight, 2nd Edition,there are several algorithms for efficiently transposing Boolean arrays, starting on page 141.
They are quite efficient: a colleague of mine obtained a factor about 10X
speedup compared to naive coding, on an X86.
Here's what I posted on gitub (mischasan/sse2/ssebmx.src)
Changing INP() and OUT() to use induction vars saves an IMUL each.
AVX256 does it twice as fast.
AVX512 is not an option, because there is no _mm512_movemask_epi8().
#include <stdint.h>
#include <emmintrin.h>
#define INP(x,y) inp[(x)*ncols/8 + (y)/8]
#define OUT(x,y) out[(y)*nrows/8 + (x)/8]
void ssebmx(char const *inp, char *out, int nrows, int ncols)
{
int rr, cc, i, h;
union { __m128i x; uint8_t b[16]; } tmp;
// Do the main body in [16 x 8] blocks:
for (rr = 0; rr <= nrows - 16; rr += 16)
for (cc = 0; cc < ncols; cc += 8) {
for (i = 0; i < 16; ++i)
tmp.b[i] = INP(rr + i, cc);
for (i = 8; i--; tmp.x = _mm_slli_epi64(tmp.x, 1))
*(uint16_t*)&OUT(rr, cc + i) = _mm_movemask_epi8(tmp.x);
}
if (rr == nrows) return;
// The remainder is a row of [8 x 16]* [8 x 8]?
// Do the [8 x 16] blocks:
for (cc = 0; cc <= ncols - 16; cc += 16) {
for (i = 8; i--;)
tmp.b[i] = h = *(uint16_t const*)&INP(rr + i, cc),
tmp.b[i + 8] = h >> 8;
for (i = 8; i--; tmp.x = _mm_slli_epi64(tmp.x, 1))
OUT(rr, cc + i) = h = _mm_movemask_epi8(tmp.x),
OUT(rr, cc + i + 8) = h >> 8;
}
if (cc == ncols) return;
// Do the remaining [8 x 8] block:
for (i = 8; i--;)
tmp.b[i] = INP(rr + i, cc);
for (i = 8; i--; tmp.x = _mm_slli_epi64(tmp.x, 1))
OUT(rr, cc + i) = _mm_movemask_epi8(tmp.x);
}
HTH.
Inspired by Roberts answer, polynomial multiplication in Arm Neon can be utilised to scatter the bits --
inline poly8x16_t mull_lo(poly8x16_t a) {
auto b = vget_low_p8(a);
return vreinterpretq_p8_p16(vmull_p8(b,b));
}
inline poly8x16_t mull_hi(poly8x16_t a) {
auto b = vget_high_p8(a);
return vreinterpretq_p8_p16(vmull_p8(b,b));
}
auto a = mull_lo(word);
auto b = mull_lo(a), c = mull_hi(a);
auto d = mull_lo(b), e = mull_hi(b);
auto f = mull_lo(c), g = mull_hi(c);
Then the vsli can be used to combine the bits pairwise.
auto ab = vsli_p8(vget_high_p8(d), vget_low_p8(d), 1);
auto cd = vsli_p8(vget_high_p8(e), vget_low_p8(e), 1);
auto ef = vsli_p8(vget_high_p8(f), vget_low_p8(f), 1);
auto gh = vsli_p8(vget_high_p8(g), vget_low_p8(g), 1);
auto abcd = vsli_p8(ab, cd, 2);
auto efgh = vsli_p8(ef, gh, 2);
return vsli_p8(abcd, efgh, 4);
Clang optimizes this code to avoid vmull2 instructions, using heavily ext q0,q0,8 to vget_high_p8.
An iterative approach would possibly be not only faster, but also uses less registers and also simdifies for 2x or more throughput.
// transpose bits in 2x2 blocks, first 4 rows
// x = a b|c d|e f|g h a i|c k|e m|g o | byte 0
// i j|k l|m n|o p b j|d l|f n|h p | byte 1
// q r|s t|u v|w x q A|s C|u E|w G | byte 2
// A B|C D|E F|G H r B|t D|v F|h H | byte 3 ...
// ----------------------
auto a = (x & 0x00aa00aa00aa00aaull);
auto b = (x & 0x5500550055005500ull);
auto c = (x & 0xaa55aa55aa55aa55ull) | (a << 7) | (b >> 7);
// transpose 2x2 blocks (first 4 rows shown)
// aa bb cc dd aa ii cc kk
// ee ff gg hh -> ee mm gg oo
// ii jj kk ll bb jj dd ll
// mm nn oo pp ff nn hh pp
auto d = (c & 0x0000cccc0000ccccull);
auto e = (c & 0x3333000033330000ull);
auto f = (c & 0xcccc3333cccc3333ull) | (d << 14) | (e >> 14);
// Final transpose of 4x4 bit blocks
auto g = (f & 0x00000000f0f0f0f0ull);
auto h = (f & 0x0f0f0f0f00000000ull);
x = (f & 0xf0f0f0f00f0f0f0full) | (g << 28) | (h >> 28);
In ARM each step can now be composed with 3 instructions:
auto tmp = vrev16_u8(x);
tmp = vshl_u8(tmp, plus_minus_1); // 0xff01ff01ff01ff01ull
x = vbsl_u8(mask_1, x, tmp); // 0xaa55aa55aa55aa55ull
tmp = vrev32_u16(x);
tmp = vshl_u16(tmp, plus_minus_2); // 0xfefe0202fefe0202ull
x = vbsl_u8(mask_2, x, tmp); // 0xcccc3333cccc3333ull
tmp = vrev64_u32(x);
tmp = vshl_u32(tmp, plus_minus_4); // 0xfcfcfcfc04040404ull
x = vbsl_u8(mask_4, x, tmp); // 0xf0f0f0f00f0f0f0full

Extract set bytes position from SIMD vector

I run a bench of computations using SIMD intructions. These instructions return a vector of 16 bytes as result, named compare, with each byte being 0x00 or 0xff :
0 1 2 3 4 5 6 7 15 16
compare : 0x00 0x00 0x00 0x00 0xff 0x00 0x00 0x00 ... 0xff 0x00
Bytes set to 0xff mean I need to run the function do_operation(i) with i being the position of the byte.
For instance, the above compare vector mean, I need to run this sequence of operations :
do_operation(4);
do_operation(15);
Here is the fastest solution I came up with until now :
for(...) {
//
// SIMD computations
//
__m128i compare = ... // Result of SIMD computations
// Extract high and low quadwords for compare vector
std::uint64_t cmp_low = (_mm_cvtsi128_si64(compare));
std::uint64_t cmp_high = (_mm_extract_epi64(compare, 1));
// Process low quadword
if (cmp_low) {
const std::uint64_t low_possible_positions = 0x0706050403020100;
const std::uint64_t match_positions = _pext_u64(
low_possible_positions, cmp_low);
const int match_count = _popcnt64(cmp_low) / 8;
const std::uint8_t* match_pos_array =
reinterpret_cast<const std::uint8_t*>(&match_positions);
for (int i = 0; i < match_count; ++i) {
do_operation(i);
}
}
// Process high quadword (similarly)
if (cmp_high) {
const std::uint64_t high_possible_positions = 0x0f0e0d0c0b0a0908;
const std::uint64_t match_positions = _pext_u64(
high_possible_positions, cmp_high);
const int match_count = _popcnt64(cmp_high) / 8;
const std::uint8_t* match_pos_array =
reinterpret_cast<const std::uint8_t*>(&match_positions);
for(int i = 0; i < match_count; ++i) {
do_operation(i);
}
}
}
I start with extracting the first and second 64 bits integers of the 128 bits vector (cmp_low and cmp_high). Then I use popcount to compute the number of bytes set to 0xff (number of bits set to 1 divided by 8). Finally, I use pext to get positions, without zeros, like this :
0x0706050403020100
0x000000ff00ff0000
|
PEXT
|
0x0000000000000402
I would like to find a faster solution to extract the positions of the bytes set to 0xff in the compare vector. More precisely, the are very often only 0, 1 or 2 bytes set to 0xff in the compare vector and I would like to use this information to avoid some branches.
Here's a quick outline of how you could reduce the number of tests:
First use a function to project all the lsb or msb of each byte of your 128bit integer into a 16bit value (for instance, there's a SSE2 assembly instruction for that on X86 cpus: pmovmskb, which is supported on Intel and MS compilers with the _mm_movemask_pi8 intrinsic, and gcc has also an intrinsic: __builtin_ia32_ppmovmskb128, );
Then split that value in 4 nibbles;
define functions to handle each possible values of a nibble (from 0 to 15) and put these in an array;
Finally call the function indexed by each nibble (with extra parameters to indicate which nibble in the 16bits it is).
Since in your case very often only 0, 1 or 2 bytes are set to 0xff in the compare vector, a short
while-loop on the bitmask might be more efficient than a solution based on the pext
instruction. See also my answer on a similar question.
/*
gcc -O3 -Wall -m64 -mavx2 -march=broadwell esbsimd.c
*/
#include <stdio.h>
#include <immintrin.h>
int do_operation(int i){ /* some arbitrary do_operation() */
printf("i = %d\n",i);
return 0;
}
int main(){
__m128i compare = _mm_set_epi8(0xFF,0,0,0, 0,0,0,0, 0,0,0,0xFF, 0,0,0,0); /* Take some randon value for compare */
int k = _mm_movemask_epi8(compare);
while (k){
int i=_tzcnt_u32(k); /* Count the number of trailing zero bits in k. BMI1 instruction set, Haswell or newer. */
do_operation(i);
k=_blsr_u32(k); /* Clear the lowest set bit in k. */
}
return 0;
}
/*
Output:
i = 4
i = 15
*/