packing 10 bit values into a byte stream with SIMD [duplicate] - c++

This question already has answers here:
Keep only the 10 useful bits in 16-bit words
(2 answers)
Closed 2 years ago.
I'm trying to packing 10 bit pixels in to a continuous byte stream, using SIMD instructions. The code below does it "in principle" but the SIMD version is slower than the scalar version.
The problem seem to be that I can't find good gather/scatter operations that load the register efficiently.
Any suggestions for improvement?
// SIMD_test.cpp : Defines the entry point for the console application.
//
#include "stdafx.h"
#include "Windows.h"
#include <tmmintrin.h>
#include <stdint.h>
#include <string.h>
// reference non-SIMD implementation that "works"
// 4 uint16 at a time as input, and 5 uint8 as output per loop iteration
void packSlow(uint16_t* ptr, uint8_t* streamBuffer, uint32_t NCOL)
{
for(uint32_t j=0;j<NCOL;j+=4)
{
streamBuffer[0] = (uint8_t)(ptr[0]);
streamBuffer[1] = (uint8_t)(((ptr[0]&0x3FF)>>8) | ((ptr[1]&0x3F) <<2));
streamBuffer[2] = (uint8_t)(((ptr[1]&0x3FF)>>6) | ((ptr[2]&0x0F) <<4));
streamBuffer[3] = (uint8_t)(((ptr[2]&0x3FF)>>4) | ((ptr[3]&0x03) <<6));
streamBuffer[4] = (uint8_t)((ptr[3]&0x3FF)>>2) ;
streamBuffer += 5;
ptr += 4;
}
}
// poorly written SIMD implementation. Attempts to do the same
// as the packSlow, but 8 iterations at a time
void packFast(uint16_t* ptr, uint8_t* streamBuffer, uint32_t NCOL)
{
const __m128i maska = _mm_set_epi16(0x3FF,0x3FF,0x3FF,0x3FF,0x3FF,0x3FF,0x3FF,0x3FF);
const __m128i maskb = _mm_set_epi16(0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F);
const __m128i maskc = _mm_set_epi16(0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F,0x0F);
const __m128i maskd = _mm_set_epi16(0x03,0x03,0x03,0x03,0x03,0x03,0x03,0x03);
for(uint32_t j=0;j<NCOL;j+=4*8)
{
_mm_prefetch((const char*)(ptr+j),_MM_HINT_T0);
}
for(uint32_t j=0;j<NCOL;j+=4*8)
{
// this "fetch" stage is costly. Each term takes 2 cycles
__m128i ptr0 = _mm_set_epi16(ptr[0],ptr[4],ptr[8],ptr[12],ptr[16],ptr[20],ptr[24],ptr[28]);
__m128i ptr1 = _mm_set_epi16(ptr[1],ptr[5],ptr[9],ptr[13],ptr[17],ptr[21],ptr[25],ptr[29]);
__m128i ptr2 = _mm_set_epi16(ptr[2],ptr[6],ptr[10],ptr[14],ptr[18],ptr[22],ptr[26],ptr[30]);
__m128i ptr3 = _mm_set_epi16(ptr[3],ptr[7],ptr[11],ptr[15],ptr[19],ptr[23],ptr[27],ptr[31]);
// I think this part is fairly well optimized
__m128i streamBuffer0 = ptr0;
__m128i streamBuffer1 = _mm_or_si128(_mm_srl_epi16 (_mm_and_si128 (ptr0 , maska), _mm_set_epi32(0, 0, 0,8)) , _mm_sll_epi16 (_mm_and_si128 (ptr1 , maskb) , _mm_set_epi32(0, 0, 0,2)));
__m128i streamBuffer2 = _mm_or_si128(_mm_srl_epi16 (_mm_and_si128 (ptr1 , maska), _mm_set_epi32(0, 0, 0,6)) , _mm_sll_epi16 (_mm_and_si128 (ptr2 , maskc) , _mm_set_epi32(0, 0, 0,4)));
__m128i streamBuffer3 = _mm_or_si128(_mm_srl_epi16 (_mm_and_si128 (ptr2 , maska), _mm_set_epi32(0, 0, 0,4)) , _mm_sll_epi16 (_mm_and_si128 (ptr3 , maskd) , _mm_set_epi32(0, 0, 0,6)));
__m128i streamBuffer4 = _mm_srl_epi16 (_mm_and_si128 (ptr3 , maska), _mm_set_epi32(0, 0, 0,2)) ;
// this again is terribly slow. ~2 cycles per byte output
for(int j=15;j>=0;j-=2)
{
streamBuffer[0] = streamBuffer0.m128i_u8[j];
streamBuffer[1] = streamBuffer1.m128i_u8[j];
streamBuffer[2] = streamBuffer2.m128i_u8[j];
streamBuffer[3] = streamBuffer3.m128i_u8[j];
streamBuffer[4] = streamBuffer4.m128i_u8[j];
streamBuffer += 5;
}
ptr += 32;
}
}
int _tmain(int argc, _TCHAR* argv[])
{
uint16_t pixels[512];
uint8_t packed1[512*10/8];
uint8_t packed2[512*10/8];
for(int i=0;i<512;i++)
{
pixels[i] = i;
}
LARGE_INTEGER t0,t1,t2;
QueryPerformanceCounter(&t0);
for(int k=0;k<1000;k++) packSlow(pixels,packed1,512);
QueryPerformanceCounter(&t1);
for(int k=0;k<1000;k++) packFast(pixels,packed2,512);
QueryPerformanceCounter(&t2);
printf("%d %d\n",t1.QuadPart-t0.QuadPart,t2.QuadPart-t1.QuadPart);
if (memcmp(packed1,packed2,sizeof(packed1)))
{
printf("failed\n");
}
return 0;
}

On re-reading your code, it looks like you are almost definitely murdering your load/store unit, which wouldn't even get complete relief with the new AVX2 VGATHER[D/Q]P[D/S] instruction family. Even Haswell's architecture still requires a uop per load element, each hitting the L1D TLB and cache, regardless of locality, with efficiency improvements showing in Skylake ca. 2016 at earliest.
Your best recourse at present is probably to do 16B register reads and manually construct your streamBuffer values with register copies, _mm_shuffle_epi8(), and _mm_or_si128() calls, and the inverse for the finishing stores.
In the near future, AVX2 will provide (and does for newer desktops already) VPS[LL/RL/RA]V[D/Q] instructions that allow variable element shifting that, combined with a horizontal add, could do this packing pretty quickly. In this case, you could use simple MOVDQU instructions for loading your values, since you could process contiguous uint16_t input values in a single xmm register.
Also, consider reworking your prefetching. Your j in NCOL loop is processing 64B/1 cache line at a time, so you should probably do a single prefetch for ptr + 32 at the beginning of your second loop's body. You might even consider omitting it, since it's a simple forward scan that the hardware prefetcher will detect and automate for you after a very small number of iterations anyway.

I have no experience specifically in SSE. But I would have tried to optimize the code as follows.
// warning. This routine requires streamBuffer to have at least 3 extra spare bytes
// at the end to be used as scratch space. It will write 0's to those bytes.
// for example, streamBuffer needs to be 640+3 bytes of allocated memory if
// 512 10-bit samples are output.
void packSlow1(uint16_t* ptr, uint8_t* streamBuffer, uint32_t NCOL)
{
for(uint32_t j=0;j<NCOL;j+=4*4)
{
uint64_t *dst;
uint64_t src[4][4];
// __m128i s01 = _mm_set_epi64(ptr[0], ptr[1]);
// __m128i s23 = _mm_set_epi64(ptr[2], ptr[3]);
// ---- or ----
// __m128i s0123 = _mm_load_si128(ptr[0])
// __m128i s01 = _?????_(s0123) // some instruction to extract s01 from s0123
// __m128i s23 = _?????_(s0123) // some instruction to extract s23
src[0][0] = ptr[0] & 0x3ff;
src[0][1] = ptr[1] & 0x3ff;
src[0][2] = ptr[2] & 0x3ff;
src[0][3] = ptr[3] & 0x3ff;
src[1][0] = ptr[4] & 0x3ff;
src[1][1] = ptr[5] & 0x3ff;
src[1][2] = ptr[6] & 0x3ff;
src[1][3] = ptr[7] & 0x3ff;
src[2][0] = ptr[8] & 0x3ff;
src[2][1] = ptr[9] & 0x3ff;
src[2][2] = ptr[10] & 0x3ff;
src[2][3] = ptr[11] & 0x3ff;
src[3][0] = ptr[12] & 0x3ff;
src[3][1] = ptr[13] & 0x3ff;
src[3][2] = ptr[14] & 0x3ff;
src[3][3] = ptr[15] & 0x3ff;
// looks like _mm_maskmoveu_si128 can store result efficiently
dst = (uint64_t*)streamBuffer;
dst[0] = src[0][0] | (src[0][1] << 10) | (src[0][2] << 20) | (src[0][3] << 30);
dst = (uint64_t*)(streamBuffer + 5);
dst[0] = src[1][0] | (src[1][1] << 10) | (src[1][2] << 20) | (src[1][3] << 30);
dst = (uint64_t*)(streamBuffer + 10);
dst[0] = src[2][0] | (src[2][1] << 10) | (src[2][2] << 20) | (src[2][3] << 30);
dst = (uint64_t*)(streamBuffer + 15);
dst[0] = src[3][0] | (src[3][1] << 10) | (src[3][2] << 20) | (src[3][3] << 30);
streamBuffer += 5 * 4;
ptr += 4 * 4;
}
}
UPDATE:
Benchmarks:
Ubuntu 12.04, x86_64 GNU/Linux, gcc v4.6.3 (Virtual Box)
Intel Core i7 (Macbook pro)
compiled with -O3
5717633386 (1X): packSlow
3868744491 (1.4X): packSlow1 (version from the post)
4471858853 (1.2X): packFast2 (from Mark Lakata's post)
1820784764 (3.1X): packFast3 (version from the post)
Windows 8.1, x64, VS2012 Express
Intel Core i5 (Asus)
compiled with standard 'Release' options and SSE2 enabled
00413185 (1X) packSlow
00782005 (0.5X) packSlow1
00236639 (1.7X) packFast2
00148906 (2.8X) packFast3
I see completely different results on Asus notebook with Windows 8.1 and VS Express 2012 (code compiled with -O2). packSlow1 is 2x slower than original packSlow, while packFast2 is 1.7X (not 2.9X) faster than packSlow. After researching this problem, I understood the reason. VC compiler was unable to save all the constants into XMMS registers for packFast2 , so it inserted additional memory accesses into the loop (see generated assembly). Slow memory access explains performance degradation.
In order to get more stable results I increased pixels buffer to 256x512 and increased loop counter from 1000 to 10000000/256.
Here is my version of SSE optimized function.
// warning. This routine requires streamBuffer to have at least 3 extra spare bytes
// at the end to be used as scratch space. It will write 0's to those bytes.
// for example, streamBuffer needs to be 640+3 bytes of allocated memory if
// 512 10-bit samples are output.
void packFast3(uint16_t* ptr, uint8_t* streamBuffer, uint32_t NCOL)
{
const __m128i m0 = _mm_set_epi16(0, 0x3FF, 0, 0x3FF, 0, 0x3FF, 0, 0x3FF);
const __m128i m1 = _mm_set_epi16(0x3FF, 0, 0x3FF, 0, 0x3FF, 0, 0x3FF, 0);
const __m128i m2 = _mm_set_epi32(0, 0xFFFFFFFF, 0, 0xFFFFFFFF);
const __m128i m3 = _mm_set_epi32(0xFFFFFFFF, 0, 0xFFFFFFFF, 0);
const __m128i m4 = _mm_set_epi32(0, 0, 0xFFFFFFFF, 0xFFFFFFFF);
const __m128i m5 = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0, 0);
__m128i s0, t0, r0, x0, x1;
// unrolled and normal loop gives the same result
for(uint32_t j=0;j<NCOL;j+=8)
{
// load 8 samples into s0
s0 = _mm_loadu_si128((__m128i*)ptr); // s0=00070006_00050004_00030002_00010000
// join 16-bit samples into 32-bit words
x0 = _mm_and_si128(s0, m0); // x0=00000006_00000004_00000002_00000000
x1 = _mm_and_si128(s0, m1); // x1=00070000_00050000_00030000_00010000
t0 = _mm_or_si128(x0, _mm_srli_epi32(x1, 6)); // t0=00001c06_00001404_00000c02_00000400
// join 32-bit words into 64-bit dwords
x0 = _mm_and_si128(t0, m2); // x0=00000000_00001404_00000000_00000400
x1 = _mm_and_si128(t0, m3); // x1=00001c06_00000000_00000c02_00000000
t0 = _mm_or_si128(x0, _mm_srli_epi64(x1, 12)); // t0=00000001_c0601404_00000000_c0200400
// join 64-bit dwords
x0 = _mm_and_si128(t0, m4); // x0=00000000_00000000_00000000_c0200400
x1 = _mm_and_si128(t0, m5); // x1=00000001_c0601404_00000000_00000000
r0 = _mm_or_si128(x0, _mm_srli_si128(x1, 3)); // r0=00000000_000001c0_60140400_c0200400
// and store result
_mm_storeu_si128((__m128i*)streamBuffer, r0);
streamBuffer += 10;
ptr += 8;
}
}

I came up with a "better" solution using SIMD, but it doesn't not leverage parallelization, just more efficient loads and stores (I think).
I'm posting it here for reference, not necessarily the best answer.
The benchmarks are (in arbitrary ticks)
gcc4.8.1 -O3 VS2012 /O2 Implementation
-----------------------------------------
369 (1X) 3394 (1X) packSlow (original code)
212 (1.7X) 2010 (1.7X) packSlow (from #alexander)
147 (2.5X) 1178 (2.9X) packFast2 (below)
Here's the code. Essentially #alexander's code except using 128 bit registers instead of 64 bit registers, and unrolled 2x instead of 4x.
void packFast2(uint16_t* ptr, uint8_t* streamBuffer, uint32_t NCOL)
{
const __m128i maska = _mm_set_epi16(0x3FF,0x3FF,0x3FF,0x3FF,0x3FF,0x3FF,0x3FF,0x3FF);
const __m128i mask0 = _mm_set_epi16(0,0,0,0,0,0,0,0x3FF);
const __m128i mask1 = _mm_set_epi16(0,0,0,0,0,0,0x3FF,0);
const __m128i mask2 = _mm_set_epi16(0,0,0,0,0,0x3FF,0,0);
const __m128i mask3 = _mm_set_epi16(0,0,0,0,0x3FF,0,0,0);
const __m128i mask4 = _mm_set_epi16(0,0,0,0x3FF,0,0,0,0);
const __m128i mask5 = _mm_set_epi16(0,0,0x3FF,0,0,0,0,0);
const __m128i mask6 = _mm_set_epi16(0,0x3FF,0,0,0,0,0,0);
const __m128i mask7 = _mm_set_epi16(0x3FF,0,0,0,0,0,0,0);
for(uint32_t j=0;j<NCOL;j+=16)
{
__m128i s = _mm_load_si128((__m128i*)ptr); // load 8 16 bit values
__m128i s2 = _mm_load_si128((__m128i*)(ptr+8)); // load 8 16 bit values
__m128i a = _mm_and_si128(s,mask0);
a = _mm_or_si128( a, _mm_srli_epi64 (_mm_and_si128(s, mask1),6));
a = _mm_or_si128( a, _mm_srli_epi64 (_mm_and_si128(s, mask2),12));
a = _mm_or_si128( a, _mm_srli_epi64 (_mm_and_si128(s, mask3),18));
a = _mm_or_si128( a, _mm_srli_si128 (_mm_and_si128(s, mask4),24/8)); // special shift 24 bits to the right, staddling the middle. luckily use just on 128 byte shift (24/8)
a = _mm_or_si128( a, _mm_srli_si128 (_mm_srli_epi64 (_mm_and_si128(s, mask5),6),24/8)); // special. shift net 30 bits. first shift 6 bits, then 3 bytes.
a = _mm_or_si128( a, _mm_srli_si128 (_mm_srli_epi64 (_mm_and_si128(s, mask6),4),32/8)); // special. shift net 36 bits. first shift 4 bits, then 4 bytes (32 bits).
a = _mm_or_si128( a, _mm_srli_epi64 (_mm_and_si128(s, mask7),42));
_mm_storeu_si128((__m128i*)streamBuffer, a);
__m128i a2 = _mm_and_si128(s2,mask0);
a2 = _mm_or_si128( a2, _mm_srli_epi64 (_mm_and_si128(s2, mask1),6));
a2 = _mm_or_si128( a2, _mm_srli_epi64 (_mm_and_si128(s2, mask2),12));
a2 = _mm_or_si128( a2, _mm_srli_epi64 (_mm_and_si128(s2, mask3),18));
a2 = _mm_or_si128( a2, _mm_srli_si128 (_mm_and_si128(s2, mask4),24/8)); // special shift 24 bits to the right, staddling the middle. luckily use just on 128 byte shift (24/8)
a2 = _mm_or_si128( a2, _mm_srli_si128 (_mm_srli_epi64 (_mm_and_si128(s2, mask5),6),24/8)); // special. shift net 30 bits. first shift 6 bits, then 3 bytes.
a2 = _mm_or_si128( a2, _mm_srli_si128 (_mm_srli_epi64 (_mm_and_si128(s2, mask6),4),32/8)); // special. shift net 36 bits. first shift 4 bits, then 4 bytes (32 bits).
a2 = _mm_or_si128( a2, _mm_srli_epi64 (_mm_and_si128(s2, mask7),42));
_mm_storeu_si128((__m128i*)(streamBuffer+10), a2);
streamBuffer += 20 ;
ptr += 16 ;
}
}

Related

AVX2 expand contiguous elements to a sparse vector based on a condition? (like AVX512 VPEXPANDD)

Does anyone know how to vectorize the following code?
uint32_t r[8];
uint16_t* ptr;
for (int j = 0; j < 8; ++j)
if (r[j] < C)
r[j] = *(ptr++);
It's basically a masked gather operation. The auto-vectorizer can't deal with this. If ptr was a uint32_t* it should be directly realizable with _mm256_mask_i32gather_epi32. But even then how do you generate the correct index vector? And wouldn't it be faster to just use a packed load and shuffling the result anyway (requiring a similar index vector)?
Updated answer: The main piece of code has been rewritten as a function and a solution
suitable for AMD processors has been added.
As Peter Cordes mentioned in the comments, the AVX-512 instruction vpexpandd would be useful here.
The functions _mm256_mask_expand_epi32_AVX2_BMI() and _mm256_mask_expand_epi32_AVX2() below more
or less emulate this instruction. The AVX2_BMI variant is suitable for Intel Haswell processors and newer.
The _mm256_mask_expand_epi32_AVX2() function is suitable for AMD processors with a slow or
lacking pdep instruction, such as the Ryzen processor.
In this function a few instructions with high throughput,
such as shifts and simple arithmetic operations, are used instead of the pdep instruction.
Another possibility for AMD processors would be
to process only 4 elements at the time, and use a tiny (16 element) lookup-table to retrieve
the shuf_mask.
Below these two functions it is shown how these can be used to vectorize your scalar code
The answer uses a similar idea as in this answer by Peter Cordes,
which discusses left packing based on a mask. In that answer the BMI2
instruction pext is used to compute the permutation vector.
Here we use the pdep instruction instead, to compute the permutation vector.
Function _mm256_mask_expand_epi32_AVX2() finds the permutation vector
in a different way by computing
a prefix sum on the r<C mask.
Because of the unsigned uint32_t, I used Paul R's idea for epu32 unsigned comparisons.
/* gcc -O3 -m64 -Wall -mavx2 -march=broadwell mask_expand_avx.c */
#include <immintrin.h>
#include <stdio.h>
#include <stdint.h>
__m256i _mm256_mask_expand_epi32_AVX2_BMI(__m256i src, __m256i mask, __m256i insert_vals, int* nonz){
/* Scatter the insert_vals to the positions indicated by mask. */
/* Blend the src with these scattered insert_vals. */
/* Return also the number of nonzeros in mask (which is inexpensive here */
/* because _mm256_movemask_epi8(mask) has to be computed anyway.) */
/* This code is suitable for Intel Haswell and newer processors. */
/* This code is less suitble for AMD Ryzen processors, due to the */
/* slow pdep instruction on those processors, see _mm256_mask_expand_epi32_AVX2 */
uint32_t all_indx = 0x76543210;
uint32_t mask_int32 = _mm256_movemask_epi8(mask); /* Packed mask of 8 nibbles */
uint32_t wanted_indx = _pdep_u32(all_indx, mask_int32); /* Select the right nibbles from all_indx */
uint64_t expand_indx = _pdep_u64(wanted_indx, 0x0F0F0F0F0F0F0F0F); /* Expand the nibbles to bytes */
__m128i shuf_mask_8bit = _mm_cvtsi64_si128(expand_indx); /* Move to AVX-128 register */
__m256i shuf_mask = _mm256_cvtepu8_epi32(shuf_mask_8bit); /* Expand bytes to 32-bit integers */
__m256i insert_vals_exp = _mm256_permutevar8x32_epi32(insert_vals, shuf_mask); /* Expand insert_vals to the right positions */
__m256i dst = _mm256_blendv_epi8(src, insert_vals_exp, mask); /* src is replaced by insert_vals_exp at the postions indicated by mask */
*nonz = _mm_popcnt_u32(mask_int32) >> 2;
return dst;
}
__m256i _mm256_mask_expand_epi32_AVX2(__m256i src, __m256i mask, __m256i insert_vals, int* nonz){
/* Scatter the insert_vals to the positions indicated by mask. */
/* Blend the src with these scattered insert_vals. */
/* Return also the number of nonzeros in mask. */
/* This code is an alternative for the _mm256_mask_expand_epi32_AVX2_BMI function. */
/* In contrast to that code, this code doesn't use the BMI instruction pdep. */
/* Therefore, this code is suitable for AMD processors. */
__m128i mask_lo = _mm256_castsi256_si128(mask);
__m128i mask_hi = _mm256_extracti128_si256(mask, 1);
__m128i mask_hi_lo = _mm_packs_epi32(mask_lo, mask_hi); /* Compressed 128-bits (8 x 16-bits) mask */
*nonz = _mm_popcnt_u32(_mm_movemask_epi8(mask_hi_lo)) >> 1;
__m128i prefix_sum = mask_hi_lo;
__m128i prefix_sum_shft = _mm_slli_si128(prefix_sum, 2); /* The permutation vector is based on the */
prefix_sum = _mm_add_epi16(prefix_sum, prefix_sum_shft); /* Prefix sum of the mask. */
prefix_sum_shft = _mm_slli_si128(prefix_sum, 4);
prefix_sum = _mm_add_epi16(prefix_sum, prefix_sum_shft);
prefix_sum_shft = _mm_slli_si128(prefix_sum, 8);
prefix_sum = _mm_add_epi16(prefix_sum, prefix_sum_shft);
__m128i shuf_mask_16bit = _mm_sub_epi16(_mm_set1_epi16(-1), prefix_sum);
__m256i shuf_mask = _mm256_cvtepu16_epi32(shuf_mask_16bit); /* Expand 16-bit integers to 32-bit integers */
__m256i insert_vals_exp = _mm256_permutevar8x32_epi32(insert_vals, shuf_mask); /* Expand insert_vals to the right positions */
__m256i dst = _mm256_blendv_epi8(src, insert_vals_exp, mask); /* src is replaced by insert_vals_exp at the postions indicated by mask */
return dst;
}
/* Unsigned integer compare _mm256_cmplt_epu32 doesn't exist */
/* The next two lines are based on Paul R's answer https://stackoverflow.com/a/32945715/2439725 */
#define _mm256_cmpge_epu32(a, b) _mm256_cmpeq_epi32(_mm256_max_epu32(a, b), a)
#define _mm256_cmplt_epu32(a, b) _mm256_xor_si256(_mm256_cmpge_epu32(a, b), _mm256_set1_epi32(-1))
int print_input(uint32_t* r, uint32_t C, uint16_t* ptr);
int print_output(uint32_t* r, uint16_t* ptr);
int main(){
int nonz;
uint32_t r[8] = {6, 3, 1001, 2, 1002, 7, 5, 1003};
uint32_t r_new[8];
uint32_t C = 9;
uint16_t* ptr = malloc(8*2); /* allocate 16 bytes for 8 uint16_t's */
ptr[0] = 11; ptr[1] = 12; ptr[2] = 13;ptr[3] = 14; ptr[4] = 15; ptr[5] = 16; ptr[6] = 17; ptr[7] = 18;
uint16_t* ptr_new;
printf("Test values:\n");
print_input(r,C,ptr);
__m256i src = _mm256_loadu_si256((__m256i *)r);
__m128i ins = _mm_loadu_si128((__m128i *)ptr);
__m256i insert_vals = _mm256_cvtepu16_epi32(ins);
__m256i mask_C = _mm256_cmplt_epu32(src,_mm256_set1_epi32(C));
printf("Output _mm256_mask_expand_epi32_AVX2_BMI:\n");
__m256i output = _mm256_mask_expand_epi32_AVX2_BMI(src, mask_C, insert_vals, &nonz);
_mm256_storeu_si256((__m256i *)r_new,output);
ptr_new = ptr + nonz;
print_output(r_new,ptr_new);
printf("Output _mm256_mask_expand_epi32_AVX2:\n");
output = _mm256_mask_expand_epi32_AVX2(src, mask_C, insert_vals, &nonz);
_mm256_storeu_si256((__m256i *)r_new,output);
ptr_new = ptr + nonz;
print_output(r_new,ptr_new);
printf("Output scalar loop:\n");
for (int j = 0; j < 8; ++j)
if (r[j] < C)
r[j] = *(ptr++);
print_output(r,ptr);
return 0;
}
int print_input(uint32_t* r, uint32_t C, uint16_t* ptr){
printf("r[0]..r[7] = %4u %4u %4u %4u %4u %4u %4u %4u \n",r[0],r[1],r[2],r[3],r[4],r[5],r[6],r[7]);
printf("Threshold value C = %4u %4u %4u %4u %4u %4u %4u %4u \n",C,C,C,C,C,C,C,C);
printf("ptr[0]..ptr[7] = %4hu %4hu %4hu %4hu %4hu %4hu %4hu %4hu \n\n",ptr[0],ptr[1],ptr[2],ptr[3],ptr[4],ptr[5],ptr[6],ptr[7]);
return 0;
}
int print_output(uint32_t* r, uint16_t* ptr){
printf("r[0]..r[7] = %4u %4u %4u %4u %4u %4u %4u %4u \n",r[0],r[1],r[2],r[3],r[4],r[5],r[6],r[7]);
printf("ptr = %p \n\n",ptr);
return 0;
}
The output is:
$ ./a.out
Test values:
r[0]..r[7] = 6 3 1001 2 1002 7 5 1003
Threshold value C = 9 9 9 9 9 9 9 9
ptr[0]..ptr[7] = 11 12 13 14 15 16 17 18
Output _mm256_mask_expand_epi32_AVX2_BMI:
r[0]..r[7] = 11 12 1001 13 1002 14 15 1003
ptr = 0x92c01a
Output _mm256_mask_expand_epi32_AVX2:
r[0]..r[7] = 11 12 1001 13 1002 14 15 1003
ptr = 0x92c01a
Output scalar loop:
r[0]..r[7] = 11 12 1001 13 1002 14 15 1003
ptr = 0x92c01a

Square root of a OpenCV's grey image using SSE

given a grey cv::Mat (CV_8UC1) I want to return another cv::Mat containing the square root of the elements (CV_32FC1) and I want to do it with SSE2 intrinsics. I am having some problems with the conversion from 8-bit values to 32 float values to perform the square root. I would really appreciate any help. This is my code for now(it does not give correct values):
uchar *source = (uchar *)cv::alignPtr(image.data, 16);
float *sqDataPtr = cv::alignPtr((float *)Squared.data, 16);
for (x = 0; x < (pixels - 16); x += 16) {
__m128i a0 = _mm_load_si128((__m128i *)(source + x));
__m128i first8 = _mm_unpacklo_epi8(a0, _mm_set1_epi8(0));
__m128i last8 = _mm_unpackhi_epi8(a0, _mm_set1_epi8(0));
__m128i first4i = _mm_unpacklo_epi16(first8, _mm_set1_epi16(0));
__m128i second4i = _mm_unpackhi_epi16(first8, _mm_set1_epi16(0));
__m128 first4 = _mm_cvtepi32_ps(first4i);
__m128 second4 = _mm_cvtepi32_ps(second4i);
__m128i third4i = _mm_unpacklo_epi16(last8, _mm_set1_epi16(0));
__m128i fourth4i = _mm_unpackhi_epi16(last8, _mm_set1_epi16(0));
__m128 third4 = _mm_cvtepi32_ps(third4i);
__m128 fourth4 = _mm_cvtepi32_ps(fourth4i);
// Store
_mm_store_ps(sqDataPtr + x, _mm_sqrt_ps(first4));
_mm_store_ps(sqDataPtr + x + 4, _mm_sqrt_ps(second4));
_mm_store_ps(sqDataPtr + x + 8, _mm_sqrt_ps(third4));
_mm_store_ps(sqDataPtr + x + 12, _mm_sqrt_ps(fourth4));
}
The SSE code looks OK, except that you're not processing the last 16 pixels:
for (x = 0; x < (pixels - 16); x += 16)
should be:
for (x = 0; x <= (pixels - 16); x += 16)
Note that if your image width is not a multiple of 16 then you will need to take care of any remaining pixels after the last full vector.
Also note that you are taking the sqrt of values in the range 0..255. It may be that you want normalised value in the range 0..1.0, in which case you'll want to scale the values accordingly.
I have no experience with SSE2, but I think that if performance is the issue you should use look-up table. Creation of look-up table is fast since you have only 256 possible values. Copy 4 bytes from look-up table into destination matrix should be a very efficient operation.

How can I add together two SSE registers

I have two SSE registers (128 bits is one register) and I want to add them up. I know how I can add corresponding words in them, for example I can do it with _mm_add_epi16 if I use 16bit words in registers, but what I want is something like _mm_add_epi128 (which does not exist), which would use register as one big word.
Is there any way to perform this operation, even if multiple instructions are needed?
I was thinking about using _mm_add_epi64, detecting overflow in the right word and then adding 1 to the left word in register if needed, but I would also like this approach to work for 256bit registers (AVX2), and this approach seems too complicated for that.
To add two 128-bit numbers x and y to give z with SSE you can do it like this
z = _mm_add_epi64(x,y);
c = _mm_unpacklo_epi64(_mm_setzero_si128(), unsigned_lessthan(z,x));
z = _mm_sub_epi64(z,c);
This is based on this link how-can-i-add-and-subtract-128-bit-integers-in-c-or-c.
The function unsigned_lessthan is defined below. It's complicated without AMD XOP (actually a found a simpler version for SSE4.2 if XOP is not available - see the end of my answer). Probably some of the other people here can suggest a better method. Here is some code showing this works.
#include <stdint.h>
#include <x86intrin.h>
#include <stdio.h>
inline __m128i unsigned_lessthan(__m128i a, __m128i b) {
#ifdef __XOP__ // AMD XOP instruction set
return _mm_comgt_epu64(b,a));
#else // SSE2 instruction set
__m128i sign32 = _mm_set1_epi32(0x80000000); // sign bit of each dword
__m128i aflip = _mm_xor_si128(b,sign32); // a with sign bits flipped
__m128i bflip = _mm_xor_si128(a,sign32); // b with sign bits flipped
__m128i equal = _mm_cmpeq_epi32(b,a); // a == b, dwords
__m128i bigger = _mm_cmpgt_epi32(aflip,bflip); // a > b, dwords
__m128i biggerl = _mm_shuffle_epi32(bigger,0xA0); // a > b, low dwords copied to high dwords
__m128i eqbig = _mm_and_si128(equal,biggerl); // high part equal and low part bigger
__m128i hibig = _mm_or_si128(bigger,eqbig); // high part bigger or high part equal and low part
__m128i big = _mm_shuffle_epi32(hibig,0xF5); // result copied to low part
return big;
#endif
}
int main() {
__m128i x,y,z,c;
x = _mm_set_epi64x(3,0xffffffffffffffffll);
y = _mm_set_epi64x(1,0x2ll);
z = _mm_add_epi64(x,y);
c = _mm_unpacklo_epi64(_mm_setzero_si128(), unsigned_lessthan(z,x));
z = _mm_sub_epi64(z,c);
int out[4];
//int64_t out[2];
_mm_storeu_si128((__m128i*)out, z);
printf("%d %d\n", out[2], out[0]);
}
Edit:
The only potentially efficient way to add 128-bit or 256-bit numbers with SSE is with XOP. The only option with AVX would be XOP2 which does not exist yet. And even if you have XOP it may only be efficient to add two 128-bit or 256-numbers in parallel (you could do four with AVX if XOP2 existed) to avoid the horizontal instructions such as mm_unpacklo_epi64.
The best solution in general is to push the registers onto the stack and use scalar arithmetic. Assuming you have two 256-bit registers x4 and y4 you can add them like this:
__m256i x4, y4, z4;
uint64_t x[4], uint64_t y[4], uint64_t z[4]
_mm256_storeu_si256((__m256i*)x, x4);
_mm256_storeu_si256((__m256i*)y, y4);
add_u256(x,y,z);
z4 = _mm256_loadu_si256((__m256i*)z);
void add_u256(uint64_t x[4], uint64_t y[4], uint64_t z[4]) {
uint64_t c1 = 0, c2 = 0, tmp;
//add low 128-bits
z[0] = x[0] + y[0];
z[1] = x[1] + y[1];
c1 += z[1]<x[1];
tmp = z[1];
z[1] += z[0]<x[0];
c1 += z[1]<tmp;
//add high 128-bits + carry from low 128-bits
z[2] = x[2] + y[2];
c2 += z[2]<x[2];
tmp = z[2];
z[2] += c1;
c2 += z[2]<tmp;
z[3] = x[3] + y[3] + c2;
}
int main() {
uint64_t x[4], y[4], z[4];
x[0] = -1; x[1] = -1; x[2] = 1; x[3] = 1;
y[0] = 1; y[1] = 1; y[2] = 1; y[3] = 1;
//z = x + y (x3,x2,x1,x0) = (2,3,1,0)
//x[0] = -1; x[1] = -1; x[2] = 1; x[3] = 1;
//y[0] = 1; y[1] = 0; y[2] = 1; y[3] = 1;
//z = x + y (x3,x2,x1,x0) = (2,3,0,0)
add_u256(x,y,z);
for(int i=3; i>=0; i--) printf("%u ", z[i]); printf("\n");
}
Edit: based on a comment by Stephen Canon at saturated-substraction-avx-or-sse4-2 I discovered there is a more efficient way to compare unsigned 64-bit numbers with SSE4.2 if XOP is not available.
__m128i a,b;
__m128i sign64 = _mm_set1_epi64x(0x8000000000000000L);
__m128i aflip = _mm_xor_si128(a, sign64);
__m128i bflip = _mm_xor_si128(b, sign64);
__m128i cmp = _mm_cmpgt_epi64(aflip,bflip);

Horizontal minimum and maximum using SSE

I have a function using SSE to do a lot of stuff, and the profiler shows me that the code portion I use to compute the horizontal minimum and maximum consumes most of the time.
I have been using the following implementation for the minimum for instance:
static inline int16_t hMin(__m128i buffer) {
buffer = _mm_min_epi8(buffer, _mm_shuffle_epi8(buffer, m1));
buffer = _mm_min_epi8(buffer, _mm_shuffle_epi8(buffer, m2));
buffer = _mm_min_epi8(buffer, _mm_shuffle_epi8(buffer, m3));
buffer = _mm_min_epi8(buffer, _mm_shuffle_epi8(buffer, m4));
return ((int8_t*) ((void *) &buffer))[0];
}
I need to compute the minimum and the maximum of 16 1-byte integers, as you see.
Any good suggestions are highly appreciated :)
Thanks
SSE 4.1 has an instruction that does almost what you want. Its name is PHMINPOSUW, C/C++ intrinsic is _mm_minpos_epu16. It is limited to 16-bit unsigned values and cannot give maximum, but these problems could be easily solved.
If you need to find minimum of non-negative bytes, do nothing. If bytes may be negative, add 128 to each. If you need maximum, subtract each from 127.
Use either _mm_srli_pi16 or _mm_shuffle_epi8, and then _mm_min_epu8 to get 8 pairwise minimum values in even bytes and zeros in odd bytes of some XMM register. (These zeros are produced by shift/shuffle instruction and should remain at their places after _mm_min_epu8).
Use _mm_minpos_epu16 to find minimum among these values.
Extract the resulting minimum value with _mm_cvtsi128_si32.
Undo effect of step 1 to get the original byte value.
Here is an example that returns maximum of 16 signed bytes:
static inline int16_t hMax(__m128i buffer)
{
__m128i tmp1 = _mm_sub_epi8(_mm_set1_epi8(127), buffer);
__m128i tmp2 = _mm_min_epu8(tmp1, _mm_srli_epi16(tmp1, 8));
__m128i tmp3 = _mm_minpos_epu16(tmp2);
return (int8_t)(127 - _mm_cvtsi128_si32(tmp3));
}
I suggest two changes:
Replace ((int8_t*) ((void *) &buffer))[0] with _mm_cvtsi128_si32.
Replace _mm_shuffle_epi8 with _mm_shuffle_epi32/_mm_shufflelo_epi16 which have lower latency on recent AMD processors and Intel Atom, and will save you memory load operations:
static inline int16_t hMin(__m128i buffer)
{
buffer = _mm_min_epi8(buffer, _mm_shuffle_epi32(buffer, _MM_SHUFFLE(3, 2, 3, 2)));
buffer = _mm_min_epi8(buffer, _mm_shuffle_epi32(buffer, _MM_SHUFFLE(1, 1, 1, 1)));
buffer = _mm_min_epi8(buffer, _mm_shufflelo_epi16(buffer, _MM_SHUFFLE(1, 1, 1, 1)));
buffer = _mm_min_epi8(buffer, _mm_srli_epi16(buffer, 8));
return (int8_t)_mm_cvtsi128_si32(buffer);
}
here's an implementation without shuffle, shuffle is slow on AMD 5000 Ryzen 7 for some reason
float max_elem3() const {
__m128 a = _mm_unpacklo_ps(mm, mm); // x x y y
__m128 b = _mm_unpackhi_ps(mm, mm); // z z w w
__m128 c = _mm_max_ps(a, b); // ..., max(x, z), ..., ...
Vector4 res = _mm_max_ps(mm, c); // ..., max(y, max(x, z)), ..., ...
return res.y;
}
float min_elem3() const {
__m128 a = _mm_unpacklo_ps(mm, mm); // x x y y
__m128 b = _mm_unpackhi_ps(mm, mm); // z z w w
__m128 c = _mm_min_ps(a, b); // ..., min(x, z), ..., ...
Vector4 res = _mm_min_ps(mm, c); // ..., min(y, min(x, z)), ..., ...
return res.y;
}

Compute norm between two integers interpreted as 4 bytes

I would like to write a function norm2 which computes
uint32_t norm2(uint32_t a, uint32_t b) {
return sqd( a & 0x000000FF , b & 0x000000FF )
+ sqd((a & 0x0000FF00)>> 8, (b & 0x0000FF00)>> 8)
+ sqd((a & 0x00FF0000)>>16, (b & 0x00FF0000)>> 16)
+ sqd((a & 0xFF000000)>>24, (b & 0xFF000000)>> 24);
}
uint32_t sqd(uint32_t a, uint32_t b) {
uint32_t x = (a > b) ? a - b : b - a;
return x*x;
}
What is the fastest way to do so under GCC? For example using assembler, SSE or similar.
Very simple to do the whole thing in a few instructions using SSE:
#include <immintrin.h>
#include <stdint.h>
uint32_t norm2(uint32_t a, uint32_t b) {
const __m128i vec_zero = _mm_setzero_si128();
__m128i vec_a = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a), vec_zero);
__m128i vec_b = _mm_unpacklo_epi8(_mm_cvtsi32_si128(b), vec_zero);
__m128i vec_diff = _mm_sub_epi16(vec_a, vec_b);
__m128i vec_dsq = _mm_madd_epi16(vec_diff, vec_diff);
return _mm_cvtsi128_si32(_mm_hadd_epi32(vec_dsq, vec_dsq));
}
What we’re doing here is “unpacking” both a and b with a zero vector to expand the individual bytes into vectors of 16-bit integers. We then subtract them (as 16-bit integers, avoiding risk of overflow), and multiply and accumulate them (as 32-bit integers, again avoiding risk of overflow).
I don’t have GCC installed to test with, but the above generates near-optimal assembly with clang; it shouldn’t be necessary to drop into assembly for such a simple task.
If you can read in sets of 4 for a and b, this can be done most cleanly/elegantly/efficiently by operating on 4-tuples because it will more fully saturate some of the instructions, so that all parts of the computation are part of the solution. The below solution uses up to SSSE3. Of course you will be better off to pull this out of the function, initialize constants up front, and find the most efficient way to put the values into the __m128i values depending on how the surrounding code is structured.
// a, b, and out, must all point to 4 integers
void norm2x4(const unsigned *a, const unsigned *b, unsigned *out) {
// load up registers a and b, in practice this should probably not be in a function,
// initialization of zero can happen outside of a loop,
// and a and b can be loaded directly from memory into __m128i registers
__m128i const zero = _mm_setzero_si128();
__m128i alo = _mm_loadu_si128((__m128i*)a); // this can also be adapted to aligned read instructions if you ensure an aligned buffer
__m128i blo = _mm_loadu_si128((__m128i*)b);
// everything is already in the register where we need it except it
// needs to be expanded to 2-byte ints for computations to work correctly
__m128i ahi = _mm_unpackhi_epi8(alo, zero);
__m128i bhi = _mm_unpackhi_epi8(blo, zero);
alo = _mm_unpacklo_epi8(alo, zero);
blo = _mm_unpacklo_epi8(blo, zero);
alo = _mm_sub_epi16(alo, blo); // don't care if a - b, or b - a, the "wrong" one will result in a
ahi = _mm_sub_epi16(ahi, bhi); // negation the square will later correct
alo = _mm_madd_epi16(alo, alo); // perform the square, and add every two adjacent
ahi = _mm_madd_epi16(ahi, ahi);
alo = _mm_hadd_epi32(alo, ahi); // add horizontal elements; ahi now contains 4 ints which are your results
// store the result to output; this can be adapted to an aligned store if you ensure an aligned buffer
// or the individual values can be extracted directly to 32-bit registers using _mm_extract_epi32
_mm_storeu_si128((__m128i*)out, alo);
}
a branchless version (as square(-x) == square(x)):
uint32_t sqd(int32_t a, int32_t b) {
int32_t x = a - b;
return x * x;
}
uint32_t norm2(uint32_t a, uint32_t b) {
return sqd( a & 0x000000FF , b & 0x000000FF )
+ sqd((a & 0x0000FF00) >> 8, (b & 0x0000FF00) >> 8)
+ sqd((a & 0x00FF0000) >> 16, (b & 0x00FF0000) >> 16)
+ sqd((a & 0xFF000000) >> 24, (b & 0xFF000000) >> 24);
}