Grayscale bilinear patch extraction - SSE optimization - c++

My program makes an intensive use of small sub-images extracted using bilinear interpolation from larger grayscale images.
I am using the following function for this purpose:
bool extract_patch_bilin(const cv::Point2f &patch_ctr, const cv::Mat_<uchar> &img, cv::Mat_<uchar> &patch)
{
const int hsize = patch.rows/2;
// ...
// Precondition checks: patch is a preallocated square matrix and both patch and image have continuous buffers
// ...
int floorx=(int)floor(patch_ctr.x)-hsize, floory=(int)floor(patch_ctr.y)-hsize;
if(floorx<0 || img.cols-1<floorx+patch.cols || floory<0 || img.rows-1<floory+patch.rows)
return false;
float x=patch_ctr.x-hsize-floorx;
float y=patch_ctr.y-hsize-floory;
float xy = x*y;
float w00=1-x-y+xy, w01=x-xy, w10=y-xy, w11=xy;
int img_stride = img.cols-patch.cols;
uchar* buff_img0 = (uchar*)img.data+img.cols*floory+floorx;
uchar* buff_img1 = buff_img0+img.cols;
uchar* buff_patch = (uchar*)patch.data;
for(int v=0; v<patch.rows; ++v,buff_img0+=img_stride,buff_img1+=img_stride) {
for(int u=0; u<patch.cols; ++u,++buff_patch,++buff_img0,++buff_img1)
buff_patch[0] = cv::saturate_cast<uchar>(buff_img0[0]*w00+buff_img0[1]*w01+buff_img1[0]*w10+buff_img1[1]*w11);
}
return true;
}
Long story short, I am already using parallelization in other parts of the program, and I am considering using SSE to optimize the execution of this function, because I am mostly using 8x8 patches and it seems like a good idea to process bunches of 8 pixels at a time using SSE.
However, I am not sure how to deal with the multiplication by the float interpolation weights (i.e. w00, w01, w10 and w11. These weights are necessarily positive and smaller than 1, hence the multiplication cannot overflow the unsigned char datatype.
Does anyone know how to proceed ?
EDIT:
I tried to do this as follows (assuming 16x16 patches), but there is no significant speed-up:
bool extract_patch_bilin_16x16(const cv::Point2f& patch_ctr, const cv::Mat_<uchar> &img, cv::Mat_<uchar> &patch)
{
// ...
// Precondition checks
// ...
const int hsize = patch.rows/2;
int floorx=(int)floor(patch_ctr.x)-hsize, floory=(int)floor(patch_ctr.y)-hsize;
// Check that the full extracted patch is inside the image
if(floorx<0 || img.cols-1<floorx+patch.cols || floory<0 || img.rows-1<floory+patch.rows)
return false;
// Compute the constant bilinear weights
float x=patch_ctr.x-hsize-floorx;
float y=patch_ctr.y-hsize-floory;
float xy = x*y;
float w00=1-x-y+xy, w01=x-xy, w10=y-xy, w11=xy;
// Prepare image resampling loop
int img_stride = img.cols-patch.cols;
uchar* buff_img0 = (uchar*)img.data+img.cols*floory+floorx;
uchar* buff_img1 = buff_img0+img.cols;
uchar* buff_patch = (uchar*)patch.data;
// Precompute weighting variables
const __m128i CONST_0 = _mm_setzero_si128();
__m128i w00x256_32i = _mm_set1_epi32(cvRound(w00*256));
__m128i w01x256_32i = _mm_set1_epi32(cvRound(w01*256));
__m128i w10x256_32i = _mm_set1_epi32(cvRound(w10*256));
__m128i w11x256_32i = _mm_set1_epi32(cvRound(w11*256));
__m128i w00x256_16i = _mm_packs_epi32(w00x256_32i,w00x256_32i);
__m128i w01x256_16i = _mm_packs_epi32(w01x256_32i,w01x256_32i);
__m128i w10x256_16i = _mm_packs_epi32(w10x256_32i,w10x256_32i);
__m128i w11x256_16i = _mm_packs_epi32(w11x256_32i,w11x256_32i);
// Process pixels
int ngroups = patch.rows>>4;
for(int v=0; v<patch.rows; ++v,buff_img0+=img_stride,buff_img1+=img_stride) {
for(int g=0; g<ngroups; ++g,buff_patch+=16,buff_img0+=16,buff_img1+=16) {
////////////////////////////////
// Load the data (16 pixels in one load)
////////////////////////////////
__m128i val00 = _mm_loadu_si128((__m128i*)buff_img0);
__m128i val01 = _mm_loadu_si128((__m128i*)(buff_img0+1));
__m128i val10 = _mm_loadu_si128((__m128i*)buff_img1);
__m128i val11 = _mm_loadu_si128((__m128i*)(buff_img1+1));
////////////////////////////////
// Process the lower 8 values
////////////////////////////////
// Unpack into 16-bits integers
__m128i val00_lo = _mm_unpacklo_epi8(val00,CONST_0);
__m128i val01_lo = _mm_unpacklo_epi8(val01,CONST_0);
__m128i val10_lo = _mm_unpacklo_epi8(val10,CONST_0);
__m128i val11_lo = _mm_unpacklo_epi8(val11,CONST_0);
// Multiply with the integer weights
__m128i w256val00_lo = _mm_mullo_epi16(val00_lo,w00x256_16i);
__m128i w256val01_lo = _mm_mullo_epi16(val01_lo,w01x256_16i);
__m128i w256val10_lo = _mm_mullo_epi16(val10_lo,w10x256_16i);
__m128i w256val11_lo = _mm_mullo_epi16(val11_lo,w11x256_16i);
// Divide by 256 to get the approximate result of the multiplication with floating-point weights
__m128i wval00_lo = _mm_srli_epi16(w256val00_lo,8);
__m128i wval01_lo = _mm_srli_epi16(w256val01_lo,8);
__m128i wval10_lo = _mm_srli_epi16(w256val10_lo,8);
__m128i wval11_lo = _mm_srli_epi16(w256val11_lo,8);
// Add pairwise
__m128i sum0_lo = _mm_add_epi16(wval00_lo,wval01_lo);
__m128i sum1_lo = _mm_add_epi16(wval10_lo,wval11_lo);
__m128i final_lo = _mm_add_epi16(sum0_lo,sum1_lo);
////////////////////////////////
// Process the higher 8 values
////////////////////////////////
// Unpack into 16-bits integers
__m128i val00_hi = _mm_unpackhi_epi8(val00,CONST_0);
__m128i val01_hi = _mm_unpackhi_epi8(val01,CONST_0);
__m128i val10_hi = _mm_unpackhi_epi8(val10,CONST_0);
__m128i val11_hi = _mm_unpackhi_epi8(val11,CONST_0);
// Multiply with the integer weights
__m128i w256val00_hi = _mm_mullo_epi16(val00_hi,w00x256_16i);
__m128i w256val01_hi = _mm_mullo_epi16(val01_hi,w01x256_16i);
__m128i w256val10_hi = _mm_mullo_epi16(val10_hi,w10x256_16i);
__m128i w256val11_hi = _mm_mullo_epi16(val11_hi,w11x256_16i);
// Divide by 256 to get the approximate result of the multiplication with floating-point weights
__m128i wval00_hi = _mm_srli_epi16(w256val00_hi,8);
__m128i wval01_hi = _mm_srli_epi16(w256val01_hi,8);
__m128i wval10_hi = _mm_srli_epi16(w256val10_hi,8);
__m128i wval11_hi = _mm_srli_epi16(w256val11_hi,8);
// Add pairwise
__m128i sum0_hi = _mm_add_epi16(wval00_hi,wval01_hi);
__m128i sum1_hi = _mm_add_epi16(wval10_hi,wval11_hi);
__m128i final_hi = _mm_add_epi16(sum0_hi,sum1_hi);
////////////////////////////////
// Repack all values
////////////////////////////////
__m128i final_val = _mm_packus_epi16(final_lo,final_hi);
_mm_storeu_si128((__m128i*)buff_patch,final_val);
}
}
}
Any idea what could be done to improve the speed-up ?

I would consider sticking to integers: your weights are multiples of 1/64 so that working with fixed-point 8.6 is enough and that fits in 16 bits numbers.
Bilinear interpolation is best done as three linear ones (two on Y then one on X; you can reuse the second Y interpolation for the neighboring patch).
To perform a linear interpolation between two values, you will pre-store once for all the interpolation weights P and Q (8 to 1 and 0 to 7), and multiply and add them in pairs like V0.P[i]+V1.Q[i]. This is efficiently done using the PMADDUBSW instruction. (After appropriate data interleaving, and replication of the values V0 and V1, with PUNPCKLBW and the like).
In the end, divide by the total weight (PSRLW), rescale to bytes (PACKUSWB). (This step can be performed once only, combining the two interpolations.)
You could think of doubling all weights, so that the final scaling is by 8 bits, and PACKUSWB would suffice, but unfortunately it saturates the values and there is no unsaturated equivalent.
It could be that precomputing all 64 interpolation weights and summing the four bilinear terms is better.
UPDATE:
If the goal is to interpolate with fixed coefficients for all pixels quads (actually achieving subpixel translation), the strategy is different.
You will load a run of 8 (16 ?) pixels corresponding to the upper-left corners, a run of 8 shifted one pixel to the right (corresponding to the upper-right corners), and similarly for the next row (bottom coners); multiply and add in pairs (PMADDUBSW) the pixel values to the corresponding interpolation weights, and combine the pairs (PADDW). Store the weights with replication.
Another option will be to avoid the (PMADD) and perform separate multiplies (PMULLW) and adds (PADDW). This will simplify the reorganization scheme.
After scaling (as above), you end up with a run of 8 interpolated values.
This can work as well for variable interpolation weights, as long as you interpolate exactly one pixel per quad.

Related

can i speed up more than _mm256_i32gather_epi32

I made a gamma conversion code for 4k video
/** gamma0
input range : 0 ~ 1,023
output range : 0 ~ ?
*/
v00 = _mm256_unpacklo_epi16(v0, _mm256_setzero_si256());
v01 = _mm256_unpackhi_epi16(v0, _mm256_setzero_si256());
v10 = _mm256_unpacklo_epi16(v1, _mm256_setzero_si256());
v11 = _mm256_unpackhi_epi16(v1, _mm256_setzero_si256());
v20 = _mm256_unpacklo_epi16(v2, _mm256_setzero_si256());
v21 = _mm256_unpackhi_epi16(v2, _mm256_setzero_si256());
v00 = _mm256_i32gather_epi32(csv->gamma0LUT, v00, 4);
v01 = _mm256_i32gather_epi32(csv->gamma0LUT, v01, 4);
v10 = _mm256_i32gather_epi32(csv->gamma0LUTc, v10, 4);
v11 = _mm256_i32gather_epi32(csv->gamma0LUTc, v11, 4);
v20 = _mm256_i32gather_epi32(csv->gamma0LUTc, v20, 4);
v21 = _mm256_i32gather_epi32(csv->gamma0LUTc, v21, 4);
I want to implement a "10-bit input to 10~13bit output" LUT(look-up table), but only 32-bit commands are supported by AVX2.
So, it was unavoidably extended to 32bit and implemented using the _mm256_i32gather_epi32 command.
The performance bottleneck in this area is the most severe, is there any way to improve this?
Since the context of your question is still a bit vague for me, just some general ideas you could try (some may be just slightly better or even worse compared to what you have at the moment, all code below is untested):
LUT with 16 bit values using _mm256_i32gather_epi32
Even though it loads 32bit values, you can still use a multiplier of 2 as last argument of _mm256_i32gather_epi32. You should make sure that 2 bytes before and after your LUT are readable.
static const int16_t LUT[1024+2] = { 0, val0, val1, ..., val1022, val1023, 0};
__m256i high_idx = _mm256_srli_epi32(v, 16);
__m256i low_idx = _mm256_blend_epi16(v, _mm256_setzero_si256(), 0xAA);
__m256i high_val = _mm256_i32gather_epi32((int const*)(LUT+0), high_idx, 2);
__m256i low_val = _mm256_i32gather_epi32((int const*)(LUT+1), low_idx, 2);
__m256i values = _mm256_blend_epi16(low_val, high_val, 0xAA);
Join two values into one LUT-entry
For small-ish LUTs, you could calculate an index from two neighboring indexes as (idx_hi << 10) + idx_low and look up the corresponding tuple directly. However, instead of 2KiB you would have a 4 MiB LUT in your case, which likely hurts caching -- but you only have half the number of gather instructions.
Polynomial approximation
Mathematically, all continuous functions on a finite interval can be approximated by a polynomial. You could either convert your values to float evaluate the polynomial and convert it back, or do it directly with fixed-point multiplications (note that _mm256_mulhi_epi16/_mm256_mulhi_epu16 compute (a * b) >> 16, which is convenient if one factor is actually in [0, 1).
8 bit, 16 entry LUT with linear interpolation
SSE/AVX2 provides a pshufb instruction which can be used as a 8bit LUT with 16 entries (and an implicit 0 entry).
Proof-of-concept implementation:
__m256i idx = _mm256_srli_epi16(v, 6); // shift highest 4 bits to the right
idx = _mm256_mullo_epi16(idx, _mm256_set1_epi16(0x0101)); // duplicate idx, maybe _mm256_shuffle_epi8 is better?
idx = _mm256_sub_epi8(idx, _mm256_set1_epi16(0x0001)); // subtract 1 from lower idx, 0 is mapped to 0xff
__m256i lut_vals = _mm256_shuffle_epi8(LUT, idx); // implicitly: LUT[-1] = 0
// get fractional part of input value:
__m256i dv = _mm256_and_si256(v, _mm256_set1_epi8(0x3f)); // lowest 6 bits
dv = _mm256_mullo_epi16(dv, _mm256_set1_epi16(0xff01)); // dv = [-dv, dv]
dv = _mm256_add_epi8(dv, _mm256_set1_epi16(0x4000)); // dv = [0x40-(v&0x3f), (v&0x3f)];
__m256i res = _mm256_maddubs_epi16(lut_vals, dv); // switch order depending on whether LUT values are (un)signed.
// probably shift res to the right, depending on the scale of your LUT values
You could also combine this with first doing a linear or quadratic approximation and just calculating the difference to your target function.

How would you convert a "while" iterator into simd instructions?

This is the code I actually had (for a scalar code) which I've replicated (x4) storing data into simd:
waveTable *waveTables[4];
for (int i = 0; i < 4; i++) {
int waveTableIindex = 0;
while ((phaseIncrement[i] >= mWaveTables[waveTableIindex].mTopFreq) && (waveTableIindex < kNumWaveTableSlots)) {
waveTableIindex++;
}
waveTables[i] = &mWaveTables[waveTableIindex];
}
Its not "faster" at all, of course. How would you do the same with simd, saving cpu? Any tips/starting point?
I'm with SSE2.
Here's the context of the computation.
topFreq for each wave table are calculated starting from the max harmonic amounts (x2, due to Nyquist), and multiply for 2 on every wave table (dividing later the number of harmonics available for each table):
double topFreq = 1.0 / (maxHarmonic * 2);
while (maxHarmonic) {
// fill the table in with the needed harmonics
// ... makeWaveTable() code
// prepare for next table
topFreq *= 2;
maxHarmonic >>= 1;
}
Than, on processing, for each sample, I need to "catch" the correct wave table to use, due to the osc's freq (i.e. phase increment):
freq = clamp(freq, 20.0f, 22050.0f);
phaseIncrement = freq * vSampleTime;
so, for example (having vSampleTime = 1/44100, maxHarmonic = 500), 30hz is wavetable 0, 50hz is wavetable 1, and so on
Assuming your values are FP32, I would do it like this. Untested.
const __m128 phaseIncrements = _mm_loadu_ps( phaseIncrement );
__m128i indices = _mm_setzero_si128();
__m128i activeIndices = _mm_set1_epi32( -1 );
for( size_t idx = 0; idx < kNumWaveTableSlots; idx++ )
{
// Broadcast the mTopFreq value into FP32 vector. If you build this for AVX1, will become 1 very fast instruction.
const __m128 topFreq = _mm_set1_ps( mWaveTables[ idx ].mTopFreq );
// Compare for phaseIncrements >= topFreq
const __m128 cmp_f32 = _mm_cmpge_ps( phaseIncrements, topFreq );
// The following line compiles into no instruction, it's only to please the type checker
__m128i cmp = _mm_castps_si128( cmp_f32 );
// Bitwise AND with activeIndices
cmp = _mm_and_si128( cmp, activeIndices );
// The following line increments the indices vector by 1, only the lanes where cmp was TRUE
indices = _mm_sub_epi32( indices, cmp );
// Update the set of active lane indices
activeIndices = cmp;
// The vector may become completely zero, meaning all 4 lanes have encountered at least 1 value where topFreq < phaseIncrements
if( 0 == _mm_movemask_epi8( activeIndices ) )
break;
}
// Indices vector keeps 4 32-bit integers
// Each lane contains index of the first table entry less than the corresponding lane of phaseIncrements
// Or maybe kNumWaveTableSlots if not found
There is no standard way to write SIMD instructions in C++. A compiler may produce SIMD instructions when appropriate as long as you've configured it to target a CPU that supports such instructions and enabled relevant optimisations. You can use standard algorithms using the std::execution::unsequenced_policy to help compiler understand that SIMD is appropriate.
If you are using GCC/G++ or Clang, there is a non-standard language extension for vector extensions. using __attribute__ ((vector_size (xx))). See the GCC manual for details
https://gcc.gnu.org/onlinedocs/gcc-11.2.0/gcc/Vector-Extensions.html#Vector-Extensions

AVX, Horizontal Sum of Single Precision Complex Numbers?

I have a 256 bit AVX register containing 4 single precision complex numbers stored as real, imaginary, real, imaginary, etc. I'm currently writing the entire 256 bit register back to memory and summing it there, but that seems inefficient.
How can the complex number horizontal sum be performed using AVX (or AVX2) intrinsics? I would accept an answer using assembly if there is not an answer with comparable efficiency using intrinsics.
Edit: To clarify, if the register contains AR, AI, BR, BI, CR, CI, DR, DI, I want to compute the complex number (AR + BR + CR + DR, AI + BI + CI + DI). If the result is in a 256 bit register, I can extract the 2 single precision floating point numbers.
Edit2: Potential solution, though not necessarily optimal...
float hsum_ps_sse3(__m128 v) {
__m128 shuf = _mm_movehdup_ps(v); // broadcast elements 3,1 to 2,0
__m128 sums = _mm_add_ps(v, shuf);
shuf = _mm_movehl_ps(shuf, sums); // high half -> low half
sums = _mm_add_ss(sums, shuf);
return _mm_cvtss_f32(sums);
}
float sumReal = 0.0;
float sumImaginary = 0.0;
__m256i mask = _mm256_set_epi32 (7, 5, 3, 1, 6, 4, 2, 0);
// Separate real and imaginary.
__m256 permutedSum = _mm256_permutevar8x32_ps(sseSum0, mask);
__m128 realSum = _mm256_extractf128_ps(permutedSum , 0);
__m128 imaginarySum = _mm256_extractf128_ps(permutedSum , 1);
// Horizontally sum real and imaginary.
sumReal = hsum_ps_sse3(realSum);
sumImaginary = hsum_ps_sse3(imaginarySum);
One fairly straightforward solution which requires only AVX (not AVX2):
__m128i v0 = _mm256_castps256_ps128(v); // get low 2 complex values
__m128i v1 = _mm256_extractf128_ps(v, 1); // get high 2 complex values
v0 = _mm_add_ps(v0, v1); // add high and low
v1 = _mm_shuffle_ps(v0, v0, _MM_SHUFFLE(1, 0, 3, 2));
v0 = _mm_add_ps(v0, v1); // combine two halves of result
The result will be in v0 as { sum.re, sum.im, sum.re, sum.im }.

Square root of a OpenCV's grey image using SSE

given a grey cv::Mat (CV_8UC1) I want to return another cv::Mat containing the square root of the elements (CV_32FC1) and I want to do it with SSE2 intrinsics. I am having some problems with the conversion from 8-bit values to 32 float values to perform the square root. I would really appreciate any help. This is my code for now(it does not give correct values):
uchar *source = (uchar *)cv::alignPtr(image.data, 16);
float *sqDataPtr = cv::alignPtr((float *)Squared.data, 16);
for (x = 0; x < (pixels - 16); x += 16) {
__m128i a0 = _mm_load_si128((__m128i *)(source + x));
__m128i first8 = _mm_unpacklo_epi8(a0, _mm_set1_epi8(0));
__m128i last8 = _mm_unpackhi_epi8(a0, _mm_set1_epi8(0));
__m128i first4i = _mm_unpacklo_epi16(first8, _mm_set1_epi16(0));
__m128i second4i = _mm_unpackhi_epi16(first8, _mm_set1_epi16(0));
__m128 first4 = _mm_cvtepi32_ps(first4i);
__m128 second4 = _mm_cvtepi32_ps(second4i);
__m128i third4i = _mm_unpacklo_epi16(last8, _mm_set1_epi16(0));
__m128i fourth4i = _mm_unpackhi_epi16(last8, _mm_set1_epi16(0));
__m128 third4 = _mm_cvtepi32_ps(third4i);
__m128 fourth4 = _mm_cvtepi32_ps(fourth4i);
// Store
_mm_store_ps(sqDataPtr + x, _mm_sqrt_ps(first4));
_mm_store_ps(sqDataPtr + x + 4, _mm_sqrt_ps(second4));
_mm_store_ps(sqDataPtr + x + 8, _mm_sqrt_ps(third4));
_mm_store_ps(sqDataPtr + x + 12, _mm_sqrt_ps(fourth4));
}
The SSE code looks OK, except that you're not processing the last 16 pixels:
for (x = 0; x < (pixels - 16); x += 16)
should be:
for (x = 0; x <= (pixels - 16); x += 16)
Note that if your image width is not a multiple of 16 then you will need to take care of any remaining pixels after the last full vector.
Also note that you are taking the sqrt of values in the range 0..255. It may be that you want normalised value in the range 0..1.0, in which case you'll want to scale the values accordingly.
I have no experience with SSE2, but I think that if performance is the issue you should use look-up table. Creation of look-up table is fast since you have only 256 possible values. Copy 4 bytes from look-up table into destination matrix should be a very efficient operation.

Horizontal minimum and maximum using SSE

I have a function using SSE to do a lot of stuff, and the profiler shows me that the code portion I use to compute the horizontal minimum and maximum consumes most of the time.
I have been using the following implementation for the minimum for instance:
static inline int16_t hMin(__m128i buffer) {
buffer = _mm_min_epi8(buffer, _mm_shuffle_epi8(buffer, m1));
buffer = _mm_min_epi8(buffer, _mm_shuffle_epi8(buffer, m2));
buffer = _mm_min_epi8(buffer, _mm_shuffle_epi8(buffer, m3));
buffer = _mm_min_epi8(buffer, _mm_shuffle_epi8(buffer, m4));
return ((int8_t*) ((void *) &buffer))[0];
}
I need to compute the minimum and the maximum of 16 1-byte integers, as you see.
Any good suggestions are highly appreciated :)
Thanks
SSE 4.1 has an instruction that does almost what you want. Its name is PHMINPOSUW, C/C++ intrinsic is _mm_minpos_epu16. It is limited to 16-bit unsigned values and cannot give maximum, but these problems could be easily solved.
If you need to find minimum of non-negative bytes, do nothing. If bytes may be negative, add 128 to each. If you need maximum, subtract each from 127.
Use either _mm_srli_pi16 or _mm_shuffle_epi8, and then _mm_min_epu8 to get 8 pairwise minimum values in even bytes and zeros in odd bytes of some XMM register. (These zeros are produced by shift/shuffle instruction and should remain at their places after _mm_min_epu8).
Use _mm_minpos_epu16 to find minimum among these values.
Extract the resulting minimum value with _mm_cvtsi128_si32.
Undo effect of step 1 to get the original byte value.
Here is an example that returns maximum of 16 signed bytes:
static inline int16_t hMax(__m128i buffer)
{
__m128i tmp1 = _mm_sub_epi8(_mm_set1_epi8(127), buffer);
__m128i tmp2 = _mm_min_epu8(tmp1, _mm_srli_epi16(tmp1, 8));
__m128i tmp3 = _mm_minpos_epu16(tmp2);
return (int8_t)(127 - _mm_cvtsi128_si32(tmp3));
}
I suggest two changes:
Replace ((int8_t*) ((void *) &buffer))[0] with _mm_cvtsi128_si32.
Replace _mm_shuffle_epi8 with _mm_shuffle_epi32/_mm_shufflelo_epi16 which have lower latency on recent AMD processors and Intel Atom, and will save you memory load operations:
static inline int16_t hMin(__m128i buffer)
{
buffer = _mm_min_epi8(buffer, _mm_shuffle_epi32(buffer, _MM_SHUFFLE(3, 2, 3, 2)));
buffer = _mm_min_epi8(buffer, _mm_shuffle_epi32(buffer, _MM_SHUFFLE(1, 1, 1, 1)));
buffer = _mm_min_epi8(buffer, _mm_shufflelo_epi16(buffer, _MM_SHUFFLE(1, 1, 1, 1)));
buffer = _mm_min_epi8(buffer, _mm_srli_epi16(buffer, 8));
return (int8_t)_mm_cvtsi128_si32(buffer);
}
here's an implementation without shuffle, shuffle is slow on AMD 5000 Ryzen 7 for some reason
float max_elem3() const {
__m128 a = _mm_unpacklo_ps(mm, mm); // x x y y
__m128 b = _mm_unpackhi_ps(mm, mm); // z z w w
__m128 c = _mm_max_ps(a, b); // ..., max(x, z), ..., ...
Vector4 res = _mm_max_ps(mm, c); // ..., max(y, max(x, z)), ..., ...
return res.y;
}
float min_elem3() const {
__m128 a = _mm_unpacklo_ps(mm, mm); // x x y y
__m128 b = _mm_unpackhi_ps(mm, mm); // z z w w
__m128 c = _mm_min_ps(a, b); // ..., min(x, z), ..., ...
Vector4 res = _mm_min_ps(mm, c); // ..., min(y, min(x, z)), ..., ...
return res.y;
}