I have a Paeth Prediction function which operates on arrays:
std::array<std::uint8_t,4> birunji::paeth_prediction
(const std::array<std::uint8_t,4>& a,
const std::array<std::uint8_t,4>& b,
const std::array<std::uint8_t,4>& c)
{
std::array<std::int16_t,4> pa;
std::array<std::int16_t,4> pb;
std::array<std::int16_t,4> pc;
std::array<std::uint8_t,4> results;
for(std::size_t i = 0; i < 4; ++i)
{
pa[i] = b[i] - c[i];
pb[i] = a[i] - c[i];
pc[i] = pa[i] + pb[i];
pa[i] = std::abs(pa[i]);
pb[i] = std::abs(pb[i]);
pc[i] = std::abs(pc[i]);
if(pa[i] <= pb[i] && pa[i] <= pc[i])
results[i] = a[i];
else if(pb[i] <= pc[i])
results[i] = b[i];
else
results[i] = c[i];
}
return results;
}
I'm attempting to use intrinsics manually to vectorise the code (for learning purposes).
__m128i birunji::paeth_prediction(const __m128i& a,
const __m128i& b,
const __m128i& c)
{
__m128i pa = _mm_sub_epi16(b, c);
__m128i pb = _mm_sub_epi16(a, c);
__m128i pc = _mm_add_epi16(pa, pb);
pa = _mm_abs_epi16(pa);
pb = _mm_abs_epi16(pb);
pc = _mm_abs_epi16(pc);
__m128i pa_le_pb = _mm_cmpgt_epi16(pb, pa);
__m128i pa_le_pc = _mm_cmpgt_epi16(pc, pa);
__m128i pb_le_pc = _mm_cmpgt_epi16(pc, pb);
return
_mm_and_si128(_mm_and_si128(pa_le_pb, pa_le_pc),
_mm_and_si128(_mm_and_si128(pb_le_pc,b),a));
}
The trouble I'm having is the conditional statements. How do I successfully vectorize these? I'm not sure if my attempt above it correct.
_mm_cmpgt_epi16 can be used for the comparisons. Note that _mm_cmpgt_epi16(a, b) = !(a <= b), however _mm_cmpgt_epi16(b, a) != (a <= b), because it is not a Greater or Equal comparison but a strict Greater Than comparison. So the masks come out inverted, but that's equally useful in this case, an explicit inversion won't be necessary.
This function should not return a condition itself, it should select from a and b and c according to the conditions. If SSE4.1 is available, _mm_blendv_epi8 can be used to implement that selection. For example (not tested):
__m128i paeth_prediction(__m128i a, __m128i b, __m128i c)
{
__m128i pa = _mm_sub_epi16(b, c);
__m128i pb = _mm_sub_epi16(a, c);
__m128i pc = _mm_add_epi16(pa, pb);
pa = _mm_abs_epi16(pa);
pb = _mm_abs_epi16(pb);
pc = _mm_abs_epi16(pc);
__m128i not_pa_le_pb = _mm_cmpgt_epi16(pa, pb);
__m128i not_pa_le_pc = _mm_cmpgt_epi16(pa, pc);
__m128i not_pb_le_pc = _mm_cmpgt_epi16(pb, pc);
__m128i not_take_a = _mm_or_si128(not_pa_le_pb, not_pa_le_pc);
__m128i t = _mm_blendv_epi8(b, c, not_pb_le_pc);
return _mm_blendv_epi8(a, t, not_take_a);
}
The last two lines implement logic like:
if PB is not less-than-or-equal-to PC, take C, otherwise take B.
if PA is not less-than-or-equal-to PB or PA is not less-than-or-equal-to PC, take the result from the previous step, otherwise take A.
Without SSE4.1, the blends could be implemented using AND/ANDNOT/OR.
I've changed the signature of the function so it takes the vectors by value, taking them by const reference is unnecessary (vectors are trivial to copy) and can add overhead from an indirection, though such overhead is likely to be removed if the function ends up being inlined by the compiler.
As a variant, _mm_min_epi16 could be used to implement part of the logic:
__m128i paeth_prediction(__m128i a, __m128i b, __m128i c)
{
__m128i pa = _mm_sub_epi16(b, c);
__m128i pb = _mm_sub_epi16(a, c);
__m128i pc = _mm_add_epi16(pa, pb);
pa = _mm_abs_epi16(pa);
pb = _mm_abs_epi16(pb);
pc = _mm_abs_epi16(pc);
__m128i not_pb_le_pc = _mm_cmpgt_epi16(pb, pc);
__m128i take_a = _mm_cmpeq_epi16(pa, _mm_min_epi16(pa, _mm_min_epi16(pb, pc)));
__m128i t = _mm_blendv_epi8(b, c, not_pb_le_pc);
return _mm_blendv_epi8(t, a, take_a);
}
Because the condition pa <= pb && pa <= pc is equivalent to pa == min(pa, pb, pc).
The resulting assembly code looks a bit better, but I did not test it in any way, including performance.
You can simplify your calculations by completely avoiding any conversion to int16_t.
First of all, note that pa<=pc as well as pb<=pc is true if and only if a<=c<=b or b<=c<=a. If c is smaller or equal than both, max(a,b) will be returned; if c is larger or equal, min(a,b) is returned.
So we can first "sort" a, b using a min and max operation,
A = min(a,b)
B = max(a,b)
which leaves three possible cases:
A<=B<=c --> A
c<=A<=B --> B
A< c< B --> c
This means in C++ code
std::array<std::uint8_t,4> birunji::paeth_prediction
(const std::array<std::uint8_t,4>& a,
const std::array<std::uint8_t,4>& b,
const std::array<std::uint8_t,4>& c)
{
std::array<std::uint8_t,4> results;
for(std::size_t i = 0; i < 4; ++i)
{
uint8_t A = std::min(a[i],b[i]);
uint8_t B = std::max(a[i],b[i]);
if (B<=c[i]) results[i] = A;
else if(c[i]<=A) results[i] = B;
else results[i] = c[i];
}
return results;
}
Unfortunately, there is no unsigned SIMD comparison (before AVX-512), but we can simulate this using (x<=y) == (max(x,y)==y) (or doing a saturated subtracting and comparing with zero.
Possible (untested) SIMD-implementation (this would also work for arbitrarily many elements -- but you can just load four elements in the lowest 32 bits and ignore the rest of the result):
__m128i paeth_prediction(__m128i a, __m128i b, __m128i c)
{
__m128i A = _mm_min_epu8(a, b);
__m128i B = _mm_max_epu8(a, b);
__m128i A_greater_equal_c = _mm_cmpeq_epi8(_mm_max_epu8(A, c), A);
__m128i B_less_equal_c = _mm_cmpeq_epi8(_mm_min_epu8(B, c), B);
// if you don't have SSE 4.1, this can be done using bitwise and/or operations:
__m128i t = _mm_blendv_epi8(b, c, A_greater_equal_c);
return _mm_blendv_epi8(a, t, B_less_equal_c);
}
Related
I've recently found that my program spend most time in the following simple function:
void SumOfSquaredDifference(
const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride,
size_t width, size_t height, uint64_t * sum)
{
*sum = 0;
for(size_t row = 0; row < height; ++row)
{
int rowSum = 0;
for(size_t col = 0; col < width; ++col)
{
int d = a[col] - b[col];
rowSum += d*d;
}
*sum += rowSum;
a += aStride;
b += bStride;
}
}
This function finds a sum of squared difference of two 8-bit gray images.
I think that there is the way to improve its performance with using SSE, but I don't have an experience in this area.
Could anybody help me?
Of course, you can improve your code.
This an example of optimization of your function with using SSE2:
const __m128i Z = _mm_setzero_si128();
const size_t A = sizeof(__m128i);
inline __m128i SquaredDifference(__m128i a, __m128i b)
{
const __m128i aLo = _mm_unpacklo_epi8(a, Z);
const __m128i bLo = _mm_unpacklo_epi8(b, Z);
const __m128i dLo = _mm_sub_epi16(aLo, bLo);
const __m128i aHi = _mm_unpackhi_epi8(a, Z);
const __m128i bHi = _mm_unpackhi_epi8(b, Z);
const __m128i dHi = _mm_sub_epi16(aHi, bHi);
return _mm_add_epi32(_mm_madd_epi16(dLo, dLo), _mm_madd_epi16(dHi, dHi));
}
inline __m128i HorizontalSum32(__m128i a)
{
return _mm_add_epi64(_mm_unpacklo_epi32(a, Z), _mm_unpackhi_epi32(a, Z));
}
inline uint64_t ExtractSum64(__m128i a)
{
uint64_t _a[2];
_mm_storeu_si128((__m128i*)_a, a);
return _a[0] + _a[1];
}
void SumOfSquaredDifference(
const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride,
size_t width, size_t height, uint64_t * sum)
{
assert(width%A == 0 && width < 0x10000);
__m128i fullSum = Z;
for(size_t row = 0; row < height; ++row)
{
__m128i rowSum = Z;
for(size_t col = 0; col < width; col += A)
{
const __m128i a_ = _mm_loadu_si128((__m128i*)(a + col));
const __m128i b_ = _mm_loadu_si128((__m128i*)(b + col));
rowSum = _mm_add_epi32(rowSum, SquaredDifference(a_, b_));
}
fullSum = _mm_add_epi64(fullSum, HorizontalSum32(rowSum));
a += aStride;
b += bStride;
}
*sum = ExtractSum64(fullSum);
}
This example is a few simplified (it doesn't work if the image width isn't multiple of 16).
Full version of the algorithm is here.
And some magic from SSSE3 version:
const __m128i K_1FF = _mm_set1_epi16(0x1FF);
inline __m128i SquaredDifference(__m128i a, __m128i b)
{
const __m128i lo = _mm_maddubs_epi16(_mm_unpacklo_epi8(a, b), K_1FF);
const __m128i hi = _mm_maddubs_epi16(_mm_unpackhi_epi8(a, b), K_1FF);
return _mm_add_epi32(_mm_madd_epi16(lo, lo), _mm_madd_epi16(hi, hi));
}
The magic description (see _mm_maddubs_epi16):
K_1FF -> {-1, 1, -1, 1, ...};
_mm_unpacklo_epi8(a, b) -> {a0, b0, a1, b1, ...};
_mm_maddubs_epi16(_mm_unpacklo_epi8(a, b), K_1FF) -> {b0 - a0, b1 - a1, ...};
GCC has switches that encourage it to vectorize the code. For example, the -mfma switch gives me about 25% speed increase on simple loops like this, using doubles. I imagine it's even better with 8-bit integers. I prefer that over hand-written optimizations because your code stays readable.
That said, there are a few old tricks that can speed up your loop:
Don't index, increment your pointer in every loop iteration. You do this in the outer loop, you should do the same in the inner loop. You can create a new pointer before going into the inner loop, so the +=stride stays valid.
Don't assign into the sum pointer inside your loop, use a local variable to accumulate and copy to the output when done. You use rowSum, but only in the inner loop. Use that variable across both loops instead.
I tried to enable vectorization of an often-used function to improve the performance.
The algorithm should do the following and is called ~4.000.000 times!
Input: double* cellvalue
Output: int8* Output (8 bit integer, c++ char)
Algo:
if (cellvalue > upper_threshold )
*output = 1;
else if (cellvalue < lower_threshold)
*output = -1;
else
*output = 0;
My first vectorization approach to compute 2 doubles in parallel looks like:
__m128d lowerThresh = _mm_set1_pd(m_lowerThreshold);
__m128d upperThresh = _mm_set1_pd(m_upperThreshold);
__m128d vec = _mm_load_pd(cellvalue);
__m128d maskLower = _mm_cmplt_pd(vec, lowerThresh); // less than
__m128d maskUpper = _mm_cmpgt_pd(vec, upperThresh); // greater than
static const tInt8 negOne = -1;
static const tInt8 posOne = 1;
output[0] = (negOne & *((tInt8*)&maskLower.m128d_f64[0])) | (posOne & *((tInt8*)&maskUpper.m128d_f64[0]));
output[1] = (negOne & *((tInt8*)&maskLower.m128d_f64[1])) | (posOne & *((tInt8*)&maskUpper.m128d_f64[1]));
Does this make sense to you? It works, but I think the last part to create the output is very complicated. Is there any faster method to do this?
Also I tried to compute 8 values at once with nearly the same code. Will this perform better? Does the order of instructions make sense?
__m128d lowerThresh = _mm_set1_pd(m_lowerThreshold);
__m128d upperThresh = _mm_set1_pd(m_upperThreshold);
// load 4 times
__m128d vec0 = _mm_load_pd(cellValue);
__m128d vec1 = _mm_load_pd(cellValue + 2);
__m128d vec2 = _mm_load_pd(cellValue + 4);
__m128d vec3 = _mm_load_pd(cellValue + 6);
__m128d maskLower0 = _mm_cmplt_pd(vec0, lowerThresh); // less than
__m128d maskLower1 = _mm_cmplt_pd(vec1, lowerThresh); // less than
__m128d maskLower2 = _mm_cmplt_pd(vec2, lowerThresh); // less than
__m128d maskLower3 = _mm_cmplt_pd(vec3, lowerThresh); // less than
__m128d maskUpper0 = _mm_cmpgt_pd(vec0, upperThresh); // greater than
__m128d maskUpper1 = _mm_cmpgt_pd(vec1, upperThresh); // greater than
__m128d maskUpper2 = _mm_cmpgt_pd(vec2, upperThresh); // greater than
__m128d maskUpper3 = _mm_cmpgt_pd(vec3, upperThresh); // greater than
static const tInt8 negOne = -1;
static const tInt8 posOne = 1;
output[0] = (negOne & *((tInt8*)&maskLower0.m128d_f64[0])) | (posOne & *((tInt8*)&maskUpper0.m128d_f64[0]));
output[1] = (negOne & *((tInt8*)&maskLower0.m128d_f64[1])) | (posOne & *((tInt8*)&maskUpper0.m128d_f64[1]));
output[2] = (negOne & *((tInt8*)&maskLower1.m128d_f64[0])) | (posOne & *((tInt8*)&maskUpper1.m128d_f64[0]));
output[3] = (negOne & *((tInt8*)&maskLower1.m128d_f64[1])) | (posOne & *((tInt8*)&maskUpper1.m128d_f64[1]));
output[4] = (negOne & *((tInt8*)&maskLower2.m128d_f64[0])) | (posOne & *((tInt8*)&maskUpper2.m128d_f64[0]));
output[5] = (negOne & *((tInt8*)&maskLower2.m128d_f64[1])) | (posOne & *((tInt8*)&maskUpper2.m128d_f64[1]));
output[6] = (negOne & *((tInt8*)&maskLower3.m128d_f64[0])) | (posOne & *((tInt8*)&maskUpper3.m128d_f64[0]));
output[7] = (negOne & *((tInt8*)&maskLower3.m128d_f64[1])) | (posOne & *((tInt8*)&maskUpper3.m128d_f64[1]));
Hopefully you can help me to understand the vectorization thing a bit better ;)
_mm_cmplt_pd and _mm_cmpgt_pd produce a result that is already either 0 or -1; anding it with -1 does nothing, and anding it with 1 is equivalent to negating it. Thus, if upper_threshold > lower_threshold (so that both conditions are never true), you can just write*:
_mm_storeu_si128(output, _mm_sub_epi64(maskLower, maskUpper));
(*) it's unclear what an "int8" is in your code; that's not a standard type in C++. It could be an 8-byte int, which is the behavior I've used here. If it's an 8-bit int instead, you'll want to pack up a bunch of results to store together.
Questioner clarifies that they intend int8 to be an 8-bit integer. In that case, you can do the following for a quick implementation:
__m128i result = _mm_sub_epi64(maskLower, maskUpper)
output[0] = result.m128i_i64[0]; // .m128i_i64 is an oddball MSVC-ism, so
output[1] = result.m128i_i64[1]; // I'm not 100% sure about the syntax here.
but you may also want to try packing eight result vectors together and store them with a single store operation.
If you change the code not to branch, then a modern compiler will do the vectorization for you.
Here's the test I ran:
#include <stdint.h>
#include <iostream>
#include <random>
#include <vector>
#include <chrono>
using Clock = std::chrono::steady_clock;
using std::chrono::milliseconds;
typedef double Scalar;
typedef int8_t Integer;
const Scalar kUpperThreshold = .5;
const Scalar kLowerThreshold = .2;
void compute_comparisons1(int n, const Scalar* xs, Integer* ys) {
#pragma simd
for (int i=0; i<n; ++i) {
Scalar x = xs[i];
ys[i] = (x > kUpperThreshold) - (x < kLowerThreshold);
}
}
void compute_comparisons2(int n, const Scalar* xs, Integer* ys) {
for (int i=0; i<n; ++i) {
Scalar x = xs[i];
Integer& y = ys[i];
if (x > kUpperThreshold)
y = 1;
else if(x < kLowerThreshold)
y = -1;
else
y = 0;
}
}
const int N = 4000000;
auto random_generator = std::mt19937{0};
int main() {
std::vector<Scalar> xs(N);
std::vector<Integer> ys1(N);
std::vector<Integer> ys2(N);
std::uniform_real_distribution<Scalar> dist(0, 1);
for (int i=0; i<N; ++i)
xs[i] = dist(random_generator);
auto time0 = Clock::now();
compute_comparisons1(N, xs.data(), ys1.data());
auto time1 = Clock::now();
compute_comparisons2(N, xs.data(), ys2.data());
auto time2 = Clock::now();
std::cout << "v1: " << std::chrono::duration_cast<milliseconds>(time1 - time0).count() << "\n";
std::cout << "v2: " << std::chrono::duration_cast<milliseconds>(time2 - time1).count() << "\n";
for (int i=0; i<N; ++i) {
if (ys1[i] != ys2[i]) {
std::cout << "Error!\n";
return -1;
}
}
return 0;
}
If you compile with a recent version of gcc (I used 4.8.3) and use the flags "-O3 -std=c++11 -march=native -S", you can verify by looking at the assembly that it vectorizes the code. And it runs much faster (3 milliseconds vs 16 milliseconds on my machine.)
Also, I'm not sure what your requirements are; but if you can live with less precision, then using float instead of double will further improve the speed (double takes 1.8x as long on my machine)
I have three unsigned 32-bit integers, say a, b, and c. If the lowest bit of b is 1, I want to XOR c with a, and store the result into c. We can do this in the following way:
#include <cassert>
int main()
{
// Some values for a and c
unsigned a = 16;
unsigned c = 25;
unsigned b = 5; // 101_2
if(b & 1)
{
c ^= a;
}
assert(c == 9);
}
Can I do this conditionally without branching, that is, with no if-statements?
There's lots of ways to do this.
Here's another, no multiply, only 4 operations.
c ^= a&(-(b&1));
This should work
c ^= a * ( b & 1 );
Without if statement and without branching you have to check the assembly dump of your compiler:
c ^= ~((b & 1) - 1) & a;
With the help of YOU, I have used SSE in my code (sample below) with significant performance boost and I was wondering if this boost could be improved by using 256bit registers of AVX.
int result[4] __attribute__((aligned(16))) = {0};
__m128i vresult = _mm_set1_epi32(0);
__m128i v1, v2, vmax;
for (int k = 0; k < limit; k += 4) {
v1 = _mm_load_si128((__m128i *) & myVector[positionNodeId + k]);
v2 = _mm_load_si128((__m128i *) & myVector2[k]);
vmax = _mm_add_epi32(v1, v2);
vresult = _mm_max_epi32(vresult, vmax);
}
_mm_store_si128((__m128i *) result, vresult);
return max(max(max(result[0], result[1]), result[2]), result[3]);
So, I have 3 questions: How would the above rather simple SSE code could be converted to AVX? WHat header should I import for that? And what flag should I tell my gcc compiler (instead of -sse4.1) for AVX to work?
Thanks in advance. for your help.
1.) This code can be easily converted to AVX2 (see below)
2.) #include <x86intrin.h>
3.) compile with -mavx2
You will need a CPU that supports AVX2. Currently only Intel Haswell processors support this. I don't have a Haswell processor (yet) so I could not test the code.
int result[8] __attribute__((aligned(32))) = {0};
__m256i vresult = _mm256_set1_epi32(0);
__m256i v1, v2, vmax;
for (int k = 0; k < limit; k += 8) {
v1 = _mm256_load_si256((__m256i *) & myVector[positionNodeId + k]);
v2 = _mm256_load_si256((__m256i *) & myVector2[k]);
vmax = _mm256_add_epi32(v1, v2);
vresult = _mm256_max_epi32(vresult, vmax);
}
return horizontal_max_Vec8i(vresult);
//_mm256_store_si256((__m256i *) result, vresult);
//int mymax = result[0];
//for(int k=1; k<8; k++) {
// if(result[k]>mymax) mymax = result[k];
//}
//return mymax;
Edit: I suspect that since you are only running over 64 elements that the horizontal max has a small but not insignifcant computation time. I came up with a horizontal_max_Vec4i function for SSE and a horizontal_max_Vec8i function for AVX (it does not need AVX2). Try replacing max(max(max(result[0], result[1]), result[2]), result[3]) with horizontal_max_Vec4i.
int horizontal_max_Vec4i(__m128i x) {
__m128i max1 = _mm_shuffle_epi32(x, _MM_SHUFFLE(0,0,3,2));
__m128i max2 = _mm_max_epi32(x,max1);
__m128i max3 = _mm_shuffle_epi32(max2, _MM_SHUFFLE(0,0,0,1));
__m128i max4 = _mm_max_epi32(max2,max3);
return _mm_cvtsi128_si32(max4);
}
int horizontal_max_Vec8i(__m256i x) {
__m128i low = _mm256_castsi256_si128(x);
__m128i high = _mm256_extractf128_si256(x,1);
return horizontal_max_Vec4i(_mm_max_epi32(low,high));
}
I'm new to SSE, and limited in knowledge. I'm trying to vectorize my code (C++, using gcc), which is actually quite simple.
I have an array of unsigned ints, and I only check for elements that are >=, or <= than some constant. As result, I need an array with elements that passed condition.
I'm thinking to use 'mm_cmpge_ps' as a mask, but this construct work over floats not ints!? :(
any suggestion, help is very much appreciated.
It's pretty easy to just mask out (i.e. set to 0) all non-matching ints. e.g.
#include <emmintrin.h> // SSE2 intrinsics
for (int i = 0; i < N; i += 4)
{
__m128i v = _mm_load_si128(&a[i]);
__m128i vcmp0 = _mm_cmpgt_epi32(v, _mm_set1_epi32(MIN_VAL - 1));
__m128i vcmp1 = _mm_cmplt_epi32(v, _mm_set1_epi32(MAX_VAL + 1));
__m128i vcmp = _mm_and_si128(vcmp0, vcmp1);
v = _mm_and_si128(v, vcmp);
_mm_store_si128(&a[i], v);
}
Note that a needs to be 16 byte aligned and N needs to be a multiple of 4 - if these constraints are a problem then it's not too hard to extend the code to cope with this.
Here you go. Here are three functions.
The first function,foo_v1, is based on Paul R's answer.
The second function,foo_v2, is based on a popular question today Fastest way to determine if an integer is between two integers (inclusive) with known sets of values
The third function, foo_v3 uses Agner Fog's vectorclass which I added only to show how much easier and cleaner it is to use his class. If you don't have the class then just comment out the #include "vectorclass.h" line and the foo_v3 function. I used Vec8ui which means it will use AVX2 if available and break it into two Vec4ui otherwise so you don't have to change your code to get the benefit of AVX2.
#include <stdio.h>
#include <nmmintrin.h> // SSE4.2
#include "vectorclass.h"
void foo_v1(const int N, int *a, const int MAX_VAL, const int MIN_VAL) {
for (int i = 0; i < N; i += 4) {
__m128i v = _mm_load_si128((const __m128i*)&a[i]);
__m128i vcmp0 = _mm_cmpgt_epi32(v, _mm_set1_epi32(MIN_VAL - 1));
__m128i vcmp1 = _mm_cmplt_epi32(v, _mm_set1_epi32(MAX_VAL + 1));
__m128i vcmp = _mm_and_si128(vcmp0, vcmp1);
v = _mm_and_si128(v, vcmp);
_mm_store_si128((__m128i*)&a[i], v);
}
}
void foo_v2(const int N, int *a, const int MAX_VAL, const int MIN_VAL) {
//if ((unsigned)(number-lower) < (upper-lower))
for (int i = 0; i < N; i += 4) {
__m128i v = _mm_load_si128((const __m128i*)&a[i]);
__m128i dv = _mm_sub_epi32(v, _mm_set1_epi32(MIN_VAL));
__m128i min_ab = _mm_min_epu32(dv,_mm_set1_epi32(MAX_VAL-MIN_VAL));
__m128i vcmp = _mm_cmpeq_epi32(dv,min_ab);
v = _mm_and_si128(v, vcmp);
_mm_store_si128((__m128i*)&a[i], v);
}
}
void foo_v3(const int N, int *a, const int MAX_VAL, const int MIN_VAL) {
//if ((unsigned)(number-lower) < (upper-lower))
for (int i = 0; i < N; i += 8) {
Vec8ui va = Vec8ui().load(&a[i]);
va &= (va - MIN_VAL) <= (MAX_VAL-MIN_VAL);
va.store(&a[i]);
}
}
int main() {
const int N = 16;
int* a = (int*)_mm_malloc(sizeof(int)*N, 16);
for(int i=0; i<N; i++) {
a[i] = i;
}
foo_v2(N, a, 7, 3);
for(int i=0; i<N; i++) {
printf("%d ", a[i]);
} printf("\n");
_mm_free(a);
}
First place to look might be IntelĀ® Intrinsics Guide