I've recently found that my program spend most time in the following simple function:
void SumOfSquaredDifference(
const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride,
size_t width, size_t height, uint64_t * sum)
{
*sum = 0;
for(size_t row = 0; row < height; ++row)
{
int rowSum = 0;
for(size_t col = 0; col < width; ++col)
{
int d = a[col] - b[col];
rowSum += d*d;
}
*sum += rowSum;
a += aStride;
b += bStride;
}
}
This function finds a sum of squared difference of two 8-bit gray images.
I think that there is the way to improve its performance with using SSE, but I don't have an experience in this area.
Could anybody help me?
Of course, you can improve your code.
This an example of optimization of your function with using SSE2:
const __m128i Z = _mm_setzero_si128();
const size_t A = sizeof(__m128i);
inline __m128i SquaredDifference(__m128i a, __m128i b)
{
const __m128i aLo = _mm_unpacklo_epi8(a, Z);
const __m128i bLo = _mm_unpacklo_epi8(b, Z);
const __m128i dLo = _mm_sub_epi16(aLo, bLo);
const __m128i aHi = _mm_unpackhi_epi8(a, Z);
const __m128i bHi = _mm_unpackhi_epi8(b, Z);
const __m128i dHi = _mm_sub_epi16(aHi, bHi);
return _mm_add_epi32(_mm_madd_epi16(dLo, dLo), _mm_madd_epi16(dHi, dHi));
}
inline __m128i HorizontalSum32(__m128i a)
{
return _mm_add_epi64(_mm_unpacklo_epi32(a, Z), _mm_unpackhi_epi32(a, Z));
}
inline uint64_t ExtractSum64(__m128i a)
{
uint64_t _a[2];
_mm_storeu_si128((__m128i*)_a, a);
return _a[0] + _a[1];
}
void SumOfSquaredDifference(
const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride,
size_t width, size_t height, uint64_t * sum)
{
assert(width%A == 0 && width < 0x10000);
__m128i fullSum = Z;
for(size_t row = 0; row < height; ++row)
{
__m128i rowSum = Z;
for(size_t col = 0; col < width; col += A)
{
const __m128i a_ = _mm_loadu_si128((__m128i*)(a + col));
const __m128i b_ = _mm_loadu_si128((__m128i*)(b + col));
rowSum = _mm_add_epi32(rowSum, SquaredDifference(a_, b_));
}
fullSum = _mm_add_epi64(fullSum, HorizontalSum32(rowSum));
a += aStride;
b += bStride;
}
*sum = ExtractSum64(fullSum);
}
This example is a few simplified (it doesn't work if the image width isn't multiple of 16).
Full version of the algorithm is here.
And some magic from SSSE3 version:
const __m128i K_1FF = _mm_set1_epi16(0x1FF);
inline __m128i SquaredDifference(__m128i a, __m128i b)
{
const __m128i lo = _mm_maddubs_epi16(_mm_unpacklo_epi8(a, b), K_1FF);
const __m128i hi = _mm_maddubs_epi16(_mm_unpackhi_epi8(a, b), K_1FF);
return _mm_add_epi32(_mm_madd_epi16(lo, lo), _mm_madd_epi16(hi, hi));
}
The magic description (see _mm_maddubs_epi16):
K_1FF -> {-1, 1, -1, 1, ...};
_mm_unpacklo_epi8(a, b) -> {a0, b0, a1, b1, ...};
_mm_maddubs_epi16(_mm_unpacklo_epi8(a, b), K_1FF) -> {b0 - a0, b1 - a1, ...};
GCC has switches that encourage it to vectorize the code. For example, the -mfma switch gives me about 25% speed increase on simple loops like this, using doubles. I imagine it's even better with 8-bit integers. I prefer that over hand-written optimizations because your code stays readable.
That said, there are a few old tricks that can speed up your loop:
Don't index, increment your pointer in every loop iteration. You do this in the outer loop, you should do the same in the inner loop. You can create a new pointer before going into the inner loop, so the +=stride stays valid.
Don't assign into the sum pointer inside your loop, use a local variable to accumulate and copy to the output when done. You use rowSum, but only in the inner loop. Use that variable across both loops instead.
Related
I am currently learning how to work with SIMD intrinsics. I know that an AVX 256-bit vector can contain four doubles, eight floats, or eight 32-bit integers. How do we use AVX to process arrays that aren't a multiple of these numbers.
For example, how would you add two std::vectors of 53 integers each? Would we slice as many of the vector that would fit in the SIMD vector and just manually process the remainder? Is there a better way to do this?
Would we slice as many of the vector that would fit in the SIMD vector and just manually process the remainder? Is there a better way to do this?
Pretty much this. A basic example that processes all number in batches of 8, and uses mask load/maskstore to handle the remainder.
void add(int* const r, const int* const a, const int* const b, const unsigned count) {
// how many blocks of 8, and how many left over
const unsigned c8 = count & ~0x7U;
const unsigned cr = count & 0x7U;
// process blocks of 8
for(unsigned i = 0; i < c8; i += 8) {
__m256i _a = _mm256_loadu_si256((__m256i*)(a + i));
__m256i _b = _mm256_loadu_si256((__m256i*)(b + i));
__m256i _c = _mm256_add_epi32(_a, _b);
_mm256_storeu_si256((__m256i*)(c + i), _c);
}
const __m128i temp[5] = {
_mm_setr_epi32(0, 0, 0, 0),
_mm_setr_epi32(-1, 0, 0, 0),
_mm_setr_epi32(-1, -1, 0, 0),
_mm_setr_epi32(-1, -1, -1, 0),
_mm_setr_epi32(-1, -1, -1, -1)
};
// I'm using mask load / mask store for the remainder here.
// (this is not the only approach)
__m256i mask;
if(cr >= 4) {
mask = _mm256_set_m128i(temp[cr&3], temp[4]);
} else {
mask = _mm256_set_m128i(temp[0], temp[cr]);
}
__m256i _a = _mm256_maskload_epi32((a + c8), mask);
__m256i _b = _mm256_maskload_epi32((b + c8), mask);
__m256i _c = _mm256_add_epi32(_a, _b);
_mm256_maskstore_epi32((c + c8), mask, _c);
}
Of course, if you happen to use your own containers (or provide your own allocators), then you can avoid most of this mess by simply ensuring all container allocations occur in multiples of 256bits.
// yes, this class is missing a lot...
class MyIntArray {
public:
MyIntArray(unsigned count, const int* data) {
// bump capacity to next multiple of 8
unsigned cap = count & 7;
if(cap) cap = 8 - cap;
capacity = cap + count;
// allocation is aligned to 256bit
alloc = new int[capacity];
size = count;
memcpy(alloc, data, sizeof(int) * size);
}
MyIntArray(unsigned count) {
// bump capacity to next multiple of 8
unsigned cap = count & 7;
if(cap) cap = 8 - cap;
capacity = cap + count;
// allocation is aligned to 256bit
alloc = new int[capacity];
size = count;
}
unsigned capacity;
unsigned size;
int* alloc;
int* begin() { return alloc; }
int* end() { return alloc + size; }
const int* begin() const { return alloc; }
const int* end() const { return alloc + size; }
};
void add(MyIntArray r, const MyIntArray a, const MyIntArray b) {
// process blocks of 8.
// we may be stamping beyond the end of the array, but not over the
// the end of the capacity allocation....
// (probably also want to check to see if the sizes match!).
for(unsigned i = 0; i < r.size; i += 8) {
__m256i _a = _mm256_loadu_si256((__m256i*)(a.alloc + i));
__m256i _b = _mm256_loadu_si256((__m256i*)(b.alloc + i));
__m256i _c = _mm256_add_epi32(_a, _b);
_mm256_storeu_si256((__m256i*)(c.alloc + i), _c);
}
}
This question already exists:
How to implement convolution algorithm with SSE?
Closed 1 year ago.
My goal is to implement exactly that algorithm using only CPU and using SSE:
My array's sizes a multiple of 4 and they are aligned:
const int INPUT_SIGNAL_ARRAY_SIZE = 256896;
const int IMPULSE_RESPONSE_ARRAY_SIZE = 318264;
const int OUTPUT_SIGNAL_ARRAY_SIZE = INPUT_SIGNAL_ARRAY_SIZE + IMPULSE_RESPONSE_ARRAY_SIZE;
__declspec(align(16)) float inputSignal_dArray[INPUT_SIGNAL_ARRAY_SIZE];
__declspec(align(16)) float impulseResponse_dArray[IMPULSE_RESPONSE_ARRAY_SIZE];
__declspec(align(16)) float outputSignal_dArray[OUTPUT_SIGNAL_ARRAY_SIZE];
I have written CPU "method" and it works correctly:
//#pragma optimize( "", off )
void computeConvolutionOutputCPU(float* inputSignal, float* impulseResponse, float* outputSignal) {
float* pInputSignal = inputSignal;
float* pImpulseResponse = impulseResponse;
float* pOutputSignal = outputSignal;
#pragma loop(no_vector)
for (int i = 0; i < OUTPUT_SIGNAL_ARRAY_SIZE; i++)
{
*(pOutputSignal + i) = 0;
#pragma loop(no_vector)
for (int j = 0; j < IMPULSE_RESPONSE_ARRAY_SIZE; j++)
{
if (i - j >= 0 && i - j < INPUT_SIGNAL_ARRAY_SIZE)
{
*(pOutputSignal + i) = *(pOutputSignal + i) + *(pImpulseResponse + j) * (*(pInputSignal + i - j));
}
}
}
}
//#pragma optimize( "", on )
On the other hand I should use function with SSE. I tried the following code:
void computeConvolutionOutputSSE(float* inputSignal, float* impulseResponse, float* outputSignal) {
__m128* pInputSignal = (__m128*) inputSignal;
__m128* pImpulseResponse = (__m128*) impulseResponse;
__m128* pOutputSignal = (__m128*) outputSignal;
int nOuterLoop = OUTPUT_SIGNAL_ARRAY_SIZE / 4;
int nInnerLoop = IMPULSE_RESPONSE_ARRAY_SIZE / 4;
int quarterOfInputSignal = INPUT_SIGNAL_ARRAY_SIZE / 4;
__m128 m0 = _mm_set_ps1(0);
for (int i = 0; i < nOuterLoop; i++)
{
*(pOutputSignal + i) = m0;
for (int j = 0; j < nInnerLoop; j++)
{
if ((i - j) >= 0 && (i - j) < quarterOfInputSignal)
{
*(pOutputSignal + i) = _mm_add_ps(
*(pOutputSignal + i),
_mm_mul_ps(*(pImpulseResponse + j), *(pInputSignal + i - j))
);
}
}
}
}
And function above works not correct and produces not the same values like CPU.
The problem was specified on stackoverflow with following comment :
*(pInputSignal + i - j) is incorrect in case of SSE, because it's not an i-j offset away from current value, it's (i-j) * 4 . THe thing is,
as I remember it, the idea of using pointer that way is incorrect
unless intrinsics had changed since then - in my time one had to
"load" values into an instance of __m128 in this case, as H(J) and
X(I-J) are in unaligned location (and sequence breaks).
and
Since you care about individual floats and their order, probably best
to use const float*, with _mm_loadu_ps instead of just dereferencing
(which is like _mm_load_ps). That way you can easily do unaligned
loads that get the floats you want into the vector element positions
you want, and the pointer math works the same as for scalar. You just
have to take into account that load(ptr) actually gets you a vector of
elements from ptr+0..3.
But I can't use this information because have no idea how to properly access array with SSE in this case.
you need 128-bit float32 value , not msvc float.
see _mm_broadcast_ss
I have a Paeth Prediction function which operates on arrays:
std::array<std::uint8_t,4> birunji::paeth_prediction
(const std::array<std::uint8_t,4>& a,
const std::array<std::uint8_t,4>& b,
const std::array<std::uint8_t,4>& c)
{
std::array<std::int16_t,4> pa;
std::array<std::int16_t,4> pb;
std::array<std::int16_t,4> pc;
std::array<std::uint8_t,4> results;
for(std::size_t i = 0; i < 4; ++i)
{
pa[i] = b[i] - c[i];
pb[i] = a[i] - c[i];
pc[i] = pa[i] + pb[i];
pa[i] = std::abs(pa[i]);
pb[i] = std::abs(pb[i]);
pc[i] = std::abs(pc[i]);
if(pa[i] <= pb[i] && pa[i] <= pc[i])
results[i] = a[i];
else if(pb[i] <= pc[i])
results[i] = b[i];
else
results[i] = c[i];
}
return results;
}
I'm attempting to use intrinsics manually to vectorise the code (for learning purposes).
__m128i birunji::paeth_prediction(const __m128i& a,
const __m128i& b,
const __m128i& c)
{
__m128i pa = _mm_sub_epi16(b, c);
__m128i pb = _mm_sub_epi16(a, c);
__m128i pc = _mm_add_epi16(pa, pb);
pa = _mm_abs_epi16(pa);
pb = _mm_abs_epi16(pb);
pc = _mm_abs_epi16(pc);
__m128i pa_le_pb = _mm_cmpgt_epi16(pb, pa);
__m128i pa_le_pc = _mm_cmpgt_epi16(pc, pa);
__m128i pb_le_pc = _mm_cmpgt_epi16(pc, pb);
return
_mm_and_si128(_mm_and_si128(pa_le_pb, pa_le_pc),
_mm_and_si128(_mm_and_si128(pb_le_pc,b),a));
}
The trouble I'm having is the conditional statements. How do I successfully vectorize these? I'm not sure if my attempt above it correct.
_mm_cmpgt_epi16 can be used for the comparisons. Note that _mm_cmpgt_epi16(a, b) = !(a <= b), however _mm_cmpgt_epi16(b, a) != (a <= b), because it is not a Greater or Equal comparison but a strict Greater Than comparison. So the masks come out inverted, but that's equally useful in this case, an explicit inversion won't be necessary.
This function should not return a condition itself, it should select from a and b and c according to the conditions. If SSE4.1 is available, _mm_blendv_epi8 can be used to implement that selection. For example (not tested):
__m128i paeth_prediction(__m128i a, __m128i b, __m128i c)
{
__m128i pa = _mm_sub_epi16(b, c);
__m128i pb = _mm_sub_epi16(a, c);
__m128i pc = _mm_add_epi16(pa, pb);
pa = _mm_abs_epi16(pa);
pb = _mm_abs_epi16(pb);
pc = _mm_abs_epi16(pc);
__m128i not_pa_le_pb = _mm_cmpgt_epi16(pa, pb);
__m128i not_pa_le_pc = _mm_cmpgt_epi16(pa, pc);
__m128i not_pb_le_pc = _mm_cmpgt_epi16(pb, pc);
__m128i not_take_a = _mm_or_si128(not_pa_le_pb, not_pa_le_pc);
__m128i t = _mm_blendv_epi8(b, c, not_pb_le_pc);
return _mm_blendv_epi8(a, t, not_take_a);
}
The last two lines implement logic like:
if PB is not less-than-or-equal-to PC, take C, otherwise take B.
if PA is not less-than-or-equal-to PB or PA is not less-than-or-equal-to PC, take the result from the previous step, otherwise take A.
Without SSE4.1, the blends could be implemented using AND/ANDNOT/OR.
I've changed the signature of the function so it takes the vectors by value, taking them by const reference is unnecessary (vectors are trivial to copy) and can add overhead from an indirection, though such overhead is likely to be removed if the function ends up being inlined by the compiler.
As a variant, _mm_min_epi16 could be used to implement part of the logic:
__m128i paeth_prediction(__m128i a, __m128i b, __m128i c)
{
__m128i pa = _mm_sub_epi16(b, c);
__m128i pb = _mm_sub_epi16(a, c);
__m128i pc = _mm_add_epi16(pa, pb);
pa = _mm_abs_epi16(pa);
pb = _mm_abs_epi16(pb);
pc = _mm_abs_epi16(pc);
__m128i not_pb_le_pc = _mm_cmpgt_epi16(pb, pc);
__m128i take_a = _mm_cmpeq_epi16(pa, _mm_min_epi16(pa, _mm_min_epi16(pb, pc)));
__m128i t = _mm_blendv_epi8(b, c, not_pb_le_pc);
return _mm_blendv_epi8(t, a, take_a);
}
Because the condition pa <= pb && pa <= pc is equivalent to pa == min(pa, pb, pc).
The resulting assembly code looks a bit better, but I did not test it in any way, including performance.
You can simplify your calculations by completely avoiding any conversion to int16_t.
First of all, note that pa<=pc as well as pb<=pc is true if and only if a<=c<=b or b<=c<=a. If c is smaller or equal than both, max(a,b) will be returned; if c is larger or equal, min(a,b) is returned.
So we can first "sort" a, b using a min and max operation,
A = min(a,b)
B = max(a,b)
which leaves three possible cases:
A<=B<=c --> A
c<=A<=B --> B
A< c< B --> c
This means in C++ code
std::array<std::uint8_t,4> birunji::paeth_prediction
(const std::array<std::uint8_t,4>& a,
const std::array<std::uint8_t,4>& b,
const std::array<std::uint8_t,4>& c)
{
std::array<std::uint8_t,4> results;
for(std::size_t i = 0; i < 4; ++i)
{
uint8_t A = std::min(a[i],b[i]);
uint8_t B = std::max(a[i],b[i]);
if (B<=c[i]) results[i] = A;
else if(c[i]<=A) results[i] = B;
else results[i] = c[i];
}
return results;
}
Unfortunately, there is no unsigned SIMD comparison (before AVX-512), but we can simulate this using (x<=y) == (max(x,y)==y) (or doing a saturated subtracting and comparing with zero.
Possible (untested) SIMD-implementation (this would also work for arbitrarily many elements -- but you can just load four elements in the lowest 32 bits and ignore the rest of the result):
__m128i paeth_prediction(__m128i a, __m128i b, __m128i c)
{
__m128i A = _mm_min_epu8(a, b);
__m128i B = _mm_max_epu8(a, b);
__m128i A_greater_equal_c = _mm_cmpeq_epi8(_mm_max_epu8(A, c), A);
__m128i B_less_equal_c = _mm_cmpeq_epi8(_mm_min_epu8(B, c), B);
// if you don't have SSE 4.1, this can be done using bitwise and/or operations:
__m128i t = _mm_blendv_epi8(b, c, A_greater_equal_c);
return _mm_blendv_epi8(a, t, B_less_equal_c);
}
I was trying to optimize following code (sum of squared differences for two arrays):
inline float Square(float value)
{
return value*value;
}
float SquaredDifferenceSum(const float * a, const float * b, size_t size)
{
float sum = 0;
for(size_t i = 0; i < size; ++i)
sum += Square(a[i] - b[i]);
return sum;
}
So I performed optimization with using of SSE instructions of CPU:
inline void SquaredDifferenceSum(const float * a, const float * b, size_t i, __m128 & sum)
{
__m128 _a = _mm_loadu_ps(a + i);
__m128 _b = _mm_loadu_ps(b + i);
__m128 _d = _mm_sub_ps(_a, _b);
sum = _mm_add_ps(sum, _mm_mul_ps(_d, _d));
}
inline float ExtractSum(__m128 a)
{
float _a[4];
_mm_storeu_ps(_a, a);
return _a[0] + _a[1] + _a[2] + _a[3];
}
float SquaredDifferenceSum(const float * a, const float * b, size_t size)
{
size_t i = 0, alignedSize = size/4*4;
__m128 sums = _mm_setzero_ps();
for(; i < alignedSize; i += 4)
SquaredDifferenceSum(a, b, i, sums);
float sum = ExtractSum(sums);
for(; i < size; ++i)
sum += Square(a[i] - b[i]);
return sum;
}
This code works fine if the size of the arrays is not too large.
But if the size is big enough then there is a large computing error between results given by base function and its optimized version.
And so I have a question: Where is here a bug in SSE optimized code, which leads to the computing error.
The error follows from finite precision floating point numbers.
Each addition of two floating point numbers is has an computing error proportional to difference between them.
In your scalar version of algorithm the resulting sum is much greater then each term (if size of arrays is big enough of course).
So it leads to accumulation of big computing error.
In the SSE version of algorithm actually there is four sums for results accumulation. And difference between these sums and each term is lesser in four times relative to scalar code.
So this leads to the lesser computing error.
There are two ways to solve this error:
1) Using of floating point numbers of double precision for accumulating sum.
2) Using of the the Kahan summation algorithm (also known as compensated summation) which significantly reduces the numerical error in the total obtained by adding a sequence of finite precision floating point numbers, compared to the obvious approach.
https://en.wikipedia.org/wiki/Kahan_summation_algorithm
With using of Kahan summation algorithm your scalar code will look like:
inline void KahanSum(float value, float & sum, float & correction)
{
float term = value - correction;
float temp = sum + term;
correction = (temp - sum) - term;
sum = temp;
}
float SquaredDifferenceKahanSum(const float * a, const float * b, size_t size)
{
float sum = 0, correction = 0;
for(size_t i = 0; i < size; ++i)
KahanSum(Square(a[i] - b[i]), sum, correction);
return sum;
}
And SSE optimized code will look as follow:
inline void SquaredDifferenceKahanSum(const float * a, const float * b, size_t i,
__m128 & sum, __m128 & correction)
{
__m128 _a = _mm_loadu_ps(a + i);
__m128 _b = _mm_loadu_ps(b + i);
__m128 _d = _mm_sub_ps(_a, _b);
__m128 term = _mm_sub_ps(_mm_mul_ps(_d, _d), correction);
__m128 temp = _mm_add_ps(sum, term);
correction = _mm_sub_ps(_mm_sub_ps(temp, sum), term);
sum = temp;
}
float SquaredDifferenceKahanSum(const float * a, const float * b, size_t size)
{
size_t i = 0, alignedSize = size/4*4;
__m128 sums = _mm_setzero_ps(), corrections = _mm_setzero_ps();
for(; i < alignedSize; i += 4)
SquaredDifferenceKahanSum(a, b, i, sums, corrections);
float sum = ExtractSum(sums), correction = 0;
for(; i < size; ++i)
KahanSum(Square(a[i] - b[i]), sum, correction);
return sum;
}
I'm new to SSE, and limited in knowledge. I'm trying to vectorize my code (C++, using gcc), which is actually quite simple.
I have an array of unsigned ints, and I only check for elements that are >=, or <= than some constant. As result, I need an array with elements that passed condition.
I'm thinking to use 'mm_cmpge_ps' as a mask, but this construct work over floats not ints!? :(
any suggestion, help is very much appreciated.
It's pretty easy to just mask out (i.e. set to 0) all non-matching ints. e.g.
#include <emmintrin.h> // SSE2 intrinsics
for (int i = 0; i < N; i += 4)
{
__m128i v = _mm_load_si128(&a[i]);
__m128i vcmp0 = _mm_cmpgt_epi32(v, _mm_set1_epi32(MIN_VAL - 1));
__m128i vcmp1 = _mm_cmplt_epi32(v, _mm_set1_epi32(MAX_VAL + 1));
__m128i vcmp = _mm_and_si128(vcmp0, vcmp1);
v = _mm_and_si128(v, vcmp);
_mm_store_si128(&a[i], v);
}
Note that a needs to be 16 byte aligned and N needs to be a multiple of 4 - if these constraints are a problem then it's not too hard to extend the code to cope with this.
Here you go. Here are three functions.
The first function,foo_v1, is based on Paul R's answer.
The second function,foo_v2, is based on a popular question today Fastest way to determine if an integer is between two integers (inclusive) with known sets of values
The third function, foo_v3 uses Agner Fog's vectorclass which I added only to show how much easier and cleaner it is to use his class. If you don't have the class then just comment out the #include "vectorclass.h" line and the foo_v3 function. I used Vec8ui which means it will use AVX2 if available and break it into two Vec4ui otherwise so you don't have to change your code to get the benefit of AVX2.
#include <stdio.h>
#include <nmmintrin.h> // SSE4.2
#include "vectorclass.h"
void foo_v1(const int N, int *a, const int MAX_VAL, const int MIN_VAL) {
for (int i = 0; i < N; i += 4) {
__m128i v = _mm_load_si128((const __m128i*)&a[i]);
__m128i vcmp0 = _mm_cmpgt_epi32(v, _mm_set1_epi32(MIN_VAL - 1));
__m128i vcmp1 = _mm_cmplt_epi32(v, _mm_set1_epi32(MAX_VAL + 1));
__m128i vcmp = _mm_and_si128(vcmp0, vcmp1);
v = _mm_and_si128(v, vcmp);
_mm_store_si128((__m128i*)&a[i], v);
}
}
void foo_v2(const int N, int *a, const int MAX_VAL, const int MIN_VAL) {
//if ((unsigned)(number-lower) < (upper-lower))
for (int i = 0; i < N; i += 4) {
__m128i v = _mm_load_si128((const __m128i*)&a[i]);
__m128i dv = _mm_sub_epi32(v, _mm_set1_epi32(MIN_VAL));
__m128i min_ab = _mm_min_epu32(dv,_mm_set1_epi32(MAX_VAL-MIN_VAL));
__m128i vcmp = _mm_cmpeq_epi32(dv,min_ab);
v = _mm_and_si128(v, vcmp);
_mm_store_si128((__m128i*)&a[i], v);
}
}
void foo_v3(const int N, int *a, const int MAX_VAL, const int MIN_VAL) {
//if ((unsigned)(number-lower) < (upper-lower))
for (int i = 0; i < N; i += 8) {
Vec8ui va = Vec8ui().load(&a[i]);
va &= (va - MIN_VAL) <= (MAX_VAL-MIN_VAL);
va.store(&a[i]);
}
}
int main() {
const int N = 16;
int* a = (int*)_mm_malloc(sizeof(int)*N, 16);
for(int i=0; i<N; i++) {
a[i] = i;
}
foo_v2(N, a, 7, 3);
for(int i=0; i<N; i++) {
printf("%d ", a[i]);
} printf("\n");
_mm_free(a);
}
First place to look might be IntelĀ® Intrinsics Guide