Vectorization of modulo multiplication - c++

I have a function:
void Func(const int * a, const int * b, size_t size, int p, int * c)
{
for (size_t i = 0; i < size; ++i)
c[i] = (a[i]*b[i])%p;
}
This function performs many modulo multiplication for arrays of integer.
All integers are positive.
And I need to improve its performance.
I thought about SSE and AVX. But they don't have an operation to vectorize modulo multiplication.
Or maybe I'm wrong?
Maybe anybody know any posibility to solve this problem?

At first I want to note that modulo operation can be realized with using of float point numbers:
d % p = d - int(float(d)/float(p))*p.
Although the amount of operation in right part is greater then in left one this part is preferable because it can be vectorized with using of SSE/AVX.
An implementation with SSE4.1 for 32x32 => 32-bit integer multiplication. Note that conversion from FP back to integer is done with round-to-nearest; use truncation toward zero (cvttps_epi32) if you want semantics like C float->integer conversions.
void Func(const int * a, const int * b, size_t size, int p, int * c)
{
__m128 _k = _mm_set1_ps(1.0f / p);
__m128i _p = _mm_set1_epi32(p);
for (size_t i = 0; i < size; i += 4)
{
__m128i _a = _mm_loadu_si128((__m128i*)(a + i));
__m128i _b = _mm_loadu_si128((__m128i*)(b + i));
__m128i _d = _mm_mullo_epi32(_a, _b);
__m128i _e = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(_d), _k)); // e = int(float(d)/float(p));
__m128i _c = _mm_sub_epi32(_d, _mm_mullo_epi32(_e, _p));
_mm_storeu_si128((__m128i*)(c + i), _c);
}
}
An implementation with using of AVX :
void Func(const int * a, const int * b, size_t size, int p, int * c)
{
__m256 _k = _mm256_set1_ps(1.0f / p);
__m256i _p = _mm256_set1_epi32(p);
for (size_t i = 0; i < size; i += 8)
{
__m256i _a = _mm256_loadu_si128((__m256i*)(a + i));
__m256i _b = _mm256_loadu_si128((__m256i*)(b + i));
__m256i _d = _mm256_mullo_epi32(_a, _b);
__m256i _e = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(_d), _k)); // e = int(float(d)/float(p));
__m256i _c = _mm256_sub_epi32(_d, _mm256_mullo_epi32(_e, _p));
_mm256_storeu_si128((__m256i*)(c + i), _c);
}
}

Actually there is an instrinsic that is performing this operation:
_mm256_irem_epi32
https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_irem_epi32

Related

Fast pyrDown image with AVX instructions

I have 2 pyrDown implementation with SSE2 and AVX instructions set. They are differ and AVX implementation get wrong image result. Also AVX implementation is slower that SSE2 impl. It's strange. Whats wrong with AVX implementation and how it make faster?
// SSE2 implementation
static __inline __m128i average2RowsSingle(const uint8_t* __restrict__ src, size_t srcStep) {
__m128i v0 = _mm_load_si128((const __m128i *)src);
__m128i v1 = _mm_load_si128((const __m128i *)&src[srcStep]);
return _mm_avg_epu8(v0, v1);
}
// SSSE3 version
// I used `__restrict__` to give the compiler more flexibility in unrolling
void average2Rows(const uint8_t* __restrict__ src,
uint8_t*__restrict__ dst,
size_t srcStep,
size_t size)
{
const __m128i vk1 = _mm_set1_epi8(1);
const __m128i add2 = _mm_set1_epi16(2);
size_t dstsize = size/2;
for (size_t i = 0; i < dstsize - 15; i += 16)
{
const size_t ii = i*2;
// based on https://stackoverflow.com/a/45564565/820795
__m128i left = average2RowsSingle(src+ii, srcStep);
__m128i right = average2RowsSingle(src+ii+16, srcStep);
__m128i w0 = _mm_maddubs_epi16(left, vk1); // unpack and horizontal add
__m128i w1 = _mm_maddubs_epi16(right, vk1);
w0 = _mm_srli_epi16(w0, 1); // divide by 2
w1 = _mm_srli_epi16(w1, 1);
w0 = _mm_packus_epi16(w0, w1); // pack
_mm_storeu_si128((__m128i *)&dst[i], w0);
}
}
// AVX implementation
static __m256i average2RowsSingle(const uint8_t* __restrict__ src, size_t srcStep) {
auto v0 = _mm256_load_si256((const __m256i*)src);
auto v1 = _mm256_load_si256((const __m256i*)&src[srcStep]);
return _mm256_avg_epu8(v0, v1);
}
void average2Rows(const uint8_t* __restrict__ src,
uint8_t*__restrict__ dst,
size_t srcStep,
size_t size) {
const __m128i vk1 = _mm_set1_epi8(1);
size_t dstsize = size/2;
const signed char o = -1; // make shuffle zero
const __m256i vec_r_i16 = _mm256_set_epi8(o,30, o,28, o,26, o,24, o,22, o,20, o,18, o,16,
o,14, o,12, o,10, o, 8, o, 6, o, 4, o, 2, o, 0);
const __m256i vec_l_i16 = _mm256_set_epi8(o,31, o,29, o,27, o,25, o,23, o,21, o,19, o,17,
o,15, o,13, o,11, o, 9, o, 7, o, 5, o, 3, o, 1);
for (size_t i = 0; i < dstsize - 31; i += 32)
{
const size_t ii = i * 2;
auto left = average2RowsSingle(src + ii, srcStep);
auto right = average2RowsSingle(src + ii + 32, srcStep);
auto w0 = _mm256_shuffle_epi8(left, vec_r_i16);
auto w1 = _mm256_shuffle_epi8(left, vec_l_i16);
left = _mm256_srli_epi16(_mm256_add_epi16(w0, w1), 1);
w0 = _mm256_shuffle_epi8(right, vec_r_i16);
w1 = _mm256_shuffle_epi8(right, vec_l_i16);
right = _mm256_srli_epi16(_mm256_add_epi16(w0, w1), 1);
left = _mm256_packus_epi16(left, right);
_mm256_storeu_si256((__m256i *) &dst[i], left);
}
}
Wrong result after AVX implementation:
With help of #chtz I come up to this code:
inline __m256i average2RowsSingle(const uint8_t* __restrict__ src, size_t srcStep) {
auto v0 = _mm256_loadu_si256((const __m256i *)src);
auto v1 = _mm256_loadu_si256((const __m256i *)&src[srcStep]);
return _mm256_avg_epu8(v0, v1);
}
void average2Rows(const uint8_t* __restrict__ src,
uint8_t*__restrict__ dst,
size_t srcStep,
size_t size) {
const auto vk1 = _mm256_set1_epi8(1);
const size_t dstSize = size/2;
for (size_t i = 0; i < dstSize - 31; i += 32)
{
const size_t ii = i * 2;
// based on https://stackoverflow.com/a/45564565/820795
auto left = average2RowsSingle(src + ii, srcStep);
auto right = average2RowsSingle(src + ii + 32, srcStep);
auto w0 = _mm256_maddubs_epi16(left, vk1); // unpack and horizontal add
auto w1 = _mm256_maddubs_epi16(right, vk1);
w0 = _mm256_srli_epi16(w0, 1); // divide by 2
w1 = _mm256_srli_epi16(w1, 1);
w0 = _mm256_packus_epi16(w0, w1); // pack
w0 = _mm256_permute4x64_epi64(w0, 0xd8); // shuffle to get correct order
_mm256_storeu_si256((__m256i *)&dst[i], w0);
}
}
Result image:

Add and subtract integers of arbitrary size

I am tasked to implement from scratch addition and subtraction of signed integers of arbitrary size. Such an integer is stored in an array of 64-bit unsigned integers. The least significant bit of the array's first element is the least significant bit of the whole integer. An array of size d represents a signed integer of at most 64 * d - 1 bits. Format of the integer should not be changed.
I came up with the following:
// Add huge integers: z[] = x[] + y[]
template<typename ing>
inline void addHint(uint64_t *z, uint64_t *x, uint64_t *y, ing d)
{
uint64_t carry = 0;
for (ing i = 0; i < d; ++i)
{
uint64_t zi = x[i] + y[i];
// uint64_t carryNew = zi < x[i] or zi < y[i];
uint64_t carryNew = zi < x[i]; // zi < y[i] unnecessary.
z[i] = zi + carry;
carry = carryNew or z[i] < zi;
}
}
// Subtract huge integers: x[] = z[] - y[]
template<typename ing>
inline void subHint(uint64_t *x, uint64_t *z, uint64_t *y, ing d)
{
uint64_t carry = 0;
for (ing i = 0; i < d; ++i)
{
uint64_t xi = z[i] - y[i];
// uint64_t carryNew = z[i] < y[i];
uint64_t carryNew = z[i] < xi; // Somehow x86-64 g++ 8.3 -O2 dumps fewer assembly lines than the above according to godbolt.org
x[i] = xi - carry;
carry = carryNew or xi < x[i];
}
}
My team is unsatisfied with the speed. Can it be improved?
Thanks!
mp_limb_t mpn_add_n (mp_limb_t *rp, const mp_limb_t *s1p, const mp_limb_t *s2p, mp_size_t n)
and
mp_limb_t mpn_sub_n (mp_limb_t *rp, const mp_limb_t *s1p, const mp_limb_t *s2p, mp_size_t n)
in GMP are what you were looking for.

Inner product of two unsigned byte vectors using AVX2 in C/C++

I'd like to implement a fast correlation coefficient computation using SSE/AVX2. The operands are two unsigned char vectors. The function should be mostly equivalent to this:
float correlate_simple(const unsigned char* vec1, const unsigned char* vec2, size_t length)
{
int sum1 = 0;
int sum2 = 0;
int sum11 = 0;
int sum22 = 0;
int sum12 = 0;
for (size_t i = length; i > 0; --i, ++vec1, ++vec2) {
sum1 += *vec1;
sum2 += *vec2;
sum11 += *vec1 * *vec1;
sum22 += *vec2 * *vec2;
sum12 += *vec1 * *vec2;
}
double mean1 = double(sum1) / double(length);
double mean2 = double(sum2) / double(length);
double mean11 = double(sum11) / double(length);
double mean22 = double(sum22) / double(length);
double mean12 = double(sum12) / double(length);
double b = (mean11 - mean1 * mean1) * (mean22 - mean2 * mean2);
if (b <= 0.0)
return 0.0f;
double a = (mean12 - mean1 * mean2);
return float(a / sqrt(b));
}
The parameter length ranges from 1 to less than 1000.
In order to do this, I researched on how to implement an inner product of two unsigned byte arrays. However, I could not come up with a solution that does not involve converting all the unsigned 8 bit values to signed 16 bit values.
The intrinsic _mm256_maddubs_epi16(a, b) expects b to be a signed byte. This would not be a problem in this case since subtracting some constant (here: 127) from b does not change the correlation coefficient. Unfortunately I could not find an intrinsic that would allow me to subtract 127 from unsigned bytes producing signed bytes (not relying on some two's complement magic).
// vec: const unsigned char*
auto x = _mm256_load_si256((const __m256i*) vec);
auto v = _mm256_set1_epi8(127);
// wrong if vec[i] is less than 127:
auto x_centered = _mm256_sub_epi8 (x, v);
What would be the best approach here to compute inner products (and finally a correlation coefficient)?
Addendum:
Below is my current implementation of a pure inner product. I decided to convert to 16 bit integer to avoid overflow errors.
Update: Changed from reading 128 bits to 256 bits at a time.
int accumulate_i32(__m256i x)
{
auto tmp1 = _mm256_srli_si256(x, 8);
x = _mm256_add_epi32(x, tmp1);
auto tmp2 = _mm256_extractf128_si256(x, 1);
tmp2 = _mm_add_epi32(tmp2, _mm256_castsi256_si128(x));
return _mm_cvtsi128_si32(tmp2) + _mm_extract_epi32(tmp2, 1);
}
int inner_product_avx(const unsigned char* vec1, const unsigned char* vec2, unsigned int length)
{
constexpr unsigned int memoryAlignmentBytes = 32;
constexpr unsigned int bytesPerPack = 256 / 8;
assert((reinterpret_cast<std::uintptr_t>(vec1) % memoryAlignmentBytes) == 0);
assert((reinterpret_cast<std::uintptr_t>(vec2) % memoryAlignmentBytes) == 0);
// compute middle part via AVX2
unsigned int packCount = length / bytesPerPack;
const __m256i zeros = _mm256_setzero_si256();
auto sumlo = _mm256_setzero_si256();
auto sumhi = _mm256_setzero_si256();
for (unsigned int packIdx = 0; packIdx < packCount; ++packIdx) {
auto x1 = _mm256_load_si256((const __m256i*)vec1);
auto x2 = _mm256_load_si256((const __m256i*)vec2);
auto x1lo = _mm256_unpacklo_epi8(x1, zeros);
auto x1hi = _mm256_unpackhi_epi8(x1, zeros);
auto x2lo = _mm256_unpacklo_epi8(x2, zeros);
auto x2hi = _mm256_unpackhi_epi8(x2, zeros);
auto tmplo = _mm256_madd_epi16(x1lo, x2lo);
auto tmphi = _mm256_madd_epi16(x1hi, x2hi);
sumlo = _mm256_add_epi32(sumlo, tmplo);
sumhi = _mm256_add_epi32(sumhi, tmphi);
vec1 += bytesPerPack;
vec2 += bytesPerPack;
}
int sum = accumulate_i32(sumlo) + accumulate_i32(sumhi);
// compute remaining part that cannot be represented as a
// whole packed integer
unsigned int packRestCount = length % bytesPerPack;
for (size_t i = packRestCount; i > 0; --i, ++vec1, ++vec2)
sum += int(*vec1) * int(*vec2);
return sum;
}
This takes roughly 20 % of the time of the simple C++ implementation (see below). Considering the fact that the AVX code works on 16 16-bit integers simultaneously, I would have expected a higher gain. - Is this reasonable or did I miss something?
Unrolling the last loop in the AVX code did not descrease computation time.
int inner_product_simple(const unsigned char* vec1, const unsigned char* vec2, size_t length)
{
int sum = 0;
for (size_t i = length; i > 0; --i, ++vec1, ++vec2)
sum += int(*vec1) * int(*vec2);
return sum;
}
I would start from something like that. It uses 32-bit accumulators just like your current code. Untested.
namespace
{
// Compute sum of the 64-bit lanes, convert to double
inline double hadd_epi64( const __m256i i64 )
{
__m128i res = _mm256_castsi256_si128( i64 );
res = _mm_add_epi64( res, _mm256_extractf128_si256( i64, 1 ) );
res = _mm_add_epi64( res, _mm_unpackhi_epi64( res, res ) );
return (double)_mm_cvtsi128_si64( res );
}
// Convert 32-bit lanes into 64-bit, compute sum of the 8, convert to double
inline double hadd_epu32( __m256i x )
{
const __m256i zero = _mm256_setzero_si256();
__m256i i64 = _mm256_unpacklo_epi32( x, zero );
i64 = _mm256_add_epi64( i64, _mm256_unpackhi_epi32( x, zero ) );
return hadd_epi64( i64 );
};
}
class InnerProduct
{
// These fields are interpreted as 64-bit integers
__m256i a, b;
// These fields are interpreted as 32-bit integers
__m256i aa, bb, ab;
// Accumulate products of 16-bit numbers, 16 of them at once
inline void add16( __m256i x, __m256i y )
{
const __m256i x2 = _mm256_madd_epi16( x, x );
const __m256i y2 = _mm256_madd_epi16( y, y );
const __m256i prod = _mm256_madd_epi16( x, y );
aa = _mm256_add_epi32( aa, x2 );
bb = _mm256_add_epi32( bb, y2 );
ab = _mm256_add_epi32( ab, prod );
}
public:
InnerProduct()
{
a = b = aa = bb = ab = _mm256_setzero_si256();
}
// Handle 32 bytes
inline void addBytes( __m256i x, __m256i y )
{
// Accumulate values
const __m256i zero = _mm256_setzero_si256();
a = _mm256_add_epi64( a, _mm256_sad_epu8( x, zero ) );
b = _mm256_add_epi64( b, _mm256_sad_epu8( y, zero ) );
// Split the vectors into 2 sets of 16-bit numbers, accumulate products
const __m256i z = _mm256_unpacklo_epi8( x, zero );
const __m256i w = _mm256_unpacklo_epi8( y, zero );
add16( z, w );
x = _mm256_unpackhi_epi8( x, zero );
y = _mm256_unpackhi_epi8( y, zero );
add16( x, y );
}
// Compute the result
float compute( size_t count ) const
{
const double div = (double)count;
const double mean1 = hadd_epi64( a ) / div;
const double mean2 = hadd_epi64( b ) / div;
const double mean11 = hadd_epu32( aa ) / div;
const double mean22 = hadd_epu32( bb ) / div;
const double mean12 = hadd_epu32( ab ) / div;
const double b = ( mean11 - mean1 * mean1 ) * ( mean22 - mean2 * mean2 );
if( b <= 0 )
return 0;
const double a = ( mean12 - mean1 * mean2 );
return float( a / sqrt( b ) );
}
};
// Load 1-31 bytes into AVX register, zero out unused higher bytes
inline __m256i loadPartial( const uint8_t* p, size_t length )
{
alignas( 32 ) std::array<uint8_t, 32> arr{};
memcpy( arr.data(), p, length );
return _mm256_load_si256( ( const __m256i* )arr.data() );
}
float correlate_simple( const uint8_t* vec1, const uint8_t* vec2, const size_t length )
{
InnerProduct ip;
const uint8_t* const vec1End = vec1 + ( ( length / 32 ) * 32 );
for( ; vec1 < vec1End; vec1 += 32, vec2 += 32 )
{
const __m256i x = _mm256_loadu_si256( ( const __m256i * )vec1 );
const __m256i y = _mm256_loadu_si256( ( const __m256i * )vec2 );
ip.addBytes( x, y );
}
const size_t remainder = length % 32;
if( remainder > 0 )
{
const __m256i x = loadPartial( vec1, remainder );
const __m256i y = loadPartial( vec2, remainder );
ip.addBytes( x, y );
}
return ip.compute( length );
}
Unfortunately I could not find an intrinsic that would allow me to subtract 127 from unsigned bytes producing signed bytes (not relying on some two's complement magic).
Intel x86 CPUs use 2's compliment representation of signed numbers, this is why there are no separate versions of SIMD intrinsics for signed/unsigned packed integers. Intel SIMD intrinsics are outside the scope of C++ standard and have a specific well-defined behaviour.

How to use intrinsics to elementwise multiply two char arrays and sum up the multiplications into int?

I am not familiar with x86_64 intrinsics, I'd like to have the following operations using 256bit vector registers.
I was using _mm256_maddubs_epi16(a, b); however, it seems that this instruction has overflow issue since char*char can exceeds 16-bit maximum value. I have issue understanding _mm256_unpackhi_epi32 and related instructions.
Can anyone elaborate me and show me the light to the destination? Thank you!
int sumup_char_arrays(char *A, char *B, int size) {
assert (size % 32 == 0);
int sum = 0;
for (int i = 0; i < size; i++) {
sum += A[i]*B[i];
}
return sum;
}
I've figured out the solution, any idea to improve it, especially the final stage of reduction.
int sumup_char_arrays(char *A, char *B, int size) {
assert (size % 32 == 0);
int sum = 0;
__m256i sum_tmp;
for (int i = 0; i < size; i += 32) {
__m256i ma_l = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)A));
__m256i ma_h = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)(A+16)));
__m256i mb_l = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)B));
__m256i mb_h = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)(B+16)));
__m256i mc = _mm256_madd_epi16(ma_l, mb_l);
mc = _mm256_add_epi32(mc, _mm256_madd_epi16(ma_h, mb_h));
sum_tmp = _mm256_add_epi32(mc, sum_tmp);
//sum += A[i]*B[i];
}
sum_tmp = _mm256_add_epi32(sum_tmp, _mm256_permute2x128_si256(sum_tmp, sum_tmp, 0x81));
sum_tmp = _mm256_add_epi32(sum_tmp, _mm256_srli_si256(sum_tmp, 8));
sum_tmp = _mm256_add_epi32(sum_tmp, _mm256_srli_si256(sum_tmp, 4));
sum = _mm256_extract_epi32(sum_tmp, 0);
return sum;
}

Fast implementation of covariance of two 8-bit arrays

I need to compare a big amount of similar images of small size (up to 200x200).
So I try to implement SSIM (Structural similarity see https://en.wikipedia.org/wiki/Structural_similarity ) algorithm.
SSIM requires calculation of covariance of two 8-bit gray images.
A trivial implementation look like:
float SigmaXY(const uint8_t * x, const uint8_t * y, size_t size, float averageX, float averageY)
{
float sum = 0;
for(size_t i = 0; i < size; ++i)
sum += (x[i] - averageX) * (y[i] - averageY);
return sum / size;
}
But it has poor performance.
So I hope to improve it with using SIMD or CUDA (I heard that it can be done).
Unfortunately I have no experience to do this.
How it will look? And where I have to go?
I have another nice solution!
At first I want to mention some mathematical formulas:
averageX = Sum(x[i])/size;
averageY = Sum(y[i])/size;
And therefore:
Sum((x[i] - averageX)*(y[i] - averageY))/size =
Sum(x[i]*y[i])/size - Sum(x[i]*averageY)/size -
Sum(averageX*y[i])/size + Sum(averageX*averageY)/size =
Sum(x[i]*y[i])/size - averageY*Sum(x[i])/size -
averageX*Sum(y[i])/size + averageX*averageY*Sum(1)/size =
Sum(x[i]*y[i])/size - averageY*averageX -
averageX*averageY + averageX*averageY =
Sum(x[i]*y[i])/size - averageY*averageX;
It allows to modify our algorithm:
float SigmaXY(const uint8_t * x, const uint8_t * y, size_t size, float averageX, float averageY)
{
uint32_t sum = 0; // If images will have size greater then 256x256 than you have to use uint64_t.
for(size_t i = 0; i < size; ++i)
sum += x[i]*y[i];
return sum / size - averageY*averageX;
}
And only after that we can use SIMD (I used SSE2):
#include <emmintrin.h>
inline __m128i SigmaXY(__m128i x, __m128i y)
{
__m128i lo = _mm_madd_epi16(_mm_unpacklo_epi8(x, _mm_setzero_si128()), _mm_unpacklo_epi8(y, _mm_setzero_si128()));
__m128i hi = _mm_madd_epi16(_mm_unpackhi_epi8(y, _mm_setzero_si128()), _mm_unpackhi_epi8(y, _mm_setzero_si128()));
return _mm_add_epi32(lo, hi);
}
float SigmaXY(const uint8_t * x, const uint8_t * y, size_t size, float averageX, float averageY)
{
uint32_t sum = 0;
size_t i = 0, alignedSize = size/16*16;
if(size >= 16)
{
__m128i sums = _mm_setzero_si128();
for(; i < alignedSize; i += 16)
{
__m128i _x = _mm_loadu_si128((__m128i*)(x + i));
__m128i _y = _mm_loadu_si128((__m128i*)(y + i));
sums = _mm_add_epi32(sums, SigmaXY(_x, _y));
}
uint32_t _sums[4];
_mm_storeu_si128(_sums, sums);
sum = _sums[0] + _sums[1] + _sums[2] + _sums[3];
}
for(; i < size; ++i)
sum += x[i]*y[i];
return sum / size - averageY*averageX;
}
There is a SIMD implementation of the algorithm (I used SSE4.1):
#include <smmintrin.h>
template <int shift> inline __m128 SigmaXY(const __m128i & x, const __m128i & y, __m128 & averageX, __m128 & averageY)
{
__m128 _x = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_srli_si128(x, shift)));
__m128 _y = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_srli_si128(y, shift)));
return _mm_mul_ps(_mm_sub_ps(_x, averageX), _mm_sub_ps(_y, averageY))
}
float SigmaXY(const uint8_t * x, const uint8_t * y, size_t size, float averageX, float averageY)
{
float sum = 0;
size_t i = 0, alignedSize = size/16*16;
if(size >= 16)
{
__m128 sums = _mm_setzero_ps();
__m128 avgX = _mm_set1_ps(averageX);
__m128 avgY = _mm_set1_ps(averageY);
for(; i < alignedSize; i += 16)
{
__m128i _x = _mm_loadu_si128((__m128i*)(x + i));
__m128i _y = _mm_loadu_si128((__m128i*)(y + i));
sums = _mm_add_ps(sums, SigmaXY<0>(_x, _y, avgX, avgY);
sums = _mm_add_ps(sums, SigmaXY<4>(_x, _y, avgX, avgY);
sums = _mm_add_ps(sums, SigmaXY<8>(_x, _y, avgX, avgY);
sums = _mm_add_ps(sums, SigmaXY<12>(_x, _y, avgX, avgY);
}
float _sums[4];
_mm_storeu_ps(_sums, sums);
sum = _sums[0] + _sums[1] + _sums[2] + _sums[3];
}
for(; i < size; ++i)
sum += (x[i] - averageX) * (y[i] - averageY);
return sum / size;
}
I hope that it will useful for you.