C++ convert SSE code to AVX - c++

With the help of YOU, I have used SSE in my code (sample below) with significant performance boost and I was wondering if this boost could be improved by using 256bit registers of AVX.
int result[4] __attribute__((aligned(16))) = {0};
__m128i vresult = _mm_set1_epi32(0);
__m128i v1, v2, vmax;
for (int k = 0; k < limit; k += 4) {
v1 = _mm_load_si128((__m128i *) & myVector[positionNodeId + k]);
v2 = _mm_load_si128((__m128i *) & myVector2[k]);
vmax = _mm_add_epi32(v1, v2);
vresult = _mm_max_epi32(vresult, vmax);
}
_mm_store_si128((__m128i *) result, vresult);
return max(max(max(result[0], result[1]), result[2]), result[3]);
So, I have 3 questions: How would the above rather simple SSE code could be converted to AVX? WHat header should I import for that? And what flag should I tell my gcc compiler (instead of -sse4.1) for AVX to work?
Thanks in advance. for your help.

1.) This code can be easily converted to AVX2 (see below)
2.) #include <x86intrin.h>
3.) compile with -mavx2
You will need a CPU that supports AVX2. Currently only Intel Haswell processors support this. I don't have a Haswell processor (yet) so I could not test the code.
int result[8] __attribute__((aligned(32))) = {0};
__m256i vresult = _mm256_set1_epi32(0);
__m256i v1, v2, vmax;
for (int k = 0; k < limit; k += 8) {
v1 = _mm256_load_si256((__m256i *) & myVector[positionNodeId + k]);
v2 = _mm256_load_si256((__m256i *) & myVector2[k]);
vmax = _mm256_add_epi32(v1, v2);
vresult = _mm256_max_epi32(vresult, vmax);
}
return horizontal_max_Vec8i(vresult);
//_mm256_store_si256((__m256i *) result, vresult);
//int mymax = result[0];
//for(int k=1; k<8; k++) {
// if(result[k]>mymax) mymax = result[k];
//}
//return mymax;
Edit: I suspect that since you are only running over 64 elements that the horizontal max has a small but not insignifcant computation time. I came up with a horizontal_max_Vec4i function for SSE and a horizontal_max_Vec8i function for AVX (it does not need AVX2). Try replacing max(max(max(result[0], result[1]), result[2]), result[3]) with horizontal_max_Vec4i.
int horizontal_max_Vec4i(__m128i x) {
__m128i max1 = _mm_shuffle_epi32(x, _MM_SHUFFLE(0,0,3,2));
__m128i max2 = _mm_max_epi32(x,max1);
__m128i max3 = _mm_shuffle_epi32(max2, _MM_SHUFFLE(0,0,0,1));
__m128i max4 = _mm_max_epi32(max2,max3);
return _mm_cvtsi128_si32(max4);
}
int horizontal_max_Vec8i(__m256i x) {
__m128i low = _mm256_castsi256_si128(x);
__m128i high = _mm256_extractf128_si256(x,1);
return horizontal_max_Vec4i(_mm_max_epi32(low,high));
}

Related

SSE optimization of sum of squared differences

I've recently found that my program spend most time in the following simple function:
void SumOfSquaredDifference(
const uint8_t * a, size_t aStride, const uint8_t * b, size_t bStride,
size_t width, size_t height, uint64_t * sum)
{
*sum = 0;
for(size_t row = 0; row < height; ++row)
{
int rowSum = 0;
for(size_t col = 0; col < width; ++col)
{
int d = a[col] - b[col];
rowSum += d*d;
}
*sum += rowSum;
a += aStride;
b += bStride;
}
}
This function finds a sum of squared difference of two 8-bit gray images.
I think that there is the way to improve its performance with using SSE, but I don't have an experience in this area.
Could anybody help me?
Of course, you can improve your code.
This an example of optimization of your function with using SSE2:
const __m128i Z = _mm_setzero_si128();
const size_t A = sizeof(__m128i);
inline __m128i SquaredDifference(__m128i a, __m128i b)
{
const __m128i aLo = _mm_unpacklo_epi8(a, Z);
const __m128i bLo = _mm_unpacklo_epi8(b, Z);
const __m128i dLo = _mm_sub_epi16(aLo, bLo);
const __m128i aHi = _mm_unpackhi_epi8(a, Z);
const __m128i bHi = _mm_unpackhi_epi8(b, Z);
const __m128i dHi = _mm_sub_epi16(aHi, bHi);
return _mm_add_epi32(_mm_madd_epi16(dLo, dLo), _mm_madd_epi16(dHi, dHi));
}
inline __m128i HorizontalSum32(__m128i a)
{
return _mm_add_epi64(_mm_unpacklo_epi32(a, Z), _mm_unpackhi_epi32(a, Z));
}
inline uint64_t ExtractSum64(__m128i a)
{
uint64_t _a[2];
_mm_storeu_si128((__m128i*)_a, a);
return _a[0] + _a[1];
}
void SumOfSquaredDifference(
const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride,
size_t width, size_t height, uint64_t * sum)
{
assert(width%A == 0 && width < 0x10000);
__m128i fullSum = Z;
for(size_t row = 0; row < height; ++row)
{
__m128i rowSum = Z;
for(size_t col = 0; col < width; col += A)
{
const __m128i a_ = _mm_loadu_si128((__m128i*)(a + col));
const __m128i b_ = _mm_loadu_si128((__m128i*)(b + col));
rowSum = _mm_add_epi32(rowSum, SquaredDifference(a_, b_));
}
fullSum = _mm_add_epi64(fullSum, HorizontalSum32(rowSum));
a += aStride;
b += bStride;
}
*sum = ExtractSum64(fullSum);
}
This example is a few simplified (it doesn't work if the image width isn't multiple of 16).
Full version of the algorithm is here.
And some magic from SSSE3 version:
const __m128i K_1FF = _mm_set1_epi16(0x1FF);
inline __m128i SquaredDifference(__m128i a, __m128i b)
{
const __m128i lo = _mm_maddubs_epi16(_mm_unpacklo_epi8(a, b), K_1FF);
const __m128i hi = _mm_maddubs_epi16(_mm_unpackhi_epi8(a, b), K_1FF);
return _mm_add_epi32(_mm_madd_epi16(lo, lo), _mm_madd_epi16(hi, hi));
}
The magic description (see _mm_maddubs_epi16):
K_1FF -> {-1, 1, -1, 1, ...};
_mm_unpacklo_epi8(a, b) -> {a0, b0, a1, b1, ...};
_mm_maddubs_epi16(_mm_unpacklo_epi8(a, b), K_1FF) -> {b0 - a0, b1 - a1, ...};
GCC has switches that encourage it to vectorize the code. For example, the -mfma switch gives me about 25% speed increase on simple loops like this, using doubles. I imagine it's even better with 8-bit integers. I prefer that over hand-written optimizations because your code stays readable.
That said, there are a few old tricks that can speed up your loop:
Don't index, increment your pointer in every loop iteration. You do this in the outer loop, you should do the same in the inner loop. You can create a new pointer before going into the inner loop, so the +=stride stays valid.
Don't assign into the sum pointer inside your loop, use a local variable to accumulate and copy to the output when done. You use rowSum, but only in the inner loop. Use that variable across both loops instead.

Convert SSE matrix-vector multiplication code to AVX

I'm trying to convert my SSE function to AVX. The function does vector-matrix multiplication, here's my working SSE code:
void multiply_matrix_by_vector_SSE(float* m, float* v, float* result, unsigned const int vector_dims)
{
size_t i, j;
for (i = 0; i < vector_dims; ++i)
{
__m128 acc = _mm_setzero_ps();
for (j = 0; j < vector_dims; j += 4)
{
__m128 vec = _mm_load_ps(&v[j]);
__m128 mat = _mm_load_ps(&m[j + vector_dims * i]);
//acc = _mm_add_ps(acc, _mm_mul_ps(mat, vec));
acc = _mm_fmadd_ps(mat, vec, acc);
}
acc = _mm_hadd_ps(acc, acc);
acc = _mm_hadd_ps(acc, acc);
_mm_store_ss(&result[i], acc);
}
}
And here's what I've come up with as for AVX:
void multiply_matrix_by_vector_AVX(float* m, float* v, float* result, unsigned const int vector_dims)
{
size_t i, j;
for (i = 0; i < vector_dims; ++i)
{
__m256 acc = _mm256_setzero_ps();
for (j = 0; j < vector_dims; j += 8)
{
__m256 vec = _mm256_load_ps(&v[j]);
__m256 mat = _mm256_load_ps(&m[j + vector_dims * i]);
acc = _mm256_fmadd_ps(mat, vec, acc);
}
acc = _mm256_hadd_ps(acc, acc);
acc = _mm256_hadd_ps(acc, acc);
acc = _mm256_hadd_ps(acc, acc);
acc = _mm256_hadd_ps(acc, acc);
_mm256_store_ps(&result[i], acc);
}
}
however, the AVX code crashes (Access violation reading location 0xFFFFFFFFFFFFFFFF).
Could anyone help me to make my AVX function work properly?
PS: the sizes of matrixes and vectors that I pass in my functions are always multiples of 8. Also, the arrays I pass to my SSE function are 16-bit aligned (__declspec(align(16))float* = generate_matrix(256);) and the arrays I pass to my AVX function are 32-bit aligned (__declspec(align(32))float* = generate_matrix(256););
Unfortunately using horizontal adds like that does not trivially extend to 256 bit, because the instruction (and most others) is "laned" - it acts like two haddps's in parallel, one on the top half and one on the bottom half, with no mixing, so the bottom and top halves will not get summed together.
Also, it is, of course, still not a packed result, and that packed store there is an aligned store writing to some unaligned address and will fail (that error is a bit weird but whatever).
Anyway let's fix the horizontal sum: (not tested)
// this part still works
acc = _mm256_hadd_ps(acc, acc);
acc = _mm256_hadd_ps(acc, acc);
// this is new
__m128 acc1 = _mm256_extractf128_ps(acc, 0);
__m128 acc2 = _mm256_extractf128_ps(acc, 1);
acc1 = _mm_add_ss(acc1, acc2);
// do scalar store, obviously
_mm_store_ss(&result[i], acc1);
By the way that inner loop needs 10 independent chains (and 10 accumulators) in order to maximize the throughput on Haswell.

C++ vectorization of conditional code with intrinsics

I tried to enable vectorization of an often-used function to improve the performance.
The algorithm should do the following and is called ~4.000.000 times!
Input: double* cellvalue
Output: int8* Output (8 bit integer, c++ char)
Algo:
if (cellvalue > upper_threshold )
*output = 1;
else if (cellvalue < lower_threshold)
*output = -1;
else
*output = 0;
My first vectorization approach to compute 2 doubles in parallel looks like:
__m128d lowerThresh = _mm_set1_pd(m_lowerThreshold);
__m128d upperThresh = _mm_set1_pd(m_upperThreshold);
__m128d vec = _mm_load_pd(cellvalue);
__m128d maskLower = _mm_cmplt_pd(vec, lowerThresh); // less than
__m128d maskUpper = _mm_cmpgt_pd(vec, upperThresh); // greater than
static const tInt8 negOne = -1;
static const tInt8 posOne = 1;
output[0] = (negOne & *((tInt8*)&maskLower.m128d_f64[0])) | (posOne & *((tInt8*)&maskUpper.m128d_f64[0]));
output[1] = (negOne & *((tInt8*)&maskLower.m128d_f64[1])) | (posOne & *((tInt8*)&maskUpper.m128d_f64[1]));
Does this make sense to you? It works, but I think the last part to create the output is very complicated. Is there any faster method to do this?
Also I tried to compute 8 values at once with nearly the same code. Will this perform better? Does the order of instructions make sense?
__m128d lowerThresh = _mm_set1_pd(m_lowerThreshold);
__m128d upperThresh = _mm_set1_pd(m_upperThreshold);
// load 4 times
__m128d vec0 = _mm_load_pd(cellValue);
__m128d vec1 = _mm_load_pd(cellValue + 2);
__m128d vec2 = _mm_load_pd(cellValue + 4);
__m128d vec3 = _mm_load_pd(cellValue + 6);
__m128d maskLower0 = _mm_cmplt_pd(vec0, lowerThresh); // less than
__m128d maskLower1 = _mm_cmplt_pd(vec1, lowerThresh); // less than
__m128d maskLower2 = _mm_cmplt_pd(vec2, lowerThresh); // less than
__m128d maskLower3 = _mm_cmplt_pd(vec3, lowerThresh); // less than
__m128d maskUpper0 = _mm_cmpgt_pd(vec0, upperThresh); // greater than
__m128d maskUpper1 = _mm_cmpgt_pd(vec1, upperThresh); // greater than
__m128d maskUpper2 = _mm_cmpgt_pd(vec2, upperThresh); // greater than
__m128d maskUpper3 = _mm_cmpgt_pd(vec3, upperThresh); // greater than
static const tInt8 negOne = -1;
static const tInt8 posOne = 1;
output[0] = (negOne & *((tInt8*)&maskLower0.m128d_f64[0])) | (posOne & *((tInt8*)&maskUpper0.m128d_f64[0]));
output[1] = (negOne & *((tInt8*)&maskLower0.m128d_f64[1])) | (posOne & *((tInt8*)&maskUpper0.m128d_f64[1]));
output[2] = (negOne & *((tInt8*)&maskLower1.m128d_f64[0])) | (posOne & *((tInt8*)&maskUpper1.m128d_f64[0]));
output[3] = (negOne & *((tInt8*)&maskLower1.m128d_f64[1])) | (posOne & *((tInt8*)&maskUpper1.m128d_f64[1]));
output[4] = (negOne & *((tInt8*)&maskLower2.m128d_f64[0])) | (posOne & *((tInt8*)&maskUpper2.m128d_f64[0]));
output[5] = (negOne & *((tInt8*)&maskLower2.m128d_f64[1])) | (posOne & *((tInt8*)&maskUpper2.m128d_f64[1]));
output[6] = (negOne & *((tInt8*)&maskLower3.m128d_f64[0])) | (posOne & *((tInt8*)&maskUpper3.m128d_f64[0]));
output[7] = (negOne & *((tInt8*)&maskLower3.m128d_f64[1])) | (posOne & *((tInt8*)&maskUpper3.m128d_f64[1]));
Hopefully you can help me to understand the vectorization thing a bit better ;)
_mm_cmplt_pd and _mm_cmpgt_pd produce a result that is already either 0 or -1; anding it with -1 does nothing, and anding it with 1 is equivalent to negating it. Thus, if upper_threshold > lower_threshold (so that both conditions are never true), you can just write*:
_mm_storeu_si128(output, _mm_sub_epi64(maskLower, maskUpper));
(*) it's unclear what an "int8" is in your code; that's not a standard type in C++. It could be an 8-byte int, which is the behavior I've used here. If it's an 8-bit int instead, you'll want to pack up a bunch of results to store together.
Questioner clarifies that they intend int8 to be an 8-bit integer. In that case, you can do the following for a quick implementation:
__m128i result = _mm_sub_epi64(maskLower, maskUpper)
output[0] = result.m128i_i64[0]; // .m128i_i64 is an oddball MSVC-ism, so
output[1] = result.m128i_i64[1]; // I'm not 100% sure about the syntax here.
but you may also want to try packing eight result vectors together and store them with a single store operation.
If you change the code not to branch, then a modern compiler will do the vectorization for you.
Here's the test I ran:
#include <stdint.h>
#include <iostream>
#include <random>
#include <vector>
#include <chrono>
using Clock = std::chrono::steady_clock;
using std::chrono::milliseconds;
typedef double Scalar;
typedef int8_t Integer;
const Scalar kUpperThreshold = .5;
const Scalar kLowerThreshold = .2;
void compute_comparisons1(int n, const Scalar* xs, Integer* ys) {
#pragma simd
for (int i=0; i<n; ++i) {
Scalar x = xs[i];
ys[i] = (x > kUpperThreshold) - (x < kLowerThreshold);
}
}
void compute_comparisons2(int n, const Scalar* xs, Integer* ys) {
for (int i=0; i<n; ++i) {
Scalar x = xs[i];
Integer& y = ys[i];
if (x > kUpperThreshold)
y = 1;
else if(x < kLowerThreshold)
y = -1;
else
y = 0;
}
}
const int N = 4000000;
auto random_generator = std::mt19937{0};
int main() {
std::vector<Scalar> xs(N);
std::vector<Integer> ys1(N);
std::vector<Integer> ys2(N);
std::uniform_real_distribution<Scalar> dist(0, 1);
for (int i=0; i<N; ++i)
xs[i] = dist(random_generator);
auto time0 = Clock::now();
compute_comparisons1(N, xs.data(), ys1.data());
auto time1 = Clock::now();
compute_comparisons2(N, xs.data(), ys2.data());
auto time2 = Clock::now();
std::cout << "v1: " << std::chrono::duration_cast<milliseconds>(time1 - time0).count() << "\n";
std::cout << "v2: " << std::chrono::duration_cast<milliseconds>(time2 - time1).count() << "\n";
for (int i=0; i<N; ++i) {
if (ys1[i] != ys2[i]) {
std::cout << "Error!\n";
return -1;
}
}
return 0;
}
If you compile with a recent version of gcc (I used 4.8.3) and use the flags "-O3 -std=c++11 -march=native -S", you can verify by looking at the assembly that it vectorizes the code. And it runs much faster (3 milliseconds vs 16 milliseconds on my machine.)
Also, I'm not sure what your requirements are; but if you can live with less precision, then using float instead of double will further improve the speed (double takes 1.8x as long on my machine)

how to use SSE to process array of ints, using a condition

I'm new to SSE, and limited in knowledge. I'm trying to vectorize my code (C++, using gcc), which is actually quite simple.
I have an array of unsigned ints, and I only check for elements that are >=, or <= than some constant. As result, I need an array with elements that passed condition.
I'm thinking to use 'mm_cmpge_ps' as a mask, but this construct work over floats not ints!? :(
any suggestion, help is very much appreciated.
It's pretty easy to just mask out (i.e. set to 0) all non-matching ints. e.g.
#include <emmintrin.h> // SSE2 intrinsics
for (int i = 0; i < N; i += 4)
{
__m128i v = _mm_load_si128(&a[i]);
__m128i vcmp0 = _mm_cmpgt_epi32(v, _mm_set1_epi32(MIN_VAL - 1));
__m128i vcmp1 = _mm_cmplt_epi32(v, _mm_set1_epi32(MAX_VAL + 1));
__m128i vcmp = _mm_and_si128(vcmp0, vcmp1);
v = _mm_and_si128(v, vcmp);
_mm_store_si128(&a[i], v);
}
Note that a needs to be 16 byte aligned and N needs to be a multiple of 4 - if these constraints are a problem then it's not too hard to extend the code to cope with this.
Here you go. Here are three functions.
The first function,foo_v1, is based on Paul R's answer.
The second function,foo_v2, is based on a popular question today Fastest way to determine if an integer is between two integers (inclusive) with known sets of values
The third function, foo_v3 uses Agner Fog's vectorclass which I added only to show how much easier and cleaner it is to use his class. If you don't have the class then just comment out the #include "vectorclass.h" line and the foo_v3 function. I used Vec8ui which means it will use AVX2 if available and break it into two Vec4ui otherwise so you don't have to change your code to get the benefit of AVX2.
#include <stdio.h>
#include <nmmintrin.h> // SSE4.2
#include "vectorclass.h"
void foo_v1(const int N, int *a, const int MAX_VAL, const int MIN_VAL) {
for (int i = 0; i < N; i += 4) {
__m128i v = _mm_load_si128((const __m128i*)&a[i]);
__m128i vcmp0 = _mm_cmpgt_epi32(v, _mm_set1_epi32(MIN_VAL - 1));
__m128i vcmp1 = _mm_cmplt_epi32(v, _mm_set1_epi32(MAX_VAL + 1));
__m128i vcmp = _mm_and_si128(vcmp0, vcmp1);
v = _mm_and_si128(v, vcmp);
_mm_store_si128((__m128i*)&a[i], v);
}
}
void foo_v2(const int N, int *a, const int MAX_VAL, const int MIN_VAL) {
//if ((unsigned)(number-lower) < (upper-lower))
for (int i = 0; i < N; i += 4) {
__m128i v = _mm_load_si128((const __m128i*)&a[i]);
__m128i dv = _mm_sub_epi32(v, _mm_set1_epi32(MIN_VAL));
__m128i min_ab = _mm_min_epu32(dv,_mm_set1_epi32(MAX_VAL-MIN_VAL));
__m128i vcmp = _mm_cmpeq_epi32(dv,min_ab);
v = _mm_and_si128(v, vcmp);
_mm_store_si128((__m128i*)&a[i], v);
}
}
void foo_v3(const int N, int *a, const int MAX_VAL, const int MIN_VAL) {
//if ((unsigned)(number-lower) < (upper-lower))
for (int i = 0; i < N; i += 8) {
Vec8ui va = Vec8ui().load(&a[i]);
va &= (va - MIN_VAL) <= (MAX_VAL-MIN_VAL);
va.store(&a[i]);
}
}
int main() {
const int N = 16;
int* a = (int*)_mm_malloc(sizeof(int)*N, 16);
for(int i=0; i<N; i++) {
a[i] = i;
}
foo_v2(N, a, 7, 3);
for(int i=0; i<N; i++) {
printf("%d ", a[i]);
} printf("\n");
_mm_free(a);
}
First place to look might be IntelĀ® Intrinsics Guide

SIMD-able code?

What is the strict definition of what code can utilise SIMD instruction set? Is it anything where you can run calculations in parallel?
So if I had:
for(int i=0; i<100; i++){
sum += array[i];
}
this could take advantage of SIMD because we could run:
for(int i=0; i<100;i=i+4){
sum0 += array[i];
sum1 += array[i+1];
sum2 += array[i+2];
sum3 += array[i+3];
}
sum = sum0 + sum1 + sum2 + sum3;
?
Does it have to be float types, or could it be double and integer?
Assuming you're talking about x86 (SSE et al) then the supported types for arithmetic are 8, 16, 32 and 64 bit integers, and single and double precision floats. Note however that not all arithmetic operations are supported for all data types - SSE lacks orthogonality in this regard.
Assuming 32 bit ints and suitably aligned arrays (16 byte aligned) then you could implement your above loop example as:
#include <emmintrin.h> // SSE2 intrinsics
int32_t a[100] __attribute__ ((aligned(16)));
// suitably aligned array
__m128i vsum = _mm_set1_epi32(0); // init vsum = { 0, 0, 0, 0 }
for (int i = 0; i < 100; i += 4)
{
__m128i v = _mm_load_si128(&a[i]); // load 4 ints from a[i]..a[i+3]
vsum = _mm_add_epi32(vsum, v); // accumulate 4 partial sums
}
// final horizontal sum of partial sums
vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
int32_t sum = _mm_cvtsi128_si32(vsum); // sum = scalar sum of a[]