SSE inline assembly and possible g++ optimization bug - c++

Let's start with the code. I have two structures, one for vectors, and other for matrices.
struct AVector
{
explicit AVector(float x=0.0f, float y=0.0f, float z=0.0f, float w=0.0f):
x(x), y(y), z(z), w(w) {}
AVector(const AVector& a):
x(a.x), y(a.y), z(a.z), w(a.w) {}
AVector& operator=(const AVector& a) {x=a.x; y=a.y; z=a.z; w=a.w; return *this;}
float x, y, z, w;
};
struct AMatrix
{
// Row-major
explicit AMatrix(const AVector& a=AVector(), const AVector& b=AVector(), const AVector& c=AVector(), const AVector& d=AVector())
{row[0]=a; row[1]=b; row[2]=c; row[3]=d;}
AMatrix(const AMatrix& m) {row[0]=m.row[0]; row[1]=m.row[1]; row[2]=m.row[2]; row[3]=m.row[3];}
AMatrix& operator=(const AMatrix& m) {row[0]=m.row[0]; row[1]=m.row[1]; row[2]=m.row[2]; row[3]=m.row[3]; return *this;}
AVector row[4];
};
Next, code performing calculations on those structures. Dot product using inlined ASM and SSE instructions:
inline AVector AVectorDot(const AVector& a, const AVector& b)
{
// XXX
/*const double v=a.x*b.x+a.y*b.y+a.z*b.z+a.w*b.w;
return AVector(v, v, v, v);*/
AVector c;
asm volatile(
"movups (%1), %%xmm0\n\t"
"movups (%2), %%xmm1\n\t"
"mulps %%xmm1, %%xmm0\n\t" // xmm0 -> (a1+b1, , , )
"movaps %%xmm0, %%xmm1\n\t" // xmm1 = xmm0
"shufps $0xB1, %%xmm1, %%xmm1\n\t" // 0xB1 = 10110001
"addps %%xmm1, %%xmm0\n\t" // xmm1 -> (x, y, z, w)+(y, x, w, z)=(x+y, x+y, z+w, z+w)
"movaps %%xmm0, %%xmm1\n\t" // xmm1 = xmm0
"shufps $0x0A, %%xmm1, %%xmm1\n\t" // 0x0A = 00001010
"addps %%xmm1, %%xmm0\n\t" // xmm1 -> (x+y+z+w, , , )
"movups %%xmm0, %0\n\t"
: "=m"(c)
: "r"(&a), "r"(&b)
);
return c;
}
Matrix transposition:
inline AMatrix AMatrixTranspose(const AMatrix& m)
{
AMatrix c(
AVector(m.row[0].x, m.row[1].x, m.row[2].x, m.row[3].x),
AVector(m.row[0].y, m.row[1].y, m.row[2].y, m.row[3].y),
AVector(m.row[0].z, m.row[1].z, m.row[2].z, m.row[3].z),
AVector(m.row[0].w, m.row[1].w, m.row[2].w, m.row[3].w));
// XXX
/*printf("AMcrix c:\n [%5.2f %5.2f %5.2f %5.2f]\n [%5.2f %5.2f %5.2f %5.2f]\n [%5.2f %5.2f %5.2f %5.2f]\n [%5.2f %5.2f %5.2f %5.2f]\n",
c.row[0].x, c.row[0].y, c.row[0].z, c.row[0].w,
c.row[1].x, c.row[1].y, c.row[1].z, c.row[1].w,
c.row[2].x, c.row[2].y, c.row[2].z, c.row[2].w,
c.row[3].x, c.row[3].y, c.row[3].z, c.row[3].w);*/
return c;
}
Matrix-matrix multiplication - transpose first matrix, because when I have it stored as column major, and second one as row major, then I can perform multiplication using dot-products.
inline AMatrix AMatrixMultiply(const AMatrix& a, const AMatrix& b)
{
AMatrix c;
const AMatrix at=AMatrixTranspose(a);
// XXX
/*printf("AMatrix at:\n [%5.2f %5.2f %5.2f %5.2f]\n [%5.2f %5.2f %5.2f %5.2f]\n [%5.2f %5.2f %5.2f %5.2f]\n [%5.2f %5.2f %5.2f %5.2f]\n",
at.row[0].x, at.row[0].y, at.row[0].z, at.row[0].w,
at.row[1].x, at.row[1].y, at.row[1].z, at.row[1].w,
at.row[2].x, at.row[2].y, at.row[2].z, at.row[2].w,
at.row[3].x, at.row[3].y, at.row[3].z, at.row[3].w);*/
for(int i=0; i<4; ++i)
{
c.row[i].x=AVectorDot(at.row[0], b.row[i]).w;
c.row[i].y=AVectorDot(at.row[1], b.row[i]).w;
c.row[i].z=AVectorDot(at.row[2], b.row[i]).w;
c.row[i].w=AVectorDot(at.row[3], b.row[i]).w;
}
return c;
}
Now time for main (pun intended) part:
int main(int argc, char *argv[])
{
AMatrix a(
AVector(0, 1, 0, 0),
AVector(1, 0, 0, 0),
AVector(0, 0, 0, 1),
AVector(0, 0, 1, 0)
);
AMatrix b(
AVector(1, 0, 0, 0),
AVector(0, 2, 0, 0),
AVector(0, 0, 3, 0),
AVector(0, 0, 0, 4)
);
AMatrix c=AMatrixMultiply(a, b);
printf("AMatrix c:\n [%5.2f %5.2f %5.2f %5.2f]\n [%5.2f %5.2f %5.2f %5.2f]\n [%5.2f %5.2f %5.2f %5.2f]\n [%5.2f %5.2f %5.2f %5.2f]\n",
c.row[0].x, c.row[0].y, c.row[0].z, c.row[0].w,
c.row[1].x, c.row[1].y, c.row[1].z, c.row[1].w,
c.row[2].x, c.row[2].y, c.row[2].z, c.row[2].w,
c.row[3].x, c.row[3].y, c.row[3].z, c.row[3].w);
AVector v(1, 2, 3, 4);
AVector w(1, 1, 1, 1);
printf("Dot product: %f (1+2+3+4 = 10)\n", AVectorDot(v, w).w);
return 0;
}
In the above code I make two matrices, multiply them and print the resulting matrix.
It works fine if I don't use any of the compiler optimizations (g++ main.cpp -O0 -msse). With optimizations enabled (g++ main.cpp -O1 -msse) resulting matrix is empty (all fields are zeroes).
Uncommenting any block marked with XXX makes program write correct result.
It seems to me that GCC optimizes-out matrix at from AMatrixMultiply function, because it wrongly assumes it's not used in AVectorDot, which is written using SSE inlines.
Last few lines check if dot-product function really works, and yes, it does.
So, the question is: did I do or understand something wrong, or is this some kind of bug in GCC? My guess is 7:3 mix of above.
I'm using GCC version 5.1.0 (tdm-1).

This is also a very inefficient way of multiplying matrices using SSE. I'd be surprised if it was much faster than a scalar implementation with so much floating-point throughput available on modern CPUs. A better method is outlined here, no explicit transpose needed:
AMatrix & operator *= (AMatrix & m0, const AMatrix & m1)
{
__m128 r0 = _mm_load_ps(& m1[0][x]);
__m128 r1 = _mm_load_ps(& m1[1][x]);
__m128 r2 = _mm_load_ps(& m1[2][x]);
__m128 r3 = _mm_load_ps(& m1[3][x]);
for (int i = 0; i < 4; i++)
{
__m128 ti = _mm_load_ps(& m0[i][x]), t0, t1, t2, t3;
t0 = _mm_shuffle_ps(ti, ti, _MM_SHUFFLE(0, 0, 0, 0));
t1 = _mm_shuffle_ps(ti, ti, _MM_SHUFFLE(1, 1, 1, 1));
t2 = _mm_shuffle_ps(ti, ti, _MM_SHUFFLE(2, 2, 2, 2));
t3 = _mm_shuffle_ps(ti, ti, _MM_SHUFFLE(3, 3, 3, 3));
ti = t0 * r0 + t1 * r1 + t2 * r2 + t3 * r3;
_mm_store_ps(& m0[i][x], ti);
}
return m0;
}
On modern compilers, like gcc and clang, t0 * r0 + t1 * r1 + t2 * r2 + t3 * r3 is actually operating on __m128 types; though you can replace these with _mm_mul_ps and _mm_add_ps intrinsics if you want.
Return by value is then just a matter of adding a function like:
inline AMatrix operator * (const AMatrix & m0, const AMatrix & m1)
{
AMatrix lhs (m0); return (lhs *= m1);
}
Personally, I'd just replace the float x, y, z, w; with alignas (16) float _s[4] = {}; or similar - so you get a 'zero-vector' by default, or a defaulted constructor:
constexpr AVector () = default;
as well as nice constructors, like:
constexpr Vector (float x, float y, float z, float w)
: _s {x, y, z, w} {}

Your inline assembly lacks some constraints:
asm volatile(
"movups (%1), %%xmm0\n\t"
"movups (%2), %%xmm1\n\t"
"mulps %%xmm1, %%xmm0\n\t" // xmm0 -> (a1+b1, , , )
"movaps %%xmm0, %%xmm1\n\t" // xmm1 = xmm0
"shufps $0xB1, %%xmm1, %%xmm1\n\t" // 0xB1 = 10110001
"addps %%xmm1, %%xmm0\n\t" // xmm1 -> (x, y, z, w)+(y, x, w, z)=(x+y, x+y, z+w, z+w)
"movaps %%xmm0, %%xmm1\n\t" // xmm1 = xmm0
"shufps $0x0A, %%xmm1, %%xmm1\n\t" // 0x0A = 00001010
"addps %%xmm1, %%xmm0\n\t" // xmm1 -> (x+y+z+w, , , )
"movups %%xmm0, %0\n\t"
: "=m"(c)
: "r"(&a), "r"(&b)
);
GCC does not know that this assembler fragment clobbers %xmm0 and %xmm1, so it might not reload those registers to their previous values after the fragment has run. Some additional clobbers might be missing as well.

Related

Counter-intuitive results while playing with intrinsics

I'm new to the world of intrinsics, and I got here because I saw a way to achieve transparent code compilation i.e. what you see is what you get. Also, reproducibility. For a system supporting e.g. AVX2 I know I'll end up with the same instructions at the end, given I use AVX2 intrinsics. This is an important step towards writing HPC libraries which make use of SIMD. Feel free to correct me in my way of thinking.
Now, I have implemented a 3D vector dot product function in three variants in a micro-benchmarking setting. The code has been compiled using the GNU compiler v11.1.0 and run on a machine with a Intel(R) Core(TM) i5-8400 CPU # 2.80GHz chip and 32 GiB of DDR4 RAM. Single thread read-write memory bandwidth of said system has been measured at ~34 GiB/s by running a DAXPY benchmark.
First, let me present the elementary structures.
struct vector3
{
float data[3] = {};
inline float& operator()(const std::size_t& index) { return data[index]; }
inline const float& operator()(const std::size_t& index) const { return data[index]; }
inline float l2_norm_sq() const { return data[0] * data[0] + data[1] * data[1] + data[2] * data[2]; }
};
// strictly speaking, the following is a class of its own that implements a subset of
// the functionality of the std::vector. The motivation is to be able to allocate memory
// without "touching" the data, a requirement that is crucial for "cold" microbenchmarking.
template<class Treal_t>
using vector3_array = std::vector<vector3<Treal_t>>;
The first is my scalar code. I'm compiling it with the flag "-O0".
void dot_product_novec(const vector3_array<float>& varray, std::vector<float>& dot_products)
{
static constexpr auto inc = 6;
static constexpr auto dot_products_per_inc = inc / 3;
const auto stream_size_div = varray.size() * 3 / inc * inc;
const auto* float_stream = reinterpret_cast<const float*>(&varray[0](0));
auto dot_product_index = std::size_t{};
for (auto index = std::size_t{}; index < varray.size(); index += inc, dot_product_index += dot_products_per_inc)
{
dot_products[dot_product_index] = float_stream[index] * float_stream[index] + float_stream[index + 1] * float_stream[index + 1]
+ float_stream[index + 2] * float_stream[index + 2];
dot_products[dot_product_index + 1] = float_stream[index + 3] * float_stream[index + 3]
+ float_stream[index + 4] * float_stream[index + 4] + float_stream[index + 5] * float_stream[index + 5];
}
for (auto index = dot_product_index; index < varray.size(); ++index)
{
dot_products[index] = varray[index].l2_norm_sq();
}
}
Next up is my auto-vectorized loop. I'm strongly recommending auto-vectorization using the corresponding directive of OpenMP 4.0. Compiled with flags "-O3;-ffast-math;-march=native;-fopenmp".
void dot_product_auto(const vector3_array<float>& varray, std::vector<float>& dot_products)
{
#pragma omp simd safelen(16)
for (auto index = std::size_t{}; index < varray.size(); ++index)
{
dot_products[index] = varray[index].l2_norm_sq();
}
}
Finally, here's my version which has been vectorized using intrinsics. Compiled using "-O3;-ffast-math;-march=native;-mfma;-mavx2".
void dot_product(const vector3_array<float>& varray, std::vector<float>& dot_products)
{
static constexpr auto inc = 6;
static constexpr auto dot_products_per_inc = inc / 3;
const auto stream_size_div = varray.size() * 3 / inc * inc;
const auto* float_stream = reinterpret_cast<const float*>(&varray[0](0));
auto dot_product_index = std::size_t{};
static const auto load_mask = _mm256_setr_epi32(-1, -1, -1, -1, -1, -1, 0, 0);
static const auto permute_mask0 = _mm256_setr_epi32(0, 1, 2, 7, 3, 4, 5, 6);
static const auto permute_mask1 = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 4, 0);
static const auto store_mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
for (auto index = std::size_t{}; index < stream_size_div; index += inc, dot_product_index += dot_products_per_inc)
{
// 1. load and permute the vectors
const auto point_packed = _mm256_maskload_ps(float_stream + index, load_mask);
const auto point_permuted_packed = _mm256_permutevar8x32_ps(point_packed, permute_mask0);
// 2. do a multiply
const auto point_permuted_elementwise_sq_packed = _mm256_mul_ps(point_permuted_packed, point_permuted_packed);
// 3. do 2 horizontal additions
const auto hadd1 = _mm256_hadd_ps(point_permuted_elementwise_sq_packed, point_permuted_elementwise_sq_packed);
const auto hadd2 = _mm256_hadd_ps(hadd1, hadd1);
// 4. permute to target position
const auto result_packed = _mm256_permutevar8x32_ps(hadd2, permute_mask1);
// 4. store
_mm256_maskstore_ps(&dot_products[dot_product_index], store_mask, result_packed);
}
for (auto index = dot_product_index; index < varray.size(); ++index) // no opt for remainder loop
{
dot_products[index] = varray[index].l2_norm_sq();
}
}
I've tested the code, so I know it works.
Now, brief details about the microbenchmarking:
I use a small library which I've written for this purpose: https://gitlab.com/anxiousprogrammer/tixl.
20 warm up runs, 100 timed runs.
fresh allocations in each run for cold microbenchmarking, first touch (zeroing of the first datum in each memory page) of test data prevents measuring of page-faults.
I'm modelling the dot product so: 5 * size FLOPs after 5 * size * sizeof(float) transfers i.e code-balance of 4 or computational intensity of 0.25. Using this information, here are the performance results in terms of effective bandwidth:
no-vec: 18.6 GB/s
auto-vec: 21.3 GB/s
intrinsic-vec: 16.4 GB/s
Questions:
Is my motivation (mentioned in paragraph 1) a sensible one?
Why is my version slower than the scalar code?
Why are they all far from the peak read-write BW of 34 GiB/s?
Please excuse the lack of a minimum reproducer, the amount of code would be too much. Thanks a lot for your thoughts and inputs.
Your manually-vectorized code is not particularly efficient.
Try to benchmark the following 2 versions instead.
This one is simpler, and only requires SSE 4.1 instruction set.
inline __m128 loadFloat3( const float* rsi )
{
__m128 xy = _mm_castpd_ps( _mm_load_sd( (const double*)rsi ) );
// Compilers should merge following 2 lines into single INSERTPS with a memory operand
__m128 z = _mm_load_ss( rsi + 2 );
return _mm_insert_ps( xy, z, 0x20 );
}
// Simple version which uses DPPS instruction from SSE 4.1 set
void dotProductsSimple( float* rdi, size_t length, const float* rsi )
{
const float* const rsiEndMinusOne = rsi + ( (ptrdiff_t)length - 1 ) * 3;
const float* const rsiEnd = rsi + length * 3;
for( ; rsi < rsiEndMinusOne; rsi += 3, rdi++ )
{
// Load complete 16 byte vector, discard the W
__m128 v = _mm_loadu_ps( rsi );
v = _mm_dp_ps( v, v, 0b01110001 );
_mm_store_ss( rdi, v );
}
if( rsi < rsiEnd )
{
// For the last vector, load exactly 12 bytes.
// Avoids potential crash when loading from out of bounds
__m128 v = loadFloat3( rsi );
v = _mm_dp_ps( v, v, 0b01110001 );
_mm_store_ss( rdi, v );
}
}
This one is more complicated, and requires AVX1 support. Probably, going to be slightly faster on most processors.
void dotProductTransposed( float* rdi, size_t length, const float* rsi )
{
constexpr size_t maskAlign8 = ~(size_t)7;
const float* const rsiEndAligned = rsi + ( length & maskAlign8 ) * 3;
const float* const rsiEndMinusOne = rsi + ( (ptrdiff_t)length - 1 ) * 3;
const float* const rsiEnd = rsi + length * 3;
while( rsi < rsiEndAligned )
{
// Load lower halves
__m256 m03, m14, m25;
m03 = _mm256_castps128_ps256( _mm_loadu_ps( rsi ) );
m14 = _mm256_castps128_ps256( _mm_loadu_ps( rsi + 4 ) );
m25 = _mm256_castps128_ps256( _mm_loadu_ps( rsi + 8 ) );
// Load upper halves; VINSERTF128 supports memory operand for the second argument.
m03 = _mm256_insertf128_ps( m03, _mm_loadu_ps( rsi + 12 ), 1 );
m14 = _mm256_insertf128_ps( m14, _mm_loadu_ps( rsi + 16 ), 1 );
m25 = _mm256_insertf128_ps( m25, _mm_loadu_ps( rsi + 20 ), 1 );
rsi += 24;
// Transpose these SIMD vectors
__m256 xy = _mm256_shuffle_ps( m14, m25, _MM_SHUFFLE( 2, 1, 3, 2 ) );
__m256 yz = _mm256_shuffle_ps( m03, m14, _MM_SHUFFLE( 1, 0, 2, 1 ) );
__m256 x = _mm256_shuffle_ps( m03, xy, _MM_SHUFFLE( 2, 0, 3, 0 ) );
__m256 y = _mm256_shuffle_ps( yz, xy, _MM_SHUFFLE( 3, 1, 2, 0 ) );
__m256 z = _mm256_shuffle_ps( yz, m25, _MM_SHUFFLE( 3, 0, 3, 1 ) );
// Now we have 3 SIMD vectors with gathered x/y/z fields of 8 source 3D vectors
// Compute squares
x = _mm256_mul_ps( x, x );
y = _mm256_mul_ps( y, y );
z = _mm256_mul_ps( z, z );
// Add squares
x = _mm256_add_ps( x, y );
x = _mm256_add_ps( x, z );
// Store 8 values
_mm256_storeu_ps( rdi, x );
rdi += 8;
}
// Handle the remainder
for( ; rsi < rsiEndMinusOne; rsi += 3, rdi++ )
{
__m128 v = _mm_loadu_ps( rsi );
v = _mm_dp_ps( v, v, 0b01110001 );
_mm_store_ss( rdi, v );
}
if( rsi < rsiEnd )
{
__m128 v = loadFloat3( rsi );
v = _mm_dp_ps( v, v, 0b01110001 );
_mm_store_ss( rdi, v );
}
}

How to make use of SIMD capability for sum of squared differences between 8-bit components of RGBA pixels?

The below code is trying to extract the red, green and blue channel of a pixel value and performing an arithmetic with another set of RGB values.
It seems that code is slow around the logic where its trying to perform the squaring and addition.
What would be the possibility to replace it with a faster version as this logic doesn't seems to be using SIMD capabilities at all.
typedef struct {
unsigned char b, g, r, a;
} pixel;
register pixel *pPixel;
register int i, red1, green1, blue1, alpha1;
register int red2, green2, blue2, alpha2;
register long oldD, newD;
red1 = GetRed( *pPixel );
green1 = GetGreen( *pPixel );
blue1 = GetBlue( *pPixel );
alpha1 = GetAlpha( *pPixel );
oldD = 2000000000;
for ( i = 0; i < newcolors; ++i ) {
red2 = GetRed( mycolormap[i].acolor );
green2 = GetGreen( mycolormap[i].acolor );
blue2 = GetBlue( mycolormap[i].acolor );
alpha2 = GetAlpha( mycolormap[i].acolor );
newD = ( red1 - red2 ) * ( red1 - red2 ) +
( green1 - green2 ) * ( green1 - green2 ) +
( blue1 - blue2 ) * ( blue1 - blue2 ) +
( alpha1 - alpha2 ) * ( alpha1 - alpha2 );
if ( newD < oldD ) {
oldD = newD;
}
}
Below section of code seems to be requiring improvement
newD = ( red1 - red2 ) * ( red1 - red2 ) +
( green1 - green2 ) * ( green1 - green2 ) +
( blue1 - blue2 ) * ( blue1 - blue2 ) +
( alpha1 - alpha2 ) * ( alpha1 - alpha2 );
It’s harder than it seems. Unfortunately for you, automatic vectorizers in C++ compilers are very rarely doing a good job for integer arithmetic, like you have there.
The following implementation only needs SSE4.1. If you have AVX2 possible to improve substantially by upgrading all these vectors to 32-byte ones, however this will complicate a couple things, remainder and final reduction.
I assumed not only you want the minimum dot product, also the index of the pixel. If you only want the minimum dot product, remove bestIndices field and the code which handles that field.
struct alignas( 4 ) Pixel
{
uint8_t b, g, r, a;
};
// Define __SSE4_1__ macro when building with MSVC for AVX1 or newer ISA
#if defined( _MSC_VER ) && defined( __AVX__ ) && !defined( __SSE4_1__ )
#define __SSE4_1__ 1
#endif
size_t findClosestPixel( const Pixel& ref, const Pixel* rsi, size_t length, int& bestValue )
{
if( 0 == length )
{
bestValue = INT_MAX;
return ~(size_t)0;
}
class Acc
{
// The reference pixel we're after, broadcasted and split into low/high pieces in 16-bit lanes
__m128i lowRef, highRef;
// The best dot product so far
__m128i bestSquares = _mm_set1_epi32( INT_MAX );
// Index of the pixels currently in bestSquares
__m128i bestIndices = _mm_set1_epi32( -1 );
const __m128i lowMask = _mm_set1_epi16( 0xFF );
// For lanes where dp < bestSquares, update bestSquares and bestIndices vectors
void updateFields( __m128i dp, __m128i indices )
{
const __m128i lt = _mm_cmplt_epi32( dp, bestSquares );
#ifndef __SSE4_1__
bestSquares = _mm_or_si128( _mm_and_si128( lt, dp ), _mm_andnot_si128( lt, bestSquares ) );
bestIndices = _mm_or_si128( _mm_and_si128( lt, indices ), _mm_andnot_si128( lt, bestIndices ) );
#else
bestSquares = _mm_min_epi32( dp, bestSquares );
bestIndices = _mm_blendv_epi8( bestIndices, indices, lt );
#endif
}
public:
Acc( const Pixel& ref )
{
__m128i tmp = _mm_set1_epi32( *(const int*)( &ref ) );
lowRef = _mm_and_si128( tmp, lowMask );
highRef = _mm_srli_epi16( tmp, 8 );
}
// Update the accumulator with another 4 pixels
void update( __m128i pixels, __m128i indices )
{
// Split into two vectors with 16-bit lanes:
// low contains blue and red channels, high contains green and alpha
__m128i low = _mm_and_si128( pixels, lowMask );
__m128i high = _mm_srli_epi16( pixels, 8 );
// Compute difference with the reference value we're after
low = _mm_sub_epi16( low, lowRef );
high = _mm_sub_epi16( high, highRef );
// Compute squares as 32-bit numbers, add adjacent pairs
low = _mm_madd_epi16( low, low );
high = _mm_madd_epi16( high, high );
// Adding them results in the dot product (sum of squares) for all 4 channels
__m128i dp = _mm_add_epi32( low, high );
// Update the state
updateFields( dp, indices );
}
// Compute horizontal minimum across lanes in these accumulators
uint32_t reduce( int& bestDp )
{
// Swap low/high halves
__m128i s2 = _mm_shuffle_epi32( bestSquares, _MM_SHUFFLE( 1, 0, 3, 2 ) );
__m128i i2 = _mm_shuffle_epi32( bestIndices, _MM_SHUFFLE( 1, 0, 3, 2 ) );
updateFields( s2, i2 );
// Swap even/odd lanes
s2 = _mm_shuffle_epi32( bestSquares, _MM_SHUFFLE( 2, 3, 0, 1 ) );
i2 = _mm_shuffle_epi32( bestIndices, _MM_SHUFFLE( 2, 3, 0, 1 ) );
updateFields( s2, i2 );
// Return lowest lanes from both vectors
bestDp = _mm_cvtsi128_si32( bestSquares );
return (uint32_t)_mm_cvtsi128_si32( bestIndices );
}
};
Acc impl{ ref };
const size_t lengthAligned = ( length / 4 ) * 4;
size_t i;
__m128i currentIndices = _mm_setr_epi32( 0, 1, 2, 3 );
for( i = 0; i < lengthAligned; i += 4 )
{
// Load 4 source pixels
__m128i src = _mm_loadu_si128( ( const __m128i* )( rsi + i ) );
// Update things
impl.update( src, currentIndices );
// Increment index vector by 4 pixels
currentIndices = _mm_add_epi32( currentIndices, _mm_set1_epi32( 4 ) );
}
const size_t remainder = length % 4;
if( remainder == 0 )
{
// The input was a multiple of 4 pixels
return impl.reduce( bestValue );
}
const int* const pi = (const int*)( rsi + i );
__m128i rv;
if( lengthAligned > 0 )
{
// We had at least 4 elements on input, can do unaligned load with negative offset
size_t offset = 4 - remainder;
currentIndices = _mm_sub_epi32( currentIndices, _mm_set1_epi32( (int)offset ) );
rv = _mm_loadu_si128( ( const __m128i* )( pi - offset ) );
}
else
{
// Less than 4 elements on input, doing partial load and broadcasting the last element
const size_t remainder = length % 4;
switch( remainder )
{
case 1:
rv = _mm_set1_epi32( pi[ 0 ] );
break;
case 2:
rv = _mm_loadl_epi64( ( const __m128i* )pi );
rv = _mm_shuffle_epi32( rv, _MM_SHUFFLE( 1, 1, 1, 0 ) );
break;
case 3:
rv = _mm_loadl_epi64( ( const __m128i* )pi );
#ifndef __SSE4_1__
rv = _mm_unpacklo_epi64( rv, _mm_set1_epi32( pi[ 2 ] ) );
#else
rv = _mm_insert_epi32( rv, pi[ 2 ], 2 );
rv = _mm_shuffle_epi32( rv, _MM_SHUFFLE( 2, 2, 1, 0 ) );
#endif
break;
}
}
impl.update( rv, currentIndices );
return impl.reduce( bestValue );
}

Accelerating matrix vector multiplication with ARM Neon Intrinsics on Raspberry Pi 4

I need to optimize a matrix vector multiplication. The data looks like following:
Vector has 81 columns
Matrix has 90,000 rows and 81 columns and is already transposed. So row-wise dot product can be used.
The output is hence a vector with 90,000 rows
All lie in 1D float array
Some non-function requirements are also have to be met for this routine:
As few as possible standard libraries should be used (no std::vector for example)
No third-party library should be used (so no Eigen or Blas for me, either)
This is my (simplified, where I assume the input is perfectly blocked, for sake of readability) code,
// input_height = 90000
// input_width = 81
for (uint32_t y = 0; y < input_height; y += 4) {
float32x4_t sum0 = vmovq_n_f32(0);
float32x4_t sum1 = vmovq_n_f32(0);
float32x4_t sum2 = vmovq_n_f32(0);
float32x4_t sum3 = vmovq_n_f32(0);
for (uint32_t x = 0; x < input_width; x += 16) {
float32x4x4_t A = load_matrix_transpose(kernel + x);
float32x4x4_t B0 = load_matrix_transpose(input + y * input_width + x);
float32x4x4_t B1 = load_matrix_transpose(input + (y + 1) * input_width + x);
float32x4x4_t B2 = load_matrix_transpose(input + (y + 2) * input_width + x);
float32x4x4_t B3 = load_matrix_transpose(input + (y + 3) * input_width + x);
matrix_element_wise_multiplication(A, B0, sum0);
matrix_element_wise_multiplication(A, B1, sum1);
matrix_element_wise_multiplication(A, B2, sum2);
matrix_element_wise_multiplication(A, B3, sum3);
}
output[y] = vaddvq_f32(sum0);
output[y + 1] = vaddvq_f32(sum1);
output[y + 2] = vaddvq_f32(sum2);
output[y + 3] = vaddvq_f32(sum3);
}
Where the load_matrix_transpose, matrix_element_wise_multiplication are the following functions:
inline float32x4x4_t load_matrix_transpose(float *a) {
float32x4x4_t ret;
ret.val[0] = simd_load(a);
ret.val[1] = simd_load(a + 4);
ret.val[2] = simd_load(a + 8);
ret.val[3] = simd_load(a + 12);
return ret;
}
inline void simd_matrix_element_wise_multiplication(float32x4x4_t & A, float32x4x4_t & B, float32x4x4_t & C) {
C = vmlaq_f32(C, A.val[0], B.val[0]);
C = vmlaq_f32(C, A.val[1], B.val[1]);
C = vmlaq_f32(C, A.val[2], B.val[2]);
C = vmlaq_f32(C, A.val[3], B.val[3]);
}
On my Rasperry Pi 4 (ARMv8, 8GB RAM, 4 cores) the code takes with optimization level -O3 about 60ms.
On long run (many loops), the Neon register version is exactly twice as fast as the normal code.
My question is, is there anyway to optimize the code further? I have tried many things but can not make any improvement with respect to the normal code.
Data locality is the highest priority when it comes to optimizations, and you should be aware of the register capacity since registers are BY FAR the fastest and most scarce resource.
aarch64: 32x128bit neon registers (512 bytes)
aarch32: 16x128bit neon registers (256 bytes)
A 81x90000 matrix when transposed requires to hold 90000 intermediate values to do the multiplication, and since 360000 bytes don't fit into a register bank of 512 bytes, there will be TONS of memory swapping which translates in HUGE performance hits.
On the other hand, 4*81 bytes of the vector fit nicely into the 512 bytes.
void matVecMult81x90000(float *pDst, float *pMat, float *pVec)
{
register float32x4_t vec0_3, vec4_7, vec8_11, vec12_15, vec16_19, vec20_23, vec24_27, vec28_31, vec32_35, vec36_39, vec40_43, vec44_47, vec48_51, vec52_55, vec56_59, vec60_63, vec64_67, vec68_71, vec72_75, vec76_79, vec80;
register float32x4_t mat0, mat1, mat2, mat3, mat4, rslt;
register float32x2_t drslt;
register uint32_t nRows = 90000;
vec80 = vdupq_n_f32(0.0f);
mat4 =vdupq_n_f32(0.0f);
vec0_3 = vld1q_f32(pVec); pVec += 4;
vec4_7 = vld1q_f32(pVec); pVec += 4;
vec8_11 = vld1q_f32(pVec); pVec += 4;
vec12_15 = vld1q_f32(pVec); pVec += 4;
vec16_19 = vld1q_f32(pVec); pVec += 4;
vec20_23 = vld1q_f32(pVec); pVec += 4;
vec24_27 = vld1q_f32(pVec); pVec += 4;
vec28_31 = vld1q_f32(pVec); pVec += 4;
vec32_35 = vld1q_f32(pVec); pVec += 4;
vec36_39 = vld1q_f32(pVec); pVec += 4;
vec40_43 = vld1q_f32(pVec); pVec += 4;
vec44_47 = vld1q_f32(pVec); pVec += 4;
vec48_51 = vld1q_f32(pVec); pVec += 4;
vec52_55 = vld1q_f32(pVec); pVec += 4;
vec56_59 = vld1q_f32(pVec); pVec += 4;
vec60_63 = vld1q_f32(pVec); pVec += 4;
vec64_67 = vld1q_f32(pVec); pVec += 4;
vec68_71 = vld1q_f32(pVec); pVec += 4;
vec72_75 = vld1q_f32(pVec); pVec += 4;
vec76_79 = vld1q_f32(pVec); pVec += 4;
vld1q_lane_f32(pVec, vec80, 0);
do {
mat0 = vld1q_f32(pMat); pMat += 4;
mat1 = vld1q_f32(pMat); pMat += 4;
mat2 = vld1q_f32(pMat); pMat += 4;
mat3 = vld1q_f32(pMat); pMat += 4;
rslt = vmulq_f32(mat0, vec0_3);
rslt += vmulq_f32(mat1, vec4_7);
rslt += vmulq_f32(mat2, vec8_11);
rslt += vmulq_f32(mat3, vec12_15);
mat0 = vld1q_f32(pMat); pMat += 4;
mat1 = vld1q_f32(pMat); pMat += 4;
mat2 = vld1q_f32(pMat); pMat += 4;
mat3 = vld1q_f32(pMat); pMat += 4;
rslt += vmulq_f32(mat0, vec16_19);
rslt += vmulq_f32(mat1, vec20_23);
rslt += vmulq_f32(mat2, vec24_27);
rslt += vmulq_f32(mat3, vec28_31);
mat0 = vld1q_f32(pMat); pMat += 4;
mat1 = vld1q_f32(pMat); pMat += 4;
mat2 = vld1q_f32(pMat); pMat += 4;
mat3 = vld1q_f32(pMat); pMat += 4;
rslt += vmulq_f32(mat0, vec32_35);
rslt += vmulq_f32(mat1, vec36_39);
rslt += vmulq_f32(mat2, vec40_43);
rslt += vmulq_f32(mat3, vec44_47);
mat0 = vld1q_f32(pMat); pMat += 4;
mat1 = vld1q_f32(pMat); pMat += 4;
mat2 = vld1q_f32(pMat); pMat += 4;
mat3 = vld1q_f32(pMat); pMat += 4;
rslt += vmulq_f32(mat0, vec48_51);
rslt += vmulq_f32(mat1, vec52_55);
rslt += vmulq_f32(mat2, vec56_59);
rslt += vmulq_f32(mat3, vec60_63);
mat0 = vld1q_f32(pMat); pMat += 4;
mat1 = vld1q_f32(pMat); pMat += 4;
mat2 = vld1q_f32(pMat); pMat += 4;
mat3 = vld1q_f32(pMat); pMat += 4;
vld1q_lane_f32(pMat, mat4, 0); pMat += 1;
rslt += vmulq_f32(mat0, vec64_67);
rslt += vmulq_f32(mat1, vec68_71);
rslt += vmulq_f32(mat2, vec72_75);
rslt += vmulq_f32(mat3, vec76_79);
rslt += vmulq_f32(mat4, vec80);
*pDst++ = vaddvq_f32(rslt);
} while (--nRows);
}
Unfortunately, compilers don't play along nicely. (Both GCC and Clang)
The generated code shows some stack swapping on the Vector inside the loop.
Below is the same function in hand written assembly without any stack swapping:
.arch armv8-a
.global matVecMult81x90000_asm
.text
.balign 64
.func
matVecMult81x90000_asm:
// init loop counter
mov w3, #90000 & 0xffff
movk w3, #90000>>16, lsl #16
// preserve registers
stp d8, d9, [sp, #-48]!
stp d10, d11, [sp, #1*16]
stp d12, d13, [sp, #2*16]
// load vectors
ldp q0, q1, [x2, #0*32]
ldp q2, q3, [x2, #1*32]
ldp q4, q5, [x2, #2*32]
ldp q6, q7, [x2, #3*32]
ldp q8, q9, [x2, #4*32]
ldp q10, q11, [x2, #5*32]
ldp q12, q13, [x2, #6*32]
ldp q16, q17, [x2, #7*32]
ldp q18, q19, [x2, #8*32]
ldp q20, q21, [x2, #9*32]
ldr s22, [x2, #10*32]
// loop
.balign 64
1:
ldp q24, q25, [x1, #0*32]
ldp q26, q27, [x1, #1*32]
ldp q28, q29, [x1, #2*32]
ldp q30, q31, [x1, #3*32]
subs w3, w3, #1
fmul v23.4s, v24.4s, v0.4s
fmla v23.4s, v25.4s, v1.4s
fmla v23.4s, v26.4s, v2.4s
fmla v23.4s, v27.4s, v3.4s
fmla v23.4s, v28.4s, v4.4s
fmla v23.4s, v29.4s, v5.4s
fmla v23.4s, v30.4s, v6.4s
fmla v23.4s, v31.4s, v7.4s
ldp q24, q25, [x1, #4*32]
ldp q26, q27, [x1, #5*32]
ldp q28, q29, [x1, #6*32]
ldp q30, q31, [x1, #7*32]
fmla v23.4s, v24.4s, v8.4s
fmla v23.4s, v25.4s, v9.4s
fmla v23.4s, v26.4s, v10.4s
fmla v23.4s, v27.4s, v11.4s
fmla v23.4s, v28.4s, v12.4s
fmla v23.4s, v29.4s, v13.4s
fmla v23.4s, v30.4s, v16.4s
fmla v23.4s, v31.4s, v17.4s
ldp q24, q25, [x1, #8*32]
ldp q26, q27, [x1, #9*32]
ldr s28, [x1, #10*32]
fmla v23.4s, v24.4s, v18.4s
fmla v23.4s, v25.4s, v19.4s
fmla v23.4s, v26.4s, v20.4s
fmla v23.4s, v27.4s, v21.4s
fmla v23.4s, v28.4s, v22.4s
add x1, x1, #81*4
faddp v23.4s, v23.4s, v23.4s
faddp v23.2s, v23.2s, v23.2s
str s23, [x0], #4
b.ne 1b
.balign 8
//restore registers
ldp d10, d11, [sp, #1*16]
ldp d12, d13, [sp, #2*16]
ldp d8, d9, [sp], #48
// return
ret
.endfunc
.end
Test results on RK3368:
Clang intrinsics: 10.41ms
assembly: 9.59ms
The compilers didn't perform that bad in this case, but more than often they are unbelievably stupid. I strongly recommend learning assembly.
Here’s an optimization of Jake’s answer.
Using 4 accumulators instead of a single one helps because FMA instructions have latency much higher than throughput. According to Cortex-A72 optimization guide, the latency of FMLA instruction is 7 cycles for the complete thing, or 3 cycles when the dependency is on the accumulator (if you wonder what the hell is Q-form and D-form, Q is for 16-byte vectors, D is for 8-byte vectors). The throughput is much higher, it’s 1 cycle, the CPU can run one FMA every cycle.
The following version used 4 independent accumulators instead of a single one, should improve the throughput despite we need 3 extra instructions in the end of the loop to sum the accumulators.
I've also used a few macros to help with repetitive code. Untested.
void matVecMult81( float *pDst, const float *pMat, const float *pVec, size_t nRows = 90000 )
{
// 30 vector registers in total; ARM64 has 32 of them, so we're good.
float32x4_t vec0_3, vec4_7, vec8_11, vec12_15, vec16_19, vec20_23, vec24_27, vec28_31, vec32_35, vec36_39, vec40_43, vec44_47, vec48_51, vec52_55, vec56_59, vec60_63, vec64_67, vec68_71, vec72_75, vec76_79, vec80;
float32x4_t mat0, mat1, mat2, mat3, mat4;
float32x4_t res0, res1, res2, res3;
vec80 = mat4 = vdupq_n_f32( 0.0f );
// Load 16 numbers from pVec into 3 vector registers, incrementing the source pointer
#define LOAD_VEC_16( v0, v1, v2, v3 ) \
v0 = vld1q_f32( pVec ); pVec += 4; \
v1 = vld1q_f32( pVec ); pVec += 4; \
v2 = vld1q_f32( pVec ); pVec += 4; \
v3 = vld1q_f32( pVec ); pVec += 4
// Load the complete vector into registers using the above macro
LOAD_VEC_16( vec0_3, vec4_7, vec8_11, vec12_15 );
LOAD_VEC_16( vec16_19, vec20_23, vec24_27, vec28_31 );
LOAD_VEC_16( vec32_35, vec36_39, vec40_43, vec44_47 );
LOAD_VEC_16( vec48_51, vec52_55, vec56_59, vec60_63 );
LOAD_VEC_16( vec64_67, vec68_71, vec72_75, vec76_79 );
// Load the final scalar of the vector
vec80 = vld1q_lane_f32( pVec, vec80, 0 );
#undef LOAD_VEC_16
// Load 16 numbers from pMat into mat0 - mat3, incrementing the source pointer
#define LOAD_MATRIX_16() \
mat0 = vld1q_f32( pMat ); pMat += 4; \
mat1 = vld1q_f32( pMat ); pMat += 4; \
mat2 = vld1q_f32( pMat ); pMat += 4; \
mat3 = vld1q_f32( pMat ); pMat += 4
// Multiply 16 numbers in mat0 - mat3 by the specified pieces of the vector, and accumulate into res0 - res3
// Multiple accumulators is critical for performance, 4 instructions produced by this macro don't have data dependencies between them.
#define HANDLE_BLOCK_16( v0, v1, v2, v3 ) \
res0 = vfmaq_f32( res0, mat0, v0 ); \
res1 = vfmaq_f32( res1, mat1, v1 ); \
res2 = vfmaq_f32( res2, mat2, v2 ); \
res3 = vfmaq_f32( res3, mat3, v3 )
const float* const pMatEnd = pMat + nRows * 81;
while( pMat < pMatEnd )
{
// Initial 16 elements only need multiplication.
LOAD_MATRIX_16();
res0 = vmulq_f32( mat0, vec0_3 );
res1 = vmulq_f32( mat1, vec4_7 );
res2 = vmulq_f32( mat2, vec8_11 );
res3 = vmulq_f32( mat3, vec12_15 );
// Handle the rest of the row using FMA instructions.
LOAD_MATRIX_16();
HANDLE_BLOCK_16( vec16_19, vec20_23, vec24_27, vec28_31 );
LOAD_MATRIX_16();
HANDLE_BLOCK_16( vec32_35, vec36_39, vec40_43, vec44_47 );
LOAD_MATRIX_16();
HANDLE_BLOCK_16( vec48_51, vec52_55, vec56_59, vec60_63 );
// The final block of the row has 17 scalars instead of 16
LOAD_MATRIX_16();
mat4 = vld1q_lane_f32( pMat, mat4, 0 ); pMat++;
HANDLE_BLOCK_16( vec64_67, vec68_71, vec72_75, vec76_79 );
res0 = vfmaq_f32( res0, mat4, vec80 );
// Vertically add 4 accumulators into res0
res1 = vaddq_f32( res1, res2 );
res0 = vaddq_f32( res3, res0 );
res0 = vaddq_f32( res1, res0 );
// Store the horizontal sum of the accumulator
*pDst = vaddvq_f32( res0 );
pDst++;
}
#undef LOAD_MATRIX_16
#undef HANDLE_BLOCK_16
}
The assembly generated from that source with GCC 10.1 looks more or less OK.

c++: Is table lookup vectorizable for small lookup-table

I want to vectorize the following snippet of code with SIMD intrinsics is this possible?
unsigned char chain[3][3] = {
3, 2, 1, // y --> x
4, -1, 0, // |
5, 6, 7 // |
}; // v
std::vector<int> x;
std::vector<int> y;
//initialize x, y
std::vector<int> chain_code(x.size());
for(std::size_t i = 0; i < x.size(); ++i
chain_code[i] = chain[x[i]][y[i]];
EDIT:
Support for: SSE - SSE4.2 and AVX
Architectur: Sandy Bridge i5 2500
If you make your x, y, chain_node 8-bit integers (instead of 32-bit ones), then you can process 16 values at once.
Here is the code using SSSE3:
std::vector<uint8_t> x;
std::vector<uint8_t> y;
...
int n = x.size();
std::vector<uint8_t> chain_code(n);
//initialize table register
__m128i table = _mm_setr_epi8(
chain[0][0], chain[0][1], chain[0][2], 99,
chain[1][0], chain[1][1], chain[1][2], 99,
chain[2][0], chain[2][1], chain[2][2], 99,
99, 99, 99, 99
);
int b = (n / 16) * 16;
for (int i = 0; i < b; i += 16) {
//load 16 X/Y bytes
__m128i regX = _mm_loadu_si128((__m128i*)&x[i]);
__m128i regY = _mm_loadu_si128((__m128i*)&y[i]);
//shift all X values left by 2 bits (as 16-bit integers)
__m128i regX4 = _mm_slli_epi16(regX, 2);
//calculate linear indices (x * 4 + y)
__m128i indices = _mm_add_epi8(regX4, regY);
//perform 16 lookups
__m128i res = _mm_shuffle_epi8(table, indices);
//store results
_mm_storeu_si128((__m128i*)&chain_code[i], res);
}
for (int i = b; i < n; i++)
chain_code[i] = chain[x[i]][y[i]];
The fully working version of this code is here. Generated assembly is quite simple (MSVC2013 x64):
movdqu xmm1, XMMWORD PTR [rdi+rax]
movdqu xmm0, XMMWORD PTR [rax]
psllw xmm1, 2
paddb xmm1, xmm0
movdqa xmm0, xmm6
pshufb xmm0, xmm1
movdqu XMMWORD PTR [rsi+rax], xmm0
P.S. I guess you'll have various performance issues with std::vector containers. Perhaps unaligned accesses are no longer expensive, but filling vector with zeros will certainly happen. And it can take more time than the vectorized code.

SIMD optimization of cvtColor using ARM NEON intrinsics

I'm working on a SIMD optimization of BGR to grayscale conversion which is equivalent to OpenCV's cvtColor() function. There is an Intel SSE version of this function and I'm referring to it. (What I'm doing is basically converting SSE code to NEON code.)
I've almost finished writing the code, and can compile it with g++, but I can't get the proper output. Does anyone have any ideas what the error could be?
What I'm getting (incorrect):
What I should be getting:
Here's my code:
#include <opencv/cv.hpp>
#include <opencv/highgui.h>
#include <arm_neon.h>
//#include <iostream>
using namespace std;
//using namespace cv;
#define int8x16_to_8x8x2(v) ((int8x8x2_t) { vget_low_s8(v), vget_high_s8(v) })
void cvtBGR2GrayNEON(cv::Mat& src, cv::Mat& dest)
{
const int size = src.size().area()*src.channels();
uchar* s = src.ptr<uchar>(0);
uchar* d = dest.ptr<uchar>(0);
const int8x16_t mask1 = {0,3,6,9,12,15,1,4,7,10,13,2,5,8,11,14};
const int8x16_t smask1 = {6,7,8,9,10,0,1,2,3,4,5,11,12,13,14,15};
const int8x16_t ssmask1 = {11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10};
const int8x16_t mask2 = {0,3,6,9,12,15, 2,5,8,11,14,1,4,7,10,13};
const int8x16_t ssmask2 = {0,1,2,3,4,11,12,13,14,15,5,6,7,8,9,10};
const int8x16_t bmask1 = {255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0};
const int8x16_t bmask2 = {255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0};
const int8x16_t bmask3 = {255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0};
const int8x16_t bmask4 = {255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0};
const int shift = 8;
const int amp = 1<<shift;
const int16_t _R_ = (int16_t)(amp*0.299);
const int16_t _G_ = (int16_t)(amp*0.587);
const int16_t _B_ = (int16_t)(amp*0.114);
const int16x8_t R = vdupq_n_s16(_R_);
const int16x8_t G = vdupq_n_s16(_G_);
const int16x8_t B = vdupq_n_s16(_B_);
const int8x16_t zero = vdupq_n_s8(0);
for(int i = 0; i < size; i += 48)
{
int8x16_t a = vld1q_s8((int8_t *) s + i);
int8x16_t b = vld1q_s8((int8_t *) s + i + 16);
int8x16_t c = vld1q_s8((int8_t *) s + i + 32);
a = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(a),vget_low_s8(mask1)),vtbl2_s8(int8x16_to_8x8x2(a),vget_high_s8(mask1)));
b = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(b), vget_low_s8(mask2)), vtbl2_s8(int8x16_to_8x8x2(b), vget_high_s8(mask2)));
c = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(c), vget_low_s8(mask2)), vtbl2_s8(int8x16_to_8x8x2(c), vget_high_s8(mask2)));
//BBBBBB
const int8x16_t aaaa = vbslq_s8(c, vbslq_s8(b, a, bmask1), bmask2);
a = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(a), vget_low_s8(smask1)), vtbl2_s8(int8x16_to_8x8x2(a), vget_high_s8(smask1)));
b = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(b), vget_low_s8(smask1)), vtbl2_s8(int8x16_to_8x8x2(b), vget_high_s8(smask1)));
c = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(c), vget_low_s8(smask1)), vtbl2_s8(int8x16_to_8x8x2(c), vget_high_s8(smask1)));
//GGGGGG
const int8x16_t bbbb = vbslq_s8(c, vbslq_s8(b, a, bmask3), bmask2);
a = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(a), vget_low_s8(ssmask1)), vtbl2_s8(int8x16_to_8x8x2(a), vget_high_s8(ssmask1)));
c = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(c), vget_low_s8(ssmask1)), vtbl2_s8(int8x16_to_8x8x2(c), vget_high_s8(ssmask1)));
b = vcombine_s8(vtbl2_s8(int8x16_to_8x8x2(b), vget_low_s8(ssmask2)), vtbl2_s8(int8x16_to_8x8x2(b), vget_high_s8(ssmask2)));
//RRRRRR
const int8x16_t cccc = vbslq_s8(c, vbslq_s8(b, a, bmask3), bmask4);
/*
int8x8x2_t a1 = vzip_s8(vget_high_s8(aaaa), vget_high_s8(zero));
int8x8x2_t a2 = vzip_s8(vget_low_s8(aaaa), vget_low_s8(zero));
*/
int8x16_t a1 = aaaa;
int8x16_t a2 = zero;
int8x16x2_t temp1 = vzipq_s8(a1, a2);
a1 = temp1.val[0];
a2 = temp1.val[1];
int16x8_t aa1 = vmulq_s16((int16x8_t)a2, B);
int16x8_t aa2 = vmulq_s16((int16x8_t)a1, B);
int8x16_t b1 = bbbb;
int8x16_t b2 = zero;
int8x16x2_t temp2 = vzipq_s8(b1, b2);
b1 = temp2.val[0];
b2 = temp2.val[1];
int16x8_t bb1 = vmulq_s16((int16x8_t)b2, G);
int16x8_t bb2 = vmulq_s16((int16x8_t)b1, G);
int8x16_t c1 = cccc;
int8x16_t c2 = zero;
int8x16x2_t temp3 = vzipq_s8(c1, c2);
c1 = temp3.val[0];
c2 = temp3.val[1];
int16x8_t cc1 = vmulq_s16((int16x8_t)c2, R);
int16x8_t cc2 = vmulq_s16((int16x8_t)c1, R);
aa1 = vaddq_s16(aa1, bb1);
aa1 = vaddq_s16(aa1, cc1);
aa2 = vaddq_s16(aa2, bb2);
aa2 = vaddq_s16(aa2, cc2);
const int shift1 = 8;
aa1 = vshrq_n_s16(aa1, shift1);
aa2 = vshrq_n_s16(aa2, shift1);
uint8x8_t aaa1 = vqmovun_s16(aa1);
uint8x8_t aaa2 = vqmovun_s16(aa2);
uint8x16_t result = vcombine_u8(aaa1, aaa2);
vst1q_u8((uint8_t *)(d), result);
d+=16;
}
}
int main()
{
cv::Mat src = cv::imread("Lenna.bmp");
cv::Mat dest(src.rows, src.cols, CV_8UC1);
cvtBGR2GrayNEON(src, dest);
cv::imwrite("grey.jpg", dest);
return 0;
}
Here is equivalent SSE code (from here):
void cvtBGR2GraySSEShort(Mat& src, Mat& dest)
{
const int size = src.size().area()*src.channels();
uchar* s = src.ptr<uchar>(0);
uchar* d = dest.ptr<uchar>(0);
//data structure
//BGR BGR BGR BGR BGR B
//GR BGR BGR BGR BGR BG
//R BGR BGR BGR BGR BGR
//shuffle to BBBBBBGGGGGRRRRR
const __m128i mask1 = _mm_setr_epi8(0,3,6,9,12,15,1,4,7,10,13,2,5,8,11,14);
const __m128i smask1 = _mm_setr_epi8(6,7,8,9,10,0,1,2,3,4,5,11,12,13,14,15);
const __m128i ssmask1 = _mm_setr_epi8(11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10);
//shuffle to GGGGGGBBBBBRRRRR
const __m128i mask2 = _mm_setr_epi8(0,3,6,9,12,15, 2,5,8,11,14,1,4,7,10,13);
//const __m128i smask2 = _mm_setr_epi8(6,7,8,9,10,0,1,2,3,4,5,11,12,13,14,15);same as smask1
const __m128i ssmask2 = _mm_setr_epi8(0,1,2,3,4,11,12,13,14,15,5,6,7,8,9,10);
//shuffle to RRRRRRGGGGGBBBBB
//__m128i mask3 = _mm_setr_epi8(0,3,6,9,12,15, 2,5,8,11,14,1,4,7,10,13);//same as mask2
//const __m128i smask3 = _mm_setr_epi8(6,7,8,9,10,0,1,2,3,4,5,6,7,8,9,10);//same as smask1
//const __m128i ssmask3 = _mm_setr_epi8(11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10);//same as ssmask1
//blend mask
const __m128i bmask1 = _mm_setr_epi8
(255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0);
const __m128i bmask2 = _mm_setr_epi8
(255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0);
const __m128i bmask3 = _mm_setr_epi8
(255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0);
const __m128i bmask4 = _mm_setr_epi8
(255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0);
const int shift = 8;
const int amp = 1<<shift;
const int _R_=(int)(amp*0.299);
const int _G_=(int)(amp*0.587);
const int _B_=(int)(amp*0.114);
const __m128i R = _mm_set1_epi16(_R_);
const __m128i G = _mm_set1_epi16(_G_);
const __m128i B = _mm_set1_epi16(_B_);
const __m128i zero = _mm_setzero_si128();
for(int i=0;i<size;i+=48)
{
__m128i a = _mm_shuffle_epi8(_mm_load_si128((__m128i*)(s+i)),mask1);
__m128i b = _mm_shuffle_epi8(_mm_load_si128((__m128i*)(s+i+16)),mask2);
__m128i c = _mm_shuffle_epi8(_mm_load_si128((__m128i*)(s+i+32)),mask2);
const __m128i aaaa = _mm_blendv_epi8(c,_mm_blendv_epi8(b,a,bmask1),bmask2);
a = _mm_shuffle_epi8(a,smask1);
b = _mm_shuffle_epi8(b,smask1);
c = _mm_shuffle_epi8(c,smask1);
const __m128i bbbb =_mm_blendv_epi8(c,_mm_blendv_epi8(b,a,bmask3),bmask2);
a = _mm_shuffle_epi8(a,ssmask1);
c = _mm_shuffle_epi8(c,ssmask1);
b = _mm_shuffle_epi8(b,ssmask2);
const __m128i cccc =_mm_blendv_epi8(c,_mm_blendv_epi8(b,a,bmask3),bmask4);
__m128i a1 = _mm_unpackhi_epi8(aaaa,zero);
__m128i a2 = _mm_unpacklo_epi8(aaaa,zero);
a1 = _mm_mullo_epi16(a1,B);
a2 = _mm_mullo_epi16(a2,B);
__m128i b1 = _mm_unpackhi_epi8(bbbb,zero);
__m128i b2 = _mm_unpacklo_epi8(bbbb,zero);
b1 = _mm_mullo_epi16(b1,G);
b2 = _mm_mullo_epi16(b2,G);
__m128i c1 = _mm_unpackhi_epi8(cccc,zero);
__m128i c2 = _mm_unpacklo_epi8(cccc,zero);
c1 = _mm_mullo_epi16(c1,R);
c2 = _mm_mullo_epi16(c2,R);
a1 = _mm_add_epi16(a1,b1);
a1 = _mm_add_epi16(a1,c1);
a2 = _mm_add_epi16(a2,b2);
a2 = _mm_add_epi16(a2,c2);
a1 = _mm_srli_epi16(a1,8);
a2 = _mm_srli_epi16(a2,8);
a = _mm_packus_epi16(a1,a2);
_mm_stream_si128((__m128i*)(d),a);
d+=16;
}
}
Ok, below is a FULLY OPTIMIZED version of that function I just wrote (Beware that this function simply returns if size is smaller than 32.)
/*
* Created on: 2014. 7. 27.
* Author: Jake Lee
* Project FANIC - Fastest ARM NEON Implementaion Challenge
*/
// void fanicCvtBGR2GrayNEON(void *pDst, void *pSrc, unsigned int size);
// Y = 0.114*B + 0.587*G + 0.299*R
.text
.arm
.global fanicCvtBGR2GrayNEON
pDst .req r0
pSrc .req r1
size .req r2
.align 5
.func
fanicCvtBGR2GrayNEON:
pld [pSrc]
subs size, size, #32
pld [pSrc, #64]
bxmi lr
pld [pSrc, #64*2]
vmov.i8 d0, #29
vmov.i8 d1, #150
vmov.i8 d2, #77
.align 5
1:
vld3.8 {d20, d21, d22}, [pSrc]!
vld3.8 {d23, d24, d25}, [pSrc]!
vld3.8 {d26, d27, d28}, [pSrc]!
vld3.8 {d29, d30, d31}, [pSrc]!
vmull.u8 q8, d20, d0
vmlal.u8 q8, d21, d1
vmlal.u8 q8, d22, d2
vmull.u8 q9, d23, d0
vmlal.u8 q9, d24, d1
vmlal.u8 q9, d25, d2
vmull.u8 q10, d26, d0
vmlal.u8 q10, d27, d1
vmlal.u8 q10, d28, d2
vmull.u8 q11, d29, d0
vmlal.u8 q11, d30, d1
vmlal.u8 q11, d31, d2
vrshrn.u16 d24, q8, #8
vrshrn.u16 d25, q9, #8
vrshrn.u16 d26, q10, #8
vrshrn.u16 d27, q11, #8
subs size, size, #32
pld [pSrc, #64*3]
pld [pSrc, #64*4]
vst1.8 {q12, q13}, [pDst]!
bpl 1b
cmp size, #-32
add pSrc, pSrc, size
bxle lr
add pSrc, pSrc, size, lsl #1
add pDst, pDst, size
b 1b
.endfunc
.end
As you can see, it's so much easier and shorter writing NEON codes in assembly than in intrinsics despite the heavy unrolling.
Have fun.