Why _umul128 works slower than scalar code for mul128x64x2 function?

Why _umul128 works slower than scalar code for mul128x64x2 function? - c++

I am second time trying to implement fast mul128x64x2 function. First time I ask the question without comparision with _umul128 MSVC version. Now I made such a comparison and the results that I got show that the _umul128 function slower then native scalar and handmade simd AVX 1.0 code.
Below my test code:
#include <iostream>
#include <chrono>
#include <intrin.h>
#include <emmintrin.h>
#include <immintrin.h>
#pragma intrinsic(_umul128)
constexpr uint32_t LOW[4] = { 4294967295u, 0u, 4294967295u, 0u };
__forceinline void multiply128x128( const uint32_t ABCD[4], const uint32_t EFGH[4], uint32_t OUT[2][4] ) noexcept
{
__m128i L = _mm_lddqu_si128( reinterpret_cast< __m128i const* >( LOW ) );
__m128i IN = _mm_lddqu_si128( reinterpret_cast< __m128i const* >( EFGH ) );
__m128i A = _mm_set1_epi32( ABCD[0] );
__m128i B = _mm_set1_epi32( ABCD[1] );
__m128i C = _mm_set1_epi32( ABCD[2] );
__m128i D = _mm_set1_epi32( ABCD[3] );
__m128i ED = _mm_mul_epu32( IN, D );
__m128i EC = _mm_mul_epu32( IN, C );
__m128i EB = _mm_mul_epu32( IN, B );
__m128i EA = _mm_mul_epu32( IN, A );
IN = _mm_srli_epi64( IN, 32 );
__m128i FD = _mm_mul_epu32( IN, D );
__m128i FC = _mm_mul_epu32( IN, C );
__m128i FB = _mm_mul_epu32( IN, B );
__m128i FA = _mm_mul_epu32( IN, A );
__m128i FD_H = _mm_srli_epi64( FD, 32 );
__m128i FD_L = _mm_and_si128 ( L, FD );
__m128i FC_H = _mm_srli_epi64( FC, 32 );
__m128i FC_L = _mm_and_si128 ( L, FC );
__m128i FB_H = _mm_srli_epi64( FB, 32 );
__m128i FB_L = _mm_and_si128 ( L, FB );
__m128i FA_H = _mm_srli_epi64( FA, 32 );
__m128i FA_L = _mm_and_si128 ( L, FA );
__m128i ED_H = _mm_srli_epi64( ED, 32 );
__m128i ED_L = _mm_and_si128 ( L, ED );
__m128i EC_H = _mm_srli_epi64( EC, 32 );
__m128i EC_L = _mm_and_si128 ( L, EC );
__m128i EB_H = _mm_srli_epi64( EB, 32 );
__m128i EB_L = _mm_and_si128 ( L, EB );
__m128i EA_H = _mm_srli_epi64( EA, 32 );
__m128i EA_L = _mm_and_si128 ( L, EA );
__m128i SUM_FC_L_FD_H = _mm_add_epi64( FC_L, FD_H );
__m128i SUM_FB_L_FC_H = _mm_add_epi64( FB_L, FC_H );
__m128i SUM_FA_L_FB_H = _mm_add_epi64( FA_L, FB_H );
__m128i SUM_EC_L_ED_H = _mm_add_epi64( EC_L, ED_H );
__m128i SUM_EB_L_EC_H = _mm_add_epi64( EB_L, EC_H );
__m128i SUM_EA_L_EB_H = _mm_add_epi64( EA_L, EB_H );
__m128i SUM_FC_L_FD_H_ED_L = _mm_add_epi64( SUM_FC_L_FD_H, ED_L );
__m128i SUM_FB_L_FC_H_EC_L_ED_H = _mm_add_epi64( SUM_FB_L_FC_H, SUM_EC_L_ED_H );
__m128i SUM_FA_L_FB_H_EB_L_EC_H = _mm_add_epi64( SUM_FA_L_FB_H, SUM_EB_L_EC_H );
__m128i SUM_FA_H_EA_L_EB_H = _mm_add_epi64( FA_H, SUM_EA_L_EB_H );
__m128i SUM_FC_L_FD_H_ED_L_L = _mm_srli_epi64( SUM_FC_L_FD_H_ED_L, 32 );
SUM_FC_L_FD_H_ED_L_L = _mm_add_epi64 ( SUM_FC_L_FD_H_ED_L_L, SUM_FB_L_FC_H_EC_L_ED_H );
__m128i SUM_FC_L_FD_H_ED_L_L_L = _mm_srli_epi64( SUM_FC_L_FD_H_ED_L_L, 32 );
SUM_FC_L_FD_H_ED_L_L_L = _mm_add_epi64 ( SUM_FC_L_FD_H_ED_L_L_L, SUM_FA_L_FB_H_EB_L_EC_H );
__m128i SUM_FC_L_FD_H_ED_L_L_L_L = _mm_srli_epi64( SUM_FC_L_FD_H_ED_L_L_L, 32 );
SUM_FC_L_FD_H_ED_L_L_L_L = _mm_add_epi64 ( SUM_FC_L_FD_H_ED_L_L_L_L, SUM_FA_H_EA_L_EB_H );
__m128i SUM_FC_L_FD_H_ED_L_L_L_L_L = _mm_srli_epi64( SUM_FC_L_FD_H_ED_L_L_L_L, 32 );
SUM_FC_L_FD_H_ED_L_L_L_L_L = _mm_add_epi64 ( SUM_FC_L_FD_H_ED_L_L_L_L_L, EA_H );
OUT[0][0] = SUM_FC_L_FD_H_ED_L_L_L_L_L.m128i_u32[0];
OUT[0][1] = SUM_FC_L_FD_H_ED_L_L_L_L.m128i_u32[0];
OUT[0][2] = SUM_FC_L_FD_H_ED_L_L_L.m128i_u32[0];
OUT[0][3] = SUM_FC_L_FD_H_ED_L_L.m128i_u32[0];
OUT[1][0] = SUM_FC_L_FD_H_ED_L_L_L_L_L.m128i_u32[2];
OUT[1][1] = SUM_FC_L_FD_H_ED_L_L_L_L.m128i_u32[2];
OUT[1][2] = SUM_FC_L_FD_H_ED_L_L_L.m128i_u32[2];
OUT[1][3] = SUM_FC_L_FD_H_ED_L_L.m128i_u32[2];
}
__forceinline void multiply128x128_1( const uint32_t ABCD[4], const uint32_t EFGH[4], uint32_t OUT[2][4] ) noexcept
{
uint64_t ED = static_cast<uint64_t>( ABCD[3] ) * static_cast<uint64_t>( EFGH[0] );
uint64_t EC = static_cast<uint64_t>( ABCD[2] ) * static_cast<uint64_t>( EFGH[0] );
uint64_t EB = static_cast<uint64_t>( ABCD[1] ) * static_cast<uint64_t>( EFGH[0] );
uint64_t EA = static_cast<uint64_t>( ABCD[0] ) * static_cast<uint64_t>( EFGH[0] );
uint64_t FD = static_cast<uint64_t>( ABCD[3] ) * static_cast<uint64_t>( EFGH[1] );
uint64_t FC = static_cast<uint64_t>( ABCD[2] ) * static_cast<uint64_t>( EFGH[1] );
uint64_t FB = static_cast<uint64_t>( ABCD[1] ) * static_cast<uint64_t>( EFGH[1] );
uint64_t FA = static_cast<uint64_t>( ABCD[0] ) * static_cast<uint64_t>( EFGH[1] );
uint64_t GD = static_cast<uint64_t>( ABCD[3] ) * static_cast<uint64_t>( EFGH[2] );
uint64_t GC = static_cast<uint64_t>( ABCD[2] ) * static_cast<uint64_t>( EFGH[2] );
uint64_t GB = static_cast<uint64_t>( ABCD[1] ) * static_cast<uint64_t>( EFGH[2] );
uint64_t GA = static_cast<uint64_t>( ABCD[0] ) * static_cast<uint64_t>( EFGH[2] );
uint64_t HD = static_cast<uint64_t>( ABCD[3] ) * static_cast<uint64_t>( EFGH[3] );
uint64_t HC = static_cast<uint64_t>( ABCD[2] ) * static_cast<uint64_t>( EFGH[3] );
uint64_t HB = static_cast<uint64_t>( ABCD[1] ) * static_cast<uint64_t>( EFGH[3] );
uint64_t HA = static_cast<uint64_t>( ABCD[0] ) * static_cast<uint64_t>( EFGH[3] );
uint64_t SUM_FC_L_FD_H = ( FC & 0xFFFFFFFF ) + ( FD >> 32u );
uint64_t SUM_FB_L_FC_H = ( FB & 0xFFFFFFFF ) + ( FC >> 32u );
uint64_t SUM_FA_L_FB_H = ( FA & 0xFFFFFFFF ) + ( FB >> 32u );
uint64_t SUM_EC_L_ED_H = ( EC & 0xFFFFFFFF ) + ( ED >> 32u );
uint64_t SUM_EB_L_EC_H = ( EB & 0xFFFFFFFF ) + ( EC >> 32u );
uint64_t SUM_EA_L_EB_H = ( EA & 0xFFFFFFFF ) + ( EB >> 32u );
uint64_t SUM_HC_L_HD_H = ( HC & 0xFFFFFFFF ) + ( HD >> 32u );
uint64_t SUM_HB_L_HC_H = ( HB & 0xFFFFFFFF ) + ( HC >> 32u );
uint64_t SUM_HA_L_HB_H = ( HA & 0xFFFFFFFF ) + ( HB >> 32u );
uint64_t SUM_GC_L_GD_H = ( GC & 0xFFFFFFFF ) + ( GD >> 32u );
uint64_t SUM_GB_L_GC_H = ( GB & 0xFFFFFFFF ) + ( GC >> 32u );
uint64_t SUM_GA_L_GB_H = ( GA & 0xFFFFFFFF ) + ( GB >> 32u );
uint64_t SUM_FC_L_FD_H_ED_L = SUM_FC_L_FD_H + ( ED & 0xFFFFFFFF );
uint64_t SUM_FB_L_FC_H_EC_L_ED_H = SUM_FB_L_FC_H + SUM_EC_L_ED_H;
uint64_t SUM_FA_L_FB_H_EB_L_EC_H = SUM_FA_L_FB_H + SUM_EB_L_EC_H;
uint64_t SUM_FA_H_EA_L_EB_H = SUM_EA_L_EB_H + ( FA >> 32u );
uint64_t SUM_FC_L_FD_H_ED_L_L = ( SUM_FC_L_FD_H_ED_L >> 32u ) + SUM_FB_L_FC_H_EC_L_ED_H;
uint64_t SUM_FC_L_FD_H_ED_L_L_L = ( SUM_FC_L_FD_H_ED_L_L >> 32u ) + SUM_FA_L_FB_H_EB_L_EC_H;
uint64_t SUM_FC_L_FD_H_ED_L_L_L_L = ( SUM_FC_L_FD_H_ED_L_L_L >> 32u ) + SUM_FA_H_EA_L_EB_H;
uint64_t SUM_FC_L_FD_H_ED_L_L_L_L_L = ( SUM_FC_L_FD_H_ED_L_L_L_L >> 32u ) + ( EA >> 32u );
uint64_t SUM_HC_L_HD_H_GD_L = SUM_HC_L_HD_H + ( GD & 0xFFFFFFFF );
uint64_t SUM_HB_L_HC_H_GC_L_GD_H = SUM_HB_L_HC_H + SUM_GC_L_GD_H;
uint64_t SUM_HA_L_HB_H_GB_L_GC_H = SUM_HA_L_HB_H + SUM_GB_L_GC_H;
uint64_t SUM_HA_H_GA_L_GB_H = SUM_GA_L_GB_H + ( HA >> 32u );
uint64_t SUM_HC_L_HD_H_GD_L_L = ( SUM_HC_L_HD_H_GD_L >> 32u ) + SUM_HB_L_HC_H_GC_L_GD_H;
uint64_t SUM_HC_L_HD_H_GD_L_L_L = ( SUM_HC_L_HD_H_GD_L_L >> 32u ) + SUM_HA_L_HB_H_GB_L_GC_H;
uint64_t SUM_HC_L_HD_H_GD_L_L_L_L = ( SUM_HC_L_HD_H_GD_L_L_L >> 32u ) + SUM_HA_H_GA_L_GB_H;
uint64_t SUM_HC_L_HD_H_GD_L_L_L_L_L = ( SUM_HC_L_HD_H_GD_L_L_L_L >> 32u ) + ( GA >> 32u );
OUT[0][0] = SUM_FC_L_FD_H_ED_L_L_L_L_L;
OUT[0][1] = SUM_FC_L_FD_H_ED_L_L_L_L;
OUT[0][2] = SUM_FC_L_FD_H_ED_L_L_L;
OUT[0][3] = SUM_FC_L_FD_H_ED_L_L;
OUT[1][0] = SUM_HC_L_HD_H_GD_L_L_L_L_L;
OUT[1][1] = SUM_HC_L_HD_H_GD_L_L_L_L;
OUT[1][2] = SUM_HC_L_HD_H_GD_L_L_L;
OUT[1][3] = SUM_HC_L_HD_H_GD_L_L;
}
__forceinline void mulShift( const uint64_t* const m, const uint64_t* const mul , uint32_t OUT[2][4]) noexcept
{
uint64_t B0[2];
uint64_t B2[2];
{
B0[0] = _umul128( m[1], mul[0], &B0[1] );
B2[0] = _umul128( m[0], mul[0], &B2[1] );
uint64_t S = B0[1] + B2[0];
OUT[0][2] = S >> 32;
OUT[0][3] = S & 0xFFFFFFFF;
uint64_t M = B2[1] + ( S < B2[0] );
OUT[0][1] = M & 0xFFFFFFFF;
OUT[0][0] = M >> 32;
}
{
B0[0] = _umul128( m[1], mul[1], &B0[1] );
B2[0] = _umul128( m[0], mul[1], &B2[1] );
uint64_t S = B0[1] + B2[0];
OUT[1][2] = S >> 32;
OUT[1][3] = S & 0xFFFFFFFF;
uint64_t M = B2[1] + ( S < B2[0] );
OUT[1][1] = M & 0xFFFFFFFF;
OUT[1][0] = M >> 32;
}
}
constexpr uint32_t N = 1 << 28;
int main()
{
uint32_t OUT[2][4];
uint32_t ABCD[4] = { 4294967295u, 4294967295u, 4294967295u, 4294967295u };
uint32_t EFGH[4] = { 4294967295u, 4294967295u, 4294967295u, 4294967295u };
multiply128x128_1( ABCD, EFGH, OUT );
uint64_t S_1 = 0u;
uint64_t S_2 = 0u;
uint64_t S_3 = 0u;
auto start_1 = std::chrono::high_resolution_clock::now();
for ( uint32_t i = 0; i < N; ++i )
{
EFGH[0] = i;
EFGH[1] = i;
EFGH[2] = i + 1;
EFGH[3] = i + 1;
ABCD[0] = i;
ABCD[1] = i;
ABCD[2] = i + 1;
ABCD[3] = i + 1;
multiply128x128( ABCD, EFGH, OUT );
S_1 += OUT[0][0] + OUT[0][1] + OUT[0][2] + OUT[0][3];
S_1 += OUT[1][0] + OUT[1][1] + OUT[1][2] + OUT[1][3];
}
auto stop_1 = std::chrono::high_resolution_clock::now();
std::cout << "Test A: " << std::chrono::duration_cast<std::chrono::milliseconds>( stop_1 - start_1 ).count() << '\n';
auto start_2 = std::chrono::high_resolution_clock::now();
for ( uint32_t i = 0; i < N; ++i )
{
EFGH[0] = i;
EFGH[1] = i;
EFGH[2] = i + 1;
EFGH[3] = i + 1;
ABCD[0] = i;
ABCD[1] = i;
ABCD[2] = i + 1;
ABCD[3] = i + 1;
mulShift( reinterpret_cast<const uint64_t*>( ABCD ), reinterpret_cast<const uint64_t*>( EFGH ), OUT );
S_2 += OUT[0][0] + OUT[0][1] + OUT[0][2] + OUT[0][3];
S_2 += OUT[1][0] + OUT[1][1] + OUT[1][2] + OUT[1][3];
}
auto stop_2 = std::chrono::high_resolution_clock::now();
std::cout << "Test B: " << std::chrono::duration_cast<std::chrono::milliseconds>( stop_2 - start_2 ).count() << '\n';
auto start_3 = std::chrono::high_resolution_clock::now();
for ( uint32_t i = 0; i < N; ++i )
{
EFGH[0] = i;
EFGH[1] = i;
EFGH[2] = i + 1;
EFGH[3] = i + 1;
ABCD[0] = i;
ABCD[1] = i;
ABCD[2] = i + 1;
ABCD[3] = i + 1;
multiply128x128_1( ABCD, EFGH, OUT );
S_3 += OUT[0][0] + OUT[0][1] + OUT[0][2] + OUT[0][3];
S_3 += OUT[1][0] + OUT[1][1] + OUT[1][2] + OUT[1][3];
}
auto stop_3 = std::chrono::high_resolution_clock::now();
std::cout << "Test C: " << std::chrono::duration_cast<std::chrono::milliseconds>( stop_3 - start_3 ).count() << '\n';
std::cout << S_1 << " " << S_2 << " " << S_3 << '\n';
}
Why is _umul128 so slow? Maybe i did some mistakes in my test code above?
My results:
Test A (simd): 4546ms.
Test B (_umul128): 6637ms.
Test C (scalar): 2333ms.
Tested on Windows 10, x64, MSVC 2019

The _umul128 version isn't really that slow but you're gimping it with store-forwarding stalls by messing around with 32-bit arrays that makes MSVC emit terrible asm.
Optimization is defeating your benchmark; the pure C version isn't really that fast.
Especially with the simple input data:
ABCD[0] = EFGH[0] = i;
ABCD[1] = EFGH[1] = i;
ABCD[2] = EFGH[2] = i + 1;
ABCD[3] = EFGH[3] = i + 1;
Initializing both inputs like this creates a huge amount of opportunity for optimization after inlining the pure C version. It does i*i 4 times, and i*(i+1) = i*i + i another 8 times, and also (i+1)*(i+1) 4 times. MSVC isn't dumb and notices this. This is called Common Subexpression Elimination (CSE).
You'll need to come up with a more sophisticated way to fake input if you want to see how slow the pure C really is. Maybe generate ahead of time then loop over memory containing inputs? Setting up inputs from a loop counter costs almost as much as a multiply.
MSVC's asm output confirms that much of the work optimized away for the pure C version. (Godbolt with MSVC 19.22 for x64)
...
$LL10#main:
lea r15, QWORD PTR [rax+1]
mov rcx, r15
mov r9, r15
imul rcx, rax # only 3, not 16, imul instructions.
imul rax, rax # (None appear later in this loop in the ... part)
imul r9, r15
mov edi, ecx
mov r14, rcx
mov r8d, eax
shr r14, 32 ; 00000020H
shr rax, 32 ; 00000020H
...
sub r13, 1
jne $LL10#main
MSVC is bad at optimizing intrinsics and does all 4 mul m64 instructions instead of noticing that ii * i1i1 is done twice.
More importantly, the _umul128 loop is hurt by store-forwarding stalls because it actually stores your array to memory with 32-bit stores and then uses 64-bit loads to feed mul m64.
Also, handling the output in 32-bit chunks just shoots yourself in the foot, introducing extra shifts and mov operations.
This is not complicated, literally just 3 instructions, mul r64 and imul r64, r64 plus an add for the high half, is all that's needed. GCC/clang easily emit the right thing, and the x86-64 System V calling convention can return a 128-bit int in registers.
On Godbolt: https://godbolt.org/z/DcZhSl
#include <stdint.h>
#ifdef __GNUC__
typedef unsigned __int128 u128;
u128 mul128x64( u128 a, uint64_t b) {
return a * b;
}
#endif
# clang -O3 for the x86-64 System V ABI (Linux)
mul128x64(unsigned __int128, unsigned long): #
mov rax, rdi
imul rsi, rdx
mul rdx
add rdx, rsi
ret
For MSVC we have to do that ourself, and the calling convention means the result is returned in memory.
#ifdef _MSC_VER
#include <intrin.h>
struct u128 { uint64_t u64[2]; };
u128 mul128x64( uint64_t a_lo, uint64_t a_hi, uint64_t b)
{
uint64_t lolo_high;
uint64_t lolo = _umul128( a_lo, b, &lolo_high );
uint64_t lohi = a_hi * b;
return {{lolo, lohi + lolo_high}};
}
#endif
# MSVC x64 -O2
u128 mul128x64(unsigned __int64,unsigned __int64,unsigned __int64) PROC
mov rax, r9
mul rdx
imul r8, r9
mov QWORD PTR [rcx], rax # store the retval into hidden pointer
mov rax, rcx
add r8, rdx
mov QWORD PTR [rcx+8], r8
ret 0
Your __m128i intrinsics version is unlikely to be a win. Modern x86 (mainstream Intel SnB-family, AMD Ryzen) has 1/clock throughput for mul and imul. (Except Ryzen where widening i/mul r64 has 2c throughput, but still 1/clock for imul r64,r64.)
So overall throughput for a 64 x 128-bit multiply on Sandybridge-family is one per 2 cycles (bottlenecked on port 1), if you implement in C that compiles to asm like this.
Given that you need more than 4 pmuludq instructions to implement a multiply, AVX1 is a non-starter. (Skylake has 0.5c throughput for pmuludq. Sandybridge has 1c throughput so you'd need to get the job done in 2 pmuludq insns per multiply (on average) to compete with scalar. And that's without considering all the shift / shuffle / add work that needs doing.
Possibly worth considering on Bulldozer-family where 64-bit scalar multiply is 4c throughput but pmuludq is 1c. (https://agner.org/optimize/) Producing 128 product bits per cycle (two 32x32 => 64-bit products) is better than producing 128 product bits per 4 cycles, if you can get them shifted and added without eating up too many extra cycles.
Again, MSVC is bad at constant-propagation or CSE optimization through intrinsincs, so your intrinsics version doesn't benefit from anything.
Your test code also uses _mm_set1_epi32( ) from scalar integer loop variables, requiring vmovd and vpshufd instructions.
And you get scalar store / vector reload for the lddqu intrinsics on those arrays, so again you have store-forwarding stalls.
The only hope for this being good with SSE2 or AVX1 is if your data comes from memory, not registers. Or if you can keep your data in vector registers for a long time, not constantly moving it back and forth. Especially on Bulldozer-family where int <-> SIMD has high latency.

Related

Inner product of two unsigned byte vectors using AVX2 in C/C++

I'd like to implement a fast correlation coefficient computation using SSE/AVX2. The operands are two unsigned char vectors. The function should be mostly equivalent to this:
float correlate_simple(const unsigned char* vec1, const unsigned char* vec2, size_t length)
{
int sum1 = 0;
int sum2 = 0;
int sum11 = 0;
int sum22 = 0;
int sum12 = 0;
for (size_t i = length; i > 0; --i, ++vec1, ++vec2) {
sum1 += *vec1;
sum2 += *vec2;
sum11 += *vec1 * *vec1;
sum22 += *vec2 * *vec2;
sum12 += *vec1 * *vec2;
}
double mean1 = double(sum1) / double(length);
double mean2 = double(sum2) / double(length);
double mean11 = double(sum11) / double(length);
double mean22 = double(sum22) / double(length);
double mean12 = double(sum12) / double(length);
double b = (mean11 - mean1 * mean1) * (mean22 - mean2 * mean2);
if (b <= 0.0)
return 0.0f;
double a = (mean12 - mean1 * mean2);
return float(a / sqrt(b));
}
The parameter length ranges from 1 to less than 1000.
In order to do this, I researched on how to implement an inner product of two unsigned byte arrays. However, I could not come up with a solution that does not involve converting all the unsigned 8 bit values to signed 16 bit values.
The intrinsic _mm256_maddubs_epi16(a, b) expects b to be a signed byte. This would not be a problem in this case since subtracting some constant (here: 127) from b does not change the correlation coefficient. Unfortunately I could not find an intrinsic that would allow me to subtract 127 from unsigned bytes producing signed bytes (not relying on some two's complement magic).
// vec: const unsigned char*
auto x = _mm256_load_si256((const __m256i*) vec);
auto v = _mm256_set1_epi8(127);
// wrong if vec[i] is less than 127:
auto x_centered = _mm256_sub_epi8 (x, v);
What would be the best approach here to compute inner products (and finally a correlation coefficient)?
Addendum:
Below is my current implementation of a pure inner product. I decided to convert to 16 bit integer to avoid overflow errors.
Update: Changed from reading 128 bits to 256 bits at a time.
int accumulate_i32(__m256i x)
{
auto tmp1 = _mm256_srli_si256(x, 8);
x = _mm256_add_epi32(x, tmp1);
auto tmp2 = _mm256_extractf128_si256(x, 1);
tmp2 = _mm_add_epi32(tmp2, _mm256_castsi256_si128(x));
return _mm_cvtsi128_si32(tmp2) + _mm_extract_epi32(tmp2, 1);
}
int inner_product_avx(const unsigned char* vec1, const unsigned char* vec2, unsigned int length)
{
constexpr unsigned int memoryAlignmentBytes = 32;
constexpr unsigned int bytesPerPack = 256 / 8;
assert((reinterpret_cast<std::uintptr_t>(vec1) % memoryAlignmentBytes) == 0);
assert((reinterpret_cast<std::uintptr_t>(vec2) % memoryAlignmentBytes) == 0);
// compute middle part via AVX2
unsigned int packCount = length / bytesPerPack;
const __m256i zeros = _mm256_setzero_si256();
auto sumlo = _mm256_setzero_si256();
auto sumhi = _mm256_setzero_si256();
for (unsigned int packIdx = 0; packIdx < packCount; ++packIdx) {
auto x1 = _mm256_load_si256((const __m256i*)vec1);
auto x2 = _mm256_load_si256((const __m256i*)vec2);
auto x1lo = _mm256_unpacklo_epi8(x1, zeros);
auto x1hi = _mm256_unpackhi_epi8(x1, zeros);
auto x2lo = _mm256_unpacklo_epi8(x2, zeros);
auto x2hi = _mm256_unpackhi_epi8(x2, zeros);
auto tmplo = _mm256_madd_epi16(x1lo, x2lo);
auto tmphi = _mm256_madd_epi16(x1hi, x2hi);
sumlo = _mm256_add_epi32(sumlo, tmplo);
sumhi = _mm256_add_epi32(sumhi, tmphi);
vec1 += bytesPerPack;
vec2 += bytesPerPack;
}
int sum = accumulate_i32(sumlo) + accumulate_i32(sumhi);
// compute remaining part that cannot be represented as a
// whole packed integer
unsigned int packRestCount = length % bytesPerPack;
for (size_t i = packRestCount; i > 0; --i, ++vec1, ++vec2)
sum += int(*vec1) * int(*vec2);
return sum;
}
This takes roughly 20 % of the time of the simple C++ implementation (see below). Considering the fact that the AVX code works on 16 16-bit integers simultaneously, I would have expected a higher gain. - Is this reasonable or did I miss something?
Unrolling the last loop in the AVX code did not descrease computation time.
int inner_product_simple(const unsigned char* vec1, const unsigned char* vec2, size_t length)
{
int sum = 0;
for (size_t i = length; i > 0; --i, ++vec1, ++vec2)
sum += int(*vec1) * int(*vec2);
return sum;
}

I would start from something like that. It uses 32-bit accumulators just like your current code. Untested.
namespace
{
// Compute sum of the 64-bit lanes, convert to double
inline double hadd_epi64( const __m256i i64 )
{
__m128i res = _mm256_castsi256_si128( i64 );
res = _mm_add_epi64( res, _mm256_extractf128_si256( i64, 1 ) );
res = _mm_add_epi64( res, _mm_unpackhi_epi64( res, res ) );
return (double)_mm_cvtsi128_si64( res );
}
// Convert 32-bit lanes into 64-bit, compute sum of the 8, convert to double
inline double hadd_epu32( __m256i x )
{
const __m256i zero = _mm256_setzero_si256();
__m256i i64 = _mm256_unpacklo_epi32( x, zero );
i64 = _mm256_add_epi64( i64, _mm256_unpackhi_epi32( x, zero ) );
return hadd_epi64( i64 );
};
}
class InnerProduct
{
// These fields are interpreted as 64-bit integers
__m256i a, b;
// These fields are interpreted as 32-bit integers
__m256i aa, bb, ab;
// Accumulate products of 16-bit numbers, 16 of them at once
inline void add16( __m256i x, __m256i y )
{
const __m256i x2 = _mm256_madd_epi16( x, x );
const __m256i y2 = _mm256_madd_epi16( y, y );
const __m256i prod = _mm256_madd_epi16( x, y );
aa = _mm256_add_epi32( aa, x2 );
bb = _mm256_add_epi32( bb, y2 );
ab = _mm256_add_epi32( ab, prod );
}
public:
InnerProduct()
{
a = b = aa = bb = ab = _mm256_setzero_si256();
}
// Handle 32 bytes
inline void addBytes( __m256i x, __m256i y )
{
// Accumulate values
const __m256i zero = _mm256_setzero_si256();
a = _mm256_add_epi64( a, _mm256_sad_epu8( x, zero ) );
b = _mm256_add_epi64( b, _mm256_sad_epu8( y, zero ) );
// Split the vectors into 2 sets of 16-bit numbers, accumulate products
const __m256i z = _mm256_unpacklo_epi8( x, zero );
const __m256i w = _mm256_unpacklo_epi8( y, zero );
add16( z, w );
x = _mm256_unpackhi_epi8( x, zero );
y = _mm256_unpackhi_epi8( y, zero );
add16( x, y );
}
// Compute the result
float compute( size_t count ) const
{
const double div = (double)count;
const double mean1 = hadd_epi64( a ) / div;
const double mean2 = hadd_epi64( b ) / div;
const double mean11 = hadd_epu32( aa ) / div;
const double mean22 = hadd_epu32( bb ) / div;
const double mean12 = hadd_epu32( ab ) / div;
const double b = ( mean11 - mean1 * mean1 ) * ( mean22 - mean2 * mean2 );
if( b <= 0 )
return 0;
const double a = ( mean12 - mean1 * mean2 );
return float( a / sqrt( b ) );
}
};
// Load 1-31 bytes into AVX register, zero out unused higher bytes
inline __m256i loadPartial( const uint8_t* p, size_t length )
{
alignas( 32 ) std::array<uint8_t, 32> arr{};
memcpy( arr.data(), p, length );
return _mm256_load_si256( ( const __m256i* )arr.data() );
}
float correlate_simple( const uint8_t* vec1, const uint8_t* vec2, const size_t length )
{
InnerProduct ip;
const uint8_t* const vec1End = vec1 + ( ( length / 32 ) * 32 );
for( ; vec1 < vec1End; vec1 += 32, vec2 += 32 )
{
const __m256i x = _mm256_loadu_si256( ( const __m256i * )vec1 );
const __m256i y = _mm256_loadu_si256( ( const __m256i * )vec2 );
ip.addBytes( x, y );
}
const size_t remainder = length % 32;
if( remainder > 0 )
{
const __m256i x = loadPartial( vec1, remainder );
const __m256i y = loadPartial( vec2, remainder );
ip.addBytes( x, y );
}
return ip.compute( length );
}

Unfortunately I could not find an intrinsic that would allow me to subtract 127 from unsigned bytes producing signed bytes (not relying on some two's complement magic).
Intel x86 CPUs use 2's compliment representation of signed numbers, this is why there are no separate versions of SIMD intrinsics for signed/unsigned packed integers. Intel SIMD intrinsics are outside the scope of C++ standard and have a specific well-defined behaviour.

AVX512 Vector Multiplication Speed

I have Function like this:
#define SPLIT(zmm, ymmA, ymmB) \
ymmA = _mm512_castsi512_si256(zmm); \
ymmB = _mm512_extracti32x8_epi32(zmm, 1)
#define PAIR_AND_BLEND(src1, src2, dst1, dst2) \
dst1 = _mm256_blend_epi32(src1, src2, 0b11110000); \
dst2 = _mm256_permute2x128_si256(src1, src2, 0b00100001);
#define OPERATE_ROW_2(i, ymmA, ymmB) \
zmm##i = _mm512_maddubs_epi16(zmm30, zmm##i); \
zmm##i = _mm512_madd_epi16(zmm##i, zmm31); \
SPLIT(zmm##i, ymmA, ymmB);
/*
* multiply query to each code in codes.
* #param n: number of codes
* #param query: 64 x uint8_t array data
* #param codes: 64 x n x uint8_t array data
* #param output: n x int32_t array data, to store output data.
*/
void avx_IP_distance_64_2(size_t n,
const uint8_t *query,
const uint8_t *codes,
int32_t *output){
__m512i zmm0, zmm1, zmm2, zmm3,
zmm4, zmm5, zmm6, zmm7,
zmm8, zmm9, zmm10, zmm11,
zmm12, zmm13, zmm14, zmm15,
zmm16, zmm17, zmm18, zmm19,
zmm20, zmm21, zmm22, zmm23,
zmm24, zmm25, zmm26, zmm27,
zmm28, zmm29, zmm30, zmm31;
__m256i ymm0, ymm1, ymm2, ymm3,
ymm4, ymm5, ymm6, ymm7,
ymm8, ymm9, ymm10, ymm11,
ymm12, ymm13, ymm14, ymm15;
zmm30 = _mm512_loadu_si512(query);
zmm31 = _mm512_set1_epi16(1);
int k_8 = n / 8;
int left = n % 8;
for (int i = 0; i < k_8; ++i){
zmm0 = _mm512_loadu_si512(codes);
zmm1 = _mm512_loadu_si512(codes + 64 * 1);
zmm2 = _mm512_loadu_si512(codes + 64 * 2);
zmm3 = _mm512_loadu_si512(codes + 64 * 3);
zmm4 = _mm512_loadu_si512(codes + 64 * 4);
zmm5 = _mm512_loadu_si512(codes + 64 * 5);
zmm6 = _mm512_loadu_si512(codes + 64 * 6);
zmm7 = _mm512_loadu_si512(codes + 64 * 7);
OPERATE_ROW_2(0, ymm0, ymm1);
OPERATE_ROW_2(1, ymm2, ymm3);
OPERATE_ROW_2(2, ymm4, ymm5);
OPERATE_ROW_2(3, ymm6, ymm7);
OPERATE_ROW_2(4, ymm8, ymm9);
OPERATE_ROW_2(5, ymm10, ymm11);
OPERATE_ROW_2(6, ymm12, ymm13);
OPERATE_ROW_2(7, ymm14, ymm15);
ymm0 = _mm256_add_epi32(ymm0, ymm1);
ymm2 = _mm256_add_epi32(ymm2, ymm3);
ymm4 = _mm256_add_epi32(ymm4, ymm5);
ymm6 = _mm256_add_epi32(ymm6, ymm7);
ymm8 = _mm256_add_epi32(ymm8, ymm9);
ymm10 = _mm256_add_epi32(ymm10, ymm11);
ymm12 = _mm256_add_epi32(ymm12, ymm13);
ymm14 = _mm256_add_epi32(ymm14, ymm15);
PAIR_AND_BLEND(ymm0, ymm8, ymm1, ymm9);
PAIR_AND_BLEND(ymm2, ymm10, ymm3, ymm11);
PAIR_AND_BLEND(ymm4, ymm12, ymm5, ymm13);
PAIR_AND_BLEND(ymm6, ymm14, ymm7, ymm15);
ymm1 = _mm256_add_epi32(ymm1, ymm9);
ymm3 = _mm256_add_epi32(ymm3, ymm11);
ymm5 = _mm256_add_epi32(ymm5, ymm13);
ymm7 = _mm256_add_epi32(ymm7, ymm15);
ymm1 = _mm256_hadd_epi32(ymm1, ymm3);
ymm5 = _mm256_hadd_epi32(ymm5, ymm7);
ymm1 = _mm256_hadd_epi32(ymm1, ymm5);
_mm256_storeu_si256((__m256i *)(output), ymm1);
codes += 8 * 64;
output += 8;
}
for (int i = 0; i < left; ++i){
OPERATE_ROW_1(0);
}
}
#define LOOP 10
int main(){
int d = 64;
int q = 1;
int n = 100000;
std::mt19937 rng;
std::uniform_real_distribution<> distrib;
uint8_t *codes = new uint8_t[d * n];
uint8_t *query = new uint8_t[d * q];
int32_t *output = new int32_t[n];
for (int i = 0; i < n; ++i){
for (int j = 0; j < d; ++j){
// codes[d*i+j] = j;
codes[d * i + j] = int(distrib(rng)) * 127;
}
}
for (int i = 0; i < q; ++i){
for (int j = 0; j < d; ++j){
// query[d*i+j] = j;
query[d * i + j] = int(distrib(rng)) * 127 - 64;
}
}
Timer timer;
timer.start();
for (int i = 0; i < LOOP; ++i){
avx_IP_distance_64_2(n, query, codes, output);
}
timer.end("Second type");
return 0;
}
When n = 10k, time duration is: 0.143917 ms
When n = 100k, time duration is: 3.2002 ms
When N is less than 10k, the time consumption basically increases linearly.
I suspect it’s a caching problem, but I’m not sure.
I want to know why the time consumption does not increase linearly with n?

Hex-rays, MATLAB, MEX, casting uintptr_t fails

I decompiled some dll using Hex-Rays and than tried to compile it again as MATLAB MEX. As Hex decompiles double * as int and I'm using win 7 64 i casted
double * using uintptr_t. Unfortunatelly sometimes I'm getting 'segmentation fault' with it.
Here is my code
#include "mex.h"
#include <stdint.h>
double __stdcall dzSell(int a1, double a2, int a3, int a4, int a5)
{
int v5; // ecx#1
double v6; // st7#3
double result; // st7#4
int v8; // ebp#5
double v9; // st6#5
double v10; // st5#5
int v11; // edx#5
int v12; // esi#6
int v13; // ecx#6
double v14; // rt0#10
double v15; // st5#10
double v16; // st6#10
double v17; // rt1#12
double v18; // rt2#15
double v19; // st5#15
double v20; // st6#15
double v21; // rtt#17
double v22; // rt0#20
double v23; // st5#20
double v24; // st6#20
double v25; // rt1#22
double v26; // rt2#25
double v27; // st5#25
double v28; // st6#25
double v29; // rtt#27
int v30; // esi#29
int v31; // ecx#29
double v32; // rt0#33
double v33; // st5#33
double v34; // st6#33
double v35; // rt1#35
signed int v36; // edi#36
double v37; // st3#36
double v38; // st4#36
double v39; // st2#37
double v40; // st2#38
double v41; // rt1#38
double v42; // st2#38
double v43; // rt2#38
signed int v44; // esi#40
double v45; // st1#40
int v46; // edx#41
int v47; // ecx#41
double v48; // rt0#43
double v49; // st1#43
double v50; // st4#43
double v51; // rtt#45
double v52; // st5#47
double v53; // st3#47
double v54; // rtt#48
double v55; // rt1#48
double v56; // st3#48
double v57; // rt2#48
double v58; // [sp+4h] [bp-8h]#3
int v59; // [sp+20h] [bp+14h]#1
int v60; // [sp+20h] [bp+14h]#5
v5 = a4 - a5;
v59 = a4 - a5;
if ( v59 >= a3 )
v59 = a3;
v6 = (double)v59;
v58 = v6;
if ( v6 >= 1.0 )
{
v8 = v5 - 1;
v9 = *(double *)(a1 + 8 * (v5 - 1));
v10 = v9;
v60 = a1 + 8 * (v5 - 1);
v11 = 1;
if ( a3 - 1 < 4 )
{
LABEL_28:
if ( v11 < a3 )
{
v30 = v8 - v11;
v31 = a1 + 8 * (v8 - v11);
do
{
if ( v30 < 0 )
break;
if ( v10 < *(double *)v31 )
v10 = *(double *)v31;
v32 = v10;
v33 = v9;
v34 = v32;
if ( v33 > *(double *)v31 )
v33 = *(double *)v31;
++v11;
v35 = v33;
v10 = v34;
v9 = v35;
v31 -= 8;
--v30;
}
while ( v11 < a3 );
}
}
else
{
v12 = v5 - 4;
v13 = a1 + 8 * (v5 - 4);
while ( v12 + 2 >= 0 )
{
if ( v10 < *(double *)(v13 + 16) )
v10 = *(double *)(v13 + 16);
v14 = v10;
v15 = v9;
v16 = v14;
if ( v15 > *(double *)(v13 + 16) )
v15 = *(double *)(v13 + 16);
v17 = v15;
v10 = v16;
v9 = v17;
if ( v12 + 1 < 0 )
break;
if ( v10 < *(double *)(v13 + 8) )
v10 = *(double *)(v13 + 8);
v18 = v10;
v19 = v9;
v20 = v18;
if ( v19 > *(double *)(v13 + 8) )
v19 = *(double *)(v13 + 8);
v21 = v19;
v10 = v20;
v9 = v21;
if ( v12 < 0 )
break;
if ( v10 < *(double *)v13 )
v10 = *(double *)v13;
v22 = v10;
v23 = v9;
v24 = v22;
if ( v23 > *(double *)v13 )
v23 = *(double *)v13;
v25 = v23;
v10 = v24;
v9 = v25;
if ( v12 - 1 < 0 )
break;
if ( v10 < *(double *)(v13 - 8) )
v10 = *(double *)(v13 - 8);
v26 = v10;
v27 = v9;
v28 = v26;
if ( v27 > *(double *)(v13 - 8) )
v27 = *(double *)(v13 - 8);
v11 += 4;
v29 = v27;
v10 = v28;
v9 = v29;
v13 -= 32;
v12 -= 4;
if ( v11 >= a3 - 3 )
goto LABEL_28;
}
}
v36 = 50;
v37 = 0.5;
v38 = (v10 + v9) * 0.5;
if ( v38 - v9 <= 0.005 )
{
result = v38;
}
else
{
v39 = 0.0;
while ( v36 > 0 )
{
v44 = 0;
v45 = v39;
if ( v39 < v6 )
{
v46 = v60;
v47 = v8;
do
{
if ( v47 < 0 )
break;
v48 = v45;
v49 = v38;
v50 = v48;
if ( v49 < *(double *)v46 )
v39 = v39 + 1.0;
++v44;
v46 -= 8;
--v47;
v51 = v49;
v45 = v50;
v38 = v51;
}
while ( (double)v44 < v6 );
}
if ( a2 - 0.001 <= v39 / v6 )
{
v54 = (v38 + v10) * v37;
v9 = v38;
v55 = v37;
v56 = v10;
v52 = v55;
v57 = v56;
v53 = v54;
v38 = v57;
}
else
{
v52 = v37;
v53 = (v38 + v9) * v37;
}
--v36;
if ( v53 - v9 <= 0.005 )
return v53;
v6 = v58;
v40 = v52;
v10 = v38;
v41 = v40;
v42 = v53;
v37 = v41;
v43 = v42;
v39 = v45;
v38 = v43;
}
result = v38;
}
}
else
{
result = 2147483647.0;
}
return result;
}
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
double *buffer;
double Probability;
int DzLookBackBars;
int Bars;
int i;
double result;
// Check inputs
if (nrhs != 5) {//if the user has given anything but one arg, then error
mexErrMsgIdAndTxt("Numerical:myGetPr:nrhs", "Require 5 args");
}
if (!mxIsNumeric(prhs[0])) {//if input is not numeric (for example a string), then errror
mexErrMsgIdAndTxt("Numerical:myGetPr:isnumeric", "Require args that is numeric");
}
if(nlhs!=1) {
mexErrMsgIdAndTxt("MyToolbox:arrayProduct:nlhs","One output required.");
}
/* make sure the first input argument is type double */
if( !mxIsDouble(prhs[0]) ||
mxIsComplex(prhs[0])) {
mexErrMsgIdAndTxt("MyToolbox:arrayProduct:notDouble","Input matrix must be type double.");
}
// Check output
int dims[] = {1,1}; //dimensions of output
buffer = mxGetPr(prhs[0]);
Probability = mxGetScalar(prhs[1]);
DzLookBackBars = mxGetScalar(prhs[2]);
Bars = mxGetScalar(prhs[3]);
i = mxGetScalar(prhs[4]);
result = dzSell ((uintptr_t)buffer, Probability, DzLookBackBars, Bars, i);
plhs[0] = mxCreateDoubleScalar(result);
}
and here screens which prove wrong casting
any idea how to deal with it ???
the example of decompiling arrays by Hex-Rays is here
Hex-Rays decompiles array to int
and here is disassembly file
http://www.mediafire.com/view/w82l6somwr7o450/dissasembly.txt
Krzysztof

Consider the decompiler as giving you a head start, not producing good code.
In particular, the code it creates is not at all portable. To be able to compile for a different pointer size, you will need to fix the type information.

the solution was to use uintptr_t and intptr_t 64 bit integer in both function definition and also inside decompiled function. Decompiled dll was from 32 bit system but my function suppose to work under win 7 64. I used IDA 64 with decompiler but pseudocode generated was 32 bit.

SSE Directshow filter

Context
I've made a directshow filter to change contrast and brightness of my video. I want to speed it up.
Working filter without SSE
HRESULT CBrightness::Transform(IMediaSample *pMediaSample)
{
...
BYTE *pData; // Pointer to the actual image buffer
pMediaSample->GetPointer(&pData);
int numPixels = cxImage * cyImage;
...
prgb = (RGBTRIPLE*) pData;
for (int iPixel=0; iPixel < numPixels; iPixel++ ) {
RGBTRIPLE *ppixel = prgb + iPixel;
ppixel->rgbtGreen = ppixel->rgbtGreen * _contrastPower + _brightnessPower;
ppixel->rgbtBlue = ppixel->rgbtBlue * _contrastPower + _brightnessPower;
ppixel->rgbtRed = ppixel->rgbtRed * _contrastPower + _brightnessPower;
if(ppixel->rgbtGreen>255) ppixel->rgbtGreen = 255;
if(ppixel->rgbtBlue>255) ppixel->rgbtBlue = 255;
if(ppixel->rgbtRed>255) ppixel->rgbtRed = 255;
}
...
}
Not working filter with SEE
HRESULT CBrightness::Transform(IMediaSample *pMediaSample)
{
BYTE *pData; // Pointer to the actual image buffer
long lDataLen; // Holds length of any given sample
int iPixel; // Used to loop through the image pixels
RGBTRIPLE *prgb; // Holds a pointer to the current pixel
AM_MEDIA_TYPE* pType = &m_pInput->CurrentMediaType();
VIDEOINFOHEADER *pvi = (VIDEOINFOHEADER *) pType->pbFormat;
ASSERT(pvi);
CheckPointer(pMediaSample,E_POINTER);
pMediaSample->GetPointer(&pData);
lDataLen = pMediaSample->GetSize();
// Get the image properties from the BITMAPINFOHEADER
int cxImage = pvi->bmiHeader.biWidth;
int cyImage = pvi->bmiHeader.biHeight;
int numPixels = cxImage * cyImage;
prgb = (RGBTRIPLE*) pData;
double dcontrast = 0.7;
__m128d cStore = _mm_set1_pd(dcontrast);
BYTE *pDataOutput = new BYTE[lDataLen];
for (iPixel=0; iPixel < numPixels; iPixel += 4 ) {
//unpack to 32 bits
__m128i current = _mm_unpacklo_epi8( _mm_loadu_si128( (__m128i*)( prgb+iPixel ) ), _mm_setzero_si128());
__m128d image = _mm_cvtepi32_pd(_mm_unpacklo_epi16(current, _mm_setzero_si128()));
//vector operations
__m128d result = _mm_mul_pd(cStore, image);
//pack back to 8 bits
__m128i pack_32 = _mm_cvtpd_epi32 (result);
__m128i pack_16 = _mm_packs_epi32 (pack_32, pack_32);
__m128i pack_8 = _mm_packus_epi16(pack_16, pack_16);
//store the new pixel in pDataOutput
_mm_storeu_si128((__m128i*)(pDataOutput+iPixel), pack_8);
//also tryed to store the result in the original array
//_mm_storeu_si128((__m128i*)(prgb+iPixel), pack_8); // blacks out the whole video
}
//assign the original pointer to point at the start of the new data array
pData = pDataOutput;
return NOERROR;
}
Problems
This code does nothing to the original stream:
//store the new pixel in pDataOutput
_mm_storeu_si128((__m128i*)(pDataOutput+iPixel), pack_8);
....
pData = pDataOutput;
This code blacks out the whole video:
_mm_storeu_si128((__m128i*)(prgb+iPixel), pack_8);
Questions
Am I using the SSE instructions correctly ?
How do I assign the modified data to the original media sample pointer ?

Maybe this example will be useful to you:
void Filter(const uint8_t * src, size_t width, size_t height, double contrast, double brightness, uint8_t * dst)
{
const int shift = 8;
size_t size = width*height*3;
__m128i _contrast16 = _mm_set1_epi16(int16_t(contrast*(1 << shift)));
__m128i _brightness16 = _mm_set1_epi16(int16_t(brightness*(1 << shift)));
for(size_t i = 0; i < size; i += sizeof(__m128i))
{
__m128i _src8 = _mm_load_si128((__m128i*)(src + i));
__m128i _srcLo16 = _mm_unpacklo_epi8(_src8, _mm_setzero_si128());
__m128i _srcHi16 = _mm_unpackhi_epi8(_src8, _mm_setzero_si128());
__m128i _dstLo16 = _mm_srai_epi16(_mm_add_epi16(_brightness16, _mm_mullo_epi16(_contrast16, _srcLo16)), shift);
__m128i _dstHi16 = _mm_srai_epi16(_mm_add_epi16(_brightness16, _mm_mullo_epi16(_contrast16, _srcHi16)), shift);
_mm_store_si128((__m128i*)(dst + i), _mm_packus_epi16(_dstLo16, _dstHi16));
}
}
If use the individual coefficients for each channel:
inline void Filter(const uint8_t * src, const __m128i & contrastLo, const __m128i & contrastHi,
const __m128i & brightnessLo, const __m128i & brightnessHi, int shift, uint8_t * dst)
{
__m128i _src8 = _mm_load_si128((__m128i*)src);
__m128i _srcLo16 = _mm_unpacklo_epi8(_src8, _mm_setzero_si128());
__m128i _srcHi16 = _mm_unpackhi_epi8(_src8, _mm_setzero_si128());
__m128i _dstLo16 = _mm_srai_epi16(_mm_add_epi16(brightnessLo, _mm_mullo_epi16(contrastLo, _srcLo16)), shift);
__m128i _dstHi16 = _mm_srai_epi16(_mm_add_epi16(brightnessHi, _mm_mullo_epi16(contrastHi, _srcHi16)), shift);
_mm_store_si128((__m128i*)dst, _mm_packus_epi16(_dstLo16, _dstHi16));
}
void Filter(const uint8_t * src, size_t width, size_t height, double contrast[3], double brightness[3], uint8_t * dst)
{
const int shift = 8;
size_t size = width*height*3;
const int16_t
c0 = int16_t(contrast[0]*(1 << shift)),
c1 = int16_t(contrast[1]*(1 << shift)),
c2 = int16_t(contrast[2]*(1 << shift));
const int16_t
b0 = int16_t(brightness[0]*(1 << shift)),
b1 = int16_t(brightness[1]*(1 << shift)),
b2 = int16_t(brightness[2]*(1 << shift));
__m128i _contrast[3], _brightness[3];
_contrast[0] = _mm_setr_epi16(c0, c1, c2, c0, c1, c2, c0, c1);
_contrast[1] = _mm_setr_epi16(c2, c0, c1, c2, c0, c1, c2, c0);
_contrast[2] = _mm_setr_epi16(c1, c2, c0, c1, c2, c0, c1, c2);
_brightness[0] = _mm_setr_epi16(b0, b1, b2, b0, b1, b2, b0, b1);
_brightness[1] = _mm_setr_epi16(b2, b0, b1, b2, b0, b1, b2, b0);
_brightness[2] = _mm_setr_epi16(b1, b2, b0, b1, b2, b0, b1, b2);
for(size_t i = 0; i < size;)
{
Filter(src + i, _contrast[0], _contrast[1], _brightness[0], _brightness[1], shift, dst + i);
i += sizeof(__m128i);
Filter(src + i, _contrast[2], _contrast[0], _brightness[2], _brightness[0], shift, dst + i);
i += sizeof(__m128i);
Filter(src + i, _contrast[1], _contrast[2], _brightness[1], _brightness[2], shift, dst + i);
i += sizeof(__m128i);
}
}

Speedup a short to float cast?

I have a short to float cast in C++ that is bottlenecking my code.
The code translates from a hardware device buffer which is natively shorts, this represents the input from a fancy photon counter.
float factor= 1.0f/value;
for (int i = 0; i < W*H; i++)//25% of time is spent doing this
{
int value = source[i];//ushort -> int
destination[i] = value*factor;//int*float->float
}
A few details
Value should go from 0 to 2^16-1, it represents the pixel values of a highly sensitive camera
I'm on a multicore x86 machine with an i7 processor (i7 960 which is SSE 4.2 and 4.1).
Source is aligned to an 8 bit boundary (a requirement of the hardware device)
W*H is always divisible by 8, most of the time W and H are divisible by 8
This makes me sad, is there anything I can do about it?
I am using Visual Studios 2012...

Here's a basic SSE4.1 implementation:
__m128 factor = _mm_set1_ps(1.0f / value);
for (int i = 0; i < W*H; i += 8)
{
// Load 8 16-bit ushorts.
// vi = {a,b,c,d,e,f,g,h}
__m128i vi = _mm_load_si128((const __m128i*)(source + i));
// Convert to 32-bit integers
// vi0 = {a,0,b,0,c,0,d,0}
// vi1 = {e,0,f,0,g,0,h,0}
__m128i vi0 = _mm_cvtepu16_epi32(vi);
__m128i vi1 = _mm_cvtepu16_epi32(_mm_unpackhi_epi64(vi,vi));
// Convert to float
__m128 vf0 = _mm_cvtepi32_ps(vi0);
__m128 vf1 = _mm_cvtepi32_ps(vi1);
// Multiply
vf0 = _mm_mul_ps(vf0,factor);
vf1 = _mm_mul_ps(vf1,factor);
// Store
_mm_store_ps(destination + i + 0,vf0);
_mm_store_ps(destination + i + 4,vf1);
}
This assumes:
source and destination are both aligned to 16 bytes.
W*H is a multiple of 8.
It's possible to do better by further unrolling this loop. (see below)
The idea here is as follows:
Load 8 shorts into a single SSE register.
Split the register into two: One with the bottom 4 shorts and the other with the top 4 shorts.
Zero-extend both registers into 32-bit integers.
Convert them both to floats.
Multiply by the factor.
Store them into destination.
EDIT :
It's been a while since I've done this type of optimization, so I went ahead and unrolled the loops.
Core i7 920 # 3.5 GHz
Visual Studio 2012 - Release x64:
Original Loop : 4.374 seconds
Vectorize no unroll: 1.665
Vectorize unroll 2 : 1.416
Further unrolling resulted in diminishing returns.
Here's the test code:
#include <smmintrin.h>
#include <time.h>
#include <iostream>
#include <malloc.h>
using namespace std;
void default_loop(float *destination,const short* source,float value,int size){
float factor = 1.0f / value;
for (int i = 0; i < size; i++)
{
int value = source[i];
destination[i] = value*factor;
}
}
void vectorize8_unroll1(float *destination,const short* source,float value,int size){
__m128 factor = _mm_set1_ps(1.0f / value);
for (int i = 0; i < size; i += 8)
{
// Load 8 16-bit ushorts.
__m128i vi = _mm_load_si128((const __m128i*)(source + i));
// Convert to 32-bit integers
__m128i vi0 = _mm_cvtepu16_epi32(vi);
__m128i vi1 = _mm_cvtepu16_epi32(_mm_unpackhi_epi64(vi,vi));
// Convert to float
__m128 vf0 = _mm_cvtepi32_ps(vi0);
__m128 vf1 = _mm_cvtepi32_ps(vi1);
// Multiply
vf0 = _mm_mul_ps(vf0,factor);
vf1 = _mm_mul_ps(vf1,factor);
// Store
_mm_store_ps(destination + i + 0,vf0);
_mm_store_ps(destination + i + 4,vf1);
}
}
void vectorize8_unroll2(float *destination,const short* source,float value,int size){
__m128 factor = _mm_set1_ps(1.0f / value);
for (int i = 0; i < size; i += 16)
{
__m128i a0 = _mm_load_si128((const __m128i*)(source + i + 0));
__m128i a1 = _mm_load_si128((const __m128i*)(source + i + 8));
// Split into two registers
__m128i b0 = _mm_unpackhi_epi64(a0,a0);
__m128i b1 = _mm_unpackhi_epi64(a1,a1);
// Convert to 32-bit integers
a0 = _mm_cvtepu16_epi32(a0);
b0 = _mm_cvtepu16_epi32(b0);
a1 = _mm_cvtepu16_epi32(a1);
b1 = _mm_cvtepu16_epi32(b1);
// Convert to float
__m128 c0 = _mm_cvtepi32_ps(a0);
__m128 d0 = _mm_cvtepi32_ps(b0);
__m128 c1 = _mm_cvtepi32_ps(a1);
__m128 d1 = _mm_cvtepi32_ps(b1);
// Multiply
c0 = _mm_mul_ps(c0,factor);
d0 = _mm_mul_ps(d0,factor);
c1 = _mm_mul_ps(c1,factor);
d1 = _mm_mul_ps(d1,factor);
// Store
_mm_store_ps(destination + i + 0,c0);
_mm_store_ps(destination + i + 4,d0);
_mm_store_ps(destination + i + 8,c1);
_mm_store_ps(destination + i + 12,d1);
}
}
void print_sum(const float *destination,int size){
float sum = 0;
for (int i = 0; i < size; i++){
sum += destination[i];
}
cout << sum << endl;
}
int main(){
int size = 8000;
short *source = (short*)_mm_malloc(size * sizeof(short), 16);
float *destination = (float*)_mm_malloc(size * sizeof(float), 16);
for (int i = 0; i < size; i++){
source[i] = i;
}
float value = 1.1;
int iterations = 1000000;
clock_t start;
// Default Loop
start = clock();
for (int it = 0; it < iterations; it++){
default_loop(destination,source,value,size);
}
cout << (double)(clock() - start) / CLOCKS_PER_SEC << endl;
print_sum(destination,size);
// Vectorize 8, no unroll
start = clock();
for (int it = 0; it < iterations; it++){
vectorize8_unroll1(destination,source,value,size);
}
cout << (double)(clock() - start) / CLOCKS_PER_SEC << endl;
print_sum(destination,size);
// Vectorize 8, unroll 2
start = clock();
for (int it = 0; it < iterations; it++){
vectorize8_unroll2(destination,source,value,size);
}
cout << (double)(clock() - start) / CLOCKS_PER_SEC << endl;
print_sum(destination,size);
_mm_free(source);
_mm_free(destination);
system("pause");
}

I believe I have the best answer. My results are much faster than Mystical's. They only require SSE2 but take advantage of SSE3, SSE4, AVX, and even AVX2 if available. You don't have to change any code. You only have to recompile.
I ran over three sizes: 8008, 64000, and 2560*1920 = 4915200. I tried several different variations. I list the most important ones below. The function vectorize8_unroll2 is mystical's function. I made a improved version of his called vectorize8_unroll2_parallel. The function vec16_loop_unroll2_fix and vec16_loop_unroll2_parallel_fix are my functions which I believe are better than mystical's. These functions will automatically use AVX if you compile with AVX but work fine on SSE4 and even SSE2
Additionally, you wrote "W*H is always divisible by 8, most of the time W and H are divisible by 8".
So we can't assume W*H is divisible by 16 in all cases. Mystical's function vectorize8_unroll2 has a bug when size is not a multiple of 16 (try size=8008 in his code and you will see what I mean). My code has no such bug.
I'm using Ander Fog's vectorclass for the vectorization. It's not a lib or dll file. It's just a few header files. I use OpenMP for the parallelization. Here are some of the results:
Intel Xeon E5630 #2.53GHz (supports upto SSE4.2)
size 8008, size2 8032, iterations 1000000
default_loop time: 7.935 seconds, diff 0.000000
vectorize8_unroll2 time: 1.875 seconds, diff 0.000000
vec16_loop_unroll2_fix time: 1.878 seconds, diff 0.000000
vectorize8_unroll2_parallel time: 1.253 seconds, diff 0.000000
vec16_loop_unroll2_parallel_fix time: 1.151 seconds, diff 0.000000
size 64000, size2 64000, iterations 100000
default_loop time: 6.387 seconds, diff 0.000000
vectorize8_unroll2 time: 1.875 seconds, diff 0.000000
vec16_loop_unroll2_fix time: 2.195 seconds, diff 0.000000
vectorize8_unroll2_parallel time: 0.439 seconds, diff 0.000000
vec16_loop_unroll2_parallel_fix time: 0.432 seconds, diff 0.000000
size 4915200, size2 4915200, iterations 1000
default_loop time: 5.125 seconds, diff 0.000000
vectorize8_unroll2 time: 3.496 seconds, diff 0.000000
vec16_loop_unroll2_fix time: 3.490 seconds, diff 0.000000
vectorize8_unroll2_parallel time: 3.119 seconds, diff 0.000000
vec16_loop_unroll2_parallel_fix time: 3.127 seconds, diff 0.000000
Edit: I added the results on a system with AVX using GCC at the end of this answer.
Below is the code. The code only looks long because I do lots of cross checks and test many variations. Download the vectorclass at
http://www.agner.org/optimize/#vectorclass . Copy the header files (vectorclass.h, instrset.h, vectorf128.h, vectorf256.h, vectorf256e.h, vectori128.h, vectori256.h, vectori256e.h) into the directory you compile from. Add /D__SSE4_2__ under C++/CommandLine. Compile in release mode. If you have a CPU with AVX then put /arch:AVX instead. Add OpenMP support under C++ properites/languages.
In GCC
SSE4.2: g++ foo.cpp -o foo_gcc -O3 -mSSE4.2 -fopenmp
AVX: g++ foo.cpp -o foo_gcc -O3 -mavx -fopenmp
In the code below the function vec16_loop_unroll2_parallel requires the array be a multiple of 32. You can change the array size to be a multiple of 32 (that's what size2 refers to) or if that's not possible you can just use the function vec16_loop_unroll2_parallel_fix which has no such restriction. It's just as fast anyway.
#include <stdio.h>
#include "vectorclass.h"
#include "omp.h"
#define ROUND_DOWN(x, s) ((x) & ~((s)-1))
inline void* aligned_malloc(size_t size, size_t align) {
void *result;
#ifdef _MSC_VER
result = _aligned_malloc(size, align);
#else
if(posix_memalign(&result, align, size)) result = 0;
#endif
return result;
}
inline void aligned_free(void *ptr) {
#ifdef _MSC_VER
_aligned_free(ptr);
#else
free(ptr);
#endif
}
void default_loop(float *destination, const unsigned short* source, float value, int size){
float factor = 1.0f/value;
for (int i = 0; i < size; i++) {
int value = source[i];
destination[i] = value*factor;
}
}
void default_loop_parallel(float *destination, const unsigned short* source, float value, int size){
float factor = 1.0f / value;
#pragma omp parallel for
for (int i = 0; i < size; i++) {
int value = source[i];
destination[i] = value*factor;
}
}
void vec8_loop(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
for (int i = 0; i < size; i += 8) {
Vec8us vi = Vec8us().load(source + i);
Vec4ui vi0 = extend_low(vi);
Vec4ui vi1 = extend_high(vi);
Vec4f vf0 = to_float(vi0);
Vec4f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i);
vf1.store(destination + i + 4);
}
}
void vec8_loop_unroll2(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
for (int i = 0; i < size; i += 16) {
Vec8us vi = Vec8us().load(source + i);
Vec4ui vi0 = extend_low(vi);
Vec4ui vi1 = extend_high(vi);
Vec4f vf0 = to_float(vi0);
Vec4f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i + 0);
vf1.store(destination + i + 4);
Vec8us vi_new = Vec8us().load(source + i + 8);
Vec4ui vi2 = extend_low(vi_new);
Vec4ui vi3 = extend_high(vi_new);
Vec4f vf2 = to_float(vi2);
Vec4f vf3 = to_float(vi3);
vf2*=factor;
vf3*=factor;
vf2.store(destination + i + 8);
vf3.store(destination + i + 12);
}
}
void vec8_loop_parallel(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
#pragma omp parallel for
for (int i = 0; i < size; i += 8) {
Vec8us vi = Vec8us().load(source + i);
Vec4ui vi0 = extend_low(vi);
Vec4ui vi1 = extend_high(vi);
Vec4f vf0 = to_float(vi0);
Vec4f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i);
vf1.store(destination + i + 4);
}
}
void vec8_loop_unroll2_parallel(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
#pragma omp parallel for
for (int i = 0; i < size; i += 16) {
Vec8us vi = Vec8us().load(source + i);
Vec4ui vi0 = extend_low(vi);
Vec4ui vi1 = extend_high(vi);
Vec4f vf0 = to_float(vi0);
Vec4f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i + 0);
vf1.store(destination + i + 4);
Vec8us vi_new = Vec8us().load(source + i + 8);
Vec4ui vi2 = extend_low(vi_new);
Vec4ui vi3 = extend_high(vi_new);
Vec4f vf2 = to_float(vi2);
Vec4f vf3 = to_float(vi3);
vf2*=factor;
vf3*=factor;
vf2.store(destination + i + 8);
vf3.store(destination + i + 12);
}
}
void vec16_loop(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
for (int i = 0; i < size; i += 16) {
Vec16us vi = Vec16us().load(source + i);
Vec8ui vi0 = extend_low(vi);
Vec8ui vi1 = extend_high(vi);
Vec8f vf0 = to_float(vi0);
Vec8f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i);
vf1.store(destination + i + 8);
}
}
void vec16_loop_unroll2(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
for (int i = 0; i < size; i += 32) {
Vec16us vi = Vec16us().load(source + i);
Vec8ui vi0 = extend_low(vi);
Vec8ui vi1 = extend_high(vi);
Vec8f vf0 = to_float(vi0);
Vec8f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i + 0);
vf1.store(destination + i + 8);
Vec16us vi_new = Vec16us().load(source + i + 16);
Vec8ui vi2 = extend_low(vi_new);
Vec8ui vi3 = extend_high(vi_new);
Vec8f vf2 = to_float(vi2);
Vec8f vf3 = to_float(vi3);
vf2*=factor;
vf3*=factor;
vf2.store(destination + i + 16);
vf3.store(destination + i + 24);
}
}
void vec16_loop_unroll2_fix(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
int i = 0;
for (; i <ROUND_DOWN(size, 32); i += 32) {
Vec16us vi = Vec16us().load(source + i);
Vec8ui vi0 = extend_low(vi);
Vec8ui vi1 = extend_high(vi);
Vec8f vf0 = to_float(vi0);
Vec8f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i + 0);
vf1.store(destination + i + 8);
Vec16us vi_new = Vec16us().load(source + i + 16);
Vec8ui vi2 = extend_low(vi_new);
Vec8ui vi3 = extend_high(vi_new);
Vec8f vf2 = to_float(vi2);
Vec8f vf3 = to_float(vi3);
vf2*=factor;
vf3*=factor;
vf2.store(destination + i + 16);
vf3.store(destination + i + 24);
}
for (; i < size; i++) {
int value = source[i];
destination[i] = value*factor;
}
}
void vec16_loop_parallel(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
#pragma omp parallel for
for (int i = 0; i < size; i += 16) {
Vec16us vi = Vec16us().load(source + i);
Vec8ui vi0 = extend_low(vi);
Vec8ui vi1 = extend_high(vi);
Vec8f vf0 = to_float(vi0);
Vec8f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i);
vf1.store(destination + i + 8);
}
}
void vec16_loop_unroll2_parallel(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
#pragma omp parallel for
for (int i = 0; i < size; i += 32) {
Vec16us vi = Vec16us().load(source + i);
Vec8ui vi0 = extend_low(vi);
Vec8ui vi1 = extend_high(vi);
Vec8f vf0 = to_float(vi0);
Vec8f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i + 0);
vf1.store(destination + i + 8);
Vec16us vi_new = Vec16us().load(source + i + 16);
Vec8ui vi2 = extend_low(vi_new);
Vec8ui vi3 = extend_high(vi_new);
Vec8f vf2 = to_float(vi2);
Vec8f vf3 = to_float(vi3);
vf2*=factor;
vf3*=factor;
vf2.store(destination + i + 16);
vf3.store(destination + i + 24);
}
}
void vec16_loop_unroll2_parallel_fix(float *destination, const unsigned short* source, float value, int size) {
float factor= 1.0f/value;
int i = 0;
#pragma omp parallel for
for (int i=0; i <ROUND_DOWN(size, 32); i += 32) {
Vec16us vi = Vec16us().load(source + i);
Vec8ui vi0 = extend_low(vi);
Vec8ui vi1 = extend_high(vi);
Vec8f vf0 = to_float(vi0);
Vec8f vf1 = to_float(vi1);
vf0*=factor;
vf1*=factor;
vf0.store(destination + i + 0);
vf1.store(destination + i + 8);
Vec16us vi_new = Vec16us().load(source + i + 16);
Vec8ui vi2 = extend_low(vi_new);
Vec8ui vi3 = extend_high(vi_new);
Vec8f vf2 = to_float(vi2);
Vec8f vf3 = to_float(vi3);
vf2*=factor;
vf3*=factor;
vf2.store(destination + i + 16);
vf3.store(destination + i + 24);
}
for(int i = ROUND_DOWN(size, 32); i < size; i++) {
int value = source[i];
destination[i] = value*factor;
}
}
void vectorize8_unroll1(float *destination,const unsigned short* source,float value,int size){
__m128 factor = _mm_set1_ps(1.0f / value);
for (int i = 0; i < size; i += 8)
{
// Load 8 16-bit ushorts.
__m128i vi = _mm_load_si128((const __m128i*)(source + i));
// Convert to 32-bit integers
__m128i vi0 = _mm_cvtepu16_epi32(vi);
__m128i vi1 = _mm_cvtepu16_epi32(_mm_unpackhi_epi64(vi,vi));
// Convert to float
__m128 vf0 = _mm_cvtepi32_ps(vi0);
__m128 vf1 = _mm_cvtepi32_ps(vi1);
// Multiply
vf0 = _mm_mul_ps(vf0,factor);
vf1 = _mm_mul_ps(vf1,factor);
// Store
_mm_store_ps(destination + i + 0,vf0);
_mm_store_ps(destination + i + 4,vf1);
}
}
void vectorize8_unroll2(float *destination,const unsigned short* source,float value,int size){
__m128 factor = _mm_set1_ps(1.0f / value);
for (int i = 0; i < size; i += 16)
{
__m128i a0 = _mm_load_si128((const __m128i*)(source + i + 0));
__m128i a1 = _mm_load_si128((const __m128i*)(source + i + 8));
// Split into two registers
__m128i b0 = _mm_unpackhi_epi64(a0,a0);
__m128i b1 = _mm_unpackhi_epi64(a1,a1);
// Convert to 32-bit integers
a0 = _mm_cvtepu16_epi32(a0);
b0 = _mm_cvtepu16_epi32(b0);
a1 = _mm_cvtepu16_epi32(a1);
b1 = _mm_cvtepu16_epi32(b1);
// Convert to float
__m128 c0 = _mm_cvtepi32_ps(a0);
__m128 d0 = _mm_cvtepi32_ps(b0);
__m128 c1 = _mm_cvtepi32_ps(a1);
__m128 d1 = _mm_cvtepi32_ps(b1);
// Multiply
c0 = _mm_mul_ps(c0,factor);
d0 = _mm_mul_ps(d0,factor);
c1 = _mm_mul_ps(c1,factor);
d1 = _mm_mul_ps(d1,factor);
// Store
_mm_store_ps(destination + i + 0,c0);
_mm_store_ps(destination + i + 4,d0);
_mm_store_ps(destination + i + 8,c1);
_mm_store_ps(destination + i + 12,d1);
}
}
void vectorize8_unroll1_parallel(float *destination,const unsigned short* source,float value,int size){
__m128 factor = _mm_set1_ps(1.0f / value);
#pragma omp parallel for
for (int i = 0; i < size; i += 8)
{
// Load 8 16-bit ushorts.
__m128i vi = _mm_load_si128((const __m128i*)(source + i));
// Convert to 32-bit integers
__m128i vi0 = _mm_cvtepu16_epi32(vi);
__m128i vi1 = _mm_cvtepu16_epi32(_mm_unpackhi_epi64(vi,vi));
// Convert to float
__m128 vf0 = _mm_cvtepi32_ps(vi0);
__m128 vf1 = _mm_cvtepi32_ps(vi1);
// Multiply
vf0 = _mm_mul_ps(vf0,factor);
vf1 = _mm_mul_ps(vf1,factor);
// Store
_mm_store_ps(destination + i + 0,vf0);
_mm_store_ps(destination + i + 4,vf1);
}
}
void vectorize8_unroll2_parallel(float *destination,const unsigned short* source,float value,int size){
__m128 factor = _mm_set1_ps(1.0f / value);
#pragma omp parallel for
for (int i = 0; i < size; i += 16)
{
__m128i a0 = _mm_load_si128((const __m128i*)(source + i + 0));
__m128i a1 = _mm_load_si128((const __m128i*)(source + i + 8));
// Split into two registers
__m128i b0 = _mm_unpackhi_epi64(a0,a0);
__m128i b1 = _mm_unpackhi_epi64(a1,a1);
// Convert to 32-bit integers
a0 = _mm_cvtepu16_epi32(a0);
b0 = _mm_cvtepu16_epi32(b0);
a1 = _mm_cvtepu16_epi32(a1);
b1 = _mm_cvtepu16_epi32(b1);
// Convert to float
__m128 c0 = _mm_cvtepi32_ps(a0);
__m128 d0 = _mm_cvtepi32_ps(b0);
__m128 c1 = _mm_cvtepi32_ps(a1);
__m128 d1 = _mm_cvtepi32_ps(b1);
// Multiply
c0 = _mm_mul_ps(c0,factor);
d0 = _mm_mul_ps(d0,factor);
c1 = _mm_mul_ps(c1,factor);
d1 = _mm_mul_ps(d1,factor);
// Store
_mm_store_ps(destination + i + 0,c0);
_mm_store_ps(destination + i + 4,d0);
_mm_store_ps(destination + i + 8,c1);
_mm_store_ps(destination + i + 12,d1);
}
}
void copy_arrays(float* a, float*b, const int size) {
float sum = 0;
for(int i=0; i<size; i++) {
b[i] = a[i];
}
}
float compare_arrays(float* a, float*b, const int size) {
float sum = 0;
for(int i=0; i<size; i++) {
float diff = a[i] - b[i];
if(diff!=0) {
printf("i %d, a[i] %f, b[i] %f, diff %f\n", i, a[i], b[i], diff);
break;
}
sum += diff;
}
return sum;
}
void randomize_array(unsigned short* a, const int size) {
for(int i=0; i<size; i++) {
float r = (float)rand()/RAND_MAX;
a[i] = (int)(65536*r);
}
}
void run(int size, int iterations) {
int rd = ROUND_DOWN(size, 32);
int size2 = rd == size ? size : rd + 32;
float value = 1.1f;
printf("size %d, size2 %d, iterations %d\n", size, size2, iterations);
unsigned short* source = (unsigned short*)aligned_malloc(size2*sizeof(short), 16);
float* destination = (float*)aligned_malloc(size2*sizeof(float), 16);
float* destination_old = (float*)aligned_malloc(size2*sizeof(float), 16);
float* destination_ref = (float*)aligned_malloc(size2*sizeof(float), 16);
void (*fp[16])(float *destination, const unsigned short* source, float value, int size);
fp[0] = default_loop;
fp[1] = vec8_loop;
fp[2] = vec8_loop_unroll2;
fp[3] = vec16_loop;
fp[4] = vec16_loop_unroll2;
fp[5] = vec16_loop_unroll2_fix;
fp[6] = vectorize8_unroll1;
fp[7] = vectorize8_unroll2;
fp[8] = default_loop_parallel;
fp[9] = vec8_loop_parallel;
fp[10] = vec8_loop_unroll2_parallel;
fp[11] = vec16_loop_parallel;
fp[12] = vec16_loop_unroll2_parallel;
fp[13] = vec16_loop_unroll2_parallel_fix;
fp[14] = vectorize8_unroll1_parallel;
fp[15] = vectorize8_unroll2_parallel;
char* func_str[] = {"default_loop", "vec8_loop", "vec8_loop_unrool2", "vec16_loop", "vec16_loop_unroll2", "vec16_loop_unroll2_fix", "vectorize8_unroll1", "vectorize8_unroll2",
"default_loop_parallel", "vec8_loop_parallel", "vec8_loop_unroll2_parallel","vec16_loop_parallel", "vec16_loop_unroll2_parallel", "vec16_loop_unroll2_parallel_fix",
"vectorize8_unroll1_parallel", "vectorize8_unroll2_parallel"};
randomize_array(source, size2);
copy_arrays(destination_old, destination_ref, size);
fp[0](destination_ref, source, value, size);
for(int i=0; i<16; i++) {
copy_arrays(destination_old, destination, size);
double dtime = omp_get_wtime();
for (int it = 0; it < iterations; it++){
fp[i](destination, source, value, size);
}
dtime = omp_get_wtime() - dtime;
float diff = compare_arrays(destination, destination_ref, size);
printf("%40s time: %.3f seconds, diff %f\n", func_str[i], dtime, diff);
}
printf("\n");
aligned_free(source);
aligned_free(destination);
aligned_free(destination_old);
aligned_free(destination_ref);
}
int main() {
run(8008, 1000000);
run(64000, 100000);
run(2560*1920, 1000);
}
Results Using GCC on a system with AVX. GCC automatically parallelizes the loop (Visual Studio fails due to the short but works if you try int). You gain very little with hand written vectorization code. However, using multiple threads can help depending upon the array size. For the small array size 8008 OpenMP gives a worse result. However, for the larger array size 128000 using OpenMP gives much better resutls. For the largest array size 4915200 it's entirely memory bound and OpenMP does not help.
i7-2600k # 4.4GHz
size 8008, size2 8032, iterations 1000000
default_loop time: 1.319 seconds, diff 0.000000
vec16_loop_unroll2_fix time: 1.167 seconds, diff 0.000000
vectorize8_unroll2 time: 1.227 seconds, diff 0.000000
vec16_loop_unroll2_parallel time: 1.528 seconds, diff 0.000000
vectorize8_unroll2_parallel time: 1.381 seconds, diff 0.000000
size 128000, size2 128000, iterations 100000
default_loop time: 2.902 seconds, diff 0.000000
vec16_loop_unroll2_fix time: 2.838 seconds, diff 0.000000
vectorize8_unroll2 time: 2.844 seconds, diff 0.000000
vec16_loop_unroll2_parallel_fix time: 0.706 seconds, diff 0.000000
vectorize8_unroll2_parallel time: 0.672 seconds, diff 0.000000
size 4915200, size2 4915200, iterations 1000
default_loop time: 2.313 seconds, diff 0.000000
vec16_loop_unroll2_fix time: 2.309 seconds, diff 0.000000
vectorize8_unroll2 time: 2.318 seconds, diff 0.000000
vec16_loop_unroll2_parallel_fix time: 2.353 seconds, diff 0.000000
vectorize8_unroll2_parallel time: 2.349 seconds, diff 0.000000

Using SSE intrinsics, on my machine [Quad Core Athlon, 3.3GHz, 16GB of RAM], and g++ -O2 optimisation [1] gives about 2.5-3x speed up. I also wrote a function to do the same thing in inline assembler, but it's not noticeably faster (again, this applies on my machine, feel free to run on other machines).
I tried a variety of sizes of H * W, and it all gives approximately the same results.
[1] Using g++ -O3 gives the same time for all four functions, as apparently -O3 enables "automatically vectorise code". So the whole thing was a bit of a waste of time assuming your compiler supports similar auto-vectorisation functionality.
Results
convert_naive sum=4373.98 t=7034751 t/n=7.03475
convert_naive sum=4373.98 t=7266738 t/n=7.26674
convert_naive sum=4373.98 t=7006154 t/n=7.00615
convert_naive sum=4373.98 t=6815329 t/n=6.81533
convert_naive sum=4373.98 t=6820318 t/n=6.82032
convert_unroll4 sum=4373.98 t=8103193 t/n=8.10319
convert_unroll4 sum=4373.98 t=7276156 t/n=7.27616
convert_unroll4 sum=4373.98 t=7028181 t/n=7.02818
convert_unroll4 sum=4373.98 t=7074258 t/n=7.07426
convert_unroll4 sum=4373.98 t=7081518 t/n=7.08152
convert_sse_intrinsic sum=4373.98 t=3377290 t/n=3.37729
convert_sse_intrinsic sum=4373.98 t=3227018 t/n=3.22702
convert_sse_intrinsic sum=4373.98 t=3007898 t/n=3.0079
convert_sse_intrinsic sum=4373.98 t=3253366 t/n=3.25337
convert_sse_intrinsic sum=4373.98 t=5576068 t/n=5.57607
convert_sse_inlineasm sum=4373.98 t=3470887 t/n=3.47089
convert_sse_inlineasm sum=4373.98 t=2838492 t/n=2.83849
convert_sse_inlineasm sum=4373.98 t=2828556 t/n=2.82856
convert_sse_inlineasm sum=4373.98 t=2789052 t/n=2.78905
convert_sse_inlineasm sum=4373.98 t=3176522 t/n=3.17652
Code
#include <iostream>
#include <iomanip>
#include <cstdlib>
#include <cstring>
#include <xmmintrin.h>
#include <emmintrin.h>
#define W 1000
#define H 1000
static __inline__ unsigned long long rdtsc(void)
{
unsigned hi, lo;
__asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
}
void convert_naive(short *source, float *destination)
{
float factor= 1.0f/32767;
for (int i = 0; i < W*H; i++)
{
int value = source[i];
destination[i] = value*factor;
}
}
void convert_unroll4(short *source, float *destination)
{
float factor= 1.0f/32767;
for (int i = 0; i < W*H; i+=4)
{
int v1 = source[i];
int v2 = source[i+1];
int v3 = source[i+2];
int v4 = source[i+3];
destination[i] = v1*factor;
destination[i+1] = v2*factor;
destination[i+2] = v3*factor;
destination[i+3] = v4*factor;
}
}
void convert_sse_intrinsic(short *source, float *destination)
{
__m128 factor = { 1.0f/32767, 1.0f/32767, 1.0f/32767, 1.0f/32767 };
__m64 zero1 = { 0,0 };
__m128i zero2 = { 0,0 };
__m64 *ps = reinterpret_cast<__m64 *>(source);
__m128 *pd = reinterpret_cast<__m128 *>(destination);
for (int i = 0; i < W*H; i+=4)
{
__m128i value = _mm_unpacklo_epi16(_mm_set_epi64(zero1, *ps), zero2);
value = _mm_srai_epi32(_mm_slli_epi32(value, 16), 16);
__m128 fval = _mm_cvtepi32_ps(value);
*pd = _mm_mul_ps(fval, factor); // destination[0,1,2,3] = value[0,1,2,3] * factor;
pd++;
ps++;
}
}
void convert_sse_inlineasm(short *source, float *destination)
{
__m128 factor = { 1.0f/32767, 1.0f/32767, 1.0f/32767, 1.0f/32767 };
__asm__ __volatile__(
"\t pxor %%xmm1, %%xmm1\n"
"\t movaps %3, %%xmm2\n"
"\t mov $0, %%rax\n"
"1:"
"\t movq (%1, %%rax), %%xmm0\n"
"\t movq 8(%1, %%rax), %%xmm3\n"
"\t movq 16(%1, %%rax), %%xmm4\n"
"\t movq 24(%1, %%rax), %%xmm5\n"
"\t punpcklwd %%xmm1, %%xmm0\n"
"\t pslld $16, %%xmm0\n"
"\t psrad $16, %%xmm0\n"
"\t cvtdq2ps %%xmm0, %%xmm0\n"
"\t mulps %%xmm2, %%xmm0\n"
"\t punpcklwd %%xmm1, %%xmm3\n"
"\t pslld $16, %%xmm3\n"
"\t psrad $16, %%xmm3\n"
"\t cvtdq2ps %%xmm3, %%xmm3\n"
"\t mulps %%xmm2, %%xmm3\n"
"\t punpcklwd %%xmm1, %%xmm4\n"
"\t pslld $16, %%xmm4\n"
"\t psrad $16, %%xmm4\n"
"\t cvtdq2ps %%xmm4, %%xmm4\n"
"\t mulps %%xmm2, %%xmm4\n"
"\t punpcklwd %%xmm1, %%xmm5\n"
"\t pslld $16, %%xmm5\n"
"\t psrad $16, %%xmm5\n"
"\t cvtdq2ps %%xmm5, %%xmm5\n"
"\t mulps %%xmm2, %%xmm5\n"
"\t movaps %%xmm0, (%0, %%rax, 2)\n"
"\t movaps %%xmm3, 16(%0, %%rax, 2)\n"
"\t movaps %%xmm4, 32(%0, %%rax, 2)\n"
"\t movaps %%xmm5, 48(%0, %%rax, 2)\n"
"\t addq $32, %%rax\n"
"\t cmpq %2, %%rax\n"
"\t jbe 1b\n"
: /* no outputs */
: "r" (destination), "r" (source), "i"(sizeof(*source) * H * W), "m"(factor):
"rax", "xmm0", "xmm1", "xmm3");
}
short inbuffer[W * H] __attribute__ ((aligned (16)));
float outbuffer[W * H + 16] __attribute__ ((aligned (16)));
#ifdef DEBUG
float outbuffer2[W * H];
#endif
typedef void (*func)(short *source, float *destination);
struct BmEntry
{
const char *name;
func fn;
};
void bm(BmEntry& e)
{
memset(outbuffer, 0, sizeof(outbuffer));
unsigned long long t = rdtsc();
e.fn(inbuffer, outbuffer);
t = rdtsc() - t;
float sum = 0;
for(int i = 0; i < W * H; i++)
{
sum += outbuffer[i];
}
#if DEBUG
convert_naive(inbuffer, outbuffer2);
for(int i = 0; i < W * H; i++)
{
if (outbuffer[i] != outbuffer2[i])
{
std::cout << i << ":: " << inbuffer[i] << ": "
<< outbuffer[i] << " != " << outbuffer2[i]
<< std::endl;
}
}
#endif
std::cout << std::left << std::setw(30) << e.name << " sum=" << sum << " t=" << t <<
" t/n=" << (double)t / (W * H) << std::endl;
}
#define BM(x) { #x, x }
BmEntry table[] =
{
BM(convert_naive),
BM(convert_unroll4),
BM(convert_sse_intrinsic),
BM(convert_sse_inlineasm),
};
int main()
{
for(int i = 0; i < W * H; i++)
{
inbuffer[i] = (short)i;
}
for(int i = 0; i < sizeof(table)/sizeof(table[i]); i++)
{
for(int j = 0; j < 5; j++)
bm(table[i]);
}
return 0;
}

No sure if the condition expression in the loop is evaluated only once.
You can try:
float factor= 1.0f/value;
for (int i = 0, count = W*H; i < count; ++i)//25% of time is spent doing this
{
int value = source[i];//short -> int
destination[i] = value*factor;//int->float
}

This is not a valid answer, don't take it as it, but I'm actually wondering how would the code behave by using a 256k look-up table. (basically a 'short to float' table with 65536 entries).
A CoreI7 has about 8 megabytes of cache I believe, so the look-up table would fit in the data cache.
I really wonder how that would impact the performance :)

and You can use OpenMP to hire every core of your CPU, and it is simple just do as following:
#include <omp.h>
float factor= 1.0f/value;
#pragma omp parallel for
for (int i = 0; i < W*H; i++)//25% of time is spent doing this
{
int value = source[i];//ushort -> int
destination[i] = value*factor;//int*float->float
}
here is the result based on previous program, just add the like this:
#pragma omp parallel for
for (int it = 0; it < iterations; it++){
...
}
and then here is the result
beta#beta-PC ~
$ g++ -o opt.exe opt.c -msse4.1 -fopenmp
beta#beta-PC ~
$ opt
0.748
2.90873e+007
0.484
2.90873e+007
0.796
2.90873e+007
beta#beta-PC ~
$ g++ -o opt.exe opt.c -msse4.1 -O3
beta#beta-PC ~
$ opt
1.404
2.90873e+007
1.404
2.90873e+007
1.404
2.90873e+007
. .
result shows 100% improvment with openmp. Visual C++ supports openmp too.

You could try to approximate the expression
float factor = 1.0f/value;
by an fraction numerator/denomitator where both numerator and denominator are ints. This can be done to the precision you need in your application like
int denominator = 10000;
int numerator = factor * denominator;
Then you can do your computation in integer arithmetics like
int value = source[i];
destination[i] = (value * numerator) / numerator;
You have to take care of overflows, perhaps you need to switch to long (or even long long on 64bit systems) for the calculation.

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

Why _umul128 works slower than scalar code for mul128x64x2 function? - c++

Related

Inner product of two unsigned byte vectors using AVX2 in C/C++

AVX512 Vector Multiplication Speed

Hex-rays, MATLAB, MEX, casting uintptr_t fails

SSE Directshow filter

Speedup a short to float cast?

Categories

Resources