VC++ optimization double loads from one _mm256_loadu_ps intrinsic

VC++ optimization double loads from one _mm256_loadu_ps intrinsic - c++

Here's a source:
// Same as multiplyComplex, multiplies 4 numbers
__forceinline __m256 multiplyComplex_x4( const __m256 x, const __m256 y )
{
// If the inputs are [ a, b ] and [ c, d ] the formula is [ ac - bd, ad + bc ]
const __m256 x2 = _mm256_movehdup_ps( x ); // [ b, b ]
const __m256 yRev = _mm256_permute_ps( y, shuffleMask_rev64q ); // [ d, c ]
const __m256 prod2 = _mm256_mul_ps( x2, yRev ); // [ bd, bc ]
const __m256 x1 = _mm256_moveldup_ps( x ); // [ a, a ]
return _mm256_fmaddsub_ps( x1, y, prod2 );
}
// Same as fftMainLoop, handles 4 complex numbers
__forceinline void fftMainLoop_x4( const __m256 omega, complex* a1c, complex* a2c )
{
const __m256 a1 = _mm256_loadu_ps( (const float*)a1c );
const __m256 a2 = _mm256_loadu_ps( (const float*)a2c );
const __m256 product = multiplyComplex_x4( omega, a2 );
_mm256_storeu_ps( (float*)a1c, _mm256_add_ps( a1, product ) );
_mm256_storeu_ps( (float*)a2c, _mm256_sub_ps( a1, product ) );
}
Here's the assembly made by VC++ 2017, the complete loop that calls that function:
00007FF7CC3F11B0 lea eax,[r8+4]
00007FF7CC3F11B4 mov ecx,r8d
00007FF7CC3F11B7 vpermilps ymm0,ymmword ptr [rdi+rax*8],0B1h
00007FF7CC3F11BE vmulps ymm2,ymm0,ymm4
00007FF7CC3F11C2 vfmaddsub231ps ymm2,ymm5,ymmword ptr [rdi+rax*8]
00007FF7CC3F11C8 vmovups ymm3,ymmword ptr [rdi+rcx*8]
00007FF7CC3F11CD vaddps ymm0,ymm2,ymm3
00007FF7CC3F11D1 add r8d,8
00007FF7CC3F11D5 vsubps ymm1,ymm3,ymm2
00007FF7CC3F11D9 vmovups ymmword ptr [rdi+rcx*8],ymm0
00007FF7CC3F11DE vmovups ymmword ptr [rdi+rax*8],ymm1
00007FF7CC3F11E3 cmp r8d,esi
00007FF7CC3F11E6 jb fft_run+1B0h (07FF7CC3F11B0h)
The code reads from memory 50% more often than I would like it to.
The two instructions
vpermilps ymm0,ymmword ptr [rdi+rax*8],0B1h
and
vfmaddsub231ps ymm2,ymm5,ymmword ptr [rdi+rax*8]
both access the same memory address.
Is there a way to convince the compiler to emit vmovups for my _mm256_loadu_ps, instead of trying to merge the load?

Related

C++ Optimize Memory Read Speed

I'm creating an int (32 bit) vector with 1024 * 1024 * 1024 elements like so:
std::vector<int> nums;
for (size_t i = 0; i < 1024 * 1024 * 1024; i++) {
nums.push_back(rand() % 1024);
}
which holds 4 GB of random data at that point. And then I'm simply summing up all the elements in the vector like so:
uint64_t total = 0;
for (auto cn = nums.begin(); cn < nums.end(); cn++) {
total += *cn;
}
This takes about ~0.18 seconds which means the data is processed at around 22.2 GB/s. I'm running this on an M1 with a much higher memory bandwidth of about 60GB/s. Is there a way to make the above code run faster on a single core?
EDIT:
Manual SIMD version:
int32x4_t simd_total = vmovq_n_s32(0);
for (auto cn = nums.begin(); cn < nums.end()-3; cn +=4) {
const int32_t v[4] = {cn[0], cn[1], cn[2], cn[3]}
simd_total = vaddq_s32(simd_total, vld1q_s32(v));
}
return vaddvq_s32(simd_total);
The SIMD version has the same performance as the non-manual-SIMD version.
EDIT 2:
Alright, so I changed the vector elements to uint32_t and also changed the result type to uint32_t(as suggested by #Peter Cordes):
uint32_t sum_ints_32(const std::vector<uint32_t>& nums) {
uint32_t total = 0;
for (auto cn = nums.begin(); cn < nums.end(); cn++) {
total += *cn;
}
return total;
}
This runs much faster (~45 GB/s). This is the disassembly:
0000000100002218 <__Z11sum_ints_32RKNSt3__16vectorIjNS_9allocatorIjEEEE>:
100002218: a940200c ldp x12, x8, [x0]
10000221c: eb08019f cmp x12, x8
100002220: 54000102 b.cs 100002240 <__Z11sum_ints_32RKNSt3__16vectorIjNS_9allocatorIjEEEE+0x28> // b.hs, b.nlast
100002224: aa2c03e9 mvn x9, x12
100002228: 8b090109 add x9, x8, x9
10000222c: f1006d3f cmp x9, #0x1b
100002230: 540000c8 b.hi 100002248 <__Z11sum_ints_32RKNSt3__16vectorIjNS_9allocatorIjEEEE+0x30> // b.pmore
100002234: 52800000 mov w0, #0x0 // #0
100002238: aa0c03e9 mov x9, x12
10000223c: 14000016 b 100002294 <__Z11sum_ints_32RKNSt3__16vectorIjNS_9allocatorIjEEEE+0x7c>
100002240: 52800000 mov w0, #0x0 // #0
100002244: d65f03c0 ret
100002248: d342fd29 lsr x9, x9, #2
10000224c: 9100052a add x10, x9, #0x1
100002250: 927ded4b and x11, x10, #0x7ffffffffffffff8
100002254: 8b0b0989 add x9, x12, x11, lsl #2
100002258: 9100418c add x12, x12, #0x10
10000225c: 6f00e400 movi v0.2d, #0x0
100002260: aa0b03ed mov x13, x11
100002264: 6f00e401 movi v1.2d, #0x0
100002268: ad7f8d82 ldp q2, q3, [x12, #-16]
10000226c: 4ea08440 add v0.4s, v2.4s, v0.4s
100002270: 4ea18461 add v1.4s, v3.4s, v1.4s
100002274: 9100818c add x12, x12, #0x20
100002278: f10021ad subs x13, x13, #0x8
10000227c: 54ffff61 b.ne 100002268 <__Z11sum_ints_32RKNSt3__16vectorIjNS_9allocatorIjEEEE+0x50> // b.any
100002280: 4ea08420 add v0.4s, v1.4s, v0.4s
100002284: 4eb1b800 addv s0, v0.4s
100002288: 1e260000 fmov w0, s0
10000228c: eb0b015f cmp x10, x11
100002290: 540000a0 b.eq 1000022a4 <__Z11sum_ints_32RKNSt3__16vectorIjNS_9allocatorIjEEEE+0x8c> // b.none
100002294: b840452a ldr w10, [x9], #4
100002298: 0b000140 add w0, w10, w0
10000229c: eb08013f cmp x9, x8
1000022a0: 54ffffa3 b.cc 100002294 <__Z11sum_ints_32RKNSt3__16vectorIjNS_9allocatorIjEEEE+0x7c> // b.lo, b.ul, b.last
1000022a4: d65f03c0 ret
I also rewrote the Manual-SIMD version:
uint32_t sum_ints_simd_2(const std::vector<uint32_t>& nums) {
uint32x4_t simd_total = vmovq_n_u32(0);
for (auto cn = nums.begin(); cn < nums.end()-3; cn +=4) {
const uint32_t v[4] = { cn[0], cn[1], cn[2], cn[3] };
simd_total = vaddq_u32(simd_total, vld1q_u32(v));
}
return vaddvq_u32(simd_total);
}
which still runs 2x slower than the non-manual-SIMD version and results in the following disassembly:
0000000100002464 <__Z15sum_ints_simd_2RKNSt3__16vectorIjNS_9allocatorIjEEEE>:
100002464: a9402408 ldp x8, x9, [x0]
100002468: d1003129 sub x9, x9, #0xc
10000246c: 6f00e400 movi v0.2d, #0x0
100002470: eb09011f cmp x8, x9
100002474: 540000c2 b.cs 10000248c <__Z15sum_ints_simd_2RKNSt3__16vectorIjNS_9allocatorIjEEEE+0x28> // b.hs, b.nlast
100002478: 6f00e400 movi v0.2d, #0x0
10000247c: 3cc10501 ldr q1, [x8], #16
100002480: 4ea08420 add v0.4s, v1.4s, v0.4s
100002484: eb09011f cmp x8, x9
100002488: 54ffffa3 b.cc 10000247c <__Z15sum_ints_simd_2RKNSt3__16vectorIjNS_9allocatorIjEEEE+0x18> // b.lo, b.ul, b.last
10000248c: 4eb1b800 addv s0, v0.4s
100002490: 1e260000 fmov w0, s0
100002494: d65f03c0 ret
To reach the same speed as the auto-vectorized version, we can use a uint32x4x2 instead of uint32x4 for our manual-SIMD version:
uint32_t sum_ints_simd_3(const std::vector<uint32_t>& nums) {
uint32x4x2_t simd_total;
simd_total.val[0] = vmovq_n_u32(0);
simd_total.val[1] = vmovq_n_u32(0);
for (auto cn = nums.begin(); cn < nums.end()-7; cn +=8) {
const uint32_t v[4] = { cn[0], cn[1], cn[2], cn[3] };
const uint32_t v2[4] = { cn[4], cn[5], cn[6], cn[7] };
simd_total.val[0] = vaddq_u32(simd_total.val[0], vld1q_u32(v));
simd_total.val[1] = vaddq_u32(simd_total.val[1], vld1q_u32(v2));
}
return vaddvq_u32(simd_total.val[0]) + vaddvq_u32(simd_total.val[1]);
}
And to gain even more speed we can leverage uint32x4x4 (which gets us about ~53 GB/s):
uint32_t sum_ints_simd_4(const std::vector<uint32_t>& nums) {
uint32x4x4_t simd_total;
simd_total.val[0] = vmovq_n_u32(0);
simd_total.val[1] = vmovq_n_u32(0);
simd_total.val[2] = vmovq_n_u32(0);
simd_total.val[3] = vmovq_n_u32(0);
for (auto cn = nums.begin(); cn < nums.end()-15; cn +=16) {
const uint32_t v[4] = { cn[0], cn[1], cn[2], cn[3] };
const uint32_t v2[4] = { cn[4], cn[5], cn[6], cn[7] };
const uint32_t v3[4] = { cn[8], cn[9], cn[10], cn[11] };
const uint32_t v4[4] = { cn[12], cn[13], cn[14], cn[15] };
simd_total.val[0] = vaddq_u32(simd_total.val[0], vld1q_u32(v));
simd_total.val[1] = vaddq_u32(simd_total.val[1], vld1q_u32(v2));
simd_total.val[2] = vaddq_u32(simd_total.val[2], vld1q_u32(v3));
simd_total.val[3] = vaddq_u32(simd_total.val[3], vld1q_u32(v4));
}
return vaddvq_u32(simd_total.val[0])
+ vaddvq_u32(simd_total.val[1])
+ vaddvq_u32(simd_total.val[2])
+ vaddvq_u32(simd_total.val[3]);
}
which gets us the following disassembly:
0000000100005e34 <__Z15sum_ints_simd_4RKNSt3__16vectorIjNS_9allocatorIjEEEE>:
100005e34: a9402408 ldp x8, x9, [x0]
100005e38: d100f129 sub x9, x9, #0x3c
100005e3c: 6f00e403 movi v3.2d, #0x0
100005e40: 6f00e402 movi v2.2d, #0x0
100005e44: 6f00e401 movi v1.2d, #0x0
100005e48: 6f00e400 movi v0.2d, #0x0
100005e4c: eb09011f cmp x8, x9
100005e50: 540001c2 b.cs 100005e88 <__Z15sum_ints_simd_4RKNSt3__16vectorIjNS_9allocatorIjEEEE+0x54> // b.hs, b.nlast
100005e54: 6f00e400 movi v0.2d, #0x0
100005e58: 6f00e401 movi v1.2d, #0x0
100005e5c: 6f00e402 movi v2.2d, #0x0
100005e60: 6f00e403 movi v3.2d, #0x0
100005e64: ad401504 ldp q4, q5, [x8]
100005e68: ad411d06 ldp q6, q7, [x8, #32]
100005e6c: 4ea38483 add v3.4s, v4.4s, v3.4s
100005e70: 4ea284a2 add v2.4s, v5.4s, v2.4s
100005e74: 4ea184c1 add v1.4s, v6.4s, v1.4s
100005e78: 4ea084e0 add v0.4s, v7.4s, v0.4s
100005e7c: 91010108 add x8, x8, #0x40
100005e80: eb09011f cmp x8, x9
100005e84: 54ffff03 b.cc 100005e64 <__Z15sum_ints_simd_4RKNSt3__16vectorIjNS_9allocatorIjEEEE+0x30> // b.lo, b.ul, b.last
100005e88: 4eb1b863 addv s3, v3.4s
100005e8c: 1e260068 fmov w8, s3
100005e90: 4eb1b842 addv s2, v2.4s
100005e94: 1e260049 fmov w9, s2
100005e98: 0b080128 add w8, w9, w8
100005e9c: 4eb1b821 addv s1, v1.4s
100005ea0: 1e260029 fmov w9, s1
100005ea4: 0b090108 add w8, w8, w9
100005ea8: 4eb1b800 addv s0, v0.4s
100005eac: 1e260009 fmov w9, s0
100005eb0: 0b090100 add w0, w8, w9
100005eb4: d65f03c0 ret
Crazy stuff

Does -march=native help? IDK if there are any SIMD features that Apple clang won't already take advantage on the first generation of AArch64 MacOS CPUs, but clang might just be taking baseline AArch64 in general.
Can you go faster if you use uint32_t sums, so the compiler doesn't have to widen each element before adding? That means each SIMD instruction can only handle half as much data from memory as with same-sized accumulators.
https://godbolt.org/z/7c19913jE shows that Thomas Matthews' unrolling suggestion does actually get clang11 -O3 -march=apple-a13 to unroll the SIMD-vectorized asm loops it makes. That source change is not a win in general, e.g. much worse for x86-64 clang -O3 -march=haswell, but it does help here.
Another possibility is that a single core can't saturate memory bandwidth. But benchmark results published by Anandtech for example seem to rule that out: they found that even a single core can achieve 59GB/s, although that was probably running an optimize memcpy function.
(They say The fact that a single Firestorm core can almost saturate the memory controllers is astounding and something we’ve never seen in a design before. That sounds a bit weird; desktop / laptop Intel CPUs come pretty close, unlike their "server" chips. Maybe not as close as Apple?
M1 has pretty low memory latency compared to modern x86, so that probably helps a single core be able to track the incoming loads to keep the necessary latency x bandwidth product in flight, even with its high memory bandwidth.

Here are some techniques.
Loop Unrolling
uint64_t total = 0;
for (auto cn = nums.begin(); cn < nums.end(); cn += 4)
{
total += cn[0];
total += cn[1];
total += cn[2];
total += cn[3];
}
Register Prefetch
uint64_t total = 0;
for (auto cn = nums.begin(); cn < nums.end(); cn += 4)
{
const uint64 n0 = cn[0];
const uint64 n1 = cn[1];
const uint64 n2 = cn[2];
const uint64 n3 = cn[3];
total += n0;
total += n1;
total += n2;
total += n3;
}
You should print the assembly language for each of these at high optimization level and compare them.
Also, your processor may have some specialized instructions that you could. For example, the ARM processor can load multiple registers from memory with one instruction.
Also, look up SIMD instructions or search the internet for "C++ SIMD read memory".
I've argued with compilers (on embedded systems) and found out that the compiler's optimization strategies may be better or equal to instruction specialization or other techniques (timings were performed using Test Points and oscilloscope).
You'll have to remember that your task, on a one core machine, will most likely be swapped out more often that with a system with multiple cores or a specialized (embedded) system.

Consider precalculating as much as you can and using built-in STL functions, this will lead to as much optimal code as possible before trying SIMD or assembly approaches. If it's still too slow, then try the SIMD/assembly versions:
Avoid calling push_back on unreserved std::vectors: this causes the system to allocate more space when the capacity limit is reached. Since you know the size of the array before hand, reserve the space ahead of time: (for non-built-in types, consider emplace_back as well).
Additionally, the STL functions can reduce the boilerplate code down to two function calls.
Also, avoid rand().
const std::size_t GB = 1024 * 1024 * 1024;
std::vector<int> nums(4 * GB);
std::generate(std::begin(nums), std::end(nums), [](){ return rand() % 1024; });
//...
const auto sum = std::accumulate(std::begin(nums), std::end(nums), 0);

Efficiently compute absolute values of std::complex<float> vector with AVX2

For some real-time DSP application I need to compute the absolute values of a complex valued vector.
The straightforward implementation would look like that
computeAbsolute (std::complex<float>* complexSourceVec,
float* realValuedDestinationVec,
int vecLength)
{
for (int i = 0; i < vecLength; ++i)
realValuedDestinationVec[i] = std::abs (complexSourceVec[i]);
}
I want to replace this implementation with an AVX2 optimized version, based on AVX2 instrincts. What would be the most efficient way to implement it that way?
Note: The source data is handed to me by an API I have no access to, so there is no chance to change the layout of the complex input vector for better efficiency.

Inspired by the answer of Dan M. I first implemented his version with some tweaks:
First changed it to use the wider 256 Bit registers, then marked the temporary re and im arrays with __attribute__((aligned (32))) to be able to use aligned load
void computeAbsolute1 (const std::complex<float>* cplxIn, float* absOut, const int length)
{
for (int i = 0; i < length; i += 8)
{
float re[8] __attribute__((aligned (32))) = {cplxIn[i].real(), cplxIn[i + 1].real(), cplxIn[i + 2].real(), cplxIn[i + 3].real(), cplxIn[i + 4].real(), cplxIn[i + 5].real(), cplxIn[i + 6].real(), cplxIn[i + 7].real()};
float im[8] __attribute__((aligned (32))) = {cplxIn[i].imag(), cplxIn[i + 1].imag(), cplxIn[i + 2].imag(), cplxIn[i + 3].imag(), cplxIn[i + 4].imag(), cplxIn[i + 5].imag(), cplxIn[i + 6].imag(), cplxIn[i + 7].imag()};
__m256 x4 = _mm256_load_ps (re);
__m256 y4 = _mm256_load_ps (im);
__m256 b4 = _mm256_sqrt_ps (_mm256_add_ps (_mm256_mul_ps (x4,x4), _mm256_mul_ps (y4,y4)));
_mm256_storeu_ps (absOut + i, b4);
}
}
However manually shuffling the values this way seemed like a task that could be speeded up somehow. Now this is the solution I came up with, that runs 2 - 3 times faster in a quick test compiled by clang with full optimization:
#include <complex>
#include <immintrin.h>
void computeAbsolute2 (const std::complex<float>* __restrict cplxIn, float* __restrict absOut, const int length)
{
for (int i = 0; i < length; i += 8)
{
// load 8 complex values (--> 16 floats overall) into two SIMD registers
__m256 inLo = _mm256_loadu_ps (reinterpret_cast<const float*> (cplxIn + i ));
__m256 inHi = _mm256_loadu_ps (reinterpret_cast<const float*> (cplxIn + i + 4));
// seperates the real and imaginary part, however values are in a wrong order
__m256 re = _mm256_shuffle_ps (inLo, inHi, _MM_SHUFFLE (2, 0, 2, 0));
__m256 im = _mm256_shuffle_ps (inLo, inHi, _MM_SHUFFLE (3, 1, 3, 1));
// do the heavy work on the unordered vectors
__m256 abs = _mm256_sqrt_ps (_mm256_add_ps (_mm256_mul_ps (re, re), _mm256_mul_ps (im, im)));
// reorder values prior to storing
__m256d ordered = _mm256_permute4x64_pd (_mm256_castps_pd(abs), _MM_SHUFFLE(3,1,2,0));
_mm256_storeu_ps (absOut + i, _mm256_castpd_ps(ordered));
}
}
I think I'll go with that implementation if no one comes up with a faster solution
This compiles efficiently with gcc and clang (on the Godbolt compiler explorer).

It's really hard (if possible) to write "highly optimized AVX2" version of complex abs since the way complex numbers are defined in the standard prevents (specifically due to all inf/nan corner cases) a lot of optimization.
However, if you don't care about the correctness you can just use -ffast-math and some compilers would optimize the code for you. See gcc output: https://godbolt.org/z/QbZlBI
You can also take this output and create your own abs function with inline assembly.
But yes, as was already mentioned, if you really need performance, you probably want to swap std::complex for something else.
I was able to get a decent output for your specific case with all the required shuffles by manually filling small re and im arrays. See: https://godbolt.org/z/sWAAXo
This could be trivially extended for ymm registers.
Anyway, here is the ultimate solution adapted from this SO answer which uses intrinsics in combination with clever compiler optimizations:
#include <complex>
#include <cassert>
#include <immintrin.h>
static inline void cabs_soa4(const float *re, const float *im, float *b) {
__m128 x4 = _mm_loadu_ps(re);
__m128 y4 = _mm_loadu_ps(im);
__m128 b4 = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(x4,x4), _mm_mul_ps(y4,y4)));
_mm_storeu_ps(b, b4);
}
void computeAbsolute (const std::complex<float>* src,
float* realValuedDestinationVec,
int vecLength)
{
for (int i = 0; i < vecLength; i += 4) {
float re[4] = {src[i].real(), src[i + 1].real(), src[i + 2].real(), src[i + 3].real()};
float im[4] = {src[i].imag(), src[i + 1].imag(), src[i + 2].imag(), src[i + 3].imag()};
cabs_soa4(re, im, realValuedDestinationVec);
}
}
which compiles to simple
_Z15computeAbsolutePKSt7complexIfEPfi:
test edx, edx
jle .L5
lea eax, [rdx-1]
shr eax, 2
sal rax, 5
lea rax, [rdi+32+rax]
.L3:
vmovups xmm0, XMMWORD PTR [rdi]
vmovups xmm2, XMMWORD PTR [rdi+16]
add rdi, 32
vshufps xmm1, xmm0, xmm2, 136
vmulps xmm1, xmm1, xmm1
vshufps xmm0, xmm0, xmm2, 221
vfmadd132ps xmm0, xmm1, xmm0
vsqrtps xmm0, xmm0
vmovups XMMWORD PTR [rsi], xmm0
cmp rax, rdi
jne .L3
.L5:
ret
https://godbolt.org/z/Yu64Wg

Floating point maxing out loop doesn't terminate in D, works in C++

I have two similar programs one in C++ and another in D.
The compilation is on on Windows7 64bit, to 64bit binaries.
C++ version, VS 2013:
#include <iostream>
#include <string>
int main(int argc, char* argv[])
{
float eps = 1.0f;
float f = 0.0f;
while (f + eps != f)
f += 1.0f;
std::cout << "eps = " + std::to_string(eps) + ", max_f = " + std::to_string(f) << std::endl;
return 0;
}
D version, DMD v2.066.1:
import std.stdio;
import std.conv;
int main(string[] argv)
{
float eps = 1.0f;
float f = 0.0f;
while (f + eps != f)
f += 1.0f;
writeln("eps = " ~ to!string(eps) ~ ", max_f = " ~ to!string(f));
return 0;
}
C++ version works as expected and finds that f + e == f when f = 16777216.
But D version hungs forever. When I put breakpoint I see that in D version f also 16777216 (after running for some time) and Watch window (I use VisualD) shows that (f + e != f) is 'false' so the loop should be terminate but it's not the case during runtime.
I think assembly could give the answer but I'm not very good with it.
I'm new to D, so it should be the case that I misused the language/compiler (compiled with DMD just as 'dmd test.d' without additional options and also from VS with VisualD with default options). Any ideas what could be wrong with D version of the program? Thanks!
Disassembly:
C++:
000000013F7D1410 mov rax,rsp
000000013F7D1413 push rbp
000000013F7D1414 lea rbp,[rax-5Fh]
000000013F7D1418 sub rsp,0E0h
000000013F7D141F mov qword ptr [rbp+17h],0FFFFFFFFFFFFFFFEh
000000013F7D1427 mov qword ptr [rax+8],rbx
000000013F7D142B movaps xmmword ptr [rax-18h],xmm6
000000013F7D142F xorps xmm1,xmm1
float eps = 1.0f;
float f = 0.0f;
000000013F7D1432 movss xmm6,dword ptr [__real#3f800000 (013F7D67E8h)]
000000013F7D143A nop word ptr [rax+rax]
f += 1.0f;
000000013F7D1440 addss xmm1,xmm6
while (f + eps != f)
000000013F7D1444 movaps xmm0,xmm1
000000013F7D1447 addss xmm0,xmm6
000000013F7D144B ucomiss xmm0,xmm1
000000013F7D144E jp main+30h (013F7D1440h)
000000013F7D1450 jne main+30h (013F7D1440h)
D:
000000013F761002 mov ebp,esp
000000013F761004 sub rsp,50h
{
float eps = 1.0f;
000000013F761008 xor eax,eax
000000013F76100A mov dword ptr [rbp-50h],eax
000000013F76100D movss xmm0,dword ptr [rbp-50h]
000000013F761012 movss dword ptr [f],xmm0
float f = 0.0f;
while (f + eps != f)
f += 1.0f;
000000013F761017 movss xmm1,dword ptr [__NULL_IMPORT_DESCRIPTOR+1138h (013F7C3040h)]
000000013F76101F movss xmm2,dword ptr [f]
000000013F761024 addss xmm2,xmm1
000000013F761028 movss dword ptr [f],xmm2
000000013F76102D fld dword ptr [f]
000000013F761030 fadd dword ptr [__NULL_IMPORT_DESCRIPTOR+1138h (013F7C3040h)]
000000013F761036 fld dword ptr [f]
000000013F761039 fucomip st,st(1)
000000013F76103B fstp st(0)
000000013F76103D jne D main+17h (013F761017h)
000000013F76103F jp D main+17h (013F761017h)
Summary
Accept harold's answer that program behavior is due to the mixed FPU and SSE usage.
Here's a summary what happens in D assembly snippet. In fact the loop will run forever.
SSE behaves strictly according to IEEE-754 when f reaches 16777216.0 and we add 1.0 to this value (f += 1.0f) we still obtain 16777216.0 in xmm2 register, then we store it to memory.
(f + eps != f) expression is computed on the FPU. Since FPU registers have enough precision (f+eps) results in 16777217.0. If we stored this result back to memory into float variable then we would get expected value 16777216.0 (since 16777217.0 is not represented as float). And (f + eps != f) would be 'false' and loop would terminate. But we do not store any numbers back to memory and perform comparison on the FPU (since we have both operands). It means that we compare one number that is computed strictly according to IEEE-754 (f) and another that is computed with 80bit accuracy (f+eps). 16777216.0 != 16777217.0 and the loop runs forever.
I'm not an expert in this area but for me it looks like that doing floating point with SSE instructions is more robust as was demonstrated in C++ version of the program.
Update
I had a discussion on the D forum http://forum.dlang.org/thread/ucnayusylmpvkpcnbhgh#forum.dlang.org
It turned out that program behaves correctly - it's according to the language specification that intermediate calculations can be performed with higher accuracy.
The robust implementation for any D compiler is:
import std.stdio;
int main()
{
const float eps = 1.0f;
const float step = 1.0;
float f = 0.0f;
float fPlusEps = f + eps;
while (f != fPlusEps)
{
f += step;
fPlusEps = f + eps;
}
writeln("eps = ", eps, ", max_f = ", f);
return 0;
}

Mixed FPU and SSE code, that's .. really strange. I see absolutely no reason to implement it this way.
But they have, and the result is that f + eps != f is evaluated with 80bit extended precision, while
f += 1.0f is evaluated using 32bit floats.
That means the loop can never end, since f will stop going up before the value that makes
f + eps != f false (which, in 80bit precision, is huge) is reached.

Trying to break a loop with != or == with floating point values is looking for troubles.
The different behavior is mot likely due to the float to double to 80-bits internal floating point conversion compiler may adopt when passing values to the FPU.
When extending the mantissa, in particular- some compilers or optimizer can decide to let the less significant bit "random" instead of zeroed. So 1.0f, when given to the FPU may become 1.000000000000000000000012134432 that -according to a float- precision, is still 1.0, but wen 1.000000000000000000000012134432 and 1.000000000000000000000089544455 (the two tail are random) are compared by the FPU, look different.
You should verify how C++ and D compiler treat the floating point extension/reduction and eventually configure the appropriate switches: if the two compilers are not from the same manufacturer, thay had probably made different choices for their respective defaults.

SSE Bilinear interpolation

I'm implementing bilinear interpolation in a tight loop and trying to optimize it with SSE, but I get zero speed-up from it.
Here is the code, the non-SIMD version uses a simple vector structure which could be defined as struct Vec3f { float x, y, z; } with implemented multiplication and addition operators:
#ifdef USE_SIMD
const Color c11 = pixelCache[y1 * size.x + x1];
const Color c12 = pixelCache[y2 * size.x + x1];
const Color c22 = pixelCache[y2 * size.x + x2];
const Color c21 = pixelCache[y1 * size.x + x2];
__declspec(align(16)) float mc11[4] = { 1.0, c11.GetB(), c11.GetG(), c11.GetR() };
__declspec(align(16)) float mc12[4] = { 1.0, c12.GetB(), c12.GetG(), c12.GetR() };
__declspec(align(16)) float mc22[4] = { 1.0, c22.GetB(), c22.GetG(), c22.GetR() };
__declspec(align(16)) float mc21[4] = { 1.0, c21.GetB(), c21.GetG(), c21.GetR() };
// scalars in vector form for SSE
const float s11 = (x2-x)*(y2-y);
const float s12 = (x2-x)*(y-y1);
const float s22 = (x-x1)*(y-y1);
const float s21 = (x-x1)*(y2-y);
__declspec(align(16)) float ms11[4] = {1.0, s11, s11, s11};
__declspec(align(16)) float ms12[4] = {1.0, s12, s12, s12};
__declspec(align(16)) float ms22[4] = {1.0, s22, s22, s22};
__declspec(align(16)) float ms21[4] = {1.0, s21, s21, s21};
__asm {
movaps xmm0, mc11
movaps xmm1, mc12
movaps xmm2, mc22
movaps xmm3, mc21
movaps xmm4, ms11
movaps xmm5, ms12
movaps xmm6, ms22
movaps xmm7, ms21
mulps xmm0, xmm4
mulps xmm1, xmm5
mulps xmm2, xmm6
mulps xmm3, xmm7
addps xmm0, xmm1
addps xmm0, xmm2
addps xmm0, xmm3
movaps mc11, xmm0
}
#else
const Vec3f c11 = toFloat(pixelCache[y1 * size.x + x1]);
const Vec3f c12 = toFloat(pixelCache[y2 * size.x + x1]);
const Vec3f c22 = toFloat(pixelCache[y2 * size.x + x2]);
const Vec3f c21 = toFloat(pixelCache[y1 * size.x + x2]);
const Vec3f colour =
c11*(x2-x)*(y2-y) +
c21*(x-x1)*(y2-y) +
c12*(x2-x)*(y-y1) +
c22*(x-x1)*(y-y1);
#endif
Rearranging the asm code to reuse the registers(ended up with just three xmm registers) didn't give any effect. I've also tried using intrinsics:
// perform bilinear interpolation
const Vec3f c11 = toFloat(pixelCache[y1 * size.x + x1]);
const Vec3f c12 = toFloat(pixelCache[y2 * size.x + x1]);
const Vec3f c22 = toFloat(pixelCache[y2 * size.x + x2]);
const Vec3f c21 = toFloat(pixelCache[y1 * size.x + x2]);
// scalars in vector form for SSE
const float s11 = (x2-x)*(y2-y);
const float s12 = (x2-x)*(y-y1);
const float s22 = (x-x1)*(y-y1);
const float s21 = (x-x1)*(y2-y);
__m128 mc11 = _mm_set_ps(1.f, c11.b, c11.g, c11.r);
__m128 mc12 = _mm_set_ps(1.f, c12.b, c12.g, c12.r);
__m128 mc22 = _mm_set_ps(1.f, c22.b, c22.g, c22.r);
__m128 mc21 = _mm_set_ps(1.f, c21.b, c21.g, c21.r);
__m128 ms11 = _mm_set_ps(1.f, s11, s11, s11);
__m128 ms12 = _mm_set_ps(1.f, s12, s12, s12);
__m128 ms22 = _mm_set_ps(1.f, s22, s22, s22);
__m128 ms21 = _mm_set_ps(1.f, s21, s21, s21);
mc11 = _mm_mul_ps(mc11, ms11);
mc12 = _mm_mul_ps(mc12, ms12);
mc22 = _mm_mul_ps(mc22, ms22);
mc21 = _mm_mul_ps(mc21, ms21);
mc11 = _mm_add_ps(mc11, mc12);
mc11 = _mm_add_ps(mc11, mc22);
mc11 = _mm_add_ps(mc11, mc21);
Vec3f colour;
_mm_storeu_ps(colour.array, mc11);
And to no avail. Am I missing something, or it is impossible to gain any extra speed here?

Why floating point?
Given packed pixel argb for a, b, c, d, and xerr, yerr in the range 0-256, a simple example is:
// =================================================================================================================
// xs_Bilerp
// =================================================================================================================
finline uint32 xs_Bilerp (uint32 a, uint32 b, uint32 c, uint32 d, uint32 xerr, uint32 yerr)
{
#define xs_rbmask 0x00ff00ff
#define xs_agmask 0xff00ff00
if (a==b && c==d && a==d) return a;
const uint32 arb = a & xs_rbmask;
const uint32 crb = c & xs_rbmask;
const uint32 aag = a & xs_agmask;
const uint32 cag = c & xs_agmask;
const uint32 rbdx1 = (b & xs_rbmask) - arb;
const uint32 rbdx2 = (d & xs_rbmask) - crb;
const uint32 agdx1 = ((b & xs_agmask)>>8) - (aag >> 8);
const uint32 agdx2 = ((d & xs_agmask)>>8) - (cag >> 8);
const uint32 rb1 = (arb + ((rbdx1 * xerr) >> 8)) & xs_rbmask;
const uint32 ag1 = (aag + ((agdx1 * xerr) )) & xs_agmask;
const uint32 rbdy = ((crb + ((rbdx2 * xerr) >> 8)) & xs_rbmask) - rb1;
const uint32 agdy = (((cag + ((agdx2 * xerr) )) & xs_agmask)>>8) - (ag1 >> 8);
const uint32 rb = (rb1 + ((rbdy * yerr) >> 8)) & xs_rbmask;
const uint32 ag = (ag1 + ((agdy * yerr) )) & xs_agmask;
return ag | rb;
}

optimization, branching elimination

float mixValue = ... //in range -1.0f to 1.0f
for(... ; ... ; ... ) //long loop
{
float inputLevel = ... //in range -1.0f to 1.0f
if(inputLevel < 0.0 && mixValue < 0.0)
{
mixValue = (mixValue + inputLevel) + (mixValue*inputLevel);
}
else
{
mixValue = (mixValue + inputLevel) - (mixValue*inputLevel);
}
}
just a simple question, can we calculate mixValue without branching? or any other optimization suggestion, such as using SIMD?
edit:
just for more information, I ended up
using this solution, based on chosen answer:
const float sign[] = {-1, 1};
float mixValue = ... //in range -1.0f to 1.0f
for(... ; ... ; ... ) //long loop
{
float inputLevel = ... //in range -1.0f to 1.0f
unsigned a = *(unsigned*)(&mixValue);
unsigned b = *(unsigned*)(&inputLevel);
float mulValue = mixValue * inputLevel * sign[(a & b) >> (8*sizeof(unsigned)-1)];
float addValue = mixValue + inputLevel;
mixValue = addValue + mulValue;
}
thank you.

How about this:
const float sign[] = {-1, 1};
float mixValue = ... //in range -1.0f to 1.0f
for(... ; ... ; ... ) //long loop
{
float inputLevel = ... //in range -1.0f to 1.0f
int bothNegative = (inputLevel < 0.0) & (mixValue < 0.0);
mixValue = (mixValue + inputLevel) + (sign[bothNegative]*mixValue*inputLevel);
}
Edit: Mike was correct that && would introduce a branch and thanks for Pedro for proving it. I changed && to & and now GCC (version 4.4.0) generates branch-free code.

Inspired by Roku's answer (which on MSVC++10 branches), this doesn't seem to branch:
#include <iostream>
using namespace std;
const float sign[] = {-1, 1};
int main() {
const int N = 10;
float mixValue = -0.5F;
for(int i = 0; i < N; i++) {
volatile float inputLevel = -0.3F;
int bothNegative = ((((unsigned char*)&inputLevel)[3] & 0x80) & (((unsigned char*)&mixValue)[3] & 0x80)) >> 7;
mixValue = (mixValue + inputLevel) + (sign[bothNegative]*mixValue*inputLevel);
}
std::cout << mixValue << std::endl;
}
Here's the disassembly, as analyzed by IDA Pro (compiled on MSVC++10, Release mode):
Disassembly http://img248.imageshack.us/img248/6865/floattestbranchmine.png

float mixValue = ... //in range -1.0f to 1.0f
for(... ; ... ; ... ) //long loop
{
float inputLevel = ... //in range -1.0f to 1.0f
float mulValue = mixValue * inputLevel;
float addValue = mixValue + inputLevel;
__int32 a = *(__int32*)(&mixValue);
__int32 b = *(__int32*)(&inputLevel);
__int32 c = *(__int32*)(&mulValue);
__int32 d = c & ((a ^ b) | 0x7FFFFFFF);
mixValue = addValue + *(float*)(&d);
}

Just off the top of my head (I'm sure it can be reduced):
mixValue = (mixValue + inputLevel) + (((mixValue / fabs(mixValue)) + (inputLevel / fabs(inputLevel))+1) / fabs(((mixValue / fabs(mixValue)) + (inputLevel / fabs(inputLevel))+1)))*-1*(mixValue*inputLevel);
Just to clarify a bit, I'll calculate sign separately:
float sign = (((mixValue / fabs(mixValue)) + (inputLevel / fabs(inputLevel))+1) / fabs(((mixValue / fabs(mixValue)) + (inputLevel / fabs(inputLevel))+1)))*-1;
mixValue = (mixValue + inputLevel) + sign*(mixValue*inputLevel);
This is floating point math, so you'll likely need to correct for some rounding issues, but that should set you on the right path I think.

If you are worried about excessive branching, look at Duff's Device. This should help unwind the loop somewhat. Truth be told, loop unwinding is something that will be done by the optimizer, so trying to do it by hand may be a waste of time. Check the assembly output to find out.
SIMD will definitely be of assistance provided you a performing the exact same operation to each item in your array. Be aware than not all hardware supports SIMD but some compilers like gcc do provide intrinsics for SIMD which will save your from dipping into assembler.
If you are using gcc to compile ARM code, the SIMD intrinsics can be found here

Have you benchmarked the loop with and without the branch ?
At least you could remove one part of the branch, since mixValue is outside of the loop.
float multiplier(float a, float b){
unsigned char c1Neg = reinterpret_cast<unsigned char *>(&a)[3] & 0x80;
unsigned char c2Neg = reinterpret_cast<unsigned char *>(&b)[3] & 0x80;
unsigned char multiplierIsNeg = c1Neg & c2Neg;
float one = 1;
reinterpret_cast<unsigned char *>(&one)[3] |= multiplierIsNeg;
return -one;
}
cout << multiplier(-1,-1) << endl; // +1
cout << multiplier( 1,-1) << endl; // -1
cout << multiplier( 1, 1) << endl; // -1
cout << multiplier(-1, 1) << endl; // -1

Looking at your code, you see that you will always add the absolute value of mixValue and inputLevel, except when both are positive.
With some bit-fiddling and IEEE floatingpoint knowledge, you may get rid of the conditional:
// sets the first bit of f to zero => makes it positive.
void absf( float& f ) {
assert( sizeof( float ) == sizeof( int ) );
reinterpret_cast<int&>( f ) &= ~0x80000000;
}
// returns a first-bit = 1 if f is positive
int pos( float& f ) {
return ~(reinterpret_cast<int&>(f) & 0x80000000) & 0x80000000;
}
// returns -fabs( f*g ) if f>0 and g>0, fabs(f*g) otherwise.
float prod( float& f, float& g ) {
float p = f*g;
float& rp=p;
int& ri = reinterpret_cast<int&>(rp);
absf(p);
ri |= ( pos(f) & pos(g) & 0x80000000); // first bit = + & +
return p;
}
int main(){
struct T { float f, g, r;
void test() {
float p = prod(f,g);
float d = (p-r)/r;
assert( -1e-15 < d && d < 1e-15 );
}
};
T vals[] = { {1,1,-1},{1,-1,1},{-1,1,1},{-1,-1,1} };
for( T* val=vals; val != vals+4; ++val ) {
val->test();
}
}
And finally: your loop
for( ... ) {
mixedResult += inputLevel + prod(mixedResult,inputLevel);
}
Note: the dimensions of your accumulation don't match. The inputLevel is a dimensionless quantity, while mixedResult is your... result (e.g. in Pascal, in Volts, ...). You cannot add two quantities with different dimensions. Probably you want mixedResult += prod( mixedResult, inputLevel ) as your accumulator.

Some compilers (ie MSC) would also require manual sign checking.
Source:
volatile float mixValue;
volatile float inputLevel;
float u = mixValue*inputLevel;
float v = -u;
float a[] = { v, u };
mixValue = (mixValue + inputLevel) + a[ (inputLevel<0.0) & (mixValue<0.0) ];
IntelC 11.1:
movss xmm1, DWORD PTR [12+esp]
mulss xmm1, DWORD PTR [16+esp]
movss xmm6, DWORD PTR [12+esp]
movss xmm2, DWORD PTR [16+esp]
movss xmm3, DWORD PTR [16+esp]
movss xmm5, DWORD PTR [12+esp]
xorps xmm4, xmm4
movaps xmm0, xmm4
subss xmm0, xmm1
movss DWORD PTR [esp], xmm0
movss DWORD PTR [4+esp], xmm1
addss xmm6, xmm2
xor eax, eax
cmpltss xmm3, xmm4
movd ecx, xmm3
neg ecx
cmpltss xmm5, xmm4
movd edx, xmm5
neg edx
and ecx, edx
addss xmm6, DWORD PTR [esp+ecx*4]
movss DWORD PTR [12+esp], xmm6
gcc 4.5:
flds 32(%esp)
flds 16(%esp)
fmulp %st, %st(1)
fld %st(0)
fchs
fstps (%esp)
fstps 4(%esp)
flds 32(%esp)
flds 16(%esp)
flds 16(%esp)
flds 32(%esp)
fxch %st(2)
faddp %st, %st(3)
fldz
fcomi %st(2), %st
fstp %st(2)
fxch %st(1)
seta %dl
xorl %eax, %eax
fcomip %st(1), %st
fstp %st(0)
seta %al
andl %edx, %eax
fadds (%esp,%eax,4)
xorl %eax, %eax
fstps 32(%esp)

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js