Related
For some real-time DSP application I need to compute the absolute values of a complex valued vector.
The straightforward implementation would look like that
computeAbsolute (std::complex<float>* complexSourceVec,
float* realValuedDestinationVec,
int vecLength)
{
for (int i = 0; i < vecLength; ++i)
realValuedDestinationVec[i] = std::abs (complexSourceVec[i]);
}
I want to replace this implementation with an AVX2 optimized version, based on AVX2 instrincts. What would be the most efficient way to implement it that way?
Note: The source data is handed to me by an API I have no access to, so there is no chance to change the layout of the complex input vector for better efficiency.
Inspired by the answer of Dan M. I first implemented his version with some tweaks:
First changed it to use the wider 256 Bit registers, then marked the temporary re and im arrays with __attribute__((aligned (32))) to be able to use aligned load
void computeAbsolute1 (const std::complex<float>* cplxIn, float* absOut, const int length)
{
for (int i = 0; i < length; i += 8)
{
float re[8] __attribute__((aligned (32))) = {cplxIn[i].real(), cplxIn[i + 1].real(), cplxIn[i + 2].real(), cplxIn[i + 3].real(), cplxIn[i + 4].real(), cplxIn[i + 5].real(), cplxIn[i + 6].real(), cplxIn[i + 7].real()};
float im[8] __attribute__((aligned (32))) = {cplxIn[i].imag(), cplxIn[i + 1].imag(), cplxIn[i + 2].imag(), cplxIn[i + 3].imag(), cplxIn[i + 4].imag(), cplxIn[i + 5].imag(), cplxIn[i + 6].imag(), cplxIn[i + 7].imag()};
__m256 x4 = _mm256_load_ps (re);
__m256 y4 = _mm256_load_ps (im);
__m256 b4 = _mm256_sqrt_ps (_mm256_add_ps (_mm256_mul_ps (x4,x4), _mm256_mul_ps (y4,y4)));
_mm256_storeu_ps (absOut + i, b4);
}
}
However manually shuffling the values this way seemed like a task that could be speeded up somehow. Now this is the solution I came up with, that runs 2 - 3 times faster in a quick test compiled by clang with full optimization:
#include <complex>
#include <immintrin.h>
void computeAbsolute2 (const std::complex<float>* __restrict cplxIn, float* __restrict absOut, const int length)
{
for (int i = 0; i < length; i += 8)
{
// load 8 complex values (--> 16 floats overall) into two SIMD registers
__m256 inLo = _mm256_loadu_ps (reinterpret_cast<const float*> (cplxIn + i ));
__m256 inHi = _mm256_loadu_ps (reinterpret_cast<const float*> (cplxIn + i + 4));
// seperates the real and imaginary part, however values are in a wrong order
__m256 re = _mm256_shuffle_ps (inLo, inHi, _MM_SHUFFLE (2, 0, 2, 0));
__m256 im = _mm256_shuffle_ps (inLo, inHi, _MM_SHUFFLE (3, 1, 3, 1));
// do the heavy work on the unordered vectors
__m256 abs = _mm256_sqrt_ps (_mm256_add_ps (_mm256_mul_ps (re, re), _mm256_mul_ps (im, im)));
// reorder values prior to storing
__m256d ordered = _mm256_permute4x64_pd (_mm256_castps_pd(abs), _MM_SHUFFLE(3,1,2,0));
_mm256_storeu_ps (absOut + i, _mm256_castpd_ps(ordered));
}
}
I think I'll go with that implementation if no one comes up with a faster solution
This compiles efficiently with gcc and clang (on the Godbolt compiler explorer).
It's really hard (if possible) to write "highly optimized AVX2" version of complex abs since the way complex numbers are defined in the standard prevents (specifically due to all inf/nan corner cases) a lot of optimization.
However, if you don't care about the correctness you can just use -ffast-math and some compilers would optimize the code for you. See gcc output: https://godbolt.org/z/QbZlBI
You can also take this output and create your own abs function with inline assembly.
But yes, as was already mentioned, if you really need performance, you probably want to swap std::complex for something else.
I was able to get a decent output for your specific case with all the required shuffles by manually filling small re and im arrays. See: https://godbolt.org/z/sWAAXo
This could be trivially extended for ymm registers.
Anyway, here is the ultimate solution adapted from this SO answer which uses intrinsics in combination with clever compiler optimizations:
#include <complex>
#include <cassert>
#include <immintrin.h>
static inline void cabs_soa4(const float *re, const float *im, float *b) {
__m128 x4 = _mm_loadu_ps(re);
__m128 y4 = _mm_loadu_ps(im);
__m128 b4 = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(x4,x4), _mm_mul_ps(y4,y4)));
_mm_storeu_ps(b, b4);
}
void computeAbsolute (const std::complex<float>* src,
float* realValuedDestinationVec,
int vecLength)
{
for (int i = 0; i < vecLength; i += 4) {
float re[4] = {src[i].real(), src[i + 1].real(), src[i + 2].real(), src[i + 3].real()};
float im[4] = {src[i].imag(), src[i + 1].imag(), src[i + 2].imag(), src[i + 3].imag()};
cabs_soa4(re, im, realValuedDestinationVec);
}
}
which compiles to simple
_Z15computeAbsolutePKSt7complexIfEPfi:
test edx, edx
jle .L5
lea eax, [rdx-1]
shr eax, 2
sal rax, 5
lea rax, [rdi+32+rax]
.L3:
vmovups xmm0, XMMWORD PTR [rdi]
vmovups xmm2, XMMWORD PTR [rdi+16]
add rdi, 32
vshufps xmm1, xmm0, xmm2, 136
vmulps xmm1, xmm1, xmm1
vshufps xmm0, xmm0, xmm2, 221
vfmadd132ps xmm0, xmm1, xmm0
vsqrtps xmm0, xmm0
vmovups XMMWORD PTR [rsi], xmm0
cmp rax, rdi
jne .L3
.L5:
ret
https://godbolt.org/z/Yu64Wg
I need to build a single-precision floating-point inner product routine for mixed single/double-precision floating-point vectors, exploiting the AVX instruction set for SIMD registers with 256 bits.
Problem: one input vector is float (x), while the other is double (yD).
Hence, before to compute the true inner product operations, I need to convert my input yD vector data from double to float.
Using the SSE2 instruction set, I was able to implement a very fast code doing what I needed, and with speed performances very close to the case when both vectors x and y were float:
void vector_operation(const size_t i)
{
__m128 X = _mm_load_ps(x + i);
__m128 Y = _mm_movelh_ps(_mm_cvtpd_ps(_mm_load_pd(yD + i + 0)), _mm_cvtpd_ps(_mm_load_pd(yD + i + 2)));
//inner-products accumulation
res = _mm_add_ps(res, _mm_mul_ps(X, Y));
}
Now, with the hope to further speed-up, I implemented a correpsonding version with AVX instruction set:
inline void vector_operation(const size_t i)
{
__m256 X = _mm256_load_ps(x + i);
__m128 yD1 = _mm_cvtpd_ps(_mm_load_pd(yD + i + 0));
__m128 yD2 = _mm_cvtpd_ps(_mm_load_pd(yD + i + 2));
__m128 yD3 = _mm_cvtpd_ps(_mm_load_pd(yD + i + 4));
__m128 yD4 = _mm_cvtpd_ps(_mm_load_pd(yD + i + 6));
__m128 Ylow = _mm_movelh_ps(yD1, yD2);
__m128 Yhigh = _mm_movelh_ps(yD3, yD4);
//Pack __m128 data inside __m256
__m256 Y = _mm256_permute2f128_ps(_mm256_castps128_ps256(Ylow), _mm256_castps128_ps256(Yhigh), 0x20);
//inner-products accumulation
res = _mm256_add_ps(res, _mm256_mul_ps(X, Y));
}
I also tested other AVX implementations using, for example, casting and insertion operations instead of perfmuting data. Performances were comparably poor compared to the case where both x and y vectors were float.
The problem with the AVX code is that no matter how I implemented it, its performance is by far inferior to the ones achieved by using only float x and y vectors (i.e. no double-float conversion is needed).
The conversion from double to float for the yD vector seems pretty fast, while a lot of time is lost in the line where data is inserted in the _m256 Y register.
Do you know if this is a well-known issue with AVX?
Do you have a solution that could preserve good performances?
Thanks in advance!
I rewrote your function and took better advantage of what AVX has to offer. I also used fused multiply-add at the end; if you can't use FMA, just replace that line with addition and multiplication. I only now see that I wrote an implementation that uses unaligned loads and yours uses aligned loads, but I'm not gonna lose any sleep over it. :)
__m256 foo(float*x, double* yD, const size_t i, __m256 res_prev)
{
__m256 X = _mm256_loadu_ps(x + i);
__m128 yD21 = _mm256_cvtpd_ps(_mm256_loadu_pd(yD + i + 0));
__m128 yD43 = _mm256_cvtpd_ps(_mm256_loadu_pd(yD + i + 4));
__m256 Y = _mm256_set_m128(yD43, yD21);
return _mm256_fmadd_ps(X, Y, res_prev);
}
I did a quick benhmark and compared running times of your and my implementation. I tried two different benchmark approaches with several repetitions and every time my code was around 15% faster. I used MSVC 14.1 compiler and compiled the program with /O2 and /arch:AVX2 flags.
EDIT: this is the disassembly of the function:
vcvtpd2ps xmm3,ymmword ptr [rdx+r8*8+20h]
vcvtpd2ps xmm2,ymmword ptr [rdx+r8*8]
vmovups ymm0,ymmword ptr [rcx+r8*4]
vinsertf128 ymm3,ymm2,xmm3,1
vfmadd213ps ymm0,ymm3,ymmword ptr [r9]
EDIT 2: this is the disassembly of your AVX implementation of the same algorithm:
vcvtpd2ps xmm0,xmmword ptr [rdx+r8*8+30h]
vcvtpd2ps xmm1,xmmword ptr [rdx+r8*8+20h]
vmovlhps xmm3,xmm1,xmm0
vcvtpd2ps xmm0,xmmword ptr [rdx+r8*8+10h]
vcvtpd2ps xmm1,xmmword ptr [rdx+r8*8]
vmovlhps xmm2,xmm1,xmm0
vperm2f128 ymm3,ymm2,ymm3,20h
vmulps ymm0,ymm3,ymmword ptr [rcx+r8*4]
vaddps ymm0,ymm0,ymmword ptr [r9]
Consider the following code:
Matrix4x4 perspective(const ViewFrustum &frustum) {
float l = frustum.l;
float r = frustum.r;
float b = frustum.b;
float t = frustum.t;
float n = frustum.n;
float f = frustum.f;
return {
{ 2 * n / (r - l), 0, (r + l) / (r - l), 0 },
{ 0, 2 * n / (t - b), (t + b) / (t - b), 0 },
{ 0, 0, -((f + n) / (f - n)), -(2 * n * f / (f - n)) },
{ 0, 0, -1, 0 }
};
}
In order to improve readability of constructing the matrix, I have to either make a copy of values from the frustum struct, or references to them. However, neither do I actually need copies or indirection.
Is it possible to have some kind of a "reference" that would be resolved at compile time, kind of like a symbolic link. It would have the same effect as:
Matrix4x4 perspective(const ViewFrustum &frustum) {
#define l frustum.l;
#define r frustum.r;
#define b frustum.b;
#define t frustum.t;
#define n frustum.n;
#define f frustum.f;
return {
{ 2 * n / (r - l), 0, (r + l) / (r - l), 0 },
{ 0, 2 * n / (t - b), (t + b) / (t - b), 0 },
{ 0, 0, -((f + n) / (f - n)), -(2 * n * f / (f - n)) },
{ 0, 0, -1, 0 }
};
#undef l
#undef r
#undef b
#undef t
#undef n
#undef f
}
Without the preprocessor (or is it acceptable?). I suppose it isn't really needed, or could be avoided in this particular case by making those 6 values arguments to a function directly (though it would be a bit irritating having to call the function like that - but even then, I could make an inline proxy function).
But I was just wondering if this is somehow possible in general? I could not find anything like it. I think it would come in handy for locally shortening descriptive names that are going to be used a lot, without actually having to lose the original names.
Well, that's what C++ references are for:
const float &l = frustum.l;
const float &r = frustum.r;
const float &b = frustum.b;
const float &t = frustum.t;
const float &n = frustum.n;
const float &f = frustum.f;
Most modern compilers will optimize out the references, and use the values from the frustum object verbatim, in the following expression, by resolving the references at compile-time.
Obligatory disclaimer: do not prematurely optimize.
Let me compare your naive perspective function, containing
float l = frustum.l;
float r = frustum.r;
float b = frustum.b;
float t = frustum.t;
float n = frustum.n;
float f = frustum.f;
With define's and #Sam Varshavchik solution with references.
We assume that our compiler is optimizing, and optimizing at least decent.
Assembly output for all three versions: https://godbolt.org/g/G06Bx8.
You can notice that reference and define versions are exactly the same - as expected. But naive differs a lot. It first loads all the values from memory:
movss (%rdi), %xmm2 # xmm2 = mem[0],zero,zero,zero
movss 4(%rdi), %xmm1 # xmm1 = mem[0],zero,zero,zero
movss 8(%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero
movss %xmm0, 12(%rsp) # 4-byte Spill
movss 12(%rdi), %xmm0 # xmm0 = mem[0],zero,zero,zero
movss %xmm0, 8(%rsp) # 4-byte Spill
movss 16(%rdi), %xmm3 # xmm3 = mem[0],zero,zero,zero
movaps %xmm3, 16(%rsp) # 16-byte Spill
movss 20(%rdi), %xmm0
And then never again references the %rdi (frustrum) memory. Reference and define versions, on the other hand, load values as they are needed.
This happens because the implementation of Vector4 constructor is hidden from the optimizer and it can't assume that constructor doesn't modify frustrum, so it must insert loads, even when such loads are redundant.
So, naive version can be even faster than "optimized" one, under certain circumstances.
In general, you can use plain references, as long as you are in the local scope. Modern compilers "see through them" and just treat them as aliases (notice that this actually applies even to pointers).
However, when dealing with stuff on the small side, copying to a local variable, if anything, is generally beneficial. frustnum.ris one layer of indirection away (frustnum is actually a pointer under the hood), so accessing it is costlier than it may seem, and if you have function calls in the middle of your function the compiler may not be able to prove that its value isn't changing, so the access may need to be repeated.
Local variables instead are normally directly on the stack (cheap) or straight in registers (cheapest), and, most importantly, given that they usually have no interaction with "the outside", the compiler has an easier time reasoning about them, so it can be more aggressive with optimizations; also, when actually performing the computations those values are going to be copied in registers and on the stack anyway.
So go ahead and use copies, at worst the compiler will probably do the same, at best you may helped it optimizing stuff.
I have two similar programs one in C++ and another in D.
The compilation is on on Windows7 64bit, to 64bit binaries.
C++ version, VS 2013:
#include <iostream>
#include <string>
int main(int argc, char* argv[])
{
float eps = 1.0f;
float f = 0.0f;
while (f + eps != f)
f += 1.0f;
std::cout << "eps = " + std::to_string(eps) + ", max_f = " + std::to_string(f) << std::endl;
return 0;
}
D version, DMD v2.066.1:
import std.stdio;
import std.conv;
int main(string[] argv)
{
float eps = 1.0f;
float f = 0.0f;
while (f + eps != f)
f += 1.0f;
writeln("eps = " ~ to!string(eps) ~ ", max_f = " ~ to!string(f));
return 0;
}
C++ version works as expected and finds that f + e == f when f = 16777216.
But D version hungs forever. When I put breakpoint I see that in D version f also 16777216 (after running for some time) and Watch window (I use VisualD) shows that (f + e != f) is 'false' so the loop should be terminate but it's not the case during runtime.
I think assembly could give the answer but I'm not very good with it.
I'm new to D, so it should be the case that I misused the language/compiler (compiled with DMD just as 'dmd test.d' without additional options and also from VS with VisualD with default options). Any ideas what could be wrong with D version of the program? Thanks!
Disassembly:
C++:
000000013F7D1410 mov rax,rsp
000000013F7D1413 push rbp
000000013F7D1414 lea rbp,[rax-5Fh]
000000013F7D1418 sub rsp,0E0h
000000013F7D141F mov qword ptr [rbp+17h],0FFFFFFFFFFFFFFFEh
000000013F7D1427 mov qword ptr [rax+8],rbx
000000013F7D142B movaps xmmword ptr [rax-18h],xmm6
000000013F7D142F xorps xmm1,xmm1
float eps = 1.0f;
float f = 0.0f;
000000013F7D1432 movss xmm6,dword ptr [__real#3f800000 (013F7D67E8h)]
000000013F7D143A nop word ptr [rax+rax]
f += 1.0f;
000000013F7D1440 addss xmm1,xmm6
while (f + eps != f)
000000013F7D1444 movaps xmm0,xmm1
000000013F7D1447 addss xmm0,xmm6
000000013F7D144B ucomiss xmm0,xmm1
000000013F7D144E jp main+30h (013F7D1440h)
000000013F7D1450 jne main+30h (013F7D1440h)
D:
000000013F761002 mov ebp,esp
000000013F761004 sub rsp,50h
{
float eps = 1.0f;
000000013F761008 xor eax,eax
000000013F76100A mov dword ptr [rbp-50h],eax
000000013F76100D movss xmm0,dword ptr [rbp-50h]
000000013F761012 movss dword ptr [f],xmm0
float f = 0.0f;
while (f + eps != f)
f += 1.0f;
000000013F761017 movss xmm1,dword ptr [__NULL_IMPORT_DESCRIPTOR+1138h (013F7C3040h)]
000000013F76101F movss xmm2,dword ptr [f]
000000013F761024 addss xmm2,xmm1
000000013F761028 movss dword ptr [f],xmm2
000000013F76102D fld dword ptr [f]
000000013F761030 fadd dword ptr [__NULL_IMPORT_DESCRIPTOR+1138h (013F7C3040h)]
000000013F761036 fld dword ptr [f]
000000013F761039 fucomip st,st(1)
000000013F76103B fstp st(0)
000000013F76103D jne D main+17h (013F761017h)
000000013F76103F jp D main+17h (013F761017h)
Summary
Accept harold's answer that program behavior is due to the mixed FPU and SSE usage.
Here's a summary what happens in D assembly snippet. In fact the loop will run forever.
SSE behaves strictly according to IEEE-754 when f reaches 16777216.0 and we add 1.0 to this value (f += 1.0f) we still obtain 16777216.0 in xmm2 register, then we store it to memory.
(f + eps != f) expression is computed on the FPU. Since FPU registers have enough precision (f+eps) results in 16777217.0. If we stored this result back to memory into float variable then we would get expected value 16777216.0 (since 16777217.0 is not represented as float). And (f + eps != f) would be 'false' and loop would terminate. But we do not store any numbers back to memory and perform comparison on the FPU (since we have both operands). It means that we compare one number that is computed strictly according to IEEE-754 (f) and another that is computed with 80bit accuracy (f+eps). 16777216.0 != 16777217.0 and the loop runs forever.
I'm not an expert in this area but for me it looks like that doing floating point with SSE instructions is more robust as was demonstrated in C++ version of the program.
Update
I had a discussion on the D forum http://forum.dlang.org/thread/ucnayusylmpvkpcnbhgh#forum.dlang.org
It turned out that program behaves correctly - it's according to the language specification that intermediate calculations can be performed with higher accuracy.
The robust implementation for any D compiler is:
import std.stdio;
int main()
{
const float eps = 1.0f;
const float step = 1.0;
float f = 0.0f;
float fPlusEps = f + eps;
while (f != fPlusEps)
{
f += step;
fPlusEps = f + eps;
}
writeln("eps = ", eps, ", max_f = ", f);
return 0;
}
Mixed FPU and SSE code, that's .. really strange. I see absolutely no reason to implement it this way.
But they have, and the result is that f + eps != f is evaluated with 80bit extended precision, while
f += 1.0f is evaluated using 32bit floats.
That means the loop can never end, since f will stop going up before the value that makes
f + eps != f false (which, in 80bit precision, is huge) is reached.
Trying to break a loop with != or == with floating point values is looking for troubles.
The different behavior is mot likely due to the float to double to 80-bits internal floating point conversion compiler may adopt when passing values to the FPU.
When extending the mantissa, in particular- some compilers or optimizer can decide to let the less significant bit "random" instead of zeroed. So 1.0f, when given to the FPU may become 1.000000000000000000000012134432 that -according to a float- precision, is still 1.0, but wen 1.000000000000000000000012134432 and 1.000000000000000000000089544455 (the two tail are random) are compared by the FPU, look different.
You should verify how C++ and D compiler treat the floating point extension/reduction and eventually configure the appropriate switches: if the two compilers are not from the same manufacturer, thay had probably made different choices for their respective defaults.
float mixValue = ... //in range -1.0f to 1.0f
for(... ; ... ; ... ) //long loop
{
float inputLevel = ... //in range -1.0f to 1.0f
if(inputLevel < 0.0 && mixValue < 0.0)
{
mixValue = (mixValue + inputLevel) + (mixValue*inputLevel);
}
else
{
mixValue = (mixValue + inputLevel) - (mixValue*inputLevel);
}
}
just a simple question, can we calculate mixValue without branching? or any other optimization suggestion, such as using SIMD?
edit:
just for more information, I ended up
using this solution, based on chosen answer:
const float sign[] = {-1, 1};
float mixValue = ... //in range -1.0f to 1.0f
for(... ; ... ; ... ) //long loop
{
float inputLevel = ... //in range -1.0f to 1.0f
unsigned a = *(unsigned*)(&mixValue);
unsigned b = *(unsigned*)(&inputLevel);
float mulValue = mixValue * inputLevel * sign[(a & b) >> (8*sizeof(unsigned)-1)];
float addValue = mixValue + inputLevel;
mixValue = addValue + mulValue;
}
thank you.
How about this:
const float sign[] = {-1, 1};
float mixValue = ... //in range -1.0f to 1.0f
for(... ; ... ; ... ) //long loop
{
float inputLevel = ... //in range -1.0f to 1.0f
int bothNegative = (inputLevel < 0.0) & (mixValue < 0.0);
mixValue = (mixValue + inputLevel) + (sign[bothNegative]*mixValue*inputLevel);
}
Edit: Mike was correct that && would introduce a branch and thanks for Pedro for proving it. I changed && to & and now GCC (version 4.4.0) generates branch-free code.
Inspired by Roku's answer (which on MSVC++10 branches), this doesn't seem to branch:
#include <iostream>
using namespace std;
const float sign[] = {-1, 1};
int main() {
const int N = 10;
float mixValue = -0.5F;
for(int i = 0; i < N; i++) {
volatile float inputLevel = -0.3F;
int bothNegative = ((((unsigned char*)&inputLevel)[3] & 0x80) & (((unsigned char*)&mixValue)[3] & 0x80)) >> 7;
mixValue = (mixValue + inputLevel) + (sign[bothNegative]*mixValue*inputLevel);
}
std::cout << mixValue << std::endl;
}
Here's the disassembly, as analyzed by IDA Pro (compiled on MSVC++10, Release mode):
Disassembly http://img248.imageshack.us/img248/6865/floattestbranchmine.png
float mixValue = ... //in range -1.0f to 1.0f
for(... ; ... ; ... ) //long loop
{
float inputLevel = ... //in range -1.0f to 1.0f
float mulValue = mixValue * inputLevel;
float addValue = mixValue + inputLevel;
__int32 a = *(__int32*)(&mixValue);
__int32 b = *(__int32*)(&inputLevel);
__int32 c = *(__int32*)(&mulValue);
__int32 d = c & ((a ^ b) | 0x7FFFFFFF);
mixValue = addValue + *(float*)(&d);
}
Just off the top of my head (I'm sure it can be reduced):
mixValue = (mixValue + inputLevel) + (((mixValue / fabs(mixValue)) + (inputLevel / fabs(inputLevel))+1) / fabs(((mixValue / fabs(mixValue)) + (inputLevel / fabs(inputLevel))+1)))*-1*(mixValue*inputLevel);
Just to clarify a bit, I'll calculate sign separately:
float sign = (((mixValue / fabs(mixValue)) + (inputLevel / fabs(inputLevel))+1) / fabs(((mixValue / fabs(mixValue)) + (inputLevel / fabs(inputLevel))+1)))*-1;
mixValue = (mixValue + inputLevel) + sign*(mixValue*inputLevel);
This is floating point math, so you'll likely need to correct for some rounding issues, but that should set you on the right path I think.
If you are worried about excessive branching, look at Duff's Device. This should help unwind the loop somewhat. Truth be told, loop unwinding is something that will be done by the optimizer, so trying to do it by hand may be a waste of time. Check the assembly output to find out.
SIMD will definitely be of assistance provided you a performing the exact same operation to each item in your array. Be aware than not all hardware supports SIMD but some compilers like gcc do provide intrinsics for SIMD which will save your from dipping into assembler.
If you are using gcc to compile ARM code, the SIMD intrinsics can be found here
Have you benchmarked the loop with and without the branch ?
At least you could remove one part of the branch, since mixValue is outside of the loop.
float multiplier(float a, float b){
unsigned char c1Neg = reinterpret_cast<unsigned char *>(&a)[3] & 0x80;
unsigned char c2Neg = reinterpret_cast<unsigned char *>(&b)[3] & 0x80;
unsigned char multiplierIsNeg = c1Neg & c2Neg;
float one = 1;
reinterpret_cast<unsigned char *>(&one)[3] |= multiplierIsNeg;
return -one;
}
cout << multiplier(-1,-1) << endl; // +1
cout << multiplier( 1,-1) << endl; // -1
cout << multiplier( 1, 1) << endl; // -1
cout << multiplier(-1, 1) << endl; // -1
Looking at your code, you see that you will always add the absolute value of mixValue and inputLevel, except when both are positive.
With some bit-fiddling and IEEE floatingpoint knowledge, you may get rid of the conditional:
// sets the first bit of f to zero => makes it positive.
void absf( float& f ) {
assert( sizeof( float ) == sizeof( int ) );
reinterpret_cast<int&>( f ) &= ~0x80000000;
}
// returns a first-bit = 1 if f is positive
int pos( float& f ) {
return ~(reinterpret_cast<int&>(f) & 0x80000000) & 0x80000000;
}
// returns -fabs( f*g ) if f>0 and g>0, fabs(f*g) otherwise.
float prod( float& f, float& g ) {
float p = f*g;
float& rp=p;
int& ri = reinterpret_cast<int&>(rp);
absf(p);
ri |= ( pos(f) & pos(g) & 0x80000000); // first bit = + & +
return p;
}
int main(){
struct T { float f, g, r;
void test() {
float p = prod(f,g);
float d = (p-r)/r;
assert( -1e-15 < d && d < 1e-15 );
}
};
T vals[] = { {1,1,-1},{1,-1,1},{-1,1,1},{-1,-1,1} };
for( T* val=vals; val != vals+4; ++val ) {
val->test();
}
}
And finally: your loop
for( ... ) {
mixedResult += inputLevel + prod(mixedResult,inputLevel);
}
Note: the dimensions of your accumulation don't match. The inputLevel is a dimensionless quantity, while mixedResult is your... result (e.g. in Pascal, in Volts, ...). You cannot add two quantities with different dimensions. Probably you want mixedResult += prod( mixedResult, inputLevel ) as your accumulator.
Some compilers (ie MSC) would also require manual sign checking.
Source:
volatile float mixValue;
volatile float inputLevel;
float u = mixValue*inputLevel;
float v = -u;
float a[] = { v, u };
mixValue = (mixValue + inputLevel) + a[ (inputLevel<0.0) & (mixValue<0.0) ];
IntelC 11.1:
movss xmm1, DWORD PTR [12+esp]
mulss xmm1, DWORD PTR [16+esp]
movss xmm6, DWORD PTR [12+esp]
movss xmm2, DWORD PTR [16+esp]
movss xmm3, DWORD PTR [16+esp]
movss xmm5, DWORD PTR [12+esp]
xorps xmm4, xmm4
movaps xmm0, xmm4
subss xmm0, xmm1
movss DWORD PTR [esp], xmm0
movss DWORD PTR [4+esp], xmm1
addss xmm6, xmm2
xor eax, eax
cmpltss xmm3, xmm4
movd ecx, xmm3
neg ecx
cmpltss xmm5, xmm4
movd edx, xmm5
neg edx
and ecx, edx
addss xmm6, DWORD PTR [esp+ecx*4]
movss DWORD PTR [12+esp], xmm6
gcc 4.5:
flds 32(%esp)
flds 16(%esp)
fmulp %st, %st(1)
fld %st(0)
fchs
fstps (%esp)
fstps 4(%esp)
flds 32(%esp)
flds 16(%esp)
flds 16(%esp)
flds 32(%esp)
fxch %st(2)
faddp %st, %st(3)
fldz
fcomi %st(2), %st
fstp %st(2)
fxch %st(1)
seta %dl
xorl %eax, %eax
fcomip %st(1), %st
fstp %st(0)
seta %al
andl %edx, %eax
fadds (%esp,%eax,4)
xorl %eax, %eax
fstps 32(%esp)