SSE Bilinear interpolation

SSE Bilinear interpolation - c++

I'm implementing bilinear interpolation in a tight loop and trying to optimize it with SSE, but I get zero speed-up from it.
Here is the code, the non-SIMD version uses a simple vector structure which could be defined as struct Vec3f { float x, y, z; } with implemented multiplication and addition operators:
#ifdef USE_SIMD
const Color c11 = pixelCache[y1 * size.x + x1];
const Color c12 = pixelCache[y2 * size.x + x1];
const Color c22 = pixelCache[y2 * size.x + x2];
const Color c21 = pixelCache[y1 * size.x + x2];
__declspec(align(16)) float mc11[4] = { 1.0, c11.GetB(), c11.GetG(), c11.GetR() };
__declspec(align(16)) float mc12[4] = { 1.0, c12.GetB(), c12.GetG(), c12.GetR() };
__declspec(align(16)) float mc22[4] = { 1.0, c22.GetB(), c22.GetG(), c22.GetR() };
__declspec(align(16)) float mc21[4] = { 1.0, c21.GetB(), c21.GetG(), c21.GetR() };
// scalars in vector form for SSE
const float s11 = (x2-x)*(y2-y);
const float s12 = (x2-x)*(y-y1);
const float s22 = (x-x1)*(y-y1);
const float s21 = (x-x1)*(y2-y);
__declspec(align(16)) float ms11[4] = {1.0, s11, s11, s11};
__declspec(align(16)) float ms12[4] = {1.0, s12, s12, s12};
__declspec(align(16)) float ms22[4] = {1.0, s22, s22, s22};
__declspec(align(16)) float ms21[4] = {1.0, s21, s21, s21};
__asm {
movaps xmm0, mc11
movaps xmm1, mc12
movaps xmm2, mc22
movaps xmm3, mc21
movaps xmm4, ms11
movaps xmm5, ms12
movaps xmm6, ms22
movaps xmm7, ms21
mulps xmm0, xmm4
mulps xmm1, xmm5
mulps xmm2, xmm6
mulps xmm3, xmm7
addps xmm0, xmm1
addps xmm0, xmm2
addps xmm0, xmm3
movaps mc11, xmm0
}
#else
const Vec3f c11 = toFloat(pixelCache[y1 * size.x + x1]);
const Vec3f c12 = toFloat(pixelCache[y2 * size.x + x1]);
const Vec3f c22 = toFloat(pixelCache[y2 * size.x + x2]);
const Vec3f c21 = toFloat(pixelCache[y1 * size.x + x2]);
const Vec3f colour =
c11*(x2-x)*(y2-y) +
c21*(x-x1)*(y2-y) +
c12*(x2-x)*(y-y1) +
c22*(x-x1)*(y-y1);
#endif
Rearranging the asm code to reuse the registers(ended up with just three xmm registers) didn't give any effect. I've also tried using intrinsics:
// perform bilinear interpolation
const Vec3f c11 = toFloat(pixelCache[y1 * size.x + x1]);
const Vec3f c12 = toFloat(pixelCache[y2 * size.x + x1]);
const Vec3f c22 = toFloat(pixelCache[y2 * size.x + x2]);
const Vec3f c21 = toFloat(pixelCache[y1 * size.x + x2]);
// scalars in vector form for SSE
const float s11 = (x2-x)*(y2-y);
const float s12 = (x2-x)*(y-y1);
const float s22 = (x-x1)*(y-y1);
const float s21 = (x-x1)*(y2-y);
__m128 mc11 = _mm_set_ps(1.f, c11.b, c11.g, c11.r);
__m128 mc12 = _mm_set_ps(1.f, c12.b, c12.g, c12.r);
__m128 mc22 = _mm_set_ps(1.f, c22.b, c22.g, c22.r);
__m128 mc21 = _mm_set_ps(1.f, c21.b, c21.g, c21.r);
__m128 ms11 = _mm_set_ps(1.f, s11, s11, s11);
__m128 ms12 = _mm_set_ps(1.f, s12, s12, s12);
__m128 ms22 = _mm_set_ps(1.f, s22, s22, s22);
__m128 ms21 = _mm_set_ps(1.f, s21, s21, s21);
mc11 = _mm_mul_ps(mc11, ms11);
mc12 = _mm_mul_ps(mc12, ms12);
mc22 = _mm_mul_ps(mc22, ms22);
mc21 = _mm_mul_ps(mc21, ms21);
mc11 = _mm_add_ps(mc11, mc12);
mc11 = _mm_add_ps(mc11, mc22);
mc11 = _mm_add_ps(mc11, mc21);
Vec3f colour;
_mm_storeu_ps(colour.array, mc11);
And to no avail. Am I missing something, or it is impossible to gain any extra speed here?

Why floating point?
Given packed pixel argb for a, b, c, d, and xerr, yerr in the range 0-256, a simple example is:
// =================================================================================================================
// xs_Bilerp
// =================================================================================================================
finline uint32 xs_Bilerp (uint32 a, uint32 b, uint32 c, uint32 d, uint32 xerr, uint32 yerr)
{
#define xs_rbmask 0x00ff00ff
#define xs_agmask 0xff00ff00
if (a==b && c==d && a==d) return a;
const uint32 arb = a & xs_rbmask;
const uint32 crb = c & xs_rbmask;
const uint32 aag = a & xs_agmask;
const uint32 cag = c & xs_agmask;
const uint32 rbdx1 = (b & xs_rbmask) - arb;
const uint32 rbdx2 = (d & xs_rbmask) - crb;
const uint32 agdx1 = ((b & xs_agmask)>>8) - (aag >> 8);
const uint32 agdx2 = ((d & xs_agmask)>>8) - (cag >> 8);
const uint32 rb1 = (arb + ((rbdx1 * xerr) >> 8)) & xs_rbmask;
const uint32 ag1 = (aag + ((agdx1 * xerr) )) & xs_agmask;
const uint32 rbdy = ((crb + ((rbdx2 * xerr) >> 8)) & xs_rbmask) - rb1;
const uint32 agdy = (((cag + ((agdx2 * xerr) )) & xs_agmask)>>8) - (ag1 >> 8);
const uint32 rb = (rb1 + ((rbdy * yerr) >> 8)) & xs_rbmask;
const uint32 ag = (ag1 + ((agdy * yerr) )) & xs_agmask;
return ag | rb;
}

Related

VC++ optimization double loads from one _mm256_loadu_ps intrinsic

Here's a source:
// Same as multiplyComplex, multiplies 4 numbers
__forceinline __m256 multiplyComplex_x4( const __m256 x, const __m256 y )
{
// If the inputs are [ a, b ] and [ c, d ] the formula is [ ac - bd, ad + bc ]
const __m256 x2 = _mm256_movehdup_ps( x ); // [ b, b ]
const __m256 yRev = _mm256_permute_ps( y, shuffleMask_rev64q ); // [ d, c ]
const __m256 prod2 = _mm256_mul_ps( x2, yRev ); // [ bd, bc ]
const __m256 x1 = _mm256_moveldup_ps( x ); // [ a, a ]
return _mm256_fmaddsub_ps( x1, y, prod2 );
}
// Same as fftMainLoop, handles 4 complex numbers
__forceinline void fftMainLoop_x4( const __m256 omega, complex* a1c, complex* a2c )
{
const __m256 a1 = _mm256_loadu_ps( (const float*)a1c );
const __m256 a2 = _mm256_loadu_ps( (const float*)a2c );
const __m256 product = multiplyComplex_x4( omega, a2 );
_mm256_storeu_ps( (float*)a1c, _mm256_add_ps( a1, product ) );
_mm256_storeu_ps( (float*)a2c, _mm256_sub_ps( a1, product ) );
}
Here's the assembly made by VC++ 2017, the complete loop that calls that function:
00007FF7CC3F11B0 lea eax,[r8+4]
00007FF7CC3F11B4 mov ecx,r8d
00007FF7CC3F11B7 vpermilps ymm0,ymmword ptr [rdi+rax*8],0B1h
00007FF7CC3F11BE vmulps ymm2,ymm0,ymm4
00007FF7CC3F11C2 vfmaddsub231ps ymm2,ymm5,ymmword ptr [rdi+rax*8]
00007FF7CC3F11C8 vmovups ymm3,ymmword ptr [rdi+rcx*8]
00007FF7CC3F11CD vaddps ymm0,ymm2,ymm3
00007FF7CC3F11D1 add r8d,8
00007FF7CC3F11D5 vsubps ymm1,ymm3,ymm2
00007FF7CC3F11D9 vmovups ymmword ptr [rdi+rcx*8],ymm0
00007FF7CC3F11DE vmovups ymmword ptr [rdi+rax*8],ymm1
00007FF7CC3F11E3 cmp r8d,esi
00007FF7CC3F11E6 jb fft_run+1B0h (07FF7CC3F11B0h)
The code reads from memory 50% more often than I would like it to.
The two instructions
vpermilps ymm0,ymmword ptr [rdi+rax*8],0B1h
and
vfmaddsub231ps ymm2,ymm5,ymmword ptr [rdi+rax*8]
both access the same memory address.
Is there a way to convince the compiler to emit vmovups for my _mm256_loadu_ps, instead of trying to merge the load?

Efficiently compute absolute values of std::complex<float> vector with AVX2

For some real-time DSP application I need to compute the absolute values of a complex valued vector.
The straightforward implementation would look like that
computeAbsolute (std::complex<float>* complexSourceVec,
float* realValuedDestinationVec,
int vecLength)
{
for (int i = 0; i < vecLength; ++i)
realValuedDestinationVec[i] = std::abs (complexSourceVec[i]);
}
I want to replace this implementation with an AVX2 optimized version, based on AVX2 instrincts. What would be the most efficient way to implement it that way?
Note: The source data is handed to me by an API I have no access to, so there is no chance to change the layout of the complex input vector for better efficiency.

Inspired by the answer of Dan M. I first implemented his version with some tweaks:
First changed it to use the wider 256 Bit registers, then marked the temporary re and im arrays with __attribute__((aligned (32))) to be able to use aligned load
void computeAbsolute1 (const std::complex<float>* cplxIn, float* absOut, const int length)
{
for (int i = 0; i < length; i += 8)
{
float re[8] __attribute__((aligned (32))) = {cplxIn[i].real(), cplxIn[i + 1].real(), cplxIn[i + 2].real(), cplxIn[i + 3].real(), cplxIn[i + 4].real(), cplxIn[i + 5].real(), cplxIn[i + 6].real(), cplxIn[i + 7].real()};
float im[8] __attribute__((aligned (32))) = {cplxIn[i].imag(), cplxIn[i + 1].imag(), cplxIn[i + 2].imag(), cplxIn[i + 3].imag(), cplxIn[i + 4].imag(), cplxIn[i + 5].imag(), cplxIn[i + 6].imag(), cplxIn[i + 7].imag()};
__m256 x4 = _mm256_load_ps (re);
__m256 y4 = _mm256_load_ps (im);
__m256 b4 = _mm256_sqrt_ps (_mm256_add_ps (_mm256_mul_ps (x4,x4), _mm256_mul_ps (y4,y4)));
_mm256_storeu_ps (absOut + i, b4);
}
}
However manually shuffling the values this way seemed like a task that could be speeded up somehow. Now this is the solution I came up with, that runs 2 - 3 times faster in a quick test compiled by clang with full optimization:
#include <complex>
#include <immintrin.h>
void computeAbsolute2 (const std::complex<float>* __restrict cplxIn, float* __restrict absOut, const int length)
{
for (int i = 0; i < length; i += 8)
{
// load 8 complex values (--> 16 floats overall) into two SIMD registers
__m256 inLo = _mm256_loadu_ps (reinterpret_cast<const float*> (cplxIn + i ));
__m256 inHi = _mm256_loadu_ps (reinterpret_cast<const float*> (cplxIn + i + 4));
// seperates the real and imaginary part, however values are in a wrong order
__m256 re = _mm256_shuffle_ps (inLo, inHi, _MM_SHUFFLE (2, 0, 2, 0));
__m256 im = _mm256_shuffle_ps (inLo, inHi, _MM_SHUFFLE (3, 1, 3, 1));
// do the heavy work on the unordered vectors
__m256 abs = _mm256_sqrt_ps (_mm256_add_ps (_mm256_mul_ps (re, re), _mm256_mul_ps (im, im)));
// reorder values prior to storing
__m256d ordered = _mm256_permute4x64_pd (_mm256_castps_pd(abs), _MM_SHUFFLE(3,1,2,0));
_mm256_storeu_ps (absOut + i, _mm256_castpd_ps(ordered));
}
}
I think I'll go with that implementation if no one comes up with a faster solution
This compiles efficiently with gcc and clang (on the Godbolt compiler explorer).

It's really hard (if possible) to write "highly optimized AVX2" version of complex abs since the way complex numbers are defined in the standard prevents (specifically due to all inf/nan corner cases) a lot of optimization.
However, if you don't care about the correctness you can just use -ffast-math and some compilers would optimize the code for you. See gcc output: https://godbolt.org/z/QbZlBI
You can also take this output and create your own abs function with inline assembly.
But yes, as was already mentioned, if you really need performance, you probably want to swap std::complex for something else.
I was able to get a decent output for your specific case with all the required shuffles by manually filling small re and im arrays. See: https://godbolt.org/z/sWAAXo
This could be trivially extended for ymm registers.
Anyway, here is the ultimate solution adapted from this SO answer which uses intrinsics in combination with clever compiler optimizations:
#include <complex>
#include <cassert>
#include <immintrin.h>
static inline void cabs_soa4(const float *re, const float *im, float *b) {
__m128 x4 = _mm_loadu_ps(re);
__m128 y4 = _mm_loadu_ps(im);
__m128 b4 = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(x4,x4), _mm_mul_ps(y4,y4)));
_mm_storeu_ps(b, b4);
}
void computeAbsolute (const std::complex<float>* src,
float* realValuedDestinationVec,
int vecLength)
{
for (int i = 0; i < vecLength; i += 4) {
float re[4] = {src[i].real(), src[i + 1].real(), src[i + 2].real(), src[i + 3].real()};
float im[4] = {src[i].imag(), src[i + 1].imag(), src[i + 2].imag(), src[i + 3].imag()};
cabs_soa4(re, im, realValuedDestinationVec);
}
}
which compiles to simple
_Z15computeAbsolutePKSt7complexIfEPfi:
test edx, edx
jle .L5
lea eax, [rdx-1]
shr eax, 2
sal rax, 5
lea rax, [rdi+32+rax]
.L3:
vmovups xmm0, XMMWORD PTR [rdi]
vmovups xmm2, XMMWORD PTR [rdi+16]
add rdi, 32
vshufps xmm1, xmm0, xmm2, 136
vmulps xmm1, xmm1, xmm1
vshufps xmm0, xmm0, xmm2, 221
vfmadd132ps xmm0, xmm1, xmm0
vsqrtps xmm0, xmm0
vmovups XMMWORD PTR [rsi], xmm0
cmp rax, rdi
jne .L3
.L5:
ret
https://godbolt.org/z/Yu64Wg

AVX2 barycentric interpolation of a vertex component

I'm just starting out on the path of using simd intrinsics. My profiler has shown that a significant amount of time is being spent on vertex interpolation. I am targeting AVX2 and am trying to find an optimization for the following - given that I have 3 vector2s that need interpolation I imagine I should be able to load them into a single __m256 and do the multiply and add efficiently. Here is the code I am trying to convert - is it worth doing it as a 256bit operation? The vectors are unaligned.
Vector2 Interpolate( Vector3 uvw, Vector2 v0, Vector2 v1, Vector2 v2 )
{
Vector2 out;
out = v0 * uvw.x;
out += v1 * uvw.y;
out += v2 * uvw.z;
return out;
}
struct Vector2 { float x; float y; } ;
struct Vector3 { float x; float y; float z; } ;
My question is this - how do I load three unaligned vector2 into the single 256bit register so I can do the multiply and add?
I am using VS2013.

I was bored so I wrote it, not tested (but compiled, both Clang and GCC make reasonable code from this)
void interpolateAll(int n, float* scales, float* vin, float* vout)
{
// preconditions:
// (n & 7 == 0) (not really, but vout must be padded)
// scales & 31 == 0
// vin & 31 == 0
// vout & 31 == 0
// vin format:
// float v0x[8]
// float v0y[8]
// float v1x[8]
// float v1y[8]
// float v2x[8]
// float v2y[8]
// scales format:
// float scale0[8]
// float scale1[8]
// float scale2[8]
// vout format:
// float vx[8]
// float vy[8]
for (int i = 0; i < n; i += 8) {
__m256 scale_0 = _mm256_load_ps(scales + i * 3);
__m256 scale_1 = _mm256_load_ps(scales + i * 3 + 8);
__m256 scale_2 = _mm256_load_ps(scales + i * 3 + 16);
__m256 v0x = _mm256_load_ps(vin + i * 6);
__m256 v0y = _mm256_load_ps(vin + i * 6 + 8);
__m256 v1x = _mm256_load_ps(vin + i * 6 + 16);
__m256 v1y = _mm256_load_ps(vin + i * 6 + 24);
__m256 v2x = _mm256_load_ps(vin + i * 6 + 32);
__m256 v2y = _mm256_load_ps(vin + i * 6 + 40);
__m256 x = _mm256_mul_ps(scale_0, v0x);
__m256 y = _mm256_mul_ps(scale_0, v0y);
x = _mm256_fmadd_ps(scale_1, v1x, x);
y = _mm256_fmadd_ps(scale_1, v1y, y);
x = _mm256_fmadd_ps(scale_2, v2x, x);
y = _mm256_fmadd_ps(scale_2, v2y, y);
_mm256_store_ps(vout + i * 2, x);
_mm256_store_ps(vout + i * 2 + 8, y);
}
}
Uses Z boson's format, if I understood him correctly. In any case it's a nice format, from a SIMD perspective. Slightly inconvenient from a C++ perspective.
The FMAs do serialize the multiplies unnecessarily but that shouldn't matter since it's not part of a loop-carried dependency.
The predicted throughput of this (assuming a small enough array) is 2 iterations per 9 cycles, bottlenecked by the loads. In practice probably slightly worse, there was some talk about simple stores stealing p2 or p3 occasionally, that sort of thing, I'm not really sure. Anyway, that's enough time for 18 "FMAs" but there are only 12 (8 and 4 mulps), so it may be useful to move some extra computation in here if there is any.

Strange assembler generated by VS 2008

I have a c++ function, looking like:
inline unsigned short function_name(float x, float y, someStruct *cfg)
{
int x_pos = (int)(x*2 + 0.5f);
int y_pos = (int)(y*2 + 0.5f);
int dict_index = x_pos + (y_pos * cfg->subdivisions_adj);
[...]
while someStruct is declared as:
struct someStruct {
int subdivisions;
int subdivisions_adj;
[...]
}
The generated assembly for the third line (int dict_index = [...]) is:
cvttss2si edi,xmm3
imul edi,[ecx+04h]
movss xmm3,[ecx+0ch]
movaps xmm4,xmm3
mulss xmm4,xmm0
addss xmm4,xmm1
cvttss2si eax,xmm4
add edi,eax
(See also the result by AMDCodeAnalyst)
Can anyone explain what this assembly does? I don't know why cvttss2si and movaps are used at all, aren't they for floating point numbers?
I am using Visual Studio 2008 on Windows 7, with the SSE2 instruction set enabled.

What you are seeing is simply that the compiler merges the first three lines together into one intermingled sequence of instructions.
cvttss2si edi,xmm3
Convert xmm3 as float to 32-bit int. Presumably xmm3 contains the float value of y_pos, and this is the result of (int) on the calculateion of y_pos.
imul edi,[ecx+04h]
Multiply by cfg->subdivisions_adj (ecx = cfg, subdivisions_adj = offset of 4)
movss xmm3,[ecx+0ch]
Would be part of the ... in your cfg variable, I suppose.
movaps xmm4,xmm3
mulss xmm4,xmm0
adss xmm4,xmm1
calculate x_pos = x * 2 + 0.5
cvttss2si eax,xmm4
(int) x_pos;
add edi,eax
Add x_pos to y_pos * cfg->subdivisions_adj;

optimization, branching elimination

float mixValue = ... //in range -1.0f to 1.0f
for(... ; ... ; ... ) //long loop
{
float inputLevel = ... //in range -1.0f to 1.0f
if(inputLevel < 0.0 && mixValue < 0.0)
{
mixValue = (mixValue + inputLevel) + (mixValue*inputLevel);
}
else
{
mixValue = (mixValue + inputLevel) - (mixValue*inputLevel);
}
}
just a simple question, can we calculate mixValue without branching? or any other optimization suggestion, such as using SIMD?
edit:
just for more information, I ended up
using this solution, based on chosen answer:
const float sign[] = {-1, 1};
float mixValue = ... //in range -1.0f to 1.0f
for(... ; ... ; ... ) //long loop
{
float inputLevel = ... //in range -1.0f to 1.0f
unsigned a = *(unsigned*)(&mixValue);
unsigned b = *(unsigned*)(&inputLevel);
float mulValue = mixValue * inputLevel * sign[(a & b) >> (8*sizeof(unsigned)-1)];
float addValue = mixValue + inputLevel;
mixValue = addValue + mulValue;
}
thank you.

How about this:
const float sign[] = {-1, 1};
float mixValue = ... //in range -1.0f to 1.0f
for(... ; ... ; ... ) //long loop
{
float inputLevel = ... //in range -1.0f to 1.0f
int bothNegative = (inputLevel < 0.0) & (mixValue < 0.0);
mixValue = (mixValue + inputLevel) + (sign[bothNegative]*mixValue*inputLevel);
}
Edit: Mike was correct that && would introduce a branch and thanks for Pedro for proving it. I changed && to & and now GCC (version 4.4.0) generates branch-free code.

Inspired by Roku's answer (which on MSVC++10 branches), this doesn't seem to branch:
#include <iostream>
using namespace std;
const float sign[] = {-1, 1};
int main() {
const int N = 10;
float mixValue = -0.5F;
for(int i = 0; i < N; i++) {
volatile float inputLevel = -0.3F;
int bothNegative = ((((unsigned char*)&inputLevel)[3] & 0x80) & (((unsigned char*)&mixValue)[3] & 0x80)) >> 7;
mixValue = (mixValue + inputLevel) + (sign[bothNegative]*mixValue*inputLevel);
}
std::cout << mixValue << std::endl;
}
Here's the disassembly, as analyzed by IDA Pro (compiled on MSVC++10, Release mode):
Disassembly http://img248.imageshack.us/img248/6865/floattestbranchmine.png

float mixValue = ... //in range -1.0f to 1.0f
for(... ; ... ; ... ) //long loop
{
float inputLevel = ... //in range -1.0f to 1.0f
float mulValue = mixValue * inputLevel;
float addValue = mixValue + inputLevel;
__int32 a = *(__int32*)(&mixValue);
__int32 b = *(__int32*)(&inputLevel);
__int32 c = *(__int32*)(&mulValue);
__int32 d = c & ((a ^ b) | 0x7FFFFFFF);
mixValue = addValue + *(float*)(&d);
}

Just off the top of my head (I'm sure it can be reduced):
mixValue = (mixValue + inputLevel) + (((mixValue / fabs(mixValue)) + (inputLevel / fabs(inputLevel))+1) / fabs(((mixValue / fabs(mixValue)) + (inputLevel / fabs(inputLevel))+1)))*-1*(mixValue*inputLevel);
Just to clarify a bit, I'll calculate sign separately:
float sign = (((mixValue / fabs(mixValue)) + (inputLevel / fabs(inputLevel))+1) / fabs(((mixValue / fabs(mixValue)) + (inputLevel / fabs(inputLevel))+1)))*-1;
mixValue = (mixValue + inputLevel) + sign*(mixValue*inputLevel);
This is floating point math, so you'll likely need to correct for some rounding issues, but that should set you on the right path I think.

If you are worried about excessive branching, look at Duff's Device. This should help unwind the loop somewhat. Truth be told, loop unwinding is something that will be done by the optimizer, so trying to do it by hand may be a waste of time. Check the assembly output to find out.
SIMD will definitely be of assistance provided you a performing the exact same operation to each item in your array. Be aware than not all hardware supports SIMD but some compilers like gcc do provide intrinsics for SIMD which will save your from dipping into assembler.
If you are using gcc to compile ARM code, the SIMD intrinsics can be found here

Have you benchmarked the loop with and without the branch ?
At least you could remove one part of the branch, since mixValue is outside of the loop.
float multiplier(float a, float b){
unsigned char c1Neg = reinterpret_cast<unsigned char *>(&a)[3] & 0x80;
unsigned char c2Neg = reinterpret_cast<unsigned char *>(&b)[3] & 0x80;
unsigned char multiplierIsNeg = c1Neg & c2Neg;
float one = 1;
reinterpret_cast<unsigned char *>(&one)[3] |= multiplierIsNeg;
return -one;
}
cout << multiplier(-1,-1) << endl; // +1
cout << multiplier( 1,-1) << endl; // -1
cout << multiplier( 1, 1) << endl; // -1
cout << multiplier(-1, 1) << endl; // -1

Looking at your code, you see that you will always add the absolute value of mixValue and inputLevel, except when both are positive.
With some bit-fiddling and IEEE floatingpoint knowledge, you may get rid of the conditional:
// sets the first bit of f to zero => makes it positive.
void absf( float& f ) {
assert( sizeof( float ) == sizeof( int ) );
reinterpret_cast<int&>( f ) &= ~0x80000000;
}
// returns a first-bit = 1 if f is positive
int pos( float& f ) {
return ~(reinterpret_cast<int&>(f) & 0x80000000) & 0x80000000;
}
// returns -fabs( f*g ) if f>0 and g>0, fabs(f*g) otherwise.
float prod( float& f, float& g ) {
float p = f*g;
float& rp=p;
int& ri = reinterpret_cast<int&>(rp);
absf(p);
ri |= ( pos(f) & pos(g) & 0x80000000); // first bit = + & +
return p;
}
int main(){
struct T { float f, g, r;
void test() {
float p = prod(f,g);
float d = (p-r)/r;
assert( -1e-15 < d && d < 1e-15 );
}
};
T vals[] = { {1,1,-1},{1,-1,1},{-1,1,1},{-1,-1,1} };
for( T* val=vals; val != vals+4; ++val ) {
val->test();
}
}
And finally: your loop
for( ... ) {
mixedResult += inputLevel + prod(mixedResult,inputLevel);
}
Note: the dimensions of your accumulation don't match. The inputLevel is a dimensionless quantity, while mixedResult is your... result (e.g. in Pascal, in Volts, ...). You cannot add two quantities with different dimensions. Probably you want mixedResult += prod( mixedResult, inputLevel ) as your accumulator.

Some compilers (ie MSC) would also require manual sign checking.
Source:
volatile float mixValue;
volatile float inputLevel;
float u = mixValue*inputLevel;
float v = -u;
float a[] = { v, u };
mixValue = (mixValue + inputLevel) + a[ (inputLevel<0.0) & (mixValue<0.0) ];
IntelC 11.1:
movss xmm1, DWORD PTR [12+esp]
mulss xmm1, DWORD PTR [16+esp]
movss xmm6, DWORD PTR [12+esp]
movss xmm2, DWORD PTR [16+esp]
movss xmm3, DWORD PTR [16+esp]
movss xmm5, DWORD PTR [12+esp]
xorps xmm4, xmm4
movaps xmm0, xmm4
subss xmm0, xmm1
movss DWORD PTR [esp], xmm0
movss DWORD PTR [4+esp], xmm1
addss xmm6, xmm2
xor eax, eax
cmpltss xmm3, xmm4
movd ecx, xmm3
neg ecx
cmpltss xmm5, xmm4
movd edx, xmm5
neg edx
and ecx, edx
addss xmm6, DWORD PTR [esp+ecx*4]
movss DWORD PTR [12+esp], xmm6
gcc 4.5:
flds 32(%esp)
flds 16(%esp)
fmulp %st, %st(1)
fld %st(0)
fchs
fstps (%esp)
fstps 4(%esp)
flds 32(%esp)
flds 16(%esp)
flds 16(%esp)
flds 32(%esp)
fxch %st(2)
faddp %st, %st(3)
fldz
fcomi %st(2), %st
fstp %st(2)
fxch %st(1)
seta %dl
xorl %eax, %eax
fcomip %st(1), %st
fstp %st(0)
seta %al
andl %edx, %eax
fadds (%esp,%eax,4)
xorl %eax, %eax
fstps 32(%esp)

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

SSE Bilinear interpolation - c++

Related

VC++ optimization double loads from one _mm256_loadu_ps intrinsic

Efficiently compute absolute values of std::complex<float> vector with AVX2

AVX2 barycentric interpolation of a vertex component

Strange assembler generated by VS 2008

optimization, branching elimination

Categories

Resources