Compile-time table generation for "ICSIlog" - c++

The following C code is used to generate a lookup table at runtime to help implement the "ICSI" log algorithm (referenced from https://github.com/mgbellemare/SkipCTS/blob/master/src/icsilog.cpp):
/*
This method fills a given array of floats with the information necessary to compute the icsi_log. This method has to be called before any call to icsi_log.
Parameters:
n is the number of bits used from the mantissa (0<=n<=23). Higher n means higher accuracy but slower execution. We found that a good value for n is 14.
lookup_table requires a float* pointing to a continuous (preallocated) memory array of 2^n*sizeof(float) bytes.
Return values: void
*/
void fill_icsi_log_table(const int n, float *lookup_table)
{
float numlog;
int incr,i,p;
int *const exp_ptr = ((int*)&numlog);
int x = *exp_ptr; /*x is the float treated as an integer*/
x = 0x3F800000; /*set the exponent to 0 so numlog=1.0*/
*exp_ptr = x;
incr = 1 << (23-n); /*amount to increase the mantissa*/
p = 1 << n;
for(i=0;i<p;i++)
{
lookup_table[i] = (float) log2(numlog); /*save the log of the value*/
x += incr;
*exp_ptr = x; /*update the float value*/
}
}
/* ICSIlog V 2.0 */
void fill_icsi_log_table2(const unsigned precision, float* const pTable)
{
/* step along table elements and x-axis positions
(start with extra half increment, so the steps intersect at their midpoints.) */
float oneToTwo = 1.0f + (1.0f / (float)( 1 <<(precision + 1) ));
int i;
for(i = 0; i < (1 << precision); ++i )
{+
// make y-axis value for table element
pTable[i] = logf(oneToTwo) / 0.69314718055995f;
oneToTwo += 1.0f / (float)( 1 << precision );
}
}
Is there a way that either of these functions could be adapted to generate a lookup table at compile-time using templates and C++11-amenable single-line return constexpr functions similar to the following structure?
/** Range generation,
* from http://stackoverflow.com/questions/13313980/populate-an-array-using-constexpr-at-compile-time **/
template<unsigned... Is> struct seq{};
template<unsigned N, unsigned... Is>
struct gen_seq : gen_seq<N-1, N-1, Is...>{};
template<unsigned... Is>
struct gen_seq<0, Is...> : seq<Is...>{};
/** A table consisting of indexes and values,
* which will all be computed at compile-time **/
template<unsigned N>
struct Table
{
unsigned indexes[N];
double values[N];
static constexpr unsigned length = N;
};
template< typename LambdaType, unsigned... Is>
constexpr Table< sizeof...(Is) > TableGenerator(seq<Is...>, LambdaType evalFunc)
{
return {{ Is... }, { evalFunc(Is)... }};
}
template<unsigned N, typename LambdaType>
constexpr Table<N> TableGenerator( LambdaType evalFunc )
{
return TableGenerator(gen_seq<N>(), evalFunc);
}
/** Function that computes a value for each index **/
constexpr double myFunc( unsigned idx )
{
return sin(0.2 * idx) + cos(0.5*idx);
}

Working from this example as a starting point and the "v2.0" variant of the table generation code:
/* ICSIlog V 2.0 */
void fill_icsi_log_table2(const unsigned precision, float* const pTable)
{
/* step along table elements and x-axis positions
(start with extra half increment, so the steps intersect at their midpoints.) */
float oneToTwo = 1.0f + (1.0f / (float)( 1 <<(precision + 1) ));
int i;
for(i = 0; i < (1 << precision); ++i )
{
// make y-axis value for table element
pTable[i] = logf(oneToTwo) / 0.69314718055995f;
oneToTwo += 1.0f / (float)( 1 << precision );
}
}
This recursive template structure:
#include <math.h>
#define PRECISION (4)
constexpr float table_log(float oneToTwo)
{
return logf(oneToTwo) / 0.69314718055995f;
}
template<size_t c, size_t precision, float* const* pTable>
struct ForLoop {
template<template <size_t, size_t, float* const*> class Func>
static void iterate(float oneToTwo) {
ForLoop<c - 1, precision, pTable>::template
iterate<Func>(Func<c - 1, precision, pTable>()(oneToTwo));
}
};
template<size_t precision, float* const* pTable>
struct ForLoop<0, precision, pTable> {
template<template <size_t, size_t, float* const*> class Func>
static void iterate(float oneToTwo) {
Func<0, precision, pTable>()(oneToTwo);
}
};
template <size_t index, size_t precision, float* const *pTable>
struct LogTabe {
float operator()(float oneToTwo) {
float a = table_log(oneToTwo);
(*pTable)[(1 << precision) - index] = a;
return oneToTwo + 1.0f / (float)(1 << precision);
}
};
static float *const table = new float[1 << PRECISION];
extern float *const table;
int main() {
ForLoop<(1 << PRECISION) + 1, PRECISION, &table>::iterate<LogTabe>(1.0f + (1.0f / (float)( 1 << (PRECISION + 1))));
}
Compiled with gcc x86-64 8.1, -std=c++11 -O1, generates an output table consistent with the original code and the asm output:
mov rax, QWORD PTR table[rip]
mov DWORD PTR [rax], 0x3d35d69b
mov DWORD PTR [rax+4], 0x3e0462c4
mov DWORD PTR [rax+8], 0x3e567af2
mov DWORD PTR [rax+12], 0x3e92203d
mov DWORD PTR [rax+16], 0x3eb7110e
mov DWORD PTR [rax+20], 0x3eda3f60
mov DWORD PTR [rax+24], 0x3efbd42b
mov DWORD PTR [rax+28], 0x3f0df989
mov DWORD PTR [rax+32], 0x3f1d5da0
mov DWORD PTR [rax+36], 0x3f2c2411
mov DWORD PTR [rax+40], 0x3f3a58fe
mov DWORD PTR [rax+44], 0x3f480731
mov DWORD PTR [rax+48], 0x3f553848
mov DWORD PTR [rax+52], 0x3f61f4e6
mov DWORD PTR [rax+56], 0x3f6e44cd
mov DWORD PTR [rax+60], 0x3f7a2f04
mov DWORD PTR [rax+64], 0x3f88759c
mov eax, 0
ret
_GLOBAL__sub_I_main:
sub rsp, 8
mov edi, 64
call operator new[](unsigned long)
mov QWORD PTR table[rip], rax
add rsp, 8
ret
Showing that the table values have been successfully pre-computed at compile-time. However recent versions of Clang refuse to compile the code on the objection given by max66 in the comments that the "cmath" and "math.h" library functions are not strictly constexpr (but since it's being evaluated at compile-time anyway, a Taylor series expansion to arbitrary precision itself implemented as a constexpr function would likely work fine as a substitute.)

Related

_asm function, which prints "Pow (x) = x^2"

I create this piece of code in VS (C++)
#include<iostream>
using namespace std;
static short arr[10];
void powers() {
_asm {
mov ecx, 0;
mov dx, 0;
for:
mov ax, cx;
inc ax;
mul ax;
mov[arr + 2 * ecx], ax;
inc ecx;
cmp ecx, 10;
jl for;
}
}
Now I want to create another function which prints "Pow (x) = x^2", I stuck here:
void print_power(unsigned short x) {
const char* f = "Pow(%d) = %d \n";
_asm {
call powers
push f
add esp, 4
}
}
When I call my "powers" function from "print_power" -> my arr[10] gets filled with the ^2 of 1 to 10
(arr[0] = 1, arr[1] = 4, arr[2] = 9, arr[3] = 16, arr[4] = 25 .....) (I think)
I want when I call my print_power(x) function from main(), for example print_power(8) -> to print the 8th element from my arr like this: "Pow (8) = 81".
Thank you in advance, also I want to apologize if I have some mistakes.

SIMD Program slow runtime

I'm starting with SIMD programming but i don't know what to do at this moment. I'm trying to diminish runtime but its doing it the other way.
This is my basic code:
https://codepaste.net/a8ut89
void blurr2(double * u, double * r) {
int i;
double dos[2] = { 2.0, 2.0 };
for (i = 0; i < SIZE - 1; i++) {
r[i] = u[i] + u[i + 1];
}
}
blurr2: 0.43s
int contarNegativos(double * u) {
int i;
int contador = 0;
for (i = 0; i < SIZE; i++) {
if (u[i] < 0) {
contador++;
}
}
return contador;
}
negativeCount: 1.38s
void ord(double * v, double * u, double * r) {
int i;
for (i = 0; i < SIZE; i += 2) {
r[i] = *(__int64*)&(v[i]) | *(__int64*)&(u[i]);
}
}
ord: 0.33
And this is my SIMD code:
https://codepaste.net/fbg1g5
void blurr2(double * u, double * r) {
__m128d rp2;
__m128d rdos;
__m128d rr;
int i;
int sizeAux = SIZE % 2 == 1 ? SIZE : SIZE - 1;
double dos[2] = { 2.0, 2.0 };
rdos = *(__m128d*)dos;
for (i = 0; i < sizeAux; i += 2) {
rp2 = *(__m128d*)&u[i + 1];
rr = _mm_add_pd(*(__m128d*)&u[i], rp2);
*((__m128d*)&r[i]) = _mm_div_pd(rr, rdos);
}
}
blurr2: 0.42s
int contarNegativos(double * u) {
__m128d rcero;
__m128d rr;
int i;
double cero[2] = { 0.0, 0.0 };
int contador = 0;
rcero = *(__m128d*)cero;
for (i = 0; i < SIZE; i += 2) {
rr = _mm_cmplt_pd(*(__m128d*)&u[i], rcero);
if (((__int64 *)&rr)[0]) {
contador++;
};
if (((__int64 *)&rr)[1]) {
contador++;
};
}
return contador;
}
negativeCount: 1.42s
void ord(double * v, double * u, double * r) {
__m128d rr;
int i;
for (i = 0; i < SIZE; i += 2) {
*((__m128d*)&r[i]) = _mm_or_pd(*(__m128d*)&v[i], *(__m128d*)&u[i]);
}
}
ord: 0.35s
**Differents solutions.
Can you explain me what i'm doing wrong? I'm a bit lost...
Use _mm_loadu_pd instead of pointer-casting and dereferencing a __m128d. Your code is guaranteed to segfault on gcc/clang where __m128d is assumed to be aligned.
blurr2: multiply by 0.5 instead of dividing by 2. It will be much faster. (I commented the same thing on a question with the exact same code in the last day or two, was that also you?)
negativeCount: _mm_castpd_si128 the compare result to integer, and accumulate it with _mm_sub_epi64. (The bit pattern is all-zero or all-one, i.e. 2's complement 0 / -1).
#include <immintrin.h>
#include <stdint.h>
static const size_t SIZE = 1024;
uint64_t countNegative(double * u) {
__m128i counts = _mm_setzero_si128();
for (size_t i = 0; i < SIZE; i += 2) {
__m128d cmp = _mm_cmplt_pd(_mm_loadu_pd(&u[i]), _mm_setzero_pd());
counts = _mm_sub_epi64(counts, _mm_castpd_si128(cmp));
}
//return counts[0] + counts[1]; // GNU C only, and less efficient
// horizontal sum
__m128i hi64 = _mm_shuffle_epi32(counts, _MM_SHUFFLE(1, 0, 3, 2));
counts = _mm_add_epi64(counts, hi64);
uint64_t scalarcount = _mm_cvtsi128_si64(counts);
return scalarcount;
}
To learn more about efficient vector horizontal sums, see Fastest way to do horizontal float vector sum on x86. But the first rule is to do it outside the loop.
(source + asm on the Godbolt compiler explorer)
From MSVC (which I'm guessing you're using, or you'd get segfaults from *(__m128d*)foo), the inner loop is:
$LL4#countNegat:
movups xmm0, XMMWORD PTR [rcx]
lea rcx, QWORD PTR [rcx+16]
cmpltpd xmm0, xmm2
psubq xmm1, xmm0
sub rax, 1
jne SHORT $LL4#countNegat
It could maybe go faster with unrolling (and maybe two vector accumulators), but this is fairly good and might go close to 1.25 clocks per 16 bytes on Sandybridge/Haswell. (Bottleneck on 5 fused-domain uops).
Your version was actually unpacking to integer inside the inner loop! And if you were using MSVC -Ox, it was actually branching instead of using a branchless compare + conditional add. I'm surprised it wasn't slower than the scalar version.
Also, (int64_t *)&rr violates strict aliasing. char* can alias anything, but it's not safe to cast other pointers onto SIMD vectors and expect it to work. If it does, you got lucky. Compilers usually generate similar code for that or intrinsics, and usually not worse for proper intrinsics.
Do you know that ord function with SIMD is not 1:1 to ord function without using SIMD instructions ?
In ord function without using SIMD, result of OR operation is calculated for even indexes
r[0] = v[0] | u[0],
r[2] = v[2] | u[2],
r[4] = v[4] | u[4]
what with odd indexes? maybe, if OR operations are calculated for all indexes, it will take more time than now.

Run-Time Check Failure #2 - Stack around the variable 'result' was corrupted

I have compiled this code and got Run-Time Check Failure #2 - Stack around the variable 'result' was corrupted exception. But when I changed result array size from 2 to 4 exception disappeared. Can you explain why this happens?
Sorry, if you found this question too basic.
#include "stdafx.h"
string get_cpu_name()
{
uint32_t data[4] = { 0 };
_asm
{
cpuid;
mov data[0], ebx;
mov data[4], edx;
mov data[8], ecx;
}
return string((const char *)data);
}
void assembler()
{
cout << "CPU is " << get_cpu_name() << endl;
float f1[] = { 1.f , 22.f};
float f2[] = { 5.f , 3.f };
float result[2] = { 0.f };
/*float f1[] = { 1.f , 22.f , 1.f , 22.f };
float f2[] = { 5.f , 3.f , 1.f , 22.f };
float result[4] = { 0.f };*/
_asm
{
movups xmm1, f1;
movups xmm2, f2;
mulps xmm1, xmm2;
movups result, xmm1;
}
/*for (size_t i = 0; i < 4; i++)*/
for (size_t i = 0; i < 2; i++)
{
cout << result[i] << "\t";
}
cout << endl;
}
int main()
{
assembler();
getchar();
return 0;
}
The movups instruction writes 128 bits (16 bytes) to memory. You are writing this to the location of an 8-byte array (2*4 bytes, or 64 bits). The 8 bytes after the array will also be written to.
You should make sure there are at least 16 bytes of space to write the result, or you should make sure to write less than 16 bytes there.

Efficient float to int without overflow

using int_type = int;
int_type min = std::numeric_limits<Depth>::min();
int_type max = std::numeric_limits<Depth>::max();
int_type convert(float f) {
if(f < static_cast<float>(min)) return min; // overflow
else if(f > static_cast<float>(max)) return max; // overflow
else return static_cast<int_type>(f);
}
Is there a more efficient way to convert float f to int_type, while clamping it to the minimal and maximal values of the integer type?
For example, without casting min and max to float for the comparisons.
Sometimes Almost always, trusting the compiler is the best thing to do.
This code:
template<class Integral>
__attribute__((noinline))
int convert(float f)
{
using int_type = Integral;
constexpr int_type min = std::numeric_limits<int_type>::min();
constexpr int_type max = std::numeric_limits<int_type>::max();
constexpr float fmin = static_cast<float>(min);
constexpr float fmax = static_cast<float>(max);
if(f < fmin) return min; // overflow
if(f > fmax) return max; // overflow
return static_cast<int_type>(f);
}
compiled with -O2 and -fomit-frame-pointer, yields:
__Z7convertIiEif: ## #_Z7convertIiEif
.cfi_startproc
movl $-2147483648, %eax ## imm = 0xFFFFFFFF80000000
movss LCPI1_0(%rip), %xmm1 ## xmm1 = mem[0],zero,zero,zero
ucomiss %xmm0, %xmm1
ja LBB1_3
movl $2147483647, %eax ## imm = 0x7FFFFFFF
ucomiss LCPI1_1(%rip), %xmm0
ja LBB1_3
cvttss2si %xmm0, %eax
LBB1_3:
retq
I'm not sure it could be any more efficient.
Note LCPI_x defined here:
.section __TEXT,__literal4,4byte_literals
.align 2
LCPI1_0:
.long 3472883712 ## float -2.14748365E+9
LCPI1_1:
.long 1325400064 ## float 2.14748365E+9
How about clamping using fmin(), fmax()... [thanks to njuffa for the question]
The code does become more efficient, because the conditional jumps are removed. However, it starts to behave incorrectly at the clamping limits.
Consider:
template<class Integral>
__attribute__((noinline))
int convert2(float f)
{
using int_type = Integral;
constexpr int_type min = std::numeric_limits<int_type>::min();
constexpr int_type max = std::numeric_limits<int_type>::max();
constexpr float fmin = static_cast<float>(min);
constexpr float fmax = static_cast<float>(max);
f = std::min(f, fmax);
f = std::max(f, fmin);
return static_cast<int_type>(f);
}
call with
auto i = convert2<int>(float(std::numeric_limits<int>::max()));
results in:
-2147483648
Clearly we need to reduce the limits by epsilon because of a float's inability to accurately represent the full range of an int, so...
template<class Integral>
__attribute__((noinline))
int convert2(float f)
{
using int_type = Integral;
constexpr int_type min = std::numeric_limits<int_type>::min();
constexpr int_type max = std::numeric_limits<int_type>::max();
constexpr float fmin = static_cast<float>(min) - (std::numeric_limits<float>::epsilon() * static_cast<float>(min));
constexpr float fmax = static_cast<float>(max) - (std::numeric_limits<float>::epsilon() * static_cast<float>(max));
f = std::min(f, fmax);
f = std::max(f, fmin);
return static_cast<int_type>(f);
}
Should be better...
except that now the same function call yields:
2147483392
Incidentally, working on this actually led me to a bug in the original code. Because of the same rounding error issue, the > and < operators need to be replaced with >= and <=.
like so:
template<class Integral>
__attribute__((noinline))
int convert(float f)
{
using int_type = Integral;
constexpr int_type min = std::numeric_limits<int_type>::min();
constexpr int_type max = std::numeric_limits<int_type>::max();
constexpr float fmin = static_cast<float>(min);
constexpr float fmax = static_cast<float>(max);
if(f <= fmin) return min; // overflow
if(f >= fmax) return max; // overflow
return static_cast<int_type>(f);
}
For 32-bit integers, you can let the CPU do some of the clamping work for you.
The cvtss2si instruction will actually return 0x80000000 in the case of an out of range floating point number. This lets you eliminate one test most of the time:
int convert(float value)
{
int result = _mm_cvtss_si32(_mm_load_ss(&value));
if (result == 0x80000000 && value > 0.0f)
result = 0x7fffffff;
return result;
}
If you have lots of them to convert, then _mm_cvtps_epi32 lets you process four at once (with the same behaviour on overflow). That should be much faster than processing them one at a time, but you'd need to structure the code differently to make use of it.
If you are want to truncate, you can take advantage of avx2 and avx instructions 512:
#include <float.h>
int main() {
__m256 a = {5.423423, -4.243423, 423.4234234, FLT_MAX, 79.4234876, 19.7, 8.5454, 7675675.6};
__m256i b = _mm256_cvttps_epi32(a);
void p256_hex_u32(__m256i in) {
alignas(32) uint32_t v[8];
_mm256_store_si256((__m256i*)v, in);
printf("v4_u32: %d %d %d %d %d %d %d %d\n", v[0], v[1], v[2], v[3], v[4], v[5], v[6], v[7]);
}
Compile with:
g++ -std=c++17 -mavx2 a.cpp && ./a.out
and for mavx512 (my cpu does not support so I will not provide a working test, feel free to edit):
_mm512_maskz_cvtt_roundpd_epi64(k, value, _MM_FROUND_NO_EXC);

SSE much slower than regular function

i am making Julia set visualisation using SSE.
here is my code
class and operators
class vec4 {
public:
inline vec4(void) {}
inline vec4(__m128 val) :v(val) {}
__m128 v;
inline void operator=(float *a) {v=_mm_load_ps(a);}
inline vec4(float *a) {(*this)=a;}
inline vec4(float a) {(*this)=a;}
inline void operator=(float a) {v=_mm_load1_ps(&a);}
};
inline vec4 operator+(const vec4 &a,const vec4 &b) { return _mm_add_ps(a.v,b.v); }
inline vec4 operator-(const vec4 &a,const vec4 &b) { return _mm_sub_ps(a.v,b.v); }
inline vec4 operator*(const vec4 &a,const vec4 &b) { return _mm_mul_ps(a.v,b.v); }
inline vec4 operator/(const vec4 &a,const vec4 &b) { return _mm_div_ps(a.v,b.v); }
inline vec4 operator++(const vec4 &a)
{
__declspec(align(16)) float b[4]={1.0f,1.0f,1.0f,1.0f};
vec4 B(b);
return _mm_add_ps(a.v,B.v);
}
function itself:
vec4 TWO(2.0f);
vec4 FOUR(4.0f);
vec4 ZER(0.0f);
vec4 CR(cR);
vec4 CI(cI);
for (int i=0; i<320; i++) //H
{
float *pr = (float*) _aligned_malloc(4 * sizeof(float), 16); //dynamic
__declspec(align(16)) float pi=i*ratioY + startY;
for (int j=0; j<420; j+=4) //W
{
pr[0]=j*ratioX + startX;
for(int x=1;x<4;x++)
{
pr[x]=pr[x-1]+ratioX;
}
vec4 ZR(pr);
vec4 ZI(pi);
__declspec(align(16)) float color[4]={0.0f,0.0f,0.0f,0.0f};
vec4 COLOR(color);
vec4 COUNT(0.0f);
__m128 MASK=ZER.v;
int _count;
enum {max_count=100};
for (_count=0;_count<=max_count;_count++)
{
vec4 tZR=ZR*ZR-ZI*ZI+CR;
vec4 tZI=TWO*ZR*ZI+CI;
vec4 LEN=tZR*tZR+tZI*tZI;
__m128 MASKOLD=MASK;
MASK=_mm_cmplt_ps(LEN.v,FOUR.v);
ZR=_mm_or_ps(_mm_and_ps(MASK,tZR.v),_mm_andnot_ps(MASK,ZR.v));
ZI=_mm_or_ps(_mm_and_ps(MASK,tZI.v),_mm_andnot_ps(MASK,ZI.v));
__m128 CHECKNOTEQL=_mm_cmpneq_ps(MASK,MASKOLD);
COLOR=_mm_or_ps(_mm_and_ps(CHECKNOTEQL,COUNT.v),_mm_andnot_ps(CHECKNOTEQL,COLOR.v));
COUNT=COUNT++;
operations+=17;
if (_mm_movemask_ps((LEN-FOUR).v)==0) break;
}
_mm_store_ps(color,COLOR.v);
SSE needs 553k operations (mull,add,if) and takes ~320ms to finish the task
from the other hand regular function takes 1428k operations but need only ~90ms to compute?
I used vs2010 performance analyser and seems that all maths operators are running rly slow.
What I am doing wrong?
The problem you are having is that the SSE intrinics are doing far more memory operations than the non-SSE version. Using your vector class I wrote this:
int main (int argc, char *argv [])
{
vec4 a (static_cast <float> (argc));
cout << "argc = " << argc << endl;
a=++a;
cout << "a = (" << a.v.m128_f32 [0] << ", " << a.v.m128_f32 [1] << ", " << a.v.m128_f32 [2] << ", " << a.v.m128_f32 [3] << ", " << ")" << endl;
}
which produced the following operations in a release build (I've edited this to show the SSE only):
fild dword ptr [ebp+8] // load argc into FPU
fstp dword ptr [esp+10h] // save argc as a float
movss xmm0,dword ptr [esp+10h] // load argc into SSE
shufps xmm0,xmm0,0 // copy argc to all values in SSE register
movaps xmmword ptr [esp+20h],xmm0 // save to stack frame
fld1 // load 1 into FPU
fst dword ptr [esp+20h]
fst dword ptr [esp+28h]
fst dword ptr [esp+30h]
fstp dword ptr [esp+38h] // create a (1,1,1,1) vector
movaps xmm0,xmmword ptr [esp+2Ch] // load above vector into SSE
addps xmm0,xmmword ptr [esp+1Ch] // add to vector a
movaps xmmword ptr [esp+38h],xmm0 // save back to a
Note: the addresses are relative to ESP and there are a few pushes which explains the weird changes of offset for the same value.
Now, compare the code to this version:
int main (int argc, char *argv [])
{
float a[4];
for (int i = 0 ; i < 4 ; ++i)
{
a [i] = static_cast <float> (argc + i);
}
cout << "argc = " << argc << endl;
for (int i = 0 ; i < 4 ; ++i)
{
a [i] += 1.0f;
}
cout << "a = (" << a [0] << ", " << a [1] << ", " << a [2] << ", " << a [3] << ", " << ")" << endl;
}
The compiler created this code for the above (again, edited and with weird offsets)
fild dword ptr [argc] // converting argc to floating point values
fstp dword ptr [esp+8]
fild dword ptr [esp+4] // the argc+i is done in the integer unit
fstp dword ptr [esp+0Ch]
fild dword ptr [esp+8]
fstp dword ptr [esp+18h]
fild dword ptr [esp+10h]
fstp dword ptr [esp+24h] // array a now initialised
fld dword ptr [esp+8] // load a[0]
fld1 // load 1 into FPU
fadd st(1),st // increment a[0]
fxch st(1)
fstp dword ptr [esp+14h] // save a[0]
fld dword ptr [esp+1Ch] // load a[1]
fadd st,st(1) // increment a[1]
fstp dword ptr [esp+24h] // save a[1]
fld dword ptr [esp+28h] // load a[2]
fadd st,st(1) // increment a[2]
fstp dword ptr [esp+28h] // save a[2]
fadd dword ptr [esp+2Ch] // increment a[3]
fstp dword ptr [esp+2Ch] // save a[3]
In terms of memory access, the increment requires:
SSE FPU
4xfloat write 1xfloat read
1xsse read 1xfloat write
1xsse read+add 1xfloat read
1xsse write 1xfloat write
1xfloat read
1xfloat write
1xfloat read
1xfloat write
total
8 float reads 4 float reads
8 float writes 4 float writes
This shows the SSE is using twice the memory bandwidth of the FPU version and memory bandwidth is a major bottleneck.
If you want to seriously maximise the SSE then you need to write the whole aglorithm in a single SSE assembler function so that you can eliminate the memory read/writes as much as possible. Using the intrinsics is not an ideal solution for optimisation.
here is an another example(Mandelbrot Sets) which is almost same to mine way of implementation of the Julia set algoritm
http://pastebin.com/J90paPVC based on http://www.iquilezles.org/www/articles/sse/sse.htm.
same story FPU>SSE I even skip some irrelevant operations.
any ideas how to do it right?