SSE much slower than regular function

SSE much slower than regular function - c++

i am making Julia set visualisation using SSE.
here is my code
class and operators
class vec4 {
public:
inline vec4(void) {}
inline vec4(__m128 val) :v(val) {}
__m128 v;
inline void operator=(float *a) {v=_mm_load_ps(a);}
inline vec4(float *a) {(*this)=a;}
inline vec4(float a) {(*this)=a;}
inline void operator=(float a) {v=_mm_load1_ps(&a);}
};
inline vec4 operator+(const vec4 &a,const vec4 &b) { return _mm_add_ps(a.v,b.v); }
inline vec4 operator-(const vec4 &a,const vec4 &b) { return _mm_sub_ps(a.v,b.v); }
inline vec4 operator*(const vec4 &a,const vec4 &b) { return _mm_mul_ps(a.v,b.v); }
inline vec4 operator/(const vec4 &a,const vec4 &b) { return _mm_div_ps(a.v,b.v); }
inline vec4 operator++(const vec4 &a)
{
__declspec(align(16)) float b[4]={1.0f,1.0f,1.0f,1.0f};
vec4 B(b);
return _mm_add_ps(a.v,B.v);
}
function itself:
vec4 TWO(2.0f);
vec4 FOUR(4.0f);
vec4 ZER(0.0f);
vec4 CR(cR);
vec4 CI(cI);
for (int i=0; i<320; i++) //H
{
float *pr = (float*) _aligned_malloc(4 * sizeof(float), 16); //dynamic
__declspec(align(16)) float pi=i*ratioY + startY;
for (int j=0; j<420; j+=4) //W
{
pr[0]=j*ratioX + startX;
for(int x=1;x<4;x++)
{
pr[x]=pr[x-1]+ratioX;
}
vec4 ZR(pr);
vec4 ZI(pi);
__declspec(align(16)) float color[4]={0.0f,0.0f,0.0f,0.0f};
vec4 COLOR(color);
vec4 COUNT(0.0f);
__m128 MASK=ZER.v;
int _count;
enum {max_count=100};
for (_count=0;_count<=max_count;_count++)
{
vec4 tZR=ZR*ZR-ZI*ZI+CR;
vec4 tZI=TWO*ZR*ZI+CI;
vec4 LEN=tZR*tZR+tZI*tZI;
__m128 MASKOLD=MASK;
MASK=_mm_cmplt_ps(LEN.v,FOUR.v);
ZR=_mm_or_ps(_mm_and_ps(MASK,tZR.v),_mm_andnot_ps(MASK,ZR.v));
ZI=_mm_or_ps(_mm_and_ps(MASK,tZI.v),_mm_andnot_ps(MASK,ZI.v));
__m128 CHECKNOTEQL=_mm_cmpneq_ps(MASK,MASKOLD);
COLOR=_mm_or_ps(_mm_and_ps(CHECKNOTEQL,COUNT.v),_mm_andnot_ps(CHECKNOTEQL,COLOR.v));
COUNT=COUNT++;
operations+=17;
if (_mm_movemask_ps((LEN-FOUR).v)==0) break;
}
_mm_store_ps(color,COLOR.v);
SSE needs 553k operations (mull,add,if) and takes ~320ms to finish the task
from the other hand regular function takes 1428k operations but need only ~90ms to compute?
I used vs2010 performance analyser and seems that all maths operators are running rly slow.
What I am doing wrong?

The problem you are having is that the SSE intrinics are doing far more memory operations than the non-SSE version. Using your vector class I wrote this:
int main (int argc, char *argv [])
{
vec4 a (static_cast <float> (argc));
cout << "argc = " << argc << endl;
a=++a;
cout << "a = (" << a.v.m128_f32 [0] << ", " << a.v.m128_f32 [1] << ", " << a.v.m128_f32 [2] << ", " << a.v.m128_f32 [3] << ", " << ")" << endl;
}
which produced the following operations in a release build (I've edited this to show the SSE only):
fild dword ptr [ebp+8] // load argc into FPU
fstp dword ptr [esp+10h] // save argc as a float
movss xmm0,dword ptr [esp+10h] // load argc into SSE
shufps xmm0,xmm0,0 // copy argc to all values in SSE register
movaps xmmword ptr [esp+20h],xmm0 // save to stack frame
fld1 // load 1 into FPU
fst dword ptr [esp+20h]
fst dword ptr [esp+28h]
fst dword ptr [esp+30h]
fstp dword ptr [esp+38h] // create a (1,1,1,1) vector
movaps xmm0,xmmword ptr [esp+2Ch] // load above vector into SSE
addps xmm0,xmmword ptr [esp+1Ch] // add to vector a
movaps xmmword ptr [esp+38h],xmm0 // save back to a
Note: the addresses are relative to ESP and there are a few pushes which explains the weird changes of offset for the same value.
Now, compare the code to this version:
int main (int argc, char *argv [])
{
float a[4];
for (int i = 0 ; i < 4 ; ++i)
{
a [i] = static_cast <float> (argc + i);
}
cout << "argc = " << argc << endl;
for (int i = 0 ; i < 4 ; ++i)
{
a [i] += 1.0f;
}
cout << "a = (" << a [0] << ", " << a [1] << ", " << a [2] << ", " << a [3] << ", " << ")" << endl;
}
The compiler created this code for the above (again, edited and with weird offsets)
fild dword ptr [argc] // converting argc to floating point values
fstp dword ptr [esp+8]
fild dword ptr [esp+4] // the argc+i is done in the integer unit
fstp dword ptr [esp+0Ch]
fild dword ptr [esp+8]
fstp dword ptr [esp+18h]
fild dword ptr [esp+10h]
fstp dword ptr [esp+24h] // array a now initialised
fld dword ptr [esp+8] // load a[0]
fld1 // load 1 into FPU
fadd st(1),st // increment a[0]
fxch st(1)
fstp dword ptr [esp+14h] // save a[0]
fld dword ptr [esp+1Ch] // load a[1]
fadd st,st(1) // increment a[1]
fstp dword ptr [esp+24h] // save a[1]
fld dword ptr [esp+28h] // load a[2]
fadd st,st(1) // increment a[2]
fstp dword ptr [esp+28h] // save a[2]
fadd dword ptr [esp+2Ch] // increment a[3]
fstp dword ptr [esp+2Ch] // save a[3]
In terms of memory access, the increment requires:
SSE FPU
4xfloat write 1xfloat read
1xsse read 1xfloat write
1xsse read+add 1xfloat read
1xsse write 1xfloat write
1xfloat read
1xfloat write
1xfloat read
1xfloat write
total
8 float reads 4 float reads
8 float writes 4 float writes
This shows the SSE is using twice the memory bandwidth of the FPU version and memory bandwidth is a major bottleneck.
If you want to seriously maximise the SSE then you need to write the whole aglorithm in a single SSE assembler function so that you can eliminate the memory read/writes as much as possible. Using the intrinsics is not an ideal solution for optimisation.

here is an another example(Mandelbrot Sets) which is almost same to mine way of implementation of the Julia set algoritm
http://pastebin.com/J90paPVC based on http://www.iquilezles.org/www/articles/sse/sse.htm.
same story FPU>SSE I even skip some irrelevant operations.
any ideas how to do it right?

Related

Compile-time table generation for "ICSIlog"

The following C code is used to generate a lookup table at runtime to help implement the "ICSI" log algorithm (referenced from https://github.com/mgbellemare/SkipCTS/blob/master/src/icsilog.cpp):
/*
This method fills a given array of floats with the information necessary to compute the icsi_log. This method has to be called before any call to icsi_log.
Parameters:
n is the number of bits used from the mantissa (0<=n<=23). Higher n means higher accuracy but slower execution. We found that a good value for n is 14.
lookup_table requires a float* pointing to a continuous (preallocated) memory array of 2^n*sizeof(float) bytes.
Return values: void
*/
void fill_icsi_log_table(const int n, float *lookup_table)
{
float numlog;
int incr,i,p;
int *const exp_ptr = ((int*)&numlog);
int x = *exp_ptr; /*x is the float treated as an integer*/
x = 0x3F800000; /*set the exponent to 0 so numlog=1.0*/
*exp_ptr = x;
incr = 1 << (23-n); /*amount to increase the mantissa*/
p = 1 << n;
for(i=0;i<p;i++)
{
lookup_table[i] = (float) log2(numlog); /*save the log of the value*/
x += incr;
*exp_ptr = x; /*update the float value*/
}
}
/* ICSIlog V 2.0 */
void fill_icsi_log_table2(const unsigned precision, float* const pTable)
{
/* step along table elements and x-axis positions
(start with extra half increment, so the steps intersect at their midpoints.) */
float oneToTwo = 1.0f + (1.0f / (float)( 1 <<(precision + 1) ));
int i;
for(i = 0; i < (1 << precision); ++i )
{+
// make y-axis value for table element
pTable[i] = logf(oneToTwo) / 0.69314718055995f;
oneToTwo += 1.0f / (float)( 1 << precision );
}
}
Is there a way that either of these functions could be adapted to generate a lookup table at compile-time using templates and C++11-amenable single-line return constexpr functions similar to the following structure?
/** Range generation,
* from http://stackoverflow.com/questions/13313980/populate-an-array-using-constexpr-at-compile-time **/
template<unsigned... Is> struct seq{};
template<unsigned N, unsigned... Is>
struct gen_seq : gen_seq<N-1, N-1, Is...>{};
template<unsigned... Is>
struct gen_seq<0, Is...> : seq<Is...>{};
/** A table consisting of indexes and values,
* which will all be computed at compile-time **/
template<unsigned N>
struct Table
{
unsigned indexes[N];
double values[N];
static constexpr unsigned length = N;
};
template< typename LambdaType, unsigned... Is>
constexpr Table< sizeof...(Is) > TableGenerator(seq<Is...>, LambdaType evalFunc)
{
return {{ Is... }, { evalFunc(Is)... }};
}
template<unsigned N, typename LambdaType>
constexpr Table<N> TableGenerator( LambdaType evalFunc )
{
return TableGenerator(gen_seq<N>(), evalFunc);
}
/** Function that computes a value for each index **/
constexpr double myFunc( unsigned idx )
{
return sin(0.2 * idx) + cos(0.5*idx);
}

Working from this example as a starting point and the "v2.0" variant of the table generation code:
/* ICSIlog V 2.0 */
void fill_icsi_log_table2(const unsigned precision, float* const pTable)
{
/* step along table elements and x-axis positions
(start with extra half increment, so the steps intersect at their midpoints.) */
float oneToTwo = 1.0f + (1.0f / (float)( 1 <<(precision + 1) ));
int i;
for(i = 0; i < (1 << precision); ++i )
{
// make y-axis value for table element
pTable[i] = logf(oneToTwo) / 0.69314718055995f;
oneToTwo += 1.0f / (float)( 1 << precision );
}
}
This recursive template structure:
#include <math.h>
#define PRECISION (4)
constexpr float table_log(float oneToTwo)
{
return logf(oneToTwo) / 0.69314718055995f;
}
template<size_t c, size_t precision, float* const* pTable>
struct ForLoop {
template<template <size_t, size_t, float* const*> class Func>
static void iterate(float oneToTwo) {
ForLoop<c - 1, precision, pTable>::template
iterate<Func>(Func<c - 1, precision, pTable>()(oneToTwo));
}
};
template<size_t precision, float* const* pTable>
struct ForLoop<0, precision, pTable> {
template<template <size_t, size_t, float* const*> class Func>
static void iterate(float oneToTwo) {
Func<0, precision, pTable>()(oneToTwo);
}
};
template <size_t index, size_t precision, float* const *pTable>
struct LogTabe {
float operator()(float oneToTwo) {
float a = table_log(oneToTwo);
(*pTable)[(1 << precision) - index] = a;
return oneToTwo + 1.0f / (float)(1 << precision);
}
};
static float *const table = new float[1 << PRECISION];
extern float *const table;
int main() {
ForLoop<(1 << PRECISION) + 1, PRECISION, &table>::iterate<LogTabe>(1.0f + (1.0f / (float)( 1 << (PRECISION + 1))));
}
Compiled with gcc x86-64 8.1, -std=c++11 -O1, generates an output table consistent with the original code and the asm output:
mov rax, QWORD PTR table[rip]
mov DWORD PTR [rax], 0x3d35d69b
mov DWORD PTR [rax+4], 0x3e0462c4
mov DWORD PTR [rax+8], 0x3e567af2
mov DWORD PTR [rax+12], 0x3e92203d
mov DWORD PTR [rax+16], 0x3eb7110e
mov DWORD PTR [rax+20], 0x3eda3f60
mov DWORD PTR [rax+24], 0x3efbd42b
mov DWORD PTR [rax+28], 0x3f0df989
mov DWORD PTR [rax+32], 0x3f1d5da0
mov DWORD PTR [rax+36], 0x3f2c2411
mov DWORD PTR [rax+40], 0x3f3a58fe
mov DWORD PTR [rax+44], 0x3f480731
mov DWORD PTR [rax+48], 0x3f553848
mov DWORD PTR [rax+52], 0x3f61f4e6
mov DWORD PTR [rax+56], 0x3f6e44cd
mov DWORD PTR [rax+60], 0x3f7a2f04
mov DWORD PTR [rax+64], 0x3f88759c
mov eax, 0
ret
_GLOBAL__sub_I_main:
sub rsp, 8
mov edi, 64
call operator new[](unsigned long)
mov QWORD PTR table[rip], rax
add rsp, 8
ret
Showing that the table values have been successfully pre-computed at compile-time. However recent versions of Clang refuse to compile the code on the objection given by max66 in the comments that the "cmath" and "math.h" library functions are not strictly constexpr (but since it's being evaluated at compile-time anyway, a Taylor series expansion to arbitrary precision itself implemented as a constexpr function would likely work fine as a substitute.)

Why AVX dot product slower than native C++ code

I have the following AVX and Native codes:
__forceinline double dotProduct_2(const double* u, const double* v)
{
_mm256_zeroupper();
__m256d xy = _mm256_mul_pd(_mm256_load_pd(u), _mm256_load_pd(v));
__m256d temp = _mm256_hadd_pd(xy, xy);
__m128d dotproduct = _mm_add_pd(_mm256_extractf128_pd(temp, 0), _mm256_extractf128_pd(temp, 1));
return dotproduct.m128d_f64[0];
}
__forceinline double dotProduct_1(const D3& a, const D3& b)
{
return a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3];
}
And respective test scripts:
std::cout << res_1 << " " << res_2 << " " << res_3 << '\n';
{
std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now();
for (int i = 0; i < (1 << 30); ++i)
{
zx_1 += dotProduct_1(aVx[i % 10000], aVx[(i + 1) % 10000]);
}
std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now();
std::cout << "NAIVE : " << std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count() << '\n';
}
{
std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now();
for (int i = 0; i < (1 << 30); ++i)
{
zx_2 += dotProduct_2(&aVx[i % 10000][0], &aVx[(i + 1) % 10000][0]);
}
std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now();
std::cout << "AVX : " << std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count() << '\n';
}
std::cout << math::min2(zx_1, zx_2) << " " << zx_1 << " " << zx_2;
Well, all of the data are aligned by 32. (D3 with __declspec... and aVx arr with _mm_malloc()..)
And, as i can see, native variant is equal/or faster than AVX variant. I can't understand it's nrmally behaviour ? Because i'm think that AVX is 'super FAST' ... If not, how i can optimize it ? I compile it on MSVC 2015(x64), with arch AVX. Also, my hardwre is intel i7 4750HQ(haswell)

Simple profiling with basic loops isn't a great idea - it usually just means you are memory bandwidth limited, so the tests end up coming out at about the same speed (memory is typically slower than the CPU, and that's basically all you are testing here).
As others have said, your code example isn't great, because you are constantly going across the lanes (which I assume is just to find the fastest dot product, and not specifically because a sum of all the dot products is the desired result?). To be honest, if you really need a fast dot product (for AOS data as presented here), I think I would prefer to replace the VHADDPD with a VADDPD + VPERMILPD (trading an additional instruction for twice the throughput, and a lower latency)
double dotProduct_3(const double* u, const double* v)
{
__m256d dp = _mm256_mul_pd(_mm256_load_pd(u), _mm256_load_pd(v));
__m128d a = _mm256_extractf128_pd(dp, 0);
__m128d b = _mm256_extractf128_pd(dp, 1);
__m128d c = _mm_add_pd(a, b);
__m128d yy = _mm_unpackhi_pd(c, c);
__m128d dotproduct = _mm_add_pd(c, yy);
return _mm_cvtsd_f64(dotproduct);
}
asm:
dotProduct_3(double const*, double const*):
vmovapd ymm0,YMMWORD PTR [rsi]
vmulpd ymm0,ymm0,YMMWORD PTR [rdi]
vextractf128 xmm1,ymm0,0x1
vaddpd xmm0,xmm1,xmm0
vpermilpd xmm1,xmm0,0x3
vaddpd xmm0,xmm1,xmm0
vzeroupper
ret
Generally speaking, if you are using horizontal adds, you're doing it wrong! Whilst a 256bit register may seem ideal for a Vector4d, it's not actually a particularly great representation (especially if you consider that AVX512 is now available!). A very similar question to this came up recently: For C++ Vector3 utility class implementations, is array faster than struct and class?
If you want performance, then structure-of-arrays is the best way to go.
struct HybridVec4SOA
{
__m256d x;
__m256d y;
__m256d z;
__m256d w;
};
__m256d dot(const HybridVec4SOA& a, const HybridVec4SOA& b)
{
return _mm256_fmadd_pd(a.w, b.w,
_mm256_fmadd_pd(a.z, b.z,
_mm256_fmadd_pd(a.y, b.y,
_mm256_mul_pd(a.x, b.x))));
}
asm:
dot(HybridVec4SOA const&, HybridVec4SOA const&):
vmovapd ymm1,YMMWORD PTR [rdi+0x20]
vmovapd ymm2,YMMWORD PTR [rdi+0x40]
vmovapd ymm3,YMMWORD PTR [rdi+0x60]
vmovapd ymm0,YMMWORD PTR [rsi]
vmulpd ymm0,ymm0,YMMWORD PTR [rdi]
vfmadd231pd ymm0,ymm1,YMMWORD PTR [rsi+0x20]
vfmadd231pd ymm0,ymm2,YMMWORD PTR [rsi+0x40]
vfmadd231pd ymm0,ymm3,YMMWORD PTR [rsi+0x60]
ret
If you compare the latencies (and more importantly throughput) of load/mul/fmadd compared to hadd and extract, and then consider that the SOA version is computing 4 dot products at a time (instead of 1), you'll start to understand why it's the way to go...

You add too much overhead with vzeroupper and hadd instructions. Good way to write it, is to do all multiplies in a loop and aggregate the result just once at the end. Imagine you unroll original loop 4 times and use 4 accumulators:
for(i=0; i < (1<<30); i+=4) {
s0 += a[i+0] * b[i+0];
s1 += a[i+1] * b[i+1];
s2 += a[i+2] * b[i+2];
s3 += a[i+3] * b[i+3];
}
return s0+s1+s2+s3;
And now just replace unrolled loop with SIMD mul and add (or even FMA intrinsic if available)

Run-Time Check Failure #2 - Stack around the variable 'result' was corrupted

I have compiled this code and got Run-Time Check Failure #2 - Stack around the variable 'result' was corrupted exception. But when I changed result array size from 2 to 4 exception disappeared. Can you explain why this happens?
Sorry, if you found this question too basic.
#include "stdafx.h"
string get_cpu_name()
{
uint32_t data[4] = { 0 };
_asm
{
cpuid;
mov data[0], ebx;
mov data[4], edx;
mov data[8], ecx;
}
return string((const char *)data);
}
void assembler()
{
cout << "CPU is " << get_cpu_name() << endl;
float f1[] = { 1.f , 22.f};
float f2[] = { 5.f , 3.f };
float result[2] = { 0.f };
/*float f1[] = { 1.f , 22.f , 1.f , 22.f };
float f2[] = { 5.f , 3.f , 1.f , 22.f };
float result[4] = { 0.f };*/
_asm
{
movups xmm1, f1;
movups xmm2, f2;
mulps xmm1, xmm2;
movups result, xmm1;
}
/*for (size_t i = 0; i < 4; i++)*/
for (size_t i = 0; i < 2; i++)
{
cout << result[i] << "\t";
}
cout << endl;
}
int main()
{
assembler();
getchar();
return 0;
}

The movups instruction writes 128 bits (16 bytes) to memory. You are writing this to the location of an 8-byte array (2*4 bytes, or 64 bits). The 8 bytes after the array will also be written to.
You should make sure there are at least 16 bytes of space to write the result, or you should make sure to write less than 16 bytes there.

Incorrect results when using SSE instrinsics in Visual Studio 2010/2012 and Release mode

I'm computing the mean and variance of an array using SSE intrinsics. Basically, this is the summation of the values and its squares which can be illustrated in the following program:
int main( int argc, const char* argv[] )
{
union u
{
__m128 m;
float f[4];
} x;
// Allocate memory and initialize data: [1,2,3,...stSize+1]
const size_t stSize = 1024;
float *pData = (float*) _aligned_malloc(stSize*sizeof(float), 32);
for ( size_t s = 0; s < stSize; ++s ) {
pData[s] = s+1;
}
// Sum and sum of squares
{
// Accumlation using SSE intrinsics
__m128 mEX = _mm_set_ps1(0.f);
__m128 mEXX = _mm_set_ps1(0.f);
for ( size_t s = 0; s < stSize; s+=4 )
{
__m128 m = _mm_load_ps(pData + s);
mEX = _mm_add_ps(mEX, m);
mEXX = _mm_add_ps(mEXX, _mm_mul_ps(m,m));
}
// Final reduction
x.m = mEX;
double dEX = x.f[0] + x.f[1] + x.f[2] + x.f[3];
x.m = mEXX;
double dEXX = x.f[0] + x.f[1] + x.f[2] + x.f[3];
std::cout << "Sum expected: " << (stSize * stSize + stSize) / 2 << std::endl;
std::cout << "EX: " << dEX << std::endl;
std::cout << "Sum of squares expected: " << 1.0/6.0 * stSize * (stSize + 1) * (2 * stSize + 1) << std::endl;
std::cout << "EXX: " << dEXX << std::endl;
}
// Clean up
_aligned_free(pData);
}
Now when I compile and run the program in Debug mode I get the following (and correct) output:
Sum expected: 524800
EX: 524800
Sum of squares expected: 3.58438e+008
EXX: 3.58438e+008
However, compiling and running the program in Release mode the following (and wrong) results are produced:
Sum expected: 524800
EX: 524800
Sum of squares expected: 3.58438e+008
EXX: 3.49272e+012
Changing the order of accumulation, i.e. EXX is updated before EX, the results are OK:
Sum expected: 524800
EX: 524800
Sum of squares expected: 3.58438e+008
EXX: 3.58438e+008
Looks like a 'counterproductive' compiler optimization or why is the order of execution relevant? Is this a known bug?
EDIT:
I just looked at the assembler output. Here is what I get (only the relevant parts).
For the release build with /arch:AVX compiler flag we have:
; 69 : // Second test: sum and sum of squares
; 70 : {
; 71 : __m128 mEX = _mm_set_ps1(0.f);
vmovaps xmm1, XMMWORD PTR __xmm#0
mov ecx, 256 ; 00000100H
; 72 : __m128 mEXX = _mm_set_ps1(0.f);
vmovaps xmm2, xmm1
npad 12
$LL3#main:
; 73 : for ( size_t s = 0; s < stSize; s+=4 )
; 74 : {
; 75 : __m128 m = _mm_load_ps(pData + s);
vmovaps xmm0, xmm1
; 76 : mEX = _mm_add_ps(mEX, m);
vaddps xmm1, xmm1, XMMWORD PTR [rax]
add rax, 16
; 77 : mEXX = _mm_add_ps(mEXX, _mm_mul_ps(m,m));
vmulps xmm0, xmm0, xmm0
vaddps xmm2, xmm0, xmm2
dec rcx
jne SHORT $LL3#main
This is clearly wrong as this (1) saves the accumulated EX result (xmm1) in xmm0 (2) accumulates EX with the current value (XMMWORD PTR [rax]) and (3) accumulates in EXX (xmm2) the square of the accumulated EX result previously save in xmm0.
In contrast, the version without the /arch:AVX looks fine and as expected:
; 69 : // Second test: sum and sum of squares
; 70 : {
; 71 : __m128 mEX = _mm_set_ps1(0.f);
movaps xmm1, XMMWORD PTR __xmm#0
mov ecx, 256 ; 00000100H
; 72 : __m128 mEXX = _mm_set_ps1(0.f);
movaps xmm2, xmm1
npad 10
$LL3#main:
; 73 : for ( size_t s = 0; s < stSize; s+=4 )
; 74 : {
; 75 : __m128 m = _mm_load_ps(pData + s);
movaps xmm0, XMMWORD PTR [rax]
add rax, 16
dec rcx
; 76 : mEX = _mm_add_ps(mEX, m);
addps xmm1, xmm0
; 77 : mEXX = _mm_add_ps(mEXX, _mm_mul_ps(m,m));
mulps xmm0, xmm0
addps xmm2, xmm0
jne SHORT $LL3#main
This really looks like a bug. Can anyone comfirm or refute this issue with a different compiler version? (I currently do not have permission to update the compiler)

Instead of manually performing the horizontal addition, I'd recommend using the corresponding SSE instruction _mm_hadd_ps
// Final reduction
__m128 sum1 = _mm_hadd_ps(mEX, mEXX);
// == {EX[0]+EX[1], EX[2]+EX[3], EXX[0]+EXX[1], EXX[2]+EXX[3]}
// final sum and conversion to double:
__m128d sum2 = _mm_cvtps_pd(_mm_hadd_ps(sum1, sum1));
// result vector:
double dEX_EXX[2]; // (I don't know MSVC syntax for stack aligned arrays)
// store register to stack: (should be _mm_store_pd, if the array is aligned)
_mm_storeu_pd(dEX_EXX, sum2);
std::cout << "EX: " << dEX_EXX[0] << "\nEXX: " << dEX_EXX[1] << std::endl;

How to create a loop in ASM?

I am really struggling to get my head around this task which I've been given and I'm going out of my mind right now.
I have been given an example of the following loop:
#include <iostream> // for cout
using namespace std;
//declare an array of 'marks'.
const int array_size = 5;
int marks [] = {45,56,99,19,21};
int main()
{int index, average, total;
total = 0;
for ( index = 0; index < array_size; index++)
{
total = total + marks [index];
}
average = total/array_size;
cout << "\nAverage value = " << average << "\n\n";
return(0);
}
in assembly:
#include <iostream> // for cout
using namespace std;
const int array_size = 5;
int marks [] = {45,56,99,19,21}; //declare an array of 'marks'.
int main()
{int index, average, total;
__asm {
mov total,0 ;total = 0;
; ************************** This part is the FOR loop *************************
mov index,0 ;for ( index = 0; index < array_size; index++)
jmp checkend
forloop1:
mov eax,index ;add 1 to index
add eax,1
mov index,eax
checkend:
cmp index,5 ;check if 5 ('array_size') loops have been done
jge endfor1 ;jump if greater than or equal to (remember?)
mov ecx,index
mov edx,total ; total =
add edx,[ecx*4+marks] ; total + marks [index];
mov total,edx
jmp forloop1
; ******************************************************************************
endfor1:
mov eax,total ;average = total/array_size;
cdq ;convert EAX to quadword (uses EDX register)
mov ecx,5 ;get array_size as divisor
idiv ecx ;divides EDX:EAX pair by ECX, EAX=answer
mov average,eax ;save answer in variable 'average'
} //end of assembly section
cout << "\nAverage value = " << average << "\n\n";
return(0);
}
However, my problem is this: I need to convert the following loop into assembly, what is the best method of doing this based on the example I've been given? I'm really struggling with assigning the temp_char variable to an array variable.
By following loop I mean the encrypt_chars function loop
#define MAXCHARS 6
#define dollarchar '$' // string terminator
char OChars[MAXCHARS],
EChars[MAXCHARS],
DChars[MAXCHARS] = "Soon!"; // Global Original, Encrypted, Decrypted character strings
//----------------------------- C++ Functions ----------------------------------------------------------
void get_char(char& a_character)
{
cin >> a_character;
while (((a_character < '0') | (a_character > 'z')) && (a_character != dollarchar))
{
cout << "Alphanumeric characters only, please try again > ";
cin >> a_character;
}
}
//-------------------------------------------------------------------------------------------------------------
void get_original_chars(int& length)
{
char next_char;
length = 0;
get_char(next_char);
while ((length < MAXCHARS) && (next_char != dollarchar))
{
OChars[length++] = next_char;
get_char(next_char);
}
}
void encrypt_chars(int length, char EKey)
{
char temp_char; // char temporary store
for (int i = 0; i < length; i++) // encrypt characters one at a time
{
temp_char = OChars[i]; // temp_char now contains the address values of the individual character
__asm
{
push eax // Save values contained within register to stack
push ecx
movzx ecx, temp_char
push ecx // Push argument #2
lea eax, EKey
push eax // Push argument #1
call encrypt4
add esp, 8 // Clean parameters of stack
mov temp_char, al // Move the temp character into a register
pop ecx
pop eax
}
EChars[i] = temp_char; // Store encrypted char in the encrypted chars array
}
return;
// Inputs: register EAX = 32-bit address of Ekey,
// ECX = the character to be encrypted (in the low 8-bit field, CL).
// Output: register EAX = the encrypted value of the source character (in the low 8-bit field, AL).
__asm
{
encrypt4:
push ebp // Set stack
mov ebp, esp // Set up the base pointer
mov eax, [ebp + 8] // Move value of parameter 1 into EAX
mov ecx, [ebp + 12] // Move value of parameter 2 into ECX
push edi // Used for string and memory array copying
push ecx // Loop counter for pushing character onto stack
not byte ptr[eax] // Negation
add byte ptr[eax], 0x04 // Adds hex 4 to EKey
movzx edi, byte ptr[eax] // Moves value of EKey into EDI using zeroes
pop eax // Pop the character value from stack
xor eax, edi // XOR character to give encrypted value of source
pop edi // Pop original address of EDI from the stack
rol al, 1 // Rotates the encrypted value of source by 1 bit (left)
rol al, 1 // Rotates the encrypted value of source by 1 bit (left) again
add al, 0x04 // Adds hex 4 to encrypted value of source
mov esp, ebp // Deallocate values
pop ebp // Restore the base pointer
ret
}
//--- End of Assembly code
}
int main(void)
{
int char_count; // The number of actual characters entered (upto MAXCHARS limit).
char EKey; // Encryption key.
cout << "\nPlease enter your Encryption Key (EKey) letter: "; get_char(EKey);
cout << "\nNow enter upto " << MAXCHARS << " alphanumeric characters:\n";
get_original_chars(char_count);
cout << "\n\nOriginal source string = " << OChars << "\tHex = ";
for (int i = 0; i<char_count; i++) cout << hex << setw(2) << setfill('0') << ((int(OChars[i])) & 0xFF) << " ";
encrypt_chars(char_count, EKey);
cout << "\n\nEncrypted string = " << EChars << "\tHex = ";
for (int i = 0; i<char_count; i++) cout << ((int(EChars[i])) & 0xFF) << " ";
decrypt_chars(char_count, EKey);
cout << "\n\nDecrypted string = " << DChars << "\tHex = ";
for (int i = 0; i<char_count; i++) cout << ((int(DChars[i])) & 0xFF) << " ";
cout << "\n\nPress a key to end...";
while (!_kbhit()); //hold the screen until a key is pressed
return (0);
}
offering a cookie to the best explained (step-by-step) answer/ solution!!
EDIT: Disassembly code generated from the loop etc:
http://pastebin.com/u5MgJ2SW

notice line 32 where it says mov cl,byte ptr [eax+0CC0320h] Unless you
can tell me how to convert that into 'normal asm code'
mov cl,byte ptr [eax+0CC0320h]: eax at this point contains the value of i, 0CC0320h is the address of OChars, i.e. its begining. So, the "normal" asm code is as follows:
__asm {
mov eax, i
mov cl, byte ptr OChars[eax]
mov temp_char, cl
}

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

SSE much slower than regular function - c++

here is an another example(Mandelbrot Sets) which is almost same to mine way of implementation of the Julia set algoritm http://pastebin.com/J90paPVC based on http://www.iquilezles.org/www/articles/sse/sse.htm. same story FPU>SSE I even skip some irrelevant operations. any ideas how to do it right?

Related

Compile-time table generation for "ICSIlog"

Why AVX dot product slower than native C++ code

Run-Time Check Failure #2 - Stack around the variable 'result' was corrupted

Incorrect results when using SSE instrinsics in Visual Studio 2010/2012 and Release mode

How to create a loop in ASM?

Categories

Resources