I have a code snippet. The snippet just loads 2 arrays and calculates dot product between them using SSE.
Code here:
using namespace std;
long long size = 3200000;
float* _random()
{
unsigned int seed = 123;
// float *t = malloc(size*sizeof(float));
float *t = new float[size];
int i;
float num = 0.0;
for(i=0; i < size; i++) {
num = rand()/(RAND_MAX+1.0);
t[i] = num;
}
return t;
}
float _dotProductVectorSSE(float *s1, float *s2)
{
float prod;
int i;
__m128 X, Y, Z;
for(i=0; i<size; i+=4)
{
X = _mm_load_ps(&s1[i]);
Y = _mm_load_ps(&s2[i]);
X = _mm_mul_ps(X, Y);
Z = _mm_add_ps(X, Z);
}
float *v = new float[4];
_mm_store_ps(v,Z);
for(i=0; i<4; i++)
{
// prod += Z[i];
std::cout << v[i] << endl;
}
return prod;
}
int main(int argc, char *argv[])
{
QCoreApplication a(argc, argv);
time_t start, stop;
double avg_time = 0;
double cur_time;
float* s1 = NULL;
float* s2 = NULL;
for(int i = 0; i < 100; i++)
{
s1 = _random();
s2 = _random();
start = clock();
float sse_product = _dotProductVectorSSE(s1, s2);
stop = clock();
cur_time = ((double) stop-start) / CLOCKS_PER_SEC;
avg_time += cur_time;
}
std::cout << "Averagely used " << avg_time/100 << " seconds." << endl;
return a.exec();
}
When I run, I got segment fault. Here is the backtrace:
(gdb) bt
0 0x0804965f in _mm_load_ps (__P=0xb6b56008) at /usr/lib/gcc/i586-suse-linux/4.6/include/xmmintrin.h:899
1 _dotProductVectorSSE (s1=0xb6b56008, s2=0xb5f20008) at ../simd/simd.cpp:37
2 0x0804987f in main (argc=1, argv=0xbfffee84) at ../simd/simd.cpp:80
Diassembler:
0x8049b30 push %ebp
0x8049b31 <+0x0001> push %edi
0x8049b32 <+0x0002> push %esi
0x8049b33 <+0x0003> push %ebx
0x8049b34 <+0x0004> sub $0x2c,%esp
0x8049b37 <+0x0007> mov 0x804c0a4,%esi
0x8049b3d <+0x000d> mov 0x40(%esp),%edx
0x8049b41 <+0x0011> mov 0x44(%esp),%ecx
0x8049b45 <+0x0015> mov 0x804c0a0,%ebx
0x8049b4b <+0x001b> cmp $0x0,%esi
0x8049b4e <+0x001e> jl 0x8049b7a <_Z20_dotProductVectorSSEPfS_+74>
0x8049b50 <+0x0020> jle 0x8049c10 <_Z20_dotProductVectorSSEPfS_+224>
0x8049b56 <+0x0026> add $0xffffffff,%ebx
0x8049b59 <+0x0029> adc $0xffffffff,%esi
0x8049b5c <+0x002c> xor %eax,%eax
0x8049b5e <+0x002e> shrd $0x2,%esi,%ebx
0x8049b62 <+0x0032> add $0x1,%ebx
0x8049b65 <+0x0035> shl $0x2,%ebx
**0x8049b68 <+0x0038> movaps (%edx,%eax,4),%xmm0**
0x8049b6c <+0x003c> mulps (%ecx,%eax,4),%xmm0
0x8049b70 <+0x0040> add $0x4,%eax
0x8049b73 <+0x0043> cmp %ebx,%eax
0x8049b75 <+0x0045> addps %xmm0,%xmm1
0x8049b78 <+0x0048> jne 0x8049b68 <_Z20_dotProductVectorSSEPfS_+56>
0x8049b7a <+0x004a> movaps %xmm1,0x10(%esp)
0x8049b7f <+0x004f> xor %ebx,%ebx
I am using QtCreator and defined in .pro file:
QMAKE_CXXFLAGS += -msse -msse2
DEFINES += __SSE__
DEFINES += __SSE2__
DEFINES += __MMX__
Please tell me how to fix that problem !
You are not ensuring that your data is 16 byte aligned (malloc/new are not sufficient in general) - you will either need to use _mm_loadu_ps instead of _mm_load_ps to deal with your potentially misaligned data, or preferably use a suitable method to allocate aligned memory (e.g. posix_memalign on Linux).
Note that you should _mm_load_ps and 16 byte aligned memory if you possibly can, otherwise use _mm_loadu_ps but note that this may reduce performance signficantly on some (older) CPUs.
Try the link below.
http://flyeater.wordpress.com/2010/11/29/memory-allocation-and-data-alignment-custom-mallocfree/
You basically allocate a bit more memory than you need, then calculate the address which is modulo 16 and use memory beginning from that address to load/store data.
Take care of pointer arithmetic.
Most of the code here ideone.com/fXKQhR is taken from the above link, sample usage.
I think, the _mm_malloc maybe helpful with you.
Related
I know there is a similar question about this: constexpr performing worse at runtime.
But my case is a lot simpler than that one, and the answers were not enough for me.
I'm just learning about constexpr in C++11 and a wrote a code to compare its efficiency, and for some reason, using constexpr makes my code run more than 4 times slower!
By the way, i'm using exactly the same example as in this site: https://www.embarcados.com.br/introducao-ao-cpp11/ (its in Portuguese but you can see the example code about constexpr). Already tried other expressions and the results are similar.
constexpr double divideC(double num){
return (2.0 * num + 10.0) / 0.8;
}
#define SIZE 1000
int main(int argc, char const *argv[])
{
// Get number of iterations from user
unsigned long long count;
cin >> count;
double values[SIZE];
// Testing normal expression
clock_t time1 = clock();
for (int i = 0; i < count; i++)
{
values[i%SIZE] = (2.0 * 3.0 + 10.0) / 0.8;
}
time1 = clock() - time1;
cout << "Time1: " << float(time1)/float(CLOCKS_PER_SEC) << " seconds" << endl;
// Testing constexpr
clock_t time2 = clock();
for (int i = 0; i < count; i++)
{
values[i%SIZE] = divideC( 3.0 );
}
time2 = clock() - time2;
cout << "Time2: " << float(time2)/float(CLOCKS_PER_SEC) << " seconds" << endl;
return 0;
}
Input given:
9999999999
Ouput:
> Time1: 5.768 seconds
> Time2: 27.259 seconds
Can someone tell me the reason of this? As constexpr calculations should run in compile time, it's supposed to run this code faster and not slower.
I'm using msbuild version 16.6.0.22303 to compile the Visual Studio project generated by the following CMake code:
cmake_minimum_required(VERSION 3.1.3)
project(C++11Tests)
add_executable(Cpp11Tests main.cpp)
set_property(TARGET Cpp11Tests PROPERTY CXX_STANDARD_REQUIRED ON)
set_property(TARGET Cpp11Tests PROPERTY CXX_STANDARD 11)
Without optimizations, the compiler will keep the divideC call so it is slower.
With optimizations on any decent compiler knows that - for the given code - everything related to values can be optimized away without any side-effects. So the shown code can never give any meaningful measurements between the difference of values[i%SIZE] = (2.0 * 3.0 + 10.0) / 0.8; or values[i%SIZE] = divideC( 3.0 );
With -O1 any decent compiler will create something this:
for (int i = 0; i < count; i++)
{
values[i%SIZE] = (2.0 * 3.0 + 10.0) / 0.8;
}
results in:
mov rdx, QWORD PTR [rsp+8]
test rdx, rdx
je .L2
mov eax, 0
.L3:
add eax, 1
cmp edx, eax
jne .L3
.L2:
and
for (int i = 0; i < count; i++)
{
values[i%SIZE] = divideC( 3.0 );
}
results in:
mov rdx, QWORD PTR [rsp+8]
test rdx, rdx
je .L4
mov eax, 0
.L5:
add eax, 1
cmp edx, eax
jne .L5
.L4:
So both will result in the identical machine code, only containing the counting of the loop and nothing else. So as soon as you turn on optimizations you will only measure the loop but nothing related to constexpr.
With -O2 even the loop is optimized away, and you would only measure:
clock_t time1 = clock();
time1 = clock() - time1;
cout << "Time1: " << float(time1)/float(CLOCKS_PER_SEC) << " seconds" << endl;
I'm starting with SIMD programming but i don't know what to do at this moment. I'm trying to diminish runtime but its doing it the other way.
This is my basic code:
https://codepaste.net/a8ut89
void blurr2(double * u, double * r) {
int i;
double dos[2] = { 2.0, 2.0 };
for (i = 0; i < SIZE - 1; i++) {
r[i] = u[i] + u[i + 1];
}
}
blurr2: 0.43s
int contarNegativos(double * u) {
int i;
int contador = 0;
for (i = 0; i < SIZE; i++) {
if (u[i] < 0) {
contador++;
}
}
return contador;
}
negativeCount: 1.38s
void ord(double * v, double * u, double * r) {
int i;
for (i = 0; i < SIZE; i += 2) {
r[i] = *(__int64*)&(v[i]) | *(__int64*)&(u[i]);
}
}
ord: 0.33
And this is my SIMD code:
https://codepaste.net/fbg1g5
void blurr2(double * u, double * r) {
__m128d rp2;
__m128d rdos;
__m128d rr;
int i;
int sizeAux = SIZE % 2 == 1 ? SIZE : SIZE - 1;
double dos[2] = { 2.0, 2.0 };
rdos = *(__m128d*)dos;
for (i = 0; i < sizeAux; i += 2) {
rp2 = *(__m128d*)&u[i + 1];
rr = _mm_add_pd(*(__m128d*)&u[i], rp2);
*((__m128d*)&r[i]) = _mm_div_pd(rr, rdos);
}
}
blurr2: 0.42s
int contarNegativos(double * u) {
__m128d rcero;
__m128d rr;
int i;
double cero[2] = { 0.0, 0.0 };
int contador = 0;
rcero = *(__m128d*)cero;
for (i = 0; i < SIZE; i += 2) {
rr = _mm_cmplt_pd(*(__m128d*)&u[i], rcero);
if (((__int64 *)&rr)[0]) {
contador++;
};
if (((__int64 *)&rr)[1]) {
contador++;
};
}
return contador;
}
negativeCount: 1.42s
void ord(double * v, double * u, double * r) {
__m128d rr;
int i;
for (i = 0; i < SIZE; i += 2) {
*((__m128d*)&r[i]) = _mm_or_pd(*(__m128d*)&v[i], *(__m128d*)&u[i]);
}
}
ord: 0.35s
**Differents solutions.
Can you explain me what i'm doing wrong? I'm a bit lost...
Use _mm_loadu_pd instead of pointer-casting and dereferencing a __m128d. Your code is guaranteed to segfault on gcc/clang where __m128d is assumed to be aligned.
blurr2: multiply by 0.5 instead of dividing by 2. It will be much faster. (I commented the same thing on a question with the exact same code in the last day or two, was that also you?)
negativeCount: _mm_castpd_si128 the compare result to integer, and accumulate it with _mm_sub_epi64. (The bit pattern is all-zero or all-one, i.e. 2's complement 0 / -1).
#include <immintrin.h>
#include <stdint.h>
static const size_t SIZE = 1024;
uint64_t countNegative(double * u) {
__m128i counts = _mm_setzero_si128();
for (size_t i = 0; i < SIZE; i += 2) {
__m128d cmp = _mm_cmplt_pd(_mm_loadu_pd(&u[i]), _mm_setzero_pd());
counts = _mm_sub_epi64(counts, _mm_castpd_si128(cmp));
}
//return counts[0] + counts[1]; // GNU C only, and less efficient
// horizontal sum
__m128i hi64 = _mm_shuffle_epi32(counts, _MM_SHUFFLE(1, 0, 3, 2));
counts = _mm_add_epi64(counts, hi64);
uint64_t scalarcount = _mm_cvtsi128_si64(counts);
return scalarcount;
}
To learn more about efficient vector horizontal sums, see Fastest way to do horizontal float vector sum on x86. But the first rule is to do it outside the loop.
(source + asm on the Godbolt compiler explorer)
From MSVC (which I'm guessing you're using, or you'd get segfaults from *(__m128d*)foo), the inner loop is:
$LL4#countNegat:
movups xmm0, XMMWORD PTR [rcx]
lea rcx, QWORD PTR [rcx+16]
cmpltpd xmm0, xmm2
psubq xmm1, xmm0
sub rax, 1
jne SHORT $LL4#countNegat
It could maybe go faster with unrolling (and maybe two vector accumulators), but this is fairly good and might go close to 1.25 clocks per 16 bytes on Sandybridge/Haswell. (Bottleneck on 5 fused-domain uops).
Your version was actually unpacking to integer inside the inner loop! And if you were using MSVC -Ox, it was actually branching instead of using a branchless compare + conditional add. I'm surprised it wasn't slower than the scalar version.
Also, (int64_t *)&rr violates strict aliasing. char* can alias anything, but it's not safe to cast other pointers onto SIMD vectors and expect it to work. If it does, you got lucky. Compilers usually generate similar code for that or intrinsics, and usually not worse for proper intrinsics.
Do you know that ord function with SIMD is not 1:1 to ord function without using SIMD instructions ?
In ord function without using SIMD, result of OR operation is calculated for even indexes
r[0] = v[0] | u[0],
r[2] = v[2] | u[2],
r[4] = v[4] | u[4]
what with odd indexes? maybe, if OR operations are calculated for all indexes, it will take more time than now.
I have the following AVX and Native codes:
__forceinline double dotProduct_2(const double* u, const double* v)
{
_mm256_zeroupper();
__m256d xy = _mm256_mul_pd(_mm256_load_pd(u), _mm256_load_pd(v));
__m256d temp = _mm256_hadd_pd(xy, xy);
__m128d dotproduct = _mm_add_pd(_mm256_extractf128_pd(temp, 0), _mm256_extractf128_pd(temp, 1));
return dotproduct.m128d_f64[0];
}
__forceinline double dotProduct_1(const D3& a, const D3& b)
{
return a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + a[3] * b[3];
}
And respective test scripts:
std::cout << res_1 << " " << res_2 << " " << res_3 << '\n';
{
std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now();
for (int i = 0; i < (1 << 30); ++i)
{
zx_1 += dotProduct_1(aVx[i % 10000], aVx[(i + 1) % 10000]);
}
std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now();
std::cout << "NAIVE : " << std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count() << '\n';
}
{
std::chrono::high_resolution_clock::time_point t1 = std::chrono::high_resolution_clock::now();
for (int i = 0; i < (1 << 30); ++i)
{
zx_2 += dotProduct_2(&aVx[i % 10000][0], &aVx[(i + 1) % 10000][0]);
}
std::chrono::high_resolution_clock::time_point t2 = std::chrono::high_resolution_clock::now();
std::cout << "AVX : " << std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count() << '\n';
}
std::cout << math::min2(zx_1, zx_2) << " " << zx_1 << " " << zx_2;
Well, all of the data are aligned by 32. (D3 with __declspec... and aVx arr with _mm_malloc()..)
And, as i can see, native variant is equal/or faster than AVX variant. I can't understand it's nrmally behaviour ? Because i'm think that AVX is 'super FAST' ... If not, how i can optimize it ? I compile it on MSVC 2015(x64), with arch AVX. Also, my hardwre is intel i7 4750HQ(haswell)
Simple profiling with basic loops isn't a great idea - it usually just means you are memory bandwidth limited, so the tests end up coming out at about the same speed (memory is typically slower than the CPU, and that's basically all you are testing here).
As others have said, your code example isn't great, because you are constantly going across the lanes (which I assume is just to find the fastest dot product, and not specifically because a sum of all the dot products is the desired result?). To be honest, if you really need a fast dot product (for AOS data as presented here), I think I would prefer to replace the VHADDPD with a VADDPD + VPERMILPD (trading an additional instruction for twice the throughput, and a lower latency)
double dotProduct_3(const double* u, const double* v)
{
__m256d dp = _mm256_mul_pd(_mm256_load_pd(u), _mm256_load_pd(v));
__m128d a = _mm256_extractf128_pd(dp, 0);
__m128d b = _mm256_extractf128_pd(dp, 1);
__m128d c = _mm_add_pd(a, b);
__m128d yy = _mm_unpackhi_pd(c, c);
__m128d dotproduct = _mm_add_pd(c, yy);
return _mm_cvtsd_f64(dotproduct);
}
asm:
dotProduct_3(double const*, double const*):
vmovapd ymm0,YMMWORD PTR [rsi]
vmulpd ymm0,ymm0,YMMWORD PTR [rdi]
vextractf128 xmm1,ymm0,0x1
vaddpd xmm0,xmm1,xmm0
vpermilpd xmm1,xmm0,0x3
vaddpd xmm0,xmm1,xmm0
vzeroupper
ret
Generally speaking, if you are using horizontal adds, you're doing it wrong! Whilst a 256bit register may seem ideal for a Vector4d, it's not actually a particularly great representation (especially if you consider that AVX512 is now available!). A very similar question to this came up recently: For C++ Vector3 utility class implementations, is array faster than struct and class?
If you want performance, then structure-of-arrays is the best way to go.
struct HybridVec4SOA
{
__m256d x;
__m256d y;
__m256d z;
__m256d w;
};
__m256d dot(const HybridVec4SOA& a, const HybridVec4SOA& b)
{
return _mm256_fmadd_pd(a.w, b.w,
_mm256_fmadd_pd(a.z, b.z,
_mm256_fmadd_pd(a.y, b.y,
_mm256_mul_pd(a.x, b.x))));
}
asm:
dot(HybridVec4SOA const&, HybridVec4SOA const&):
vmovapd ymm1,YMMWORD PTR [rdi+0x20]
vmovapd ymm2,YMMWORD PTR [rdi+0x40]
vmovapd ymm3,YMMWORD PTR [rdi+0x60]
vmovapd ymm0,YMMWORD PTR [rsi]
vmulpd ymm0,ymm0,YMMWORD PTR [rdi]
vfmadd231pd ymm0,ymm1,YMMWORD PTR [rsi+0x20]
vfmadd231pd ymm0,ymm2,YMMWORD PTR [rsi+0x40]
vfmadd231pd ymm0,ymm3,YMMWORD PTR [rsi+0x60]
ret
If you compare the latencies (and more importantly throughput) of load/mul/fmadd compared to hadd and extract, and then consider that the SOA version is computing 4 dot products at a time (instead of 1), you'll start to understand why it's the way to go...
You add too much overhead with vzeroupper and hadd instructions. Good way to write it, is to do all multiplies in a loop and aggregate the result just once at the end. Imagine you unroll original loop 4 times and use 4 accumulators:
for(i=0; i < (1<<30); i+=4) {
s0 += a[i+0] * b[i+0];
s1 += a[i+1] * b[i+1];
s2 += a[i+2] * b[i+2];
s3 += a[i+3] * b[i+3];
}
return s0+s1+s2+s3;
And now just replace unrolled loop with SIMD mul and add (or even FMA intrinsic if available)
Actual refined question:
Why does this not print 0?
#include "stdafx.h"
#include <iostream>
#include <string>
int _tmain(int argc, _TCHAR* argv[])
{
unsigned char barray[] = {1,2,3,4,5,6,7,8,9};
unsigned long weirdValue = barray[3] << 32;
std::cout << weirdValue; // prints 4
std::string bla;
std::getline(std::cin, bla);
return 0;
}
The disassembly of the shift operation:
10: unsigned long weirdValue = barray[3] << 32;
00411424 movzx eax,byte ptr [ebp-1Dh]
00411428 shl eax,20h
0041142B mov dword ptr [ebp-2Ch],eax
Original question:
I found the following snippet in some old code we maintain. It converts a byte array to multiple float values and adds the floats to a list. Why does it work for byte arrays greater than 4?
unsigned long ulValue = 0;
for (USHORT usIndex = 0; usIndex < m_oData.usNumberOfBytes; usIndex++)
{
if (usIndex > 0 && (usIndex % 4) == 0)
{
float* pfValue = (float*)&ulValue;
oValues.push_back(*pfValue);
ulValue = 0;
}
ulValue += (m_oData.pabyDataBytes[usIndex] << (8*usIndex)); // Why does this work for usIndex > 3??
}
I would understand that this works if << was a rotate operator, not a shift operator. Or if it was
ulValue += (m_oData.pabyDataBytes[usIndex] << (8*(usIndex%4)))
But the code like i found it just confuses me.
The code is compiled using VS 2005.
If i try the original snippet in the immediate window, it doesn't work though.
I know how to do this properly, i just want to know why the code and especially the shift operation works as it is.
Edit: The disassembly for the shift operation is:
13D61D0A shl ecx,3 // multiply uIndex by 8
13D61D0D shl eax,cl // shift to left, does nothing for multiples of 32
13D61D0F add eax,dword ptr [ulValue]
13D61D15 mov dword ptr [ulValue],eax
So the disassembly is fine.
The shift count is masked to 5 bits, which limits the range to 0-31.
A shift of 32 therefore is same as a shift of zero.
http://x86.renejeschke.de/html/file_module_x86_id_285.html
I have compiled this code and got Run-Time Check Failure #2 - Stack around the variable 'result' was corrupted exception. But when I changed result array size from 2 to 4 exception disappeared. Can you explain why this happens?
Sorry, if you found this question too basic.
#include "stdafx.h"
string get_cpu_name()
{
uint32_t data[4] = { 0 };
_asm
{
cpuid;
mov data[0], ebx;
mov data[4], edx;
mov data[8], ecx;
}
return string((const char *)data);
}
void assembler()
{
cout << "CPU is " << get_cpu_name() << endl;
float f1[] = { 1.f , 22.f};
float f2[] = { 5.f , 3.f };
float result[2] = { 0.f };
/*float f1[] = { 1.f , 22.f , 1.f , 22.f };
float f2[] = { 5.f , 3.f , 1.f , 22.f };
float result[4] = { 0.f };*/
_asm
{
movups xmm1, f1;
movups xmm2, f2;
mulps xmm1, xmm2;
movups result, xmm1;
}
/*for (size_t i = 0; i < 4; i++)*/
for (size_t i = 0; i < 2; i++)
{
cout << result[i] << "\t";
}
cout << endl;
}
int main()
{
assembler();
getchar();
return 0;
}
The movups instruction writes 128 bits (16 bytes) to memory. You are writing this to the location of an 8-byte array (2*4 bytes, or 64 bits). The 8 bytes after the array will also be written to.
You should make sure there are at least 16 bytes of space to write the result, or you should make sure to write less than 16 bytes there.