GCC std::sin vectorization bug? - c++

The next code (with -O3 -ffast-math):
#include <cmath>
float a[4];
void sin1() {
for(unsigned i = 0; i < 4; i++) a[i] = sinf(a[i]);
}
Compiles vectorized version of sinf (_ZGVbN4v_sinf):
sin1():
sub rsp, 8
movaps xmm0, XMMWORD PTR a[rip]
call _ZGVbN4v_sinf
movaps XMMWORD PTR a[rip], xmm0
add rsp, 8
ret
But when i use c++ version of sinf (std::sin) no vectorization occurrs:
void sin2() {
for(unsigned i = 0; i < 4; i++) a[i] = std::sin(a[i]);
}
sin2():
sub rsp, 8
movss xmm0, DWORD PTR a[rip]
call sinf
movss DWORD PTR a[rip], xmm0
movss xmm0, DWORD PTR a[rip+4]
call sinf
movss DWORD PTR a[rip+4], xmm0
movss xmm0, DWORD PTR a[rip+8]
call sinf
movss DWORD PTR a[rip+8], xmm0
movss xmm0, DWORD PTR a[rip+12]
call sinf
movss DWORD PTR a[rip+12], xmm0
add rsp, 8
ret
Compiler Explorer Code

Related

Loop unroll issue with Visual Studio compiler

I have some simple setup, where I noticed that VS compiler seems not smart enough to unroll loop, but other compilers like clang or gcc do so. Do I miss some optimization flag for VS?
#include <cstddef>
struct A
{
double data[4];
double *begin() { return data; }
double *end() { return data + 4; }
double const *begin() const { return data; }
double const *end() const { return data + 4; }
};
double sum_index(A const &a) {
double ret = 0;
for(std::size_t i = 0; i < 4; ++i)
{
ret += a.data[i];
}
return ret;
}
double sum_iter(A const &a) {
double ret = 0;
for(auto const &v : a)
{
ret += v;
}
return ret;
}
I used https://godbolt.org/ compiler explorer to generate assembler code.
gcc 11.2 with -O3:
sum_index(A const&):
pxor xmm0, xmm0
addsd xmm0, QWORD PTR [rdi]
addsd xmm0, QWORD PTR [rdi+8]
addsd xmm0, QWORD PTR [rdi+16]
addsd xmm0, QWORD PTR [rdi+24]
ret
sum_iter(A const&):
movsd xmm1, QWORD PTR [rdi]
addsd xmm1, QWORD PTR .LC0[rip]
movsd xmm0, QWORD PTR [rdi+8]
addsd xmm1, xmm0
movupd xmm0, XMMWORD PTR [rdi+16]
addsd xmm1, xmm0
unpckhpd xmm0, xmm0
addsd xmm0, xmm1
ret
.LC0:
.long 0
.long 0
clang 13.0.1 with -O3:
sum_index(A const&): # #sum_index(A const&)
xorpd xmm0, xmm0
addsd xmm0, qword ptr [rdi]
addsd xmm0, qword ptr [rdi + 8]
addsd xmm0, qword ptr [rdi + 16]
addsd xmm0, qword ptr [rdi + 24]
ret
sum_iter(A const&): # #sum_iter(A const&)
xorpd xmm0, xmm0
addsd xmm0, qword ptr [rdi]
addsd xmm0, qword ptr [rdi + 8]
addsd xmm0, qword ptr [rdi + 16]
addsd xmm0, qword ptr [rdi + 24]
ret
MSVC 19.30 with /O2 (there is no /O3?):
this$ = 8
double const * A::begin(void)const PROC ; A::begin, COMDAT
mov rax, rcx
ret 0
double const * A::begin(void)const ENDP ; A::begin
this$ = 8
double const * A::end(void)const PROC ; A::end, COMDAT
lea rax, QWORD PTR [rcx+32]
ret 0
double const * A::end(void)const ENDP ; A::end
a$ = 8
double sum_index(A const &) PROC ; sum_index, COMDAT
movsd xmm0, QWORD PTR [rcx]
xorps xmm1, xmm1
addsd xmm0, xmm1
addsd xmm0, QWORD PTR [rcx+8]
addsd xmm0, QWORD PTR [rcx+16]
addsd xmm0, QWORD PTR [rcx+24]
ret 0
double sum_index(A const &) ENDP ; sum_index
a$ = 8
double sum_iter(A const &) PROC ; sum_iter, COMDAT
lea rax, QWORD PTR [rcx+32]
xorps xmm0, xmm0
cmp rcx, rax
je SHORT $LN12#sum_iter
npad 4
$LL8#sum_iter:
addsd xmm0, QWORD PTR [rcx]
add rcx, 8
cmp rcx, rax
jne SHORT $LL8#sum_iter
$LN12#sum_iter:
ret 0
double sum_iter(A const &) ENDP ; sum_iter
Obviously there is problem with unrolling the loop for MSVC. Is there some additional optimization flag I have to set?
Thanks for help!

Differences in custom and std fetch_add on floats

This is an attempt at implementing fetch_add on floats without C++20.
void fetch_add(volatile float* x, float y)
{
bool success = false;
auto xi = (volatile std::int32_t*)x;
while(!success)
{
union {
std::int32_t sumint;
float sum;
};
auto tmp = __atomic_load_n(xi, __ATOMIC_RELAXED);
sumint = tmp;
sum += y;
success = __atomic_compare_exchange_n(xi, &tmp, sumint, true, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
}
}
To my great confusion, when I compare the assembly from gcc10.1 -O2 -std=c++2a for x86-64, they differ.
fetch_add(float volatile*, float):
.L2:
mov eax, DWORD PTR [rdi]
movd xmm1, eax
addss xmm1, xmm0
movd edx, xmm1
lock cmpxchg DWORD PTR [rdi], edx
jne .L2
ret
fetch_add_std(std::atomic<float>&, float):
mov eax, DWORD PTR [rdi]
movaps xmm1, xmm0
movd xmm0, eax
mov DWORD PTR [rsp-4], eax
addss xmm0, xmm1
.L9:
mov eax, DWORD PTR [rsp-4]
movd edx, xmm0
lock cmpxchg DWORD PTR [rdi], edx
je .L6
mov DWORD PTR [rsp-4], eax
movss xmm0, DWORD PTR [rsp-4]
addss xmm0, xmm1
jmp .L9
.L6:
ret
My ability to read assembly is near non-existent, but the custom version looks correct to me, which implies it is either incorrect, inefficient or somehow the standard library is rather broken. I don't quite believe the third case, which leads me to ask, is the custom version incorrect or inefficient?
After some comments, a second version without reloading after cmpxchg is written. They do still differ.

Bug in VC++ 14.0 (2015) compiler?

I've been running into some issues that only occurred during Release x86 mode and not during Release x64 or any Debug mode. I managed to reproduce the bug using the following code:
#include <stdio.h>
#include <iostream>
using namespace std;
struct WMatrix {
float _11, _12, _13, _14;
float _21, _22, _23, _24;
float _31, _32, _33, _34;
float _41, _42, _43, _44;
WMatrix(float f11, float f12, float f13, float f14,
float f21, float f22, float f23, float f24,
float f31, float f32, float f33, float f34,
float f41, float f42, float f43, float f44) :
_11(f11), _12(f12), _13(f13), _14(f14),
_21(f21), _22(f22), _23(f23), _24(f24),
_31(f31), _32(f32), _33(f33), _34(f34),
_41(f41), _42(f42), _43(f43), _44(f44) {
}
};
void printmtx(WMatrix m1) {
char str[256];
sprintf_s(str, 256, "%.3f, %.3f, %.3f, %.3f", m1._11, m1._12, m1._13, m1._14);
cout << str << "\n";
sprintf_s(str, 256, "%.3f, %.3f, %.3f, %.3f", m1._21, m1._22, m1._23, m1._24);
cout << str << "\n";
sprintf_s(str, 256, "%.3f, %.3f, %.3f, %.3f", m1._31, m1._32, m1._33, m1._34);
cout << str << "\n";
sprintf_s(str, 256, "%.3f, %.3f, %.3f, %.3f", m1._41, m1._42, m1._43, m1._44);
cout << str << "\n";
}
WMatrix mul1(WMatrix m, float f) {
WMatrix out = m;
for (unsigned int i = 0; i < 4; i++) {
for (unsigned int j = 0; j < 4; j++) {
unsigned int idx = i * 4 + j; // critical code
*(&out._11 + idx) *= f; // critical code
}
}
return out;
}
WMatrix mul2(WMatrix m, float f) {
WMatrix out = m;
unsigned int idx2 = 0;
for (unsigned int i = 0; i < 4; i++) {
for (unsigned int j = 0; j < 4; j++) {
unsigned int idx = i * 4 + j; // critical code
bool b = idx == idx2; // critical code
*(&out._11 + idx) *= f; // critical code
idx2++;
}
}
return out;
}
int main() {
WMatrix m1(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
WMatrix m2 = mul1(m1, 0.5f);
WMatrix m3 = mul2(m1, 0.5f);
printmtx(m1);
cout << "\n";
printmtx(m2);
cout << "\n";
printmtx(m3);
int x;
cin >> x;
}
In the above code, mul2 works, but mul1 does not. mul1 and mul2 are simply trying to iterate over the floats in the WMatrix and multiply them by f, but the way mul1 indexes (i*4+j) somehow evaluates to incorrect results. All mul2 does different is it checks the index before using it and then it works (there are many other ways of tinkering with the index to make it work). Notice if you remove the line "bool b = idx == idx2" then mul2 also breaks...
Here is the output:
1.000, 2.000, 3.000, 4.000
5.000, 6.000, 7.000, 8.000
9.000, 10.000, 11.000, 12.000
13.000, 14.000, 15.000, 16.000
0.500, 0.500, 0.375, 0.250
0.625, 1.500, 3.500, 8.000
9.000, 10.000, 11.000, 12.000
13.000, 14.000, 15.000, 16.000
0.500, 1.000, 1.500, 2.000
2.500, 3.000, 3.500, 4.000
4.500, 5.000, 5.500, 6.000
6.500, 7.000, 7.500, 8.000
Correct output should be...
1.000, 2.000, 3.000, 4.000
5.000, 6.000, 7.000, 8.000
9.000, 10.000, 11.000, 12.000
13.000, 14.000, 15.000, 16.000
0.500, 1.000, 1.500, 2.000
2.500, 3.000, 3.500, 4.000
4.500, 5.000, 5.500, 6.000
6.500, 7.000, 7.500, 8.000
0.500, 1.000, 1.500, 2.000
2.500, 3.000, 3.500, 4.000
4.500, 5.000, 5.500, 6.000
6.500, 7.000, 7.500, 8.000
Am I missing something? Or is it actually a bug in the compiler?
This afflicts only the 32-bit compiler; x86-64 builds are not affected, regardless of optimization settings. However, you see the problem manifest in 32-bit builds whether optimizing for speed (/O2) or size (/O1). As you mentioned, it works as expected in debugging builds with optimization disabled.
Wimmel's suggestion of changing the packing, accurate though it is, does not change the behavior. (The code below assumes the packing is correctly set to 1 for WMatrix.)
I can't reproduce it in VS 2010, but I can in VS 2013 and 2015. I don't have 2012 installed. That's good enough, though, to allow us to analyze the difference between the object code produced by the two compilers.
Here is the code for mul1 from VS 2010 (the "working" code):
(Actually, in many cases, the compiler inlined the code from this function at the call site. But the compiler will still output disassembly files containing the code it generated for the individual functions prior to inlining. That's what we're looking at here, because it is more cluttered. The behavior of the code is entirely equivalent whether it's been inlined or not.)
PUBLIC mul1
_TEXT SEGMENT
_m$ = 8 ; size = 64
_f$ = 72 ; size = 4
mul1 PROC
___$ReturnUdt$ = eax
push esi
push edi
; WMatrix out = m;
mov ecx, 16 ; 00000010H
lea esi, DWORD PTR _m$[esp+4]
mov edi, eax
rep movsd
; for (unsigned int i = 0; i < 4; i++)
; {
; for (unsigned int j = 0; j < 4; j++)
; {
; unsigned int idx = i * 4 + j; // critical code
; *(&out._11 + idx) *= f; // critical code
movss xmm0, DWORD PTR [eax]
cvtps2pd xmm1, xmm0
movss xmm0, DWORD PTR _f$[esp+4]
cvtps2pd xmm2, xmm0
mulsd xmm1, xmm2
cvtpd2ps xmm1, xmm1
movss DWORD PTR [eax], xmm1
movss xmm1, DWORD PTR [eax+4]
cvtps2pd xmm1, xmm1
cvtps2pd xmm2, xmm0
mulsd xmm1, xmm2
cvtpd2ps xmm1, xmm1
movss DWORD PTR [eax+4], xmm1
movss xmm1, DWORD PTR [eax+8]
cvtps2pd xmm1, xmm1
cvtps2pd xmm2, xmm0
mulsd xmm1, xmm2
cvtpd2ps xmm1, xmm1
movss DWORD PTR [eax+8], xmm1
movss xmm1, DWORD PTR [eax+12]
cvtps2pd xmm1, xmm1
cvtps2pd xmm2, xmm0
mulsd xmm1, xmm2
cvtpd2ps xmm1, xmm1
movss DWORD PTR [eax+12], xmm1
movss xmm2, DWORD PTR [eax+16]
cvtps2pd xmm2, xmm2
cvtps2pd xmm1, xmm0
mulsd xmm1, xmm2
cvtpd2ps xmm1, xmm1
movss DWORD PTR [eax+16], xmm1
movss xmm1, DWORD PTR [eax+20]
cvtps2pd xmm1, xmm1
cvtps2pd xmm2, xmm0
mulsd xmm1, xmm2
cvtpd2ps xmm1, xmm1
movss DWORD PTR [eax+20], xmm1
movss xmm1, DWORD PTR [eax+24]
cvtps2pd xmm1, xmm1
cvtps2pd xmm2, xmm0
mulsd xmm1, xmm2
cvtpd2ps xmm1, xmm1
movss DWORD PTR [eax+24], xmm1
movss xmm1, DWORD PTR [eax+28]
cvtps2pd xmm1, xmm1
cvtps2pd xmm2, xmm0
mulsd xmm1, xmm2
cvtpd2ps xmm1, xmm1
movss DWORD PTR [eax+28], xmm1
movss xmm1, DWORD PTR [eax+32]
cvtps2pd xmm1, xmm1
cvtps2pd xmm2, xmm0
mulsd xmm1, xmm2
cvtpd2ps xmm1, xmm1
movss DWORD PTR [eax+32], xmm1
movss xmm1, DWORD PTR [eax+36]
cvtps2pd xmm1, xmm1
cvtps2pd xmm2, xmm0
mulsd xmm1, xmm2
cvtpd2ps xmm1, xmm1
movss DWORD PTR [eax+36], xmm1
movss xmm2, DWORD PTR [eax+40]
cvtps2pd xmm2, xmm2
cvtps2pd xmm1, xmm0
mulsd xmm1, xmm2
cvtpd2ps xmm1, xmm1
movss DWORD PTR [eax+40], xmm1
movss xmm1, DWORD PTR [eax+44]
cvtps2pd xmm1, xmm1
cvtps2pd xmm2, xmm0
mulsd xmm1, xmm2
cvtpd2ps xmm1, xmm1
movss DWORD PTR [eax+44], xmm1
movss xmm2, DWORD PTR [eax+48]
cvtps2pd xmm1, xmm0
cvtps2pd xmm2, xmm2
mulsd xmm1, xmm2
cvtpd2ps xmm1, xmm1
movss DWORD PTR [eax+48], xmm1
movss xmm1, DWORD PTR [eax+52]
cvtps2pd xmm1, xmm1
cvtps2pd xmm2, xmm0
mulsd xmm1, xmm2
cvtpd2ps xmm1, xmm1
movss DWORD PTR [eax+52], xmm1
movss xmm1, DWORD PTR [eax+56]
cvtps2pd xmm1, xmm1
cvtps2pd xmm2, xmm0
mulsd xmm1, xmm2
cvtpd2ps xmm1, xmm1
cvtps2pd xmm0, xmm0
movss DWORD PTR [eax+56], xmm1
movss xmm1, DWORD PTR [eax+60]
cvtps2pd xmm1, xmm1
mulsd xmm1, xmm0
pop edi
cvtpd2ps xmm0, xmm1
movss DWORD PTR [eax+60], xmm0
pop esi
; return out;
ret 0
mul1 ENDP
Compare that to the code for mul1 generated by VS 2015:
mul1 PROC
_m$ = 8 ; size = 64
; ___$ReturnUdt$ = ecx
; _f$ = xmm2s
; WMatrix out = m;
movups xmm0, XMMWORD PTR _m$[esp-4]
; for (unsigned int i = 0; i < 4; i++)
xor eax, eax
movaps xmm1, xmm2
movups XMMWORD PTR [ecx], xmm0
movups xmm0, XMMWORD PTR _m$[esp+12]
shufps xmm1, xmm1, 0
movups XMMWORD PTR [ecx+16], xmm0
movups xmm0, XMMWORD PTR _m$[esp+28]
movups XMMWORD PTR [ecx+32], xmm0
movups xmm0, XMMWORD PTR _m$[esp+44]
movups XMMWORD PTR [ecx+48], xmm0
npad 4
$LL4#mul1:
; for (unsigned int j = 0; j < 4; j++)
; {
; unsigned int idx = i * 4 + j; // critical code
; *(&out._11 + idx) *= f; // critical code
movups xmm0, XMMWORD PTR [ecx+eax*4]
mulps xmm0, xmm1
movups XMMWORD PTR [ecx+eax*4], xmm0
inc eax
cmp eax, 4
jb SHORT $LL4#mul1
; return out;
mov eax, ecx
ret 0
?mul1##YA?AUWMatrix##U1#M#Z ENDP ; mul1
_TEXT ENDS
It is immediately obvious how much shorter the code is. Apparently the optimizer got a lot smarter between VS 2010 and VS 2015. Unfortunately, sometimes the source of the optimizer's "smarts" is the exploitation of bugs in your code.
Looking at the code that matches up with the loops, you can see that VS 2010 is unrolling the loops. All of the computations are done inline so that there are no branches. This is kind of what you'd expect for loops with upper and lower bounds that are known at compile time and, as in this case, reasonably small.
What happened in VS 2015? Well, it didn't unroll anything. There are 5 lines of code, and then a conditional jump JB back to the top of the loop sequence. That alone doesn't tell you much. What does look highly suspicious is that it only loops 4 times (see the cmp eax, 4 statement that sets flags right before doing the jb, effectively continuing the loop as long as the counter is less than 4). Well, that might be okay if it had merged the two loops into one. Let's see what it's doing inside of the loop:
$LL4#mul1:
movups xmm0, XMMWORD PTR [ecx+eax*4] ; load a packed unaligned value into XMM0
mulps xmm0, xmm1 ; do a packed multiplication of XMM0 by XMM1,
; storing the result in XMM0
movups XMMWORD PTR [ecx+eax*4], xmm0 ; store the result of the previous multiplication
; back into the memory location that we
; initially loaded from
inc eax ; one iteration done, increment loop counter
cmp eax, 4 ; see how many loops we've done
jb $LL4#mul1 ; keep looping if < 4 iterations
The code reads a value from memory (an XMM-sized value from the location determined by ecx + eax * 4) into XMM0, multiplies it by a value in XMM1 (which was set outside the loop, based on the f parameter), and then stores the result back into the original memory location.
Compare that to the code for the corresponding loop in mul2:
$LL4#mul2:
lea eax, DWORD PTR [eax+16]
movups xmm0, XMMWORD PTR [eax-24]
mulps xmm0, xmm2
movups XMMWORD PTR [eax-24], xmm0
sub ecx, 1
jne $LL4#mul2
Aside from a different loop control sequence (this sets ECX to 4 outside of the loop, subtracts 1 each time through, and keeps looping as long as ECX != 0), the big difference here is the actual XMM values that it manipulates in memory. Instead of loading from [ecx+eax*4], it loads from [eax-24] (after having previously added 16 to EAX).
What's different about mul2? You had added code to track a separate index in idx2, incrementing it each time through the loop. Now, this alone would not be enough. If you comment out the assignment to the bool variable b, mul1 and mul2 result in identical object code. Clearly without the comparison of idx to idx2, the compiler is able to deduce that idx2 is completely unused, and therefore eliminate it, turning mul2 into mul1. But with that comparison, the compiler apparently becomes unable to eliminate idx2, and its presence ever so slightly changes what optimizations are deemed possible for the function, resulting in the output discrepancy.
Now the question turns to why is this happening. Is it an optimizer bug, as you first suspected? Well, no—and as some of the commenters have mentioned, it should never be your first instinct to blame the compiler/optimizer. Always assume that there are bugs in your code unless you can prove otherwise. That proof would always involve looking at the disassembly, and preferably referencing the relevant portions of the language standard if you really want to be taken seriously.
In this case, Mystical has already nailed the problem. Your code exhibits undefined behavior when it does *(&out._11 + idx). This makes certain assumptions about the layout of the WMatrix struct in memory, which you cannot legally make, even after explicitly setting the packing.
This is why undefined behavior is evil—it results in code that seems to work sometimes, but other times it doesn't. It is very sensitive to compiler flags, especially optimizations, but also target platforms (as we saw at the top of this answer). mul2 only works by accident. Both mul1 and mul2 are wrong. Unfortunately, the bug is in your code. Worse, the compiler didn't issue a warning that might have alerted you to your use of undefined behavior.
If we look at the generated code, the problem is fairly clear. Ignoring a few bits and pieces that aren't related to the problem at hand, mul1 produces code like this:
movss xmm1, DWORD PTR _f$[esp-4] ; load xmm1 from _11 of source
; ...
shufps xmm1, xmm1, 0 ; duplicate _11 across floats of xmm1
; ...
for ecx = 0 to 3 {
movups xmm0, XMMWORD PTR [dest+ecx*4] ; load 4 floats from dest
mulps xmm0, xmm1 ; multiply each by _11
movups XMMWORD PTR [dest+ecx*4], xmm0 ; store result back to dest
}
So, instead of multiplying each element of one matrix by the corresponding element of the other matrix, it's multiplying each element of one matrix by _11 of the other matrix.
Although it's impossible to confirm exactly how it happened (without looking through the compiler's source code), this certainly fits with #Mysticial's guess about how the problem arose.

Optimizing assembly generated by Microsoft Visual Studio Compiler

I'm working on a project with matrix multiplication. I have been able to write the C code and I was able to generate the assembly code for it using the Microsoft visual studio 2012 compiler. The compiler generated code is shown below. The compiler used the SSE registers, which is exactly what I wanted, but it is not the best code. I wanted to optimize this code and write it inline with the C code but I don't understand the assembly code. Basically the assembly code is good for only one dimension of the matrix, the code below is only good for 4 by 4 matrix. How can I make is so that it is good for n*n matrix size.
The C++ code is shown below:
#define MAX_NUM 10
#define MAX_DIM 4
int main () {
float mat_a [] = {1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0};
float mat_b [] = {1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0};
float result [] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
int num_row = 4;
int num_col = 4;
float sum;
for (int i = 0; i < num_row; i++) {
for (int j = 0; j < num_col; j++) {
sum = 0.0;
for (int k = 0; k < num_row; k++) {
sum = sum + mat_a[i * num_col + k] * mat_b[k * num_col + j];
}
*(result + i * num_col + j) = sum;
}
}
return 0;
}
The assembly code is shown below:
; Listing generated by Microsoft (R) Optimizing Compiler Version 17.00.50727.1
TITLE C:\Users\GS\Documents\Visual Studio 2012\Projects\Assembly_InLine\Assembly_InLine\Source.cpp
.686P
.XMM
include listing.inc
.model flat
INCLUDELIB MSVCRTD
INCLUDELIB OLDNAMES
PUBLIC _main
PUBLIC __real#00000000
PUBLIC __real#3f800000
PUBLIC __real#40000000
PUBLIC __real#40400000
PUBLIC __real#40800000
EXTRN #_RTC_CheckStackVars#8:PROC
EXTRN #__security_check_cookie#4:PROC
EXTRN __RTC_InitBase:PROC
EXTRN __RTC_Shutdown:PROC
EXTRN ___security_cookie:DWORD
EXTRN __fltused:DWORD
; COMDAT __real#40800000
CONST SEGMENT
__real#40800000 DD 040800000r ; 4
CONST ENDS
; COMDAT __real#40400000
CONST SEGMENT
__real#40400000 DD 040400000r ; 3
CONST ENDS
; COMDAT __real#40000000
CONST SEGMENT
__real#40000000 DD 040000000r ; 2
CONST ENDS
; COMDAT __real#3f800000
CONST SEGMENT
__real#3f800000 DD 03f800000r ; 1
CONST ENDS
; COMDAT __real#00000000
CONST SEGMENT
__real#00000000 DD 000000000r ; 0
CONST ENDS
; COMDAT rtc$TMZ
rtc$TMZ SEGMENT
__RTC_Shutdown.rtc$TMZ DD FLAT:__RTC_Shutdown
rtc$TMZ ENDS
; COMDAT rtc$IMZ
rtc$IMZ SEGMENT
__RTC_InitBase.rtc$IMZ DD FLAT:__RTC_InitBase
rtc$IMZ ENDS
; Function compile flags: /Odtp /RTCsu /ZI
; COMDAT _main
_TEXT SEGMENT
_k$1 = -288 ; size = 4
_j$2 = -276 ; size = 4
_i$3 = -264 ; size = 4
_sum$ = -252 ; size = 4
_num_col$ = -240 ; size = 4
_num_row$ = -228 ; size = 4
_result$ = -216 ; size = 64
_mat_b$ = -144 ; size = 64
_mat_a$ = -72 ; size = 64
__$ArrayPad$ = -4 ; size = 4
_main PROC ; COMDAT
; File c:\users\gs\documents\visual studio 2012\projects\assembly_inline\assembly_inline\source.cpp
; Line 4
push ebp
mov ebp, esp
sub esp, 484 ; 000001e4H
push ebx
push esi
push edi
lea edi, DWORD PTR [ebp-484]
mov ecx, 121 ; 00000079H
mov eax, -858993460 ; ccccccccH
rep stosd
mov eax, DWORD PTR ___security_cookie
xor eax, ebp
mov DWORD PTR __$ArrayPad$[ebp], eax
; Line 5
movss xmm0, DWORD PTR __real#3f800000
movss DWORD PTR _mat_a$[ebp], xmm0
movss xmm0, DWORD PTR __real#40000000
movss DWORD PTR _mat_a$[ebp+4], xmm0
movss xmm0, DWORD PTR __real#40400000
movss DWORD PTR _mat_a$[ebp+8], xmm0
movss xmm0, DWORD PTR __real#40800000
movss DWORD PTR _mat_a$[ebp+12], xmm0
movss xmm0, DWORD PTR __real#3f800000
movss DWORD PTR _mat_a$[ebp+16], xmm0
movss xmm0, DWORD PTR __real#40000000
movss DWORD PTR _mat_a$[ebp+20], xmm0
movss xmm0, DWORD PTR __real#40400000
movss DWORD PTR _mat_a$[ebp+24], xmm0
movss xmm0, DWORD PTR __real#40800000
movss DWORD PTR _mat_a$[ebp+28], xmm0
movss xmm0, DWORD PTR __real#3f800000
movss DWORD PTR _mat_a$[ebp+32], xmm0
movss xmm0, DWORD PTR __real#40000000
movss DWORD PTR _mat_a$[ebp+36], xmm0
movss xmm0, DWORD PTR __real#40400000
movss DWORD PTR _mat_a$[ebp+40], xmm0
movss xmm0, DWORD PTR __real#40800000
movss DWORD PTR _mat_a$[ebp+44], xmm0
movss xmm0, DWORD PTR __real#3f800000
movss DWORD PTR _mat_a$[ebp+48], xmm0
movss xmm0, DWORD PTR __real#40000000
movss DWORD PTR _mat_a$[ebp+52], xmm0
movss xmm0, DWORD PTR __real#40400000
movss DWORD PTR _mat_a$[ebp+56], xmm0
movss xmm0, DWORD PTR __real#40800000
movss DWORD PTR _mat_a$[ebp+60], xmm0
; Line 6
movss xmm0, DWORD PTR __real#3f800000
movss DWORD PTR _mat_b$[ebp], xmm0
movss xmm0, DWORD PTR __real#40000000
movss DWORD PTR _mat_b$[ebp+4], xmm0
movss xmm0, DWORD PTR __real#40400000
movss DWORD PTR _mat_b$[ebp+8], xmm0
movss xmm0, DWORD PTR __real#40800000
movss DWORD PTR _mat_b$[ebp+12], xmm0
movss xmm0, DWORD PTR __real#3f800000
movss DWORD PTR _mat_b$[ebp+16], xmm0
movss xmm0, DWORD PTR __real#40000000
movss DWORD PTR _mat_b$[ebp+20], xmm0
movss xmm0, DWORD PTR __real#40400000
movss DWORD PTR _mat_b$[ebp+24], xmm0
movss xmm0, DWORD PTR __real#40800000
movss DWORD PTR _mat_b$[ebp+28], xmm0
movss xmm0, DWORD PTR __real#3f800000
movss DWORD PTR _mat_b$[ebp+32], xmm0
movss xmm0, DWORD PTR __real#40000000
movss DWORD PTR _mat_b$[ebp+36], xmm0
movss xmm0, DWORD PTR __real#40400000
movss DWORD PTR _mat_b$[ebp+40], xmm0
movss xmm0, DWORD PTR __real#40800000
movss DWORD PTR _mat_b$[ebp+44], xmm0
movss xmm0, DWORD PTR __real#3f800000
movss DWORD PTR _mat_b$[ebp+48], xmm0
movss xmm0, DWORD PTR __real#40000000
movss DWORD PTR _mat_b$[ebp+52], xmm0
movss xmm0, DWORD PTR __real#40400000
movss DWORD PTR _mat_b$[ebp+56], xmm0
movss xmm0, DWORD PTR __real#40800000
movss DWORD PTR _mat_b$[ebp+60], xmm0
; Line 7
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp], xmm0
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp+4], xmm0
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp+8], xmm0
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp+12], xmm0
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp+16], xmm0
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp+20], xmm0
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp+24], xmm0
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp+28], xmm0
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp+32], xmm0
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp+36], xmm0
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp+40], xmm0
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp+44], xmm0
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp+48], xmm0
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp+52], xmm0
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp+56], xmm0
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp+60], xmm0
; Line 9
mov DWORD PTR _num_row$[ebp], 4
; Line 10
mov DWORD PTR _num_col$[ebp], 4
; Line 14
mov DWORD PTR _i$3[ebp], 0
jmp SHORT $LN9#main
$LN8#main:
mov eax, DWORD PTR _i$3[ebp]
add eax, 1
mov DWORD PTR _i$3[ebp], eax
$LN9#main:
mov eax, DWORD PTR _i$3[ebp]
cmp eax, DWORD PTR _num_row$[ebp]
jge $LN7#main
; Line 15
mov DWORD PTR _j$2[ebp], 0
jmp SHORT $LN6#main
$LN5#main:
mov eax, DWORD PTR _j$2[ebp]
add eax, 1
mov DWORD PTR _j$2[ebp], eax
$LN6#main:
mov eax, DWORD PTR _j$2[ebp]
cmp eax, DWORD PTR _num_col$[ebp]
jge $LN4#main
; Line 16
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _sum$[ebp], xmm0
; Line 17
mov DWORD PTR _k$1[ebp], 0
jmp SHORT $LN3#main
$LN2#main:
mov eax, DWORD PTR _k$1[ebp]
add eax, 1
mov DWORD PTR _k$1[ebp], eax
$LN3#main:
mov eax, DWORD PTR _k$1[ebp]
cmp eax, DWORD PTR _num_row$[ebp]
jge SHORT $LN1#main
; Line 18
mov eax, DWORD PTR _i$3[ebp]
imul eax, DWORD PTR _num_col$[ebp]
add eax, DWORD PTR _k$1[ebp]
mov ecx, DWORD PTR _k$1[ebp]
imul ecx, DWORD PTR _num_col$[ebp]
add ecx, DWORD PTR _j$2[ebp]
movss xmm0, DWORD PTR _mat_a$[ebp+eax*4]
mulss xmm0, DWORD PTR _mat_b$[ebp+ecx*4]
addss xmm0, DWORD PTR _sum$[ebp]
movss DWORD PTR _sum$[ebp], xmm0
; Line 19
jmp SHORT $LN2#main
$LN1#main:
; Line 20
mov eax, DWORD PTR _i$3[ebp]
imul eax, DWORD PTR _num_col$[ebp]
lea ecx, DWORD PTR _result$[ebp+eax*4]
mov edx, DWORD PTR _j$2[ebp]
movss xmm0, DWORD PTR _sum$[ebp]
movss DWORD PTR [ecx+edx*4], xmm0
; Line 21
jmp $LN5#main
$LN4#main:
; Line 22
jmp $LN8#main
$LN7#main:
; Line 24
xor eax, eax
; Line 25
push edx
mov ecx, ebp
push eax
lea edx, DWORD PTR $LN16#main
call #_RTC_CheckStackVars#8
pop eax
pop edx
pop edi
pop esi
pop ebx
mov ecx, DWORD PTR __$ArrayPad$[ebp]
xor ecx, ebp
call #__security_check_cookie#4
mov esp, ebp
pop ebp
ret 0
npad 1
$LN16#main:
DD 3
DD $LN15#main
$LN15#main:
DD -72 ; ffffffb8H
DD 64 ; 00000040H
DD $LN12#main
DD -144 ; ffffff70H
DD 64 ; 00000040H
DD $LN13#main
DD -216 ; ffffff28H
DD 64 ; 00000040H
DD $LN14#main
$LN14#main:
DB 114 ; 00000072H
DB 101 ; 00000065H
DB 115 ; 00000073H
DB 117 ; 00000075H
DB 108 ; 0000006cH
DB 116 ; 00000074H
DB 0
$LN13#main:
DB 109 ; 0000006dH
DB 97 ; 00000061H
DB 116 ; 00000074H
DB 95 ; 0000005fH
DB 98 ; 00000062H
DB 0
$LN12#main:
DB 109 ; 0000006dH
DB 97 ; 00000061H
DB 116 ; 00000074H
DB 95 ; 0000005fH
DB 97 ; 00000061H
DB 0
_main ENDP
_TEXT ENDS
END
Visual Studio and SSE is a red herring here (as well as the C++ vs. C nonsense). Assuming you compile in Release mode there are other reason your code is inefficient especially for large matrices. The main reason is that it's cache unfriendly. To make your code efficient for an arbitrary n*n matrix you need optimize for big and small.
It's important to optimize for the cache BEFORE employing SIMD or threads. In the code below I use block multiplication to speed up your code for a 1024x1204 matrix by more than a factor of ten (7.1 s with old code and 0.6s with new) using only a single thread without using SSE/AVX. It's not going to do any good to use SIMD if your code is memory bound.
I have already described a first order improvement to matrix multiplication using the transpose here.
OpenMP C++ Matrix Multiplication run slower in parallel
But let me describe an even more cache friendly method. Let's assume your hardware has two types of memory:
small and fast,
large and slow.
In reality, modern CPUs actually have several levels of this (L1 small and fast, L2 larger and slower, L3 even larger and slower, main memory even larger still and even slower still. Some CPUs even have a L4) but this simple model with only two levels here will still lead to a big improvement in performance.
Using this model with two types of memory you can show that you will get the best performance by dividing your matrix into square tiles which fit in the small and fast memory and doing block matrix multiplication. Next you want to rearrange the memory so that the elements of each tile are contiguous.
Below is some code showing how to do this. I used a block size of 64x64 on a 1024x1024 matrix. It took 7s with your code and 0.65s with mine. The matrix size has to be multiples of 64x64 but it's easy to extend this to an arbitrary size matrix. If you want to see an example of how to optimize the blocks see this Difference in performance between MSVC and GCC for highly optimized matrix multplication code
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <omp.h>
void reorder(float *a, float *b, int n, int bs) {
int nb = n/bs;
int cnt = 0;
for(int i=0; i<nb; i++) {
for(int j=0; j<nb; j++) {
for(int i2=0; i2<bs; i2++) {
for(int j2=0; j2<bs; j2++) {
b[cnt++] = a[bs*(i*n+j) + i2*n + j2];
}
}
}
}
}
void gemm_slow(float *a, float *b, float *c, int n) {
for(int i=0; i<n; i++) {
for(int j=0; j<n; j++) {
float sum = c[i*n+j];
for(int k=0; k<n; k++) {
sum += a[i*n+k]*b[k*n+j];
}
c[i*n+j] += sum;
}
}
}
void gemm_block(float *a, float *b, float *c, int n, int n2) {
for(int i=0; i<n2; i++) {
for(int j=0; j<n2; j++) {
float sum = c[i*n+j];
for(int k=0; k<n2; k++) {
sum += a[i*n+k]*b[k*n2+j];
}
c[i*n+j] = sum;
}
}
}
void gemm(float *a, float*b, float*c, int n, int bs) {
int nb = n/bs;
float *b2 = (float*)malloc(sizeof(float)*n*n);
reorder(b,b2,n,bs);
for(int i=0; i<nb; i++) {
for(int j=0; j<nb; j++) {
for(int k=0; k<nb; k++) {
gemm_block(&a[bs*(i*n+k)],&b2[bs*bs*(k*nb+j)],&c[bs*(i*n+j)], n, bs);
}
}
}
free(b2);
}
int main() {
const int bs = 64;
const int n = 1024;
float *a = new float[n*n];
float *b = new float[n*n];
float *c1 = new float[n*n]();
float *c2 = new float[n*n]();
for(int i=0; i<n*n; i++) {
a[i] = 1.0*rand()/RAND_MAX;
b[i] = 1.0*rand()/RAND_MAX;
}
double dtime;
dtime = omp_get_wtime();
gemm_slow(a,b,c1,n);
dtime = omp_get_wtime() - dtime;
printf("%f\n", dtime);
dtime = omp_get_wtime();
gemm(a,b,c2,n,64);
dtime = omp_get_wtime() - dtime;
printf("%f\n", dtime);
printf("%d\n", memcmp(c1,c2, sizeof(float)*n*n));
}

SSE2 - 16-byte aligned dynamic allocation of memory

EDIT:
This is a followup to SSE2 Compiler Error
This is the real bug I experienced before and have reproduced below by changing the _mm_malloc statement as Michael Burr suggested:
Unhandled exception at 0x00415116 in SO.exe: 0xC0000005: Access violation reading
location 0xffffffff.
At line label: movdqa xmm0, xmmword ptr [t1+eax]
I'm trying to dynamically allocate t1 and t2 and according to this tutorial, I've used _mm_malloc:
#include <emmintrin.h>
int main(int argc, char* argv[])
{
int *t1, *t2;
const int n = 100000;
t1 = (int*)_mm_malloc(n*sizeof(int),16);
t2 = (int*)_mm_malloc(n*sizeof(int),16);
__m128i mul1, mul2;
for (int j = 0; j < n; j++)
{
t1[j] = j;
t2[j] = (j+1);
} // set temporary variables to random values
_asm
{
mov eax, 0
label: movdqa xmm0, xmmword ptr [t1+eax]
movdqa xmm1, xmmword ptr [t2+eax]
pmuludq xmm0, xmm1
movdqa mul1, xmm0
movdqa xmm0, xmmword ptr [t1+eax]
pshufd xmm0, xmm0, 05fh
pshufd xmm1, xmm1, 05fh
pmuludq xmm0, xmm1
movdqa mul2, xmm0
add eax, 16
cmp eax, 100000
jnge label
}
_mm_free(t1);
_mm_free(t2);
return 0;
}
I think the 2nd problem is that you're reading at an offset from the pointer variable (not an offset from what the pointer points to).
Change:
label: movdqa xmm0, xmmword ptr [t1+eax]
To something like:
mov ebx, [t1]
label: movdqa xmm0, xmmword ptr [ebx+eax]
And similarly for your accesses through the t2 pointer.
This might be even better (though I haven't had an opportunity to test it, so it might not even work):
_asm
{
mov eax, [t1]
mov ebx, [t1]
lea ecx, [eax + (100000*4)]
label: movdqa xmm0, xmmword ptr [eax]
movdqa xmm1, xmmword ptr [ebx]
pmuludq xmm0, xmm1
movdqa mul1, xmm0
movdqa xmm0, xmmword ptr [eax]
pshufd xmm0, xmm0, 05fh
pshufd xmm1, xmm1, 05fh
pmuludq xmm0, xmm1
movdqa mul2, xmm0
add eax, 16
add ebx, 16
cmp eax, ecx
jnge label
}
You're not allocating enough memory:
t1 = (int*)_mm_malloc(n * sizeof( int),16);
t2 = (int*)_mm_malloc(n * sizeof( int),16);
Perhaps:
t1 = (int*)_mm_malloc(n*sizeof(int),16);