Microsoft VS 2015 compiler bug? - c++

I have the following function:
void MainLayer::onMouseScroll(cocos2d::EventMouse* event)
{
const float scrollAmount = event->getScrollY();
const float newMapScale = scrollAmount < 0.f ? std::min(_mapScale * k_deltaMapScaleMag, k_maxMapScale) : std::max(_mapScale * k_deltaMapScaleMin, k_minMapScale);
const float invNewMapScale = 1.f / newMapScale;
const Vec2 anchorScreenSpace = event->getLocationInView();
_mapScale = newMapScale;
_invMapScale = invNewMapScale;
updateMapTransform();
}
Microsoft VS 2015 compiler generates the following assembly:
push ebp
mov ebp, esp
push -1
push __ehhandler$?onMouseScroll#MainLayer##AAEXPAVEventMouse#cocos2d###Z
mov eax, DWORD PTR fs:0
push eax
sub esp, 20 ; 00000014H
mov eax, DWORD PTR ___security_cookie
xor eax, ebp
mov DWORD PTR __$ArrayPad$[ebp], eax
push esi
push eax
lea eax, DWORD PTR __$EHRec$[ebp]
mov DWORD PTR fs:0, eax
mov esi, ecx
mov ecx, DWORD PTR _event$[ebp]
; File c:\program files (x86)\microsoft visual studio 14.0\vc\include\algorithm
; 3649 : return (_DEBUG_LT(_Right, _Left) ? _Right : _Left);
lea edx, DWORD PTR $T1[ebp]
; File e:\projects\aliceandbob\classes\mainlayer.cpp
; 121 : const float newMapScale = scrollAmount < 0.f ? std::min(_mapScale * k_deltaMapScaleMag, k_maxMapScale) : std::max(_mapScale * k_deltaMapScaleMin, k_minMapScale);
movss xmm1, DWORD PTR __real#3f800000
xorps xmm0, xmm0
comiss xmm0, DWORD PTR [ecx+56]
movss xmm0, DWORD PTR [esi+712]
jbe SHORT $LN3#onMouseScr
mulss xmm0, DWORD PTR __real#3f99999a
; File c:\program files (x86)\microsoft visual studio 14.0\vc\include\algorithm
; 3649 : return (_DEBUG_LT(_Right, _Left) ? _Right : _Left);
mov eax, OFFSET ?k_maxMapScale#?A0x69cbde3b##3MB
comiss xmm0, DWORD PTR __real#41200000
; File e:\projects\aliceandbob\classes\mainlayer.cpp
; 121 : const float newMapScale = scrollAmount < 0.f ? std::min(_mapScale * k_deltaMapScaleMag, k_maxMapScale) : std::max(_mapScale * k_deltaMapScaleMin, k_minMapScale);
jmp SHORT $LN18#onMouseScr
$LN3#onMouseScr:
mulss xmm0, DWORD PTR __real#3f4ccccd
; File c:\program files (x86)\microsoft visual studio 14.0\vc\include\algorithm
; 3612 : return (_DEBUG_LT(_Left, _Right) ? _Right : _Left);
mov eax, OFFSET ?k_minMapScale#?A0x69cbde3b##3MB
comiss xmm1, xmm0
$LN18#onMouseScr:
cmovbe eax, edx
movss xmm0, DWORD PTR [eax]
; File e:\projects\aliceandbob\classes\mainlayer.cpp
; 124 : const Vec2 anchorScreenSpace = event->getLocationInView();
lea eax, DWORD PTR _anchorScreenSpace$[ebp]
divss xmm1, xmm0
push eax
movss DWORD PTR tv161[ebp], xmm0
movss DWORD PTR _invNewMapScale$1$[ebp], xmm1
call DWORD PTR __imp_?getLocationInView#EventMouse#cocos2d##QBE?AVVec2#2#XZ
; 125 : _mapScale = newMapScale;
movss xmm0, DWORD PTR tv161[ebp]
; 126 : _invMapScale = invNewMapScale;
; 127 :
; 128 : updateMapTransform();
mov ecx, esi
mov DWORD PTR __$EHRec$[ebp+8], 0
movss DWORD PTR [esi+712], xmm0
movss xmm0, DWORD PTR _invNewMapScale$1$[ebp]
movss DWORD PTR [esi+716], xmm0
call ?updateMapTransform#MainLayer##AAEXXZ ; MainLayer::updateMapTransform
; 129 : }
mov ecx, DWORD PTR __$EHRec$[ebp]
mov DWORD PTR fs:0, ecx
pop ecx
pop esi
mov ecx, DWORD PTR __$ArrayPad$[ebp]
xor ecx, ebp
call #__security_check_cookie#4
mov esp, ebp
pop ebp
ret 4
The problem is the compiler generates the code that does not store the result of the expression into newMapScale variable. As you see, it also does not insert minss and maxss instructions here (for std::max() and std::min()). If I deny compuler to use SSE instructions - it works as intended. I tried to reproduce this issue on a test project and failed - almost exact same code had minss and maxss instructions and stored the result of expression into the newMapScale variable. Do I have some kind of undefined behavior, or it's just a compiler bug?
I use VS 2015 Update 3, VS 2015 - Windows XP (v140_xp) SDK.
UPD: I only left source code that helps reproducing the problem. Removing any line from that code fixes compiler behavior. Please pay attention to the following lines:
movss xmm1, DWORD PTR __real#3f800000
xorps xmm0, xmm0
comiss xmm0, DWORD PTR [ecx+56]
movss xmm0, DWORD PTR [esi+712]
jbe SHORT $LN3#onMouseScr
mulss xmm0, DWORD PTR __real#3f99999a
; File c:\program files (x86)\microsoft visual studio 14.0\vc\include\algorithm
; 3649 : return (_DEBUG_LT(_Right, _Left) ? _Right : _Left);
mov eax, OFFSET ?k_maxMapScale#?A0x69cbde3b##3MB
comiss xmm0, DWORD PTR __real#41200000
; File e:\projects\aliceandbob\classes\mainlayer.cpp
; 121 : const float newMapScale = scrollAmount < 0.f ? std::min(_mapScale * k_deltaMapScaleMag, k_maxMapScale) : std::max(_mapScale * k_deltaMapScaleMin, k_minMapScale);
jmp SHORT $LN18#onMouseScr
$LN3#onMouseScr:
mulss xmm0, DWORD PTR __real#3f4ccccd
; File c:\program files (x86)\microsoft visual studio 14.0\vc\include\algorithm
; 3612 : return (_DEBUG_LT(_Left, _Right) ? _Right : _Left);
mov eax, OFFSET ?k_minMapScale#?A0x69cbde3b##3MB
comiss xmm1, xmm0
$LN18#onMouseScr:
cmovbe eax, edx
movss xmm0, DWORD PTR [eax]
; File e:\projects\aliceandbob\classes\mainlayer.cpp
; 124 : const Vec2 anchorScreenSpace = event->getLocationInView();
lea eax, DWORD PTR _anchorScreenSpace$[ebp]
divss xmm1, xmm0
push eax
As Andrey Turkin said, xmm0 gets discarded although it contains calculation results. An unitialized memory is loaded into xmm0 instead.

Related

Loop unroll issue with Visual Studio compiler

I have some simple setup, where I noticed that VS compiler seems not smart enough to unroll loop, but other compilers like clang or gcc do so. Do I miss some optimization flag for VS?
#include <cstddef>
struct A
{
double data[4];
double *begin() { return data; }
double *end() { return data + 4; }
double const *begin() const { return data; }
double const *end() const { return data + 4; }
};
double sum_index(A const &a) {
double ret = 0;
for(std::size_t i = 0; i < 4; ++i)
{
ret += a.data[i];
}
return ret;
}
double sum_iter(A const &a) {
double ret = 0;
for(auto const &v : a)
{
ret += v;
}
return ret;
}
I used https://godbolt.org/ compiler explorer to generate assembler code.
gcc 11.2 with -O3:
sum_index(A const&):
pxor xmm0, xmm0
addsd xmm0, QWORD PTR [rdi]
addsd xmm0, QWORD PTR [rdi+8]
addsd xmm0, QWORD PTR [rdi+16]
addsd xmm0, QWORD PTR [rdi+24]
ret
sum_iter(A const&):
movsd xmm1, QWORD PTR [rdi]
addsd xmm1, QWORD PTR .LC0[rip]
movsd xmm0, QWORD PTR [rdi+8]
addsd xmm1, xmm0
movupd xmm0, XMMWORD PTR [rdi+16]
addsd xmm1, xmm0
unpckhpd xmm0, xmm0
addsd xmm0, xmm1
ret
.LC0:
.long 0
.long 0
clang 13.0.1 with -O3:
sum_index(A const&): # #sum_index(A const&)
xorpd xmm0, xmm0
addsd xmm0, qword ptr [rdi]
addsd xmm0, qword ptr [rdi + 8]
addsd xmm0, qword ptr [rdi + 16]
addsd xmm0, qword ptr [rdi + 24]
ret
sum_iter(A const&): # #sum_iter(A const&)
xorpd xmm0, xmm0
addsd xmm0, qword ptr [rdi]
addsd xmm0, qword ptr [rdi + 8]
addsd xmm0, qword ptr [rdi + 16]
addsd xmm0, qword ptr [rdi + 24]
ret
MSVC 19.30 with /O2 (there is no /O3?):
this$ = 8
double const * A::begin(void)const PROC ; A::begin, COMDAT
mov rax, rcx
ret 0
double const * A::begin(void)const ENDP ; A::begin
this$ = 8
double const * A::end(void)const PROC ; A::end, COMDAT
lea rax, QWORD PTR [rcx+32]
ret 0
double const * A::end(void)const ENDP ; A::end
a$ = 8
double sum_index(A const &) PROC ; sum_index, COMDAT
movsd xmm0, QWORD PTR [rcx]
xorps xmm1, xmm1
addsd xmm0, xmm1
addsd xmm0, QWORD PTR [rcx+8]
addsd xmm0, QWORD PTR [rcx+16]
addsd xmm0, QWORD PTR [rcx+24]
ret 0
double sum_index(A const &) ENDP ; sum_index
a$ = 8
double sum_iter(A const &) PROC ; sum_iter, COMDAT
lea rax, QWORD PTR [rcx+32]
xorps xmm0, xmm0
cmp rcx, rax
je SHORT $LN12#sum_iter
npad 4
$LL8#sum_iter:
addsd xmm0, QWORD PTR [rcx]
add rcx, 8
cmp rcx, rax
jne SHORT $LL8#sum_iter
$LN12#sum_iter:
ret 0
double sum_iter(A const &) ENDP ; sum_iter
Obviously there is problem with unrolling the loop for MSVC. Is there some additional optimization flag I have to set?
Thanks for help!

Differences in custom and std fetch_add on floats

This is an attempt at implementing fetch_add on floats without C++20.
void fetch_add(volatile float* x, float y)
{
bool success = false;
auto xi = (volatile std::int32_t*)x;
while(!success)
{
union {
std::int32_t sumint;
float sum;
};
auto tmp = __atomic_load_n(xi, __ATOMIC_RELAXED);
sumint = tmp;
sum += y;
success = __atomic_compare_exchange_n(xi, &tmp, sumint, true, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
}
}
To my great confusion, when I compare the assembly from gcc10.1 -O2 -std=c++2a for x86-64, they differ.
fetch_add(float volatile*, float):
.L2:
mov eax, DWORD PTR [rdi]
movd xmm1, eax
addss xmm1, xmm0
movd edx, xmm1
lock cmpxchg DWORD PTR [rdi], edx
jne .L2
ret
fetch_add_std(std::atomic<float>&, float):
mov eax, DWORD PTR [rdi]
movaps xmm1, xmm0
movd xmm0, eax
mov DWORD PTR [rsp-4], eax
addss xmm0, xmm1
.L9:
mov eax, DWORD PTR [rsp-4]
movd edx, xmm0
lock cmpxchg DWORD PTR [rdi], edx
je .L6
mov DWORD PTR [rsp-4], eax
movss xmm0, DWORD PTR [rsp-4]
addss xmm0, xmm1
jmp .L9
.L6:
ret
My ability to read assembly is near non-existent, but the custom version looks correct to me, which implies it is either incorrect, inefficient or somehow the standard library is rather broken. I don't quite believe the third case, which leads me to ask, is the custom version incorrect or inefficient?
After some comments, a second version without reloading after cmpxchg is written. They do still differ.

Optimizing assembly generated by Microsoft Visual Studio Compiler

I'm working on a project with matrix multiplication. I have been able to write the C code and I was able to generate the assembly code for it using the Microsoft visual studio 2012 compiler. The compiler generated code is shown below. The compiler used the SSE registers, which is exactly what I wanted, but it is not the best code. I wanted to optimize this code and write it inline with the C code but I don't understand the assembly code. Basically the assembly code is good for only one dimension of the matrix, the code below is only good for 4 by 4 matrix. How can I make is so that it is good for n*n matrix size.
The C++ code is shown below:
#define MAX_NUM 10
#define MAX_DIM 4
int main () {
float mat_a [] = {1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0};
float mat_b [] = {1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0};
float result [] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
int num_row = 4;
int num_col = 4;
float sum;
for (int i = 0; i < num_row; i++) {
for (int j = 0; j < num_col; j++) {
sum = 0.0;
for (int k = 0; k < num_row; k++) {
sum = sum + mat_a[i * num_col + k] * mat_b[k * num_col + j];
}
*(result + i * num_col + j) = sum;
}
}
return 0;
}
The assembly code is shown below:
; Listing generated by Microsoft (R) Optimizing Compiler Version 17.00.50727.1
TITLE C:\Users\GS\Documents\Visual Studio 2012\Projects\Assembly_InLine\Assembly_InLine\Source.cpp
.686P
.XMM
include listing.inc
.model flat
INCLUDELIB MSVCRTD
INCLUDELIB OLDNAMES
PUBLIC _main
PUBLIC __real#00000000
PUBLIC __real#3f800000
PUBLIC __real#40000000
PUBLIC __real#40400000
PUBLIC __real#40800000
EXTRN #_RTC_CheckStackVars#8:PROC
EXTRN #__security_check_cookie#4:PROC
EXTRN __RTC_InitBase:PROC
EXTRN __RTC_Shutdown:PROC
EXTRN ___security_cookie:DWORD
EXTRN __fltused:DWORD
; COMDAT __real#40800000
CONST SEGMENT
__real#40800000 DD 040800000r ; 4
CONST ENDS
; COMDAT __real#40400000
CONST SEGMENT
__real#40400000 DD 040400000r ; 3
CONST ENDS
; COMDAT __real#40000000
CONST SEGMENT
__real#40000000 DD 040000000r ; 2
CONST ENDS
; COMDAT __real#3f800000
CONST SEGMENT
__real#3f800000 DD 03f800000r ; 1
CONST ENDS
; COMDAT __real#00000000
CONST SEGMENT
__real#00000000 DD 000000000r ; 0
CONST ENDS
; COMDAT rtc$TMZ
rtc$TMZ SEGMENT
__RTC_Shutdown.rtc$TMZ DD FLAT:__RTC_Shutdown
rtc$TMZ ENDS
; COMDAT rtc$IMZ
rtc$IMZ SEGMENT
__RTC_InitBase.rtc$IMZ DD FLAT:__RTC_InitBase
rtc$IMZ ENDS
; Function compile flags: /Odtp /RTCsu /ZI
; COMDAT _main
_TEXT SEGMENT
_k$1 = -288 ; size = 4
_j$2 = -276 ; size = 4
_i$3 = -264 ; size = 4
_sum$ = -252 ; size = 4
_num_col$ = -240 ; size = 4
_num_row$ = -228 ; size = 4
_result$ = -216 ; size = 64
_mat_b$ = -144 ; size = 64
_mat_a$ = -72 ; size = 64
__$ArrayPad$ = -4 ; size = 4
_main PROC ; COMDAT
; File c:\users\gs\documents\visual studio 2012\projects\assembly_inline\assembly_inline\source.cpp
; Line 4
push ebp
mov ebp, esp
sub esp, 484 ; 000001e4H
push ebx
push esi
push edi
lea edi, DWORD PTR [ebp-484]
mov ecx, 121 ; 00000079H
mov eax, -858993460 ; ccccccccH
rep stosd
mov eax, DWORD PTR ___security_cookie
xor eax, ebp
mov DWORD PTR __$ArrayPad$[ebp], eax
; Line 5
movss xmm0, DWORD PTR __real#3f800000
movss DWORD PTR _mat_a$[ebp], xmm0
movss xmm0, DWORD PTR __real#40000000
movss DWORD PTR _mat_a$[ebp+4], xmm0
movss xmm0, DWORD PTR __real#40400000
movss DWORD PTR _mat_a$[ebp+8], xmm0
movss xmm0, DWORD PTR __real#40800000
movss DWORD PTR _mat_a$[ebp+12], xmm0
movss xmm0, DWORD PTR __real#3f800000
movss DWORD PTR _mat_a$[ebp+16], xmm0
movss xmm0, DWORD PTR __real#40000000
movss DWORD PTR _mat_a$[ebp+20], xmm0
movss xmm0, DWORD PTR __real#40400000
movss DWORD PTR _mat_a$[ebp+24], xmm0
movss xmm0, DWORD PTR __real#40800000
movss DWORD PTR _mat_a$[ebp+28], xmm0
movss xmm0, DWORD PTR __real#3f800000
movss DWORD PTR _mat_a$[ebp+32], xmm0
movss xmm0, DWORD PTR __real#40000000
movss DWORD PTR _mat_a$[ebp+36], xmm0
movss xmm0, DWORD PTR __real#40400000
movss DWORD PTR _mat_a$[ebp+40], xmm0
movss xmm0, DWORD PTR __real#40800000
movss DWORD PTR _mat_a$[ebp+44], xmm0
movss xmm0, DWORD PTR __real#3f800000
movss DWORD PTR _mat_a$[ebp+48], xmm0
movss xmm0, DWORD PTR __real#40000000
movss DWORD PTR _mat_a$[ebp+52], xmm0
movss xmm0, DWORD PTR __real#40400000
movss DWORD PTR _mat_a$[ebp+56], xmm0
movss xmm0, DWORD PTR __real#40800000
movss DWORD PTR _mat_a$[ebp+60], xmm0
; Line 6
movss xmm0, DWORD PTR __real#3f800000
movss DWORD PTR _mat_b$[ebp], xmm0
movss xmm0, DWORD PTR __real#40000000
movss DWORD PTR _mat_b$[ebp+4], xmm0
movss xmm0, DWORD PTR __real#40400000
movss DWORD PTR _mat_b$[ebp+8], xmm0
movss xmm0, DWORD PTR __real#40800000
movss DWORD PTR _mat_b$[ebp+12], xmm0
movss xmm0, DWORD PTR __real#3f800000
movss DWORD PTR _mat_b$[ebp+16], xmm0
movss xmm0, DWORD PTR __real#40000000
movss DWORD PTR _mat_b$[ebp+20], xmm0
movss xmm0, DWORD PTR __real#40400000
movss DWORD PTR _mat_b$[ebp+24], xmm0
movss xmm0, DWORD PTR __real#40800000
movss DWORD PTR _mat_b$[ebp+28], xmm0
movss xmm0, DWORD PTR __real#3f800000
movss DWORD PTR _mat_b$[ebp+32], xmm0
movss xmm0, DWORD PTR __real#40000000
movss DWORD PTR _mat_b$[ebp+36], xmm0
movss xmm0, DWORD PTR __real#40400000
movss DWORD PTR _mat_b$[ebp+40], xmm0
movss xmm0, DWORD PTR __real#40800000
movss DWORD PTR _mat_b$[ebp+44], xmm0
movss xmm0, DWORD PTR __real#3f800000
movss DWORD PTR _mat_b$[ebp+48], xmm0
movss xmm0, DWORD PTR __real#40000000
movss DWORD PTR _mat_b$[ebp+52], xmm0
movss xmm0, DWORD PTR __real#40400000
movss DWORD PTR _mat_b$[ebp+56], xmm0
movss xmm0, DWORD PTR __real#40800000
movss DWORD PTR _mat_b$[ebp+60], xmm0
; Line 7
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp], xmm0
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp+4], xmm0
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp+8], xmm0
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp+12], xmm0
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp+16], xmm0
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp+20], xmm0
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp+24], xmm0
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp+28], xmm0
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp+32], xmm0
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp+36], xmm0
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp+40], xmm0
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp+44], xmm0
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp+48], xmm0
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp+52], xmm0
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp+56], xmm0
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp+60], xmm0
; Line 9
mov DWORD PTR _num_row$[ebp], 4
; Line 10
mov DWORD PTR _num_col$[ebp], 4
; Line 14
mov DWORD PTR _i$3[ebp], 0
jmp SHORT $LN9#main
$LN8#main:
mov eax, DWORD PTR _i$3[ebp]
add eax, 1
mov DWORD PTR _i$3[ebp], eax
$LN9#main:
mov eax, DWORD PTR _i$3[ebp]
cmp eax, DWORD PTR _num_row$[ebp]
jge $LN7#main
; Line 15
mov DWORD PTR _j$2[ebp], 0
jmp SHORT $LN6#main
$LN5#main:
mov eax, DWORD PTR _j$2[ebp]
add eax, 1
mov DWORD PTR _j$2[ebp], eax
$LN6#main:
mov eax, DWORD PTR _j$2[ebp]
cmp eax, DWORD PTR _num_col$[ebp]
jge $LN4#main
; Line 16
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _sum$[ebp], xmm0
; Line 17
mov DWORD PTR _k$1[ebp], 0
jmp SHORT $LN3#main
$LN2#main:
mov eax, DWORD PTR _k$1[ebp]
add eax, 1
mov DWORD PTR _k$1[ebp], eax
$LN3#main:
mov eax, DWORD PTR _k$1[ebp]
cmp eax, DWORD PTR _num_row$[ebp]
jge SHORT $LN1#main
; Line 18
mov eax, DWORD PTR _i$3[ebp]
imul eax, DWORD PTR _num_col$[ebp]
add eax, DWORD PTR _k$1[ebp]
mov ecx, DWORD PTR _k$1[ebp]
imul ecx, DWORD PTR _num_col$[ebp]
add ecx, DWORD PTR _j$2[ebp]
movss xmm0, DWORD PTR _mat_a$[ebp+eax*4]
mulss xmm0, DWORD PTR _mat_b$[ebp+ecx*4]
addss xmm0, DWORD PTR _sum$[ebp]
movss DWORD PTR _sum$[ebp], xmm0
; Line 19
jmp SHORT $LN2#main
$LN1#main:
; Line 20
mov eax, DWORD PTR _i$3[ebp]
imul eax, DWORD PTR _num_col$[ebp]
lea ecx, DWORD PTR _result$[ebp+eax*4]
mov edx, DWORD PTR _j$2[ebp]
movss xmm0, DWORD PTR _sum$[ebp]
movss DWORD PTR [ecx+edx*4], xmm0
; Line 21
jmp $LN5#main
$LN4#main:
; Line 22
jmp $LN8#main
$LN7#main:
; Line 24
xor eax, eax
; Line 25
push edx
mov ecx, ebp
push eax
lea edx, DWORD PTR $LN16#main
call #_RTC_CheckStackVars#8
pop eax
pop edx
pop edi
pop esi
pop ebx
mov ecx, DWORD PTR __$ArrayPad$[ebp]
xor ecx, ebp
call #__security_check_cookie#4
mov esp, ebp
pop ebp
ret 0
npad 1
$LN16#main:
DD 3
DD $LN15#main
$LN15#main:
DD -72 ; ffffffb8H
DD 64 ; 00000040H
DD $LN12#main
DD -144 ; ffffff70H
DD 64 ; 00000040H
DD $LN13#main
DD -216 ; ffffff28H
DD 64 ; 00000040H
DD $LN14#main
$LN14#main:
DB 114 ; 00000072H
DB 101 ; 00000065H
DB 115 ; 00000073H
DB 117 ; 00000075H
DB 108 ; 0000006cH
DB 116 ; 00000074H
DB 0
$LN13#main:
DB 109 ; 0000006dH
DB 97 ; 00000061H
DB 116 ; 00000074H
DB 95 ; 0000005fH
DB 98 ; 00000062H
DB 0
$LN12#main:
DB 109 ; 0000006dH
DB 97 ; 00000061H
DB 116 ; 00000074H
DB 95 ; 0000005fH
DB 97 ; 00000061H
DB 0
_main ENDP
_TEXT ENDS
END
Visual Studio and SSE is a red herring here (as well as the C++ vs. C nonsense). Assuming you compile in Release mode there are other reason your code is inefficient especially for large matrices. The main reason is that it's cache unfriendly. To make your code efficient for an arbitrary n*n matrix you need optimize for big and small.
It's important to optimize for the cache BEFORE employing SIMD or threads. In the code below I use block multiplication to speed up your code for a 1024x1204 matrix by more than a factor of ten (7.1 s with old code and 0.6s with new) using only a single thread without using SSE/AVX. It's not going to do any good to use SIMD if your code is memory bound.
I have already described a first order improvement to matrix multiplication using the transpose here.
OpenMP C++ Matrix Multiplication run slower in parallel
But let me describe an even more cache friendly method. Let's assume your hardware has two types of memory:
small and fast,
large and slow.
In reality, modern CPUs actually have several levels of this (L1 small and fast, L2 larger and slower, L3 even larger and slower, main memory even larger still and even slower still. Some CPUs even have a L4) but this simple model with only two levels here will still lead to a big improvement in performance.
Using this model with two types of memory you can show that you will get the best performance by dividing your matrix into square tiles which fit in the small and fast memory and doing block matrix multiplication. Next you want to rearrange the memory so that the elements of each tile are contiguous.
Below is some code showing how to do this. I used a block size of 64x64 on a 1024x1024 matrix. It took 7s with your code and 0.65s with mine. The matrix size has to be multiples of 64x64 but it's easy to extend this to an arbitrary size matrix. If you want to see an example of how to optimize the blocks see this Difference in performance between MSVC and GCC for highly optimized matrix multplication code
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <omp.h>
void reorder(float *a, float *b, int n, int bs) {
int nb = n/bs;
int cnt = 0;
for(int i=0; i<nb; i++) {
for(int j=0; j<nb; j++) {
for(int i2=0; i2<bs; i2++) {
for(int j2=0; j2<bs; j2++) {
b[cnt++] = a[bs*(i*n+j) + i2*n + j2];
}
}
}
}
}
void gemm_slow(float *a, float *b, float *c, int n) {
for(int i=0; i<n; i++) {
for(int j=0; j<n; j++) {
float sum = c[i*n+j];
for(int k=0; k<n; k++) {
sum += a[i*n+k]*b[k*n+j];
}
c[i*n+j] += sum;
}
}
}
void gemm_block(float *a, float *b, float *c, int n, int n2) {
for(int i=0; i<n2; i++) {
for(int j=0; j<n2; j++) {
float sum = c[i*n+j];
for(int k=0; k<n2; k++) {
sum += a[i*n+k]*b[k*n2+j];
}
c[i*n+j] = sum;
}
}
}
void gemm(float *a, float*b, float*c, int n, int bs) {
int nb = n/bs;
float *b2 = (float*)malloc(sizeof(float)*n*n);
reorder(b,b2,n,bs);
for(int i=0; i<nb; i++) {
for(int j=0; j<nb; j++) {
for(int k=0; k<nb; k++) {
gemm_block(&a[bs*(i*n+k)],&b2[bs*bs*(k*nb+j)],&c[bs*(i*n+j)], n, bs);
}
}
}
free(b2);
}
int main() {
const int bs = 64;
const int n = 1024;
float *a = new float[n*n];
float *b = new float[n*n];
float *c1 = new float[n*n]();
float *c2 = new float[n*n]();
for(int i=0; i<n*n; i++) {
a[i] = 1.0*rand()/RAND_MAX;
b[i] = 1.0*rand()/RAND_MAX;
}
double dtime;
dtime = omp_get_wtime();
gemm_slow(a,b,c1,n);
dtime = omp_get_wtime() - dtime;
printf("%f\n", dtime);
dtime = omp_get_wtime();
gemm(a,b,c2,n,64);
dtime = omp_get_wtime() - dtime;
printf("%f\n", dtime);
printf("%d\n", memcmp(c1,c2, sizeof(float)*n*n));
}

Why is __fastcall assebmler code larger than __stdcall one in MS C++?

I have disassembled two different variations of Swap function (simple value-swap between two pointers).
1). __fastcall http://pastebin.com/ux5LMktz
2). __stdcall (function without explicit calling convention modifier will have a __stdcall by default, because of MS C++ compiler for Windows) http://pastebin.com/eGR6VUjX
As I know, __fastcall is implemented differently, depending on the compiler, but basically it puts the first two arguments (left to right) into ECX and EDX register. And there could be stack use, but if the arguments are too long.
But as for the link at 1-st option, you can see, that value is pushed into the ECX registry, and there is no real difference between two variations of swap function.
And __fastcall variant does use:
00AA261F pop ecx
00AA2620 mov dword ptr [ebp-14h],edx
00AA2623 mov dword ptr [ebp-8],ecx
Which are not used in __stdcall version.
So it doesn't look like more optimized (as __fasctcall must be , by its definition).
I'm a newbie in ASM language and calling convention, so I ask you for a piece of advice. Maybe __fastcall is faster exactly in my sample, but I don't see it, do I?
Thanks!
Try turning on optimization, then comparing the results. Your fastcall version has many redundant operations because it's not optimized.
Here's output of VS 2010 with /Ox.
fastcall:
; _firstValue$ = ecx
; _secondValue$ = edx
?CallMe1##YIXPAH0#Z PROC ; CallMe1
mov eax, DWORD PTR [ecx]
push esi
mov esi, DWORD PTR [edx]
cmp eax, esi
je SHORT $LN1#CallMe1
mov DWORD PTR [ecx], esi
mov DWORD PTR [edx], eax
$LN1#CallMe1:
pop esi
ret 0
?CallMe1##YIXPAH0#Z ENDP ; CallMe1
stdcall:
_firstValue$ = 8 ; size = 4
_secondValue$ = 12 ; size = 4
?CallMe2##YGXPAH0#Z PROC ; CallMe2
mov edx, DWORD PTR _firstValue$[esp-4]
mov eax, DWORD PTR [edx]
push esi
mov esi, DWORD PTR _secondValue$[esp]
mov ecx, DWORD PTR [esi]
cmp eax, ecx
je SHORT $LN1#CallMe2
mov DWORD PTR [edx], ecx
mov DWORD PTR [esi], eax
$LN1#CallMe2:
pop esi
ret 8
?CallMe2##YGXPAH0#Z ENDP ; CallMe2
cdecl (what you mistakenly call stdcall in your example):
_firstValue$ = 8 ; size = 4
_secondValue$ = 12 ; size = 4
?CallMe3##YAXPAH0#Z PROC ; CallMe3
mov edx, DWORD PTR _firstValue$[esp-4]
mov eax, DWORD PTR [edx]
push esi
mov esi, DWORD PTR _secondValue$[esp]
mov ecx, DWORD PTR [esi]
cmp eax, ecx
je SHORT $LN1#CallMe3
mov DWORD PTR [edx], ecx
mov DWORD PTR [esi], eax
$LN1#CallMe3:
pop esi
ret 0
?CallMe3##YAXPAH0#Z ENDP ; CallMe3

Why does adding extra check in loop make big difference on some machines, and small difference on others?

I have been doing some testing to see how much of a difference additional bounds checking makes in loops. This is prompted by thinking about the cost of implicit bounds checking inserted by languages such as C#, Java etc, when you access arrays.
Update: I have tried the same executable program out on several additional computers, which throws a lot more light onto what is happening. I've listed the original computer first, and second my modern laptop. On my modern laptop, adding additional checks in the loop adds only between 1 and 4% to the time taken, compared to between 3 and 30% for the original hardware.
Processor x86 Family 6 Model 30 Stepping 5 GenuineIntel ~2793 Mhz
Ratio 2 checks : 1 check = 1.0310
Ratio 3 checks : 1 check = 1.2769
Processor Intel(R) Core(TM) i7-3610QM CPU # 2.30GHz, 2301 Mhz, 4 Core(s), 8 Logical Processor(s)
Ratio 2 checks : 1 check = 1.0090
Ratio 3 checks : 1 check = 1.0393
Processor Intel(R) Core(TM) i5-2500 CPU # 3.30GHz, 4 Cores(s)
Ratio 2 checks : 1 check = 1.0035
Ratio 3 checks : 1 check = 1.0639
Processor Intel(R) Core(TM)2 Duo CPU T9300 # 2.50GHz, 2501 Mhz, 2 Core(s), 2 Logical Processor(s)
Ratio 2 checks : 1 check = 1.1195
Ratio 3 checks : 1 check = 1.3597
Processor x86 Family 15 Model 43 Stepping 1 AuthenticAMD ~2010 Mhz
Ratio 2 checks : 1 check = 1.0776
Ratio 3 checks : 1 check = 1.1451
In the test program, below, the first function checks just one bound, the second function checks two, and the third checks three (in the calling code, n1=n2=n3). I found that the ratio two checks:one was about 1.03, and the ratio three checks:one was about 1.3. I was surprised by that adding one more check made such a difference to performance. I got an interesting answer concerning the low cost of bounds checking on modern processors to my original question, which may throw some light on the differences observed here.
Note that it's important to compile the program without whole program optimization turned on; otherwise the compiler can simply remove the additional bounds checking.
// dotprod.cpp
#include "dotprod.h"
double SumProduct(const double* v1, const double* v2, int n)
{
double sum=0;
for(int i=0;
i<n;
++i)
sum += v1[i]*v2[i];
return sum;
}
double SumProduct(const double* v1, const double* v2, int n1, int n2)
{
double sum=0;
for(int i=0;
i<n1 && i <n2;
++i)
sum += v1[i]*v2[i];
return sum;
}
double SumProduct(const double* v1, const double* v2, int n1, int n2, int n3)
{
double sum=0;
for(int i=0;
i<n1 && i <n2 && i <n3;
++i)
sum += v1[i]*v2[i];
return sum;
}
This code was originally built using Visual Studio 2010, Release, Win32 (I've added the 'C' tag because the reasoning behind the difference in speed is not likely to be C++ specific, and may not be Windows specific). Can anyone explain it?
Rest of the code below, for information. This has some C++ specific stuff in it.
Header file
// dotprod.h
double SumProduct(const double*, const double*, int n);
double SumProduct(const double*, const double*, int n1, int n2);
double SumProduct(const double*, const double*, int n1, int n2, int n3);
Test harness
// main.cpp
#include <stdio.h>
#include <math.h>
#include <numeric>
#include <vector>
#include <windows.h>
#include "../dotprod/dotprod.h" // separate lib
typedef __int64 timecount_t;
inline timecount_t GetTimeCount()
{
LARGE_INTEGER li;
if (!QueryPerformanceCounter(&li)) {
exit(1);
}
return li.QuadPart;
}
int main()
{
typedef std::vector<double> dvec;
const int N = 100 * 1000;
// Initialize
dvec v1(N);
dvec v2(N);
dvec dp1(N);
dvec dp2(N);
dvec dp3(N);
for(int i=0; i<N; ++i) {
v1[i] = i;
v2[i] = log(static_cast<double>(i+1));
}
const timecount_t t0 = GetTimeCount();
// Check cost with one bound
for(int n=0; n<N; ++n) {
dp1[n] = SumProduct(&(v1[0]),&(v2[0]),n);
}
const timecount_t t1 = GetTimeCount();
// Check cost with two bounds
for(int n=0; n<N; ++n) {
dp2[n] = SumProduct(&(v1[0]),&(v2[0]),n,n);
}
const timecount_t t2 = GetTimeCount();
// Check cost with three bounds
for(int n=0; n<N; ++n) {
dp3[n] = SumProduct(&(v1[0]),&(v2[0]),n,n,n);
}
const timecount_t t3 = GetTimeCount();
// Check results
const double sumSumProducts1 = std::accumulate(dp1.begin(), dp1.end(), 0.0);
const double sumSumProducts2 = std::accumulate(dp2.begin(), dp2.end(), 0.0);
const double sumSumProducts3 = std::accumulate(dp3.begin(), dp3.end(), 0.0);
printf("Sums of dot products: %.1f, %.1f, %.1f\n", sumSumProducts1, sumSumProducts2, sumSumProducts3);
// Output timings
const timecount_t elapsed1 = t1-t0;
const timecount_t elapsed2 = t2-t1;
const timecount_t elapsed3 = t3-t2;
printf("Elapsed: %.0f, %.0f, %.0f\n",
static_cast<double>(elapsed1),
static_cast<double>(elapsed2),
static_cast<double>(elapsed3));
const double ratio2to1 = elapsed2 / static_cast<double>(elapsed1);
const double ratio3to1 = elapsed3 / static_cast<double>(elapsed1);
printf("Ratio 2:1=%.2f\n", ratio2to1);
printf("Ratio 3:1=%.2f\n", ratio3to1);
return 0;
}
In order to produce assembly, I took the advice in this answer (case 2, turning off whole program optimization), producing the following asm file.
; Listing generated by Microsoft (R) Optimizing Compiler Version 16.00.40219.01
TITLE C:\dev\TestSpeed\dotprod\dotprod.cpp
.686P
.XMM
include listing.inc
.model flat
INCLUDELIB OLDNAMES
PUBLIC __real#0000000000000000
PUBLIC ?SumProduct##YANPBN0HHH#Z ; SumProduct
EXTRN __fltused:DWORD
; COMDAT __real#0000000000000000
; File c:\dev\testspeed\dotprod\dotprod.cpp
CONST SEGMENT
__real#0000000000000000 DQ 00000000000000000r ; 0
; Function compile flags: /Ogtp
CONST ENDS
; COMDAT ?SumProduct##YANPBN0HHH#Z
_TEXT SEGMENT
tv491 = -4 ; size = 4
_v1$ = 8 ; size = 4
_v2$ = 12 ; size = 4
_n1$ = 16 ; size = 4
_n2$ = 20 ; size = 4
_n3$ = 24 ; size = 4
?SumProduct##YANPBN0HHH#Z PROC ; SumProduct, COMDAT
; 25 : {
push ebp
mov ebp, esp
push ecx
; 26 : double sum=0;
fldz
push ebx
mov ebx, DWORD PTR _v2$[ebp]
push esi
push edi
mov edi, DWORD PTR _n1$[ebp]
; 27 : for(int i=0;
xor ecx, ecx
; 28 : i<n1 && i <n2 && i <n3;
; 29 : ++i)
cmp edi, 4
jl $LC8#SumProduct
; 26 : double sum=0;
mov edi, DWORD PTR _v1$[ebp]
lea esi, DWORD PTR [edi+24]
; 30 : sum += v1[i]*v2[i];
sub edi, ebx
lea edx, DWORD PTR [ecx+2]
lea eax, DWORD PTR [ebx+8]
mov DWORD PTR tv491[ebp], edi
$LN15#SumProduct:
; 28 : i<n1 && i <n2 && i <n3;
; 29 : ++i)
mov ebx, DWORD PTR _n2$[ebp]
cmp ecx, ebx
jge $LN9#SumProduct
cmp ecx, DWORD PTR _n3$[ebp]
jge $LN9#SumProduct
; 30 : sum += v1[i]*v2[i];
fld QWORD PTR [eax-8]
lea edi, DWORD PTR [edx-1]
fmul QWORD PTR [esi-24]
faddp ST(1), ST(0)
cmp edi, ebx
jge SHORT $LN9#SumProduct
; 28 : i<n1 && i <n2 && i <n3;
; 29 : ++i)
cmp edi, DWORD PTR _n3$[ebp]
jge SHORT $LN9#SumProduct
; 30 : sum += v1[i]*v2[i];
mov edi, DWORD PTR tv491[ebp]
fld QWORD PTR [edi+eax]
fmul QWORD PTR [eax]
faddp ST(1), ST(0)
cmp edx, ebx
jge SHORT $LN9#SumProduct
; 28 : i<n1 && i <n2 && i <n3;
; 29 : ++i)
cmp edx, DWORD PTR _n3$[ebp]
jge SHORT $LN9#SumProduct
; 30 : sum += v1[i]*v2[i];
fld QWORD PTR [eax+8]
lea edi, DWORD PTR [edx+1]
fmul QWORD PTR [esi-8]
faddp ST(1), ST(0)
cmp edi, ebx
jge SHORT $LN9#SumProduct
; 28 : i<n1 && i <n2 && i <n3;
; 29 : ++i)
cmp edi, DWORD PTR _n3$[ebp]
jge SHORT $LN9#SumProduct
; 30 : sum += v1[i]*v2[i];
fld QWORD PTR [eax+16]
mov edi, DWORD PTR _n1$[ebp]
fmul QWORD PTR [esi]
add ecx, 4
lea ebx, DWORD PTR [edi-3]
add eax, 32 ; 00000020H
add esi, 32 ; 00000020H
faddp ST(1), ST(0)
add edx, 4
cmp ecx, ebx
jl SHORT $LN15#SumProduct
mov ebx, DWORD PTR _v2$[ebp]
$LC8#SumProduct:
; 28 : i<n1 && i <n2 && i <n3;
; 29 : ++i)
cmp ecx, edi
jge SHORT $LN9#SumProduct
mov edx, DWORD PTR _v1$[ebp]
lea eax, DWORD PTR [ebx+ecx*8]
sub edx, ebx
$LC3#SumProduct:
cmp ecx, DWORD PTR _n2$[ebp]
jge SHORT $LN9#SumProduct
cmp ecx, DWORD PTR _n3$[ebp]
jge SHORT $LN9#SumProduct
; 30 : sum += v1[i]*v2[i];
fld QWORD PTR [eax+edx]
inc ecx
fmul QWORD PTR [eax]
add eax, 8
faddp ST(1), ST(0)
cmp ecx, edi
jl SHORT $LC3#SumProduct
$LN9#SumProduct:
; 31 : return sum;
; 32 : }
pop edi
pop esi
pop ebx
mov esp, ebp
pop ebp
ret 0
?SumProduct##YANPBN0HHH#Z ENDP ; SumProduct
_TEXT ENDS
PUBLIC ?SumProduct##YANPBN0HH#Z ; SumProduct
; Function compile flags: /Ogtp
; COMDAT ?SumProduct##YANPBN0HH#Z
_TEXT SEGMENT
tv448 = -4 ; size = 4
_v1$ = 8 ; size = 4
_v2$ = 12 ; size = 4
_n1$ = 16 ; size = 4
_n2$ = 20 ; size = 4
?SumProduct##YANPBN0HH#Z PROC ; SumProduct, COMDAT
; 15 : {
push ebp
mov ebp, esp
push ecx
; 16 : double sum=0;
fldz
push ebx
mov ebx, DWORD PTR _v2$[ebp]
push esi
push edi
mov edi, DWORD PTR _n1$[ebp]
; 17 : for(int i=0;
xor ecx, ecx
; 18 : i<n1 && i <n2;
; 19 : ++i)
cmp edi, 4
jl SHORT $LC8#SumProduct#2
; 16 : double sum=0;
mov edi, DWORD PTR _v1$[ebp]
lea edx, DWORD PTR [edi+24]
; 20 : sum += v1[i]*v2[i];
sub edi, ebx
lea esi, DWORD PTR [ecx+2]
lea eax, DWORD PTR [ebx+8]
mov DWORD PTR tv448[ebp], edi
$LN19#SumProduct#2:
mov edi, DWORD PTR _n2$[ebp]
cmp ecx, edi
jge SHORT $LN9#SumProduct#2
fld QWORD PTR [eax-8]
lea ebx, DWORD PTR [esi-1]
fmul QWORD PTR [edx-24]
faddp ST(1), ST(0)
cmp ebx, edi
jge SHORT $LN9#SumProduct#2
mov ebx, DWORD PTR tv448[ebp]
fld QWORD PTR [ebx+eax]
fmul QWORD PTR [eax]
faddp ST(1), ST(0)
cmp esi, edi
jge SHORT $LN9#SumProduct#2
fld QWORD PTR [eax+8]
lea ebx, DWORD PTR [esi+1]
fmul QWORD PTR [edx-8]
faddp ST(1), ST(0)
cmp ebx, edi
jge SHORT $LN9#SumProduct#2
fld QWORD PTR [eax+16]
mov edi, DWORD PTR _n1$[ebp]
fmul QWORD PTR [edx]
add ecx, 4
lea ebx, DWORD PTR [edi-3]
add eax, 32 ; 00000020H
add edx, 32 ; 00000020H
faddp ST(1), ST(0)
add esi, 4
cmp ecx, ebx
jl SHORT $LN19#SumProduct#2
mov ebx, DWORD PTR _v2$[ebp]
$LC8#SumProduct#2:
; 18 : i<n1 && i <n2;
; 19 : ++i)
cmp ecx, edi
jge SHORT $LN9#SumProduct#2
mov edx, DWORD PTR _v1$[ebp]
lea eax, DWORD PTR [ebx+ecx*8]
sub edx, ebx
$LC3#SumProduct#2:
cmp ecx, DWORD PTR _n2$[ebp]
jge SHORT $LN9#SumProduct#2
; 20 : sum += v1[i]*v2[i];
fld QWORD PTR [eax+edx]
inc ecx
fmul QWORD PTR [eax]
add eax, 8
faddp ST(1), ST(0)
cmp ecx, edi
jl SHORT $LC3#SumProduct#2
$LN9#SumProduct#2:
; 21 : return sum;
; 22 : }
pop edi
pop esi
pop ebx
mov esp, ebp
pop ebp
ret 0
?SumProduct##YANPBN0HH#Z ENDP ; SumProduct
_TEXT ENDS
PUBLIC ?SumProduct##YANPBN0H#Z ; SumProduct
; Function compile flags: /Ogtp
; COMDAT ?SumProduct##YANPBN0H#Z
_TEXT SEGMENT
_v1$ = 8 ; size = 4
_v2$ = 12 ; size = 4
?SumProduct##YANPBN0H#Z PROC ; SumProduct, COMDAT
; _n$ = eax
; 5 : {
push ebp
mov ebp, esp
mov edx, DWORD PTR _v2$[ebp]
; 6 : double sum=0;
fldz
push ebx
push esi
mov esi, eax
; 7 : for(int i=0;
xor ebx, ebx
push edi
mov edi, DWORD PTR _v1$[ebp]
; 8 : i<n;
; 9 : ++i)
cmp esi, 4
jl SHORT $LC9#SumProduct#3
; 6 : double sum=0;
lea eax, DWORD PTR [edx+8]
lea ecx, DWORD PTR [edi+24]
; 10 : sum += v1[i]*v2[i];
sub edi, edx
lea edx, DWORD PTR [esi-4]
shr edx, 2
inc edx
lea ebx, DWORD PTR [edx*4]
$LN10#SumProduct#3:
fld QWORD PTR [eax-8]
add eax, 32 ; 00000020H
fmul QWORD PTR [ecx-24]
add ecx, 32 ; 00000020H
dec edx
faddp ST(1), ST(0)
fld QWORD PTR [edi+eax-32]
fmul QWORD PTR [eax-32]
faddp ST(1), ST(0)
fld QWORD PTR [eax-24]
fmul QWORD PTR [ecx-40]
faddp ST(1), ST(0)
fld QWORD PTR [eax-16]
fmul QWORD PTR [ecx-32]
faddp ST(1), ST(0)
jne SHORT $LN10#SumProduct#3
; 6 : double sum=0;
mov edx, DWORD PTR _v2$[ebp]
mov edi, DWORD PTR _v1$[ebp]
$LC9#SumProduct#3:
; 8 : i<n;
; 9 : ++i)
cmp ebx, esi
jge SHORT $LN8#SumProduct#3
sub edi, edx
lea eax, DWORD PTR [edx+ebx*8]
sub esi, ebx
$LC3#SumProduct#3:
; 10 : sum += v1[i]*v2[i];
fld QWORD PTR [eax+edi]
add eax, 8
dec esi
fmul QWORD PTR [eax-8]
faddp ST(1), ST(0)
jne SHORT $LC3#SumProduct#3
$LN8#SumProduct#3:
; 11 : return sum;
; 12 : }
pop edi
pop esi
pop ebx
pop ebp
ret 0
?SumProduct##YANPBN0H#Z ENDP ; SumProduct
_TEXT ENDS
END
One big difference between CPUs is the pipeline optimization
The CPU can execute in parallel several instructions until reaches a conditional branch. From this point instead of waiting until all the instructions are executed, the CPU can continue with a branch in parallel until the condition is available and ready to be evaluated. If the assumption was correct, then we have a gain. Otherwise the CPU will go with the other branch.
So the tricky part for a CPU is to find the best assumptions and to execute as many instructions in parallel as possible.