Optimizing assembly generated by Microsoft Visual Studio Compiler - c++
I'm working on a project with matrix multiplication. I have been able to write the C code and I was able to generate the assembly code for it using the Microsoft visual studio 2012 compiler. The compiler generated code is shown below. The compiler used the SSE registers, which is exactly what I wanted, but it is not the best code. I wanted to optimize this code and write it inline with the C code but I don't understand the assembly code. Basically the assembly code is good for only one dimension of the matrix, the code below is only good for 4 by 4 matrix. How can I make is so that it is good for n*n matrix size.
The C++ code is shown below:
#define MAX_NUM 10
#define MAX_DIM 4
int main () {
float mat_a [] = {1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0};
float mat_b [] = {1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0};
float result [] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
int num_row = 4;
int num_col = 4;
float sum;
for (int i = 0; i < num_row; i++) {
for (int j = 0; j < num_col; j++) {
sum = 0.0;
for (int k = 0; k < num_row; k++) {
sum = sum + mat_a[i * num_col + k] * mat_b[k * num_col + j];
}
*(result + i * num_col + j) = sum;
}
}
return 0;
}
The assembly code is shown below:
; Listing generated by Microsoft (R) Optimizing Compiler Version 17.00.50727.1
TITLE C:\Users\GS\Documents\Visual Studio 2012\Projects\Assembly_InLine\Assembly_InLine\Source.cpp
.686P
.XMM
include listing.inc
.model flat
INCLUDELIB MSVCRTD
INCLUDELIB OLDNAMES
PUBLIC _main
PUBLIC __real#00000000
PUBLIC __real#3f800000
PUBLIC __real#40000000
PUBLIC __real#40400000
PUBLIC __real#40800000
EXTRN #_RTC_CheckStackVars#8:PROC
EXTRN #__security_check_cookie#4:PROC
EXTRN __RTC_InitBase:PROC
EXTRN __RTC_Shutdown:PROC
EXTRN ___security_cookie:DWORD
EXTRN __fltused:DWORD
; COMDAT __real#40800000
CONST SEGMENT
__real#40800000 DD 040800000r ; 4
CONST ENDS
; COMDAT __real#40400000
CONST SEGMENT
__real#40400000 DD 040400000r ; 3
CONST ENDS
; COMDAT __real#40000000
CONST SEGMENT
__real#40000000 DD 040000000r ; 2
CONST ENDS
; COMDAT __real#3f800000
CONST SEGMENT
__real#3f800000 DD 03f800000r ; 1
CONST ENDS
; COMDAT __real#00000000
CONST SEGMENT
__real#00000000 DD 000000000r ; 0
CONST ENDS
; COMDAT rtc$TMZ
rtc$TMZ SEGMENT
__RTC_Shutdown.rtc$TMZ DD FLAT:__RTC_Shutdown
rtc$TMZ ENDS
; COMDAT rtc$IMZ
rtc$IMZ SEGMENT
__RTC_InitBase.rtc$IMZ DD FLAT:__RTC_InitBase
rtc$IMZ ENDS
; Function compile flags: /Odtp /RTCsu /ZI
; COMDAT _main
_TEXT SEGMENT
_k$1 = -288 ; size = 4
_j$2 = -276 ; size = 4
_i$3 = -264 ; size = 4
_sum$ = -252 ; size = 4
_num_col$ = -240 ; size = 4
_num_row$ = -228 ; size = 4
_result$ = -216 ; size = 64
_mat_b$ = -144 ; size = 64
_mat_a$ = -72 ; size = 64
__$ArrayPad$ = -4 ; size = 4
_main PROC ; COMDAT
; File c:\users\gs\documents\visual studio 2012\projects\assembly_inline\assembly_inline\source.cpp
; Line 4
push ebp
mov ebp, esp
sub esp, 484 ; 000001e4H
push ebx
push esi
push edi
lea edi, DWORD PTR [ebp-484]
mov ecx, 121 ; 00000079H
mov eax, -858993460 ; ccccccccH
rep stosd
mov eax, DWORD PTR ___security_cookie
xor eax, ebp
mov DWORD PTR __$ArrayPad$[ebp], eax
; Line 5
movss xmm0, DWORD PTR __real#3f800000
movss DWORD PTR _mat_a$[ebp], xmm0
movss xmm0, DWORD PTR __real#40000000
movss DWORD PTR _mat_a$[ebp+4], xmm0
movss xmm0, DWORD PTR __real#40400000
movss DWORD PTR _mat_a$[ebp+8], xmm0
movss xmm0, DWORD PTR __real#40800000
movss DWORD PTR _mat_a$[ebp+12], xmm0
movss xmm0, DWORD PTR __real#3f800000
movss DWORD PTR _mat_a$[ebp+16], xmm0
movss xmm0, DWORD PTR __real#40000000
movss DWORD PTR _mat_a$[ebp+20], xmm0
movss xmm0, DWORD PTR __real#40400000
movss DWORD PTR _mat_a$[ebp+24], xmm0
movss xmm0, DWORD PTR __real#40800000
movss DWORD PTR _mat_a$[ebp+28], xmm0
movss xmm0, DWORD PTR __real#3f800000
movss DWORD PTR _mat_a$[ebp+32], xmm0
movss xmm0, DWORD PTR __real#40000000
movss DWORD PTR _mat_a$[ebp+36], xmm0
movss xmm0, DWORD PTR __real#40400000
movss DWORD PTR _mat_a$[ebp+40], xmm0
movss xmm0, DWORD PTR __real#40800000
movss DWORD PTR _mat_a$[ebp+44], xmm0
movss xmm0, DWORD PTR __real#3f800000
movss DWORD PTR _mat_a$[ebp+48], xmm0
movss xmm0, DWORD PTR __real#40000000
movss DWORD PTR _mat_a$[ebp+52], xmm0
movss xmm0, DWORD PTR __real#40400000
movss DWORD PTR _mat_a$[ebp+56], xmm0
movss xmm0, DWORD PTR __real#40800000
movss DWORD PTR _mat_a$[ebp+60], xmm0
; Line 6
movss xmm0, DWORD PTR __real#3f800000
movss DWORD PTR _mat_b$[ebp], xmm0
movss xmm0, DWORD PTR __real#40000000
movss DWORD PTR _mat_b$[ebp+4], xmm0
movss xmm0, DWORD PTR __real#40400000
movss DWORD PTR _mat_b$[ebp+8], xmm0
movss xmm0, DWORD PTR __real#40800000
movss DWORD PTR _mat_b$[ebp+12], xmm0
movss xmm0, DWORD PTR __real#3f800000
movss DWORD PTR _mat_b$[ebp+16], xmm0
movss xmm0, DWORD PTR __real#40000000
movss DWORD PTR _mat_b$[ebp+20], xmm0
movss xmm0, DWORD PTR __real#40400000
movss DWORD PTR _mat_b$[ebp+24], xmm0
movss xmm0, DWORD PTR __real#40800000
movss DWORD PTR _mat_b$[ebp+28], xmm0
movss xmm0, DWORD PTR __real#3f800000
movss DWORD PTR _mat_b$[ebp+32], xmm0
movss xmm0, DWORD PTR __real#40000000
movss DWORD PTR _mat_b$[ebp+36], xmm0
movss xmm0, DWORD PTR __real#40400000
movss DWORD PTR _mat_b$[ebp+40], xmm0
movss xmm0, DWORD PTR __real#40800000
movss DWORD PTR _mat_b$[ebp+44], xmm0
movss xmm0, DWORD PTR __real#3f800000
movss DWORD PTR _mat_b$[ebp+48], xmm0
movss xmm0, DWORD PTR __real#40000000
movss DWORD PTR _mat_b$[ebp+52], xmm0
movss xmm0, DWORD PTR __real#40400000
movss DWORD PTR _mat_b$[ebp+56], xmm0
movss xmm0, DWORD PTR __real#40800000
movss DWORD PTR _mat_b$[ebp+60], xmm0
; Line 7
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp], xmm0
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp+4], xmm0
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp+8], xmm0
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp+12], xmm0
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp+16], xmm0
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp+20], xmm0
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp+24], xmm0
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp+28], xmm0
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp+32], xmm0
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp+36], xmm0
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp+40], xmm0
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp+44], xmm0
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp+48], xmm0
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp+52], xmm0
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp+56], xmm0
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _result$[ebp+60], xmm0
; Line 9
mov DWORD PTR _num_row$[ebp], 4
; Line 10
mov DWORD PTR _num_col$[ebp], 4
; Line 14
mov DWORD PTR _i$3[ebp], 0
jmp SHORT $LN9#main
$LN8#main:
mov eax, DWORD PTR _i$3[ebp]
add eax, 1
mov DWORD PTR _i$3[ebp], eax
$LN9#main:
mov eax, DWORD PTR _i$3[ebp]
cmp eax, DWORD PTR _num_row$[ebp]
jge $LN7#main
; Line 15
mov DWORD PTR _j$2[ebp], 0
jmp SHORT $LN6#main
$LN5#main:
mov eax, DWORD PTR _j$2[ebp]
add eax, 1
mov DWORD PTR _j$2[ebp], eax
$LN6#main:
mov eax, DWORD PTR _j$2[ebp]
cmp eax, DWORD PTR _num_col$[ebp]
jge $LN4#main
; Line 16
movss xmm0, DWORD PTR __real#00000000
movss DWORD PTR _sum$[ebp], xmm0
; Line 17
mov DWORD PTR _k$1[ebp], 0
jmp SHORT $LN3#main
$LN2#main:
mov eax, DWORD PTR _k$1[ebp]
add eax, 1
mov DWORD PTR _k$1[ebp], eax
$LN3#main:
mov eax, DWORD PTR _k$1[ebp]
cmp eax, DWORD PTR _num_row$[ebp]
jge SHORT $LN1#main
; Line 18
mov eax, DWORD PTR _i$3[ebp]
imul eax, DWORD PTR _num_col$[ebp]
add eax, DWORD PTR _k$1[ebp]
mov ecx, DWORD PTR _k$1[ebp]
imul ecx, DWORD PTR _num_col$[ebp]
add ecx, DWORD PTR _j$2[ebp]
movss xmm0, DWORD PTR _mat_a$[ebp+eax*4]
mulss xmm0, DWORD PTR _mat_b$[ebp+ecx*4]
addss xmm0, DWORD PTR _sum$[ebp]
movss DWORD PTR _sum$[ebp], xmm0
; Line 19
jmp SHORT $LN2#main
$LN1#main:
; Line 20
mov eax, DWORD PTR _i$3[ebp]
imul eax, DWORD PTR _num_col$[ebp]
lea ecx, DWORD PTR _result$[ebp+eax*4]
mov edx, DWORD PTR _j$2[ebp]
movss xmm0, DWORD PTR _sum$[ebp]
movss DWORD PTR [ecx+edx*4], xmm0
; Line 21
jmp $LN5#main
$LN4#main:
; Line 22
jmp $LN8#main
$LN7#main:
; Line 24
xor eax, eax
; Line 25
push edx
mov ecx, ebp
push eax
lea edx, DWORD PTR $LN16#main
call #_RTC_CheckStackVars#8
pop eax
pop edx
pop edi
pop esi
pop ebx
mov ecx, DWORD PTR __$ArrayPad$[ebp]
xor ecx, ebp
call #__security_check_cookie#4
mov esp, ebp
pop ebp
ret 0
npad 1
$LN16#main:
DD 3
DD $LN15#main
$LN15#main:
DD -72 ; ffffffb8H
DD 64 ; 00000040H
DD $LN12#main
DD -144 ; ffffff70H
DD 64 ; 00000040H
DD $LN13#main
DD -216 ; ffffff28H
DD 64 ; 00000040H
DD $LN14#main
$LN14#main:
DB 114 ; 00000072H
DB 101 ; 00000065H
DB 115 ; 00000073H
DB 117 ; 00000075H
DB 108 ; 0000006cH
DB 116 ; 00000074H
DB 0
$LN13#main:
DB 109 ; 0000006dH
DB 97 ; 00000061H
DB 116 ; 00000074H
DB 95 ; 0000005fH
DB 98 ; 00000062H
DB 0
$LN12#main:
DB 109 ; 0000006dH
DB 97 ; 00000061H
DB 116 ; 00000074H
DB 95 ; 0000005fH
DB 97 ; 00000061H
DB 0
_main ENDP
_TEXT ENDS
END
Visual Studio and SSE is a red herring here (as well as the C++ vs. C nonsense). Assuming you compile in Release mode there are other reason your code is inefficient especially for large matrices. The main reason is that it's cache unfriendly. To make your code efficient for an arbitrary n*n matrix you need optimize for big and small.
It's important to optimize for the cache BEFORE employing SIMD or threads. In the code below I use block multiplication to speed up your code for a 1024x1204 matrix by more than a factor of ten (7.1 s with old code and 0.6s with new) using only a single thread without using SSE/AVX. It's not going to do any good to use SIMD if your code is memory bound.
I have already described a first order improvement to matrix multiplication using the transpose here.
OpenMP C++ Matrix Multiplication run slower in parallel
But let me describe an even more cache friendly method. Let's assume your hardware has two types of memory:
small and fast,
large and slow.
In reality, modern CPUs actually have several levels of this (L1 small and fast, L2 larger and slower, L3 even larger and slower, main memory even larger still and even slower still. Some CPUs even have a L4) but this simple model with only two levels here will still lead to a big improvement in performance.
Using this model with two types of memory you can show that you will get the best performance by dividing your matrix into square tiles which fit in the small and fast memory and doing block matrix multiplication. Next you want to rearrange the memory so that the elements of each tile are contiguous.
Below is some code showing how to do this. I used a block size of 64x64 on a 1024x1024 matrix. It took 7s with your code and 0.65s with mine. The matrix size has to be multiples of 64x64 but it's easy to extend this to an arbitrary size matrix. If you want to see an example of how to optimize the blocks see this Difference in performance between MSVC and GCC for highly optimized matrix multplication code
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <omp.h>
void reorder(float *a, float *b, int n, int bs) {
int nb = n/bs;
int cnt = 0;
for(int i=0; i<nb; i++) {
for(int j=0; j<nb; j++) {
for(int i2=0; i2<bs; i2++) {
for(int j2=0; j2<bs; j2++) {
b[cnt++] = a[bs*(i*n+j) + i2*n + j2];
}
}
}
}
}
void gemm_slow(float *a, float *b, float *c, int n) {
for(int i=0; i<n; i++) {
for(int j=0; j<n; j++) {
float sum = c[i*n+j];
for(int k=0; k<n; k++) {
sum += a[i*n+k]*b[k*n+j];
}
c[i*n+j] += sum;
}
}
}
void gemm_block(float *a, float *b, float *c, int n, int n2) {
for(int i=0; i<n2; i++) {
for(int j=0; j<n2; j++) {
float sum = c[i*n+j];
for(int k=0; k<n2; k++) {
sum += a[i*n+k]*b[k*n2+j];
}
c[i*n+j] = sum;
}
}
}
void gemm(float *a, float*b, float*c, int n, int bs) {
int nb = n/bs;
float *b2 = (float*)malloc(sizeof(float)*n*n);
reorder(b,b2,n,bs);
for(int i=0; i<nb; i++) {
for(int j=0; j<nb; j++) {
for(int k=0; k<nb; k++) {
gemm_block(&a[bs*(i*n+k)],&b2[bs*bs*(k*nb+j)],&c[bs*(i*n+j)], n, bs);
}
}
}
free(b2);
}
int main() {
const int bs = 64;
const int n = 1024;
float *a = new float[n*n];
float *b = new float[n*n];
float *c1 = new float[n*n]();
float *c2 = new float[n*n]();
for(int i=0; i<n*n; i++) {
a[i] = 1.0*rand()/RAND_MAX;
b[i] = 1.0*rand()/RAND_MAX;
}
double dtime;
dtime = omp_get_wtime();
gemm_slow(a,b,c1,n);
dtime = omp_get_wtime() - dtime;
printf("%f\n", dtime);
dtime = omp_get_wtime();
gemm(a,b,c2,n,64);
dtime = omp_get_wtime() - dtime;
printf("%f\n", dtime);
printf("%d\n", memcmp(c1,c2, sizeof(float)*n*n));
}
Related
Loop unroll issue with Visual Studio compiler
I have some simple setup, where I noticed that VS compiler seems not smart enough to unroll loop, but other compilers like clang or gcc do so. Do I miss some optimization flag for VS? #include <cstddef> struct A { double data[4]; double *begin() { return data; } double *end() { return data + 4; } double const *begin() const { return data; } double const *end() const { return data + 4; } }; double sum_index(A const &a) { double ret = 0; for(std::size_t i = 0; i < 4; ++i) { ret += a.data[i]; } return ret; } double sum_iter(A const &a) { double ret = 0; for(auto const &v : a) { ret += v; } return ret; } I used https://godbolt.org/ compiler explorer to generate assembler code. gcc 11.2 with -O3: sum_index(A const&): pxor xmm0, xmm0 addsd xmm0, QWORD PTR [rdi] addsd xmm0, QWORD PTR [rdi+8] addsd xmm0, QWORD PTR [rdi+16] addsd xmm0, QWORD PTR [rdi+24] ret sum_iter(A const&): movsd xmm1, QWORD PTR [rdi] addsd xmm1, QWORD PTR .LC0[rip] movsd xmm0, QWORD PTR [rdi+8] addsd xmm1, xmm0 movupd xmm0, XMMWORD PTR [rdi+16] addsd xmm1, xmm0 unpckhpd xmm0, xmm0 addsd xmm0, xmm1 ret .LC0: .long 0 .long 0 clang 13.0.1 with -O3: sum_index(A const&): # #sum_index(A const&) xorpd xmm0, xmm0 addsd xmm0, qword ptr [rdi] addsd xmm0, qword ptr [rdi + 8] addsd xmm0, qword ptr [rdi + 16] addsd xmm0, qword ptr [rdi + 24] ret sum_iter(A const&): # #sum_iter(A const&) xorpd xmm0, xmm0 addsd xmm0, qword ptr [rdi] addsd xmm0, qword ptr [rdi + 8] addsd xmm0, qword ptr [rdi + 16] addsd xmm0, qword ptr [rdi + 24] ret MSVC 19.30 with /O2 (there is no /O3?): this$ = 8 double const * A::begin(void)const PROC ; A::begin, COMDAT mov rax, rcx ret 0 double const * A::begin(void)const ENDP ; A::begin this$ = 8 double const * A::end(void)const PROC ; A::end, COMDAT lea rax, QWORD PTR [rcx+32] ret 0 double const * A::end(void)const ENDP ; A::end a$ = 8 double sum_index(A const &) PROC ; sum_index, COMDAT movsd xmm0, QWORD PTR [rcx] xorps xmm1, xmm1 addsd xmm0, xmm1 addsd xmm0, QWORD PTR [rcx+8] addsd xmm0, QWORD PTR [rcx+16] addsd xmm0, QWORD PTR [rcx+24] ret 0 double sum_index(A const &) ENDP ; sum_index a$ = 8 double sum_iter(A const &) PROC ; sum_iter, COMDAT lea rax, QWORD PTR [rcx+32] xorps xmm0, xmm0 cmp rcx, rax je SHORT $LN12#sum_iter npad 4 $LL8#sum_iter: addsd xmm0, QWORD PTR [rcx] add rcx, 8 cmp rcx, rax jne SHORT $LL8#sum_iter $LN12#sum_iter: ret 0 double sum_iter(A const &) ENDP ; sum_iter Obviously there is problem with unrolling the loop for MSVC. Is there some additional optimization flag I have to set? Thanks for help!
Differences in custom and std fetch_add on floats
This is an attempt at implementing fetch_add on floats without C++20. void fetch_add(volatile float* x, float y) { bool success = false; auto xi = (volatile std::int32_t*)x; while(!success) { union { std::int32_t sumint; float sum; }; auto tmp = __atomic_load_n(xi, __ATOMIC_RELAXED); sumint = tmp; sum += y; success = __atomic_compare_exchange_n(xi, &tmp, sumint, true, __ATOMIC_RELAXED, __ATOMIC_RELAXED); } } To my great confusion, when I compare the assembly from gcc10.1 -O2 -std=c++2a for x86-64, they differ. fetch_add(float volatile*, float): .L2: mov eax, DWORD PTR [rdi] movd xmm1, eax addss xmm1, xmm0 movd edx, xmm1 lock cmpxchg DWORD PTR [rdi], edx jne .L2 ret fetch_add_std(std::atomic<float>&, float): mov eax, DWORD PTR [rdi] movaps xmm1, xmm0 movd xmm0, eax mov DWORD PTR [rsp-4], eax addss xmm0, xmm1 .L9: mov eax, DWORD PTR [rsp-4] movd edx, xmm0 lock cmpxchg DWORD PTR [rdi], edx je .L6 mov DWORD PTR [rsp-4], eax movss xmm0, DWORD PTR [rsp-4] addss xmm0, xmm1 jmp .L9 .L6: ret My ability to read assembly is near non-existent, but the custom version looks correct to me, which implies it is either incorrect, inefficient or somehow the standard library is rather broken. I don't quite believe the third case, which leads me to ask, is the custom version incorrect or inefficient? After some comments, a second version without reloading after cmpxchg is written. They do still differ.
GCC std::sin vectorization bug?
The next code (with -O3 -ffast-math): #include <cmath> float a[4]; void sin1() { for(unsigned i = 0; i < 4; i++) a[i] = sinf(a[i]); } Compiles vectorized version of sinf (_ZGVbN4v_sinf): sin1(): sub rsp, 8 movaps xmm0, XMMWORD PTR a[rip] call _ZGVbN4v_sinf movaps XMMWORD PTR a[rip], xmm0 add rsp, 8 ret But when i use c++ version of sinf (std::sin) no vectorization occurrs: void sin2() { for(unsigned i = 0; i < 4; i++) a[i] = std::sin(a[i]); } sin2(): sub rsp, 8 movss xmm0, DWORD PTR a[rip] call sinf movss DWORD PTR a[rip], xmm0 movss xmm0, DWORD PTR a[rip+4] call sinf movss DWORD PTR a[rip+4], xmm0 movss xmm0, DWORD PTR a[rip+8] call sinf movss DWORD PTR a[rip+8], xmm0 movss xmm0, DWORD PTR a[rip+12] call sinf movss DWORD PTR a[rip+12], xmm0 add rsp, 8 ret Compiler Explorer Code
Microsoft VS 2015 compiler bug?
I have the following function: void MainLayer::onMouseScroll(cocos2d::EventMouse* event) { const float scrollAmount = event->getScrollY(); const float newMapScale = scrollAmount < 0.f ? std::min(_mapScale * k_deltaMapScaleMag, k_maxMapScale) : std::max(_mapScale * k_deltaMapScaleMin, k_minMapScale); const float invNewMapScale = 1.f / newMapScale; const Vec2 anchorScreenSpace = event->getLocationInView(); _mapScale = newMapScale; _invMapScale = invNewMapScale; updateMapTransform(); } Microsoft VS 2015 compiler generates the following assembly: push ebp mov ebp, esp push -1 push __ehhandler$?onMouseScroll#MainLayer##AAEXPAVEventMouse#cocos2d###Z mov eax, DWORD PTR fs:0 push eax sub esp, 20 ; 00000014H mov eax, DWORD PTR ___security_cookie xor eax, ebp mov DWORD PTR __$ArrayPad$[ebp], eax push esi push eax lea eax, DWORD PTR __$EHRec$[ebp] mov DWORD PTR fs:0, eax mov esi, ecx mov ecx, DWORD PTR _event$[ebp] ; File c:\program files (x86)\microsoft visual studio 14.0\vc\include\algorithm ; 3649 : return (_DEBUG_LT(_Right, _Left) ? _Right : _Left); lea edx, DWORD PTR $T1[ebp] ; File e:\projects\aliceandbob\classes\mainlayer.cpp ; 121 : const float newMapScale = scrollAmount < 0.f ? std::min(_mapScale * k_deltaMapScaleMag, k_maxMapScale) : std::max(_mapScale * k_deltaMapScaleMin, k_minMapScale); movss xmm1, DWORD PTR __real#3f800000 xorps xmm0, xmm0 comiss xmm0, DWORD PTR [ecx+56] movss xmm0, DWORD PTR [esi+712] jbe SHORT $LN3#onMouseScr mulss xmm0, DWORD PTR __real#3f99999a ; File c:\program files (x86)\microsoft visual studio 14.0\vc\include\algorithm ; 3649 : return (_DEBUG_LT(_Right, _Left) ? _Right : _Left); mov eax, OFFSET ?k_maxMapScale#?A0x69cbde3b##3MB comiss xmm0, DWORD PTR __real#41200000 ; File e:\projects\aliceandbob\classes\mainlayer.cpp ; 121 : const float newMapScale = scrollAmount < 0.f ? std::min(_mapScale * k_deltaMapScaleMag, k_maxMapScale) : std::max(_mapScale * k_deltaMapScaleMin, k_minMapScale); jmp SHORT $LN18#onMouseScr $LN3#onMouseScr: mulss xmm0, DWORD PTR __real#3f4ccccd ; File c:\program files (x86)\microsoft visual studio 14.0\vc\include\algorithm ; 3612 : return (_DEBUG_LT(_Left, _Right) ? _Right : _Left); mov eax, OFFSET ?k_minMapScale#?A0x69cbde3b##3MB comiss xmm1, xmm0 $LN18#onMouseScr: cmovbe eax, edx movss xmm0, DWORD PTR [eax] ; File e:\projects\aliceandbob\classes\mainlayer.cpp ; 124 : const Vec2 anchorScreenSpace = event->getLocationInView(); lea eax, DWORD PTR _anchorScreenSpace$[ebp] divss xmm1, xmm0 push eax movss DWORD PTR tv161[ebp], xmm0 movss DWORD PTR _invNewMapScale$1$[ebp], xmm1 call DWORD PTR __imp_?getLocationInView#EventMouse#cocos2d##QBE?AVVec2#2#XZ ; 125 : _mapScale = newMapScale; movss xmm0, DWORD PTR tv161[ebp] ; 126 : _invMapScale = invNewMapScale; ; 127 : ; 128 : updateMapTransform(); mov ecx, esi mov DWORD PTR __$EHRec$[ebp+8], 0 movss DWORD PTR [esi+712], xmm0 movss xmm0, DWORD PTR _invNewMapScale$1$[ebp] movss DWORD PTR [esi+716], xmm0 call ?updateMapTransform#MainLayer##AAEXXZ ; MainLayer::updateMapTransform ; 129 : } mov ecx, DWORD PTR __$EHRec$[ebp] mov DWORD PTR fs:0, ecx pop ecx pop esi mov ecx, DWORD PTR __$ArrayPad$[ebp] xor ecx, ebp call #__security_check_cookie#4 mov esp, ebp pop ebp ret 4 The problem is the compiler generates the code that does not store the result of the expression into newMapScale variable. As you see, it also does not insert minss and maxss instructions here (for std::max() and std::min()). If I deny compuler to use SSE instructions - it works as intended. I tried to reproduce this issue on a test project and failed - almost exact same code had minss and maxss instructions and stored the result of expression into the newMapScale variable. Do I have some kind of undefined behavior, or it's just a compiler bug? I use VS 2015 Update 3, VS 2015 - Windows XP (v140_xp) SDK. UPD: I only left source code that helps reproducing the problem. Removing any line from that code fixes compiler behavior. Please pay attention to the following lines: movss xmm1, DWORD PTR __real#3f800000 xorps xmm0, xmm0 comiss xmm0, DWORD PTR [ecx+56] movss xmm0, DWORD PTR [esi+712] jbe SHORT $LN3#onMouseScr mulss xmm0, DWORD PTR __real#3f99999a ; File c:\program files (x86)\microsoft visual studio 14.0\vc\include\algorithm ; 3649 : return (_DEBUG_LT(_Right, _Left) ? _Right : _Left); mov eax, OFFSET ?k_maxMapScale#?A0x69cbde3b##3MB comiss xmm0, DWORD PTR __real#41200000 ; File e:\projects\aliceandbob\classes\mainlayer.cpp ; 121 : const float newMapScale = scrollAmount < 0.f ? std::min(_mapScale * k_deltaMapScaleMag, k_maxMapScale) : std::max(_mapScale * k_deltaMapScaleMin, k_minMapScale); jmp SHORT $LN18#onMouseScr $LN3#onMouseScr: mulss xmm0, DWORD PTR __real#3f4ccccd ; File c:\program files (x86)\microsoft visual studio 14.0\vc\include\algorithm ; 3612 : return (_DEBUG_LT(_Left, _Right) ? _Right : _Left); mov eax, OFFSET ?k_minMapScale#?A0x69cbde3b##3MB comiss xmm1, xmm0 $LN18#onMouseScr: cmovbe eax, edx movss xmm0, DWORD PTR [eax] ; File e:\projects\aliceandbob\classes\mainlayer.cpp ; 124 : const Vec2 anchorScreenSpace = event->getLocationInView(); lea eax, DWORD PTR _anchorScreenSpace$[ebp] divss xmm1, xmm0 push eax As Andrey Turkin said, xmm0 gets discarded although it contains calculation results. An unitialized memory is loaded into xmm0 instead.
Why does adding extra check in loop make big difference on some machines, and small difference on others?
I have been doing some testing to see how much of a difference additional bounds checking makes in loops. This is prompted by thinking about the cost of implicit bounds checking inserted by languages such as C#, Java etc, when you access arrays. Update: I have tried the same executable program out on several additional computers, which throws a lot more light onto what is happening. I've listed the original computer first, and second my modern laptop. On my modern laptop, adding additional checks in the loop adds only between 1 and 4% to the time taken, compared to between 3 and 30% for the original hardware. Processor x86 Family 6 Model 30 Stepping 5 GenuineIntel ~2793 Mhz Ratio 2 checks : 1 check = 1.0310 Ratio 3 checks : 1 check = 1.2769 Processor Intel(R) Core(TM) i7-3610QM CPU # 2.30GHz, 2301 Mhz, 4 Core(s), 8 Logical Processor(s) Ratio 2 checks : 1 check = 1.0090 Ratio 3 checks : 1 check = 1.0393 Processor Intel(R) Core(TM) i5-2500 CPU # 3.30GHz, 4 Cores(s) Ratio 2 checks : 1 check = 1.0035 Ratio 3 checks : 1 check = 1.0639 Processor Intel(R) Core(TM)2 Duo CPU T9300 # 2.50GHz, 2501 Mhz, 2 Core(s), 2 Logical Processor(s) Ratio 2 checks : 1 check = 1.1195 Ratio 3 checks : 1 check = 1.3597 Processor x86 Family 15 Model 43 Stepping 1 AuthenticAMD ~2010 Mhz Ratio 2 checks : 1 check = 1.0776 Ratio 3 checks : 1 check = 1.1451 In the test program, below, the first function checks just one bound, the second function checks two, and the third checks three (in the calling code, n1=n2=n3). I found that the ratio two checks:one was about 1.03, and the ratio three checks:one was about 1.3. I was surprised by that adding one more check made such a difference to performance. I got an interesting answer concerning the low cost of bounds checking on modern processors to my original question, which may throw some light on the differences observed here. Note that it's important to compile the program without whole program optimization turned on; otherwise the compiler can simply remove the additional bounds checking. // dotprod.cpp #include "dotprod.h" double SumProduct(const double* v1, const double* v2, int n) { double sum=0; for(int i=0; i<n; ++i) sum += v1[i]*v2[i]; return sum; } double SumProduct(const double* v1, const double* v2, int n1, int n2) { double sum=0; for(int i=0; i<n1 && i <n2; ++i) sum += v1[i]*v2[i]; return sum; } double SumProduct(const double* v1, const double* v2, int n1, int n2, int n3) { double sum=0; for(int i=0; i<n1 && i <n2 && i <n3; ++i) sum += v1[i]*v2[i]; return sum; } This code was originally built using Visual Studio 2010, Release, Win32 (I've added the 'C' tag because the reasoning behind the difference in speed is not likely to be C++ specific, and may not be Windows specific). Can anyone explain it? Rest of the code below, for information. This has some C++ specific stuff in it. Header file // dotprod.h double SumProduct(const double*, const double*, int n); double SumProduct(const double*, const double*, int n1, int n2); double SumProduct(const double*, const double*, int n1, int n2, int n3); Test harness // main.cpp #include <stdio.h> #include <math.h> #include <numeric> #include <vector> #include <windows.h> #include "../dotprod/dotprod.h" // separate lib typedef __int64 timecount_t; inline timecount_t GetTimeCount() { LARGE_INTEGER li; if (!QueryPerformanceCounter(&li)) { exit(1); } return li.QuadPart; } int main() { typedef std::vector<double> dvec; const int N = 100 * 1000; // Initialize dvec v1(N); dvec v2(N); dvec dp1(N); dvec dp2(N); dvec dp3(N); for(int i=0; i<N; ++i) { v1[i] = i; v2[i] = log(static_cast<double>(i+1)); } const timecount_t t0 = GetTimeCount(); // Check cost with one bound for(int n=0; n<N; ++n) { dp1[n] = SumProduct(&(v1[0]),&(v2[0]),n); } const timecount_t t1 = GetTimeCount(); // Check cost with two bounds for(int n=0; n<N; ++n) { dp2[n] = SumProduct(&(v1[0]),&(v2[0]),n,n); } const timecount_t t2 = GetTimeCount(); // Check cost with three bounds for(int n=0; n<N; ++n) { dp3[n] = SumProduct(&(v1[0]),&(v2[0]),n,n,n); } const timecount_t t3 = GetTimeCount(); // Check results const double sumSumProducts1 = std::accumulate(dp1.begin(), dp1.end(), 0.0); const double sumSumProducts2 = std::accumulate(dp2.begin(), dp2.end(), 0.0); const double sumSumProducts3 = std::accumulate(dp3.begin(), dp3.end(), 0.0); printf("Sums of dot products: %.1f, %.1f, %.1f\n", sumSumProducts1, sumSumProducts2, sumSumProducts3); // Output timings const timecount_t elapsed1 = t1-t0; const timecount_t elapsed2 = t2-t1; const timecount_t elapsed3 = t3-t2; printf("Elapsed: %.0f, %.0f, %.0f\n", static_cast<double>(elapsed1), static_cast<double>(elapsed2), static_cast<double>(elapsed3)); const double ratio2to1 = elapsed2 / static_cast<double>(elapsed1); const double ratio3to1 = elapsed3 / static_cast<double>(elapsed1); printf("Ratio 2:1=%.2f\n", ratio2to1); printf("Ratio 3:1=%.2f\n", ratio3to1); return 0; } In order to produce assembly, I took the advice in this answer (case 2, turning off whole program optimization), producing the following asm file. ; Listing generated by Microsoft (R) Optimizing Compiler Version 16.00.40219.01 TITLE C:\dev\TestSpeed\dotprod\dotprod.cpp .686P .XMM include listing.inc .model flat INCLUDELIB OLDNAMES PUBLIC __real#0000000000000000 PUBLIC ?SumProduct##YANPBN0HHH#Z ; SumProduct EXTRN __fltused:DWORD ; COMDAT __real#0000000000000000 ; File c:\dev\testspeed\dotprod\dotprod.cpp CONST SEGMENT __real#0000000000000000 DQ 00000000000000000r ; 0 ; Function compile flags: /Ogtp CONST ENDS ; COMDAT ?SumProduct##YANPBN0HHH#Z _TEXT SEGMENT tv491 = -4 ; size = 4 _v1$ = 8 ; size = 4 _v2$ = 12 ; size = 4 _n1$ = 16 ; size = 4 _n2$ = 20 ; size = 4 _n3$ = 24 ; size = 4 ?SumProduct##YANPBN0HHH#Z PROC ; SumProduct, COMDAT ; 25 : { push ebp mov ebp, esp push ecx ; 26 : double sum=0; fldz push ebx mov ebx, DWORD PTR _v2$[ebp] push esi push edi mov edi, DWORD PTR _n1$[ebp] ; 27 : for(int i=0; xor ecx, ecx ; 28 : i<n1 && i <n2 && i <n3; ; 29 : ++i) cmp edi, 4 jl $LC8#SumProduct ; 26 : double sum=0; mov edi, DWORD PTR _v1$[ebp] lea esi, DWORD PTR [edi+24] ; 30 : sum += v1[i]*v2[i]; sub edi, ebx lea edx, DWORD PTR [ecx+2] lea eax, DWORD PTR [ebx+8] mov DWORD PTR tv491[ebp], edi $LN15#SumProduct: ; 28 : i<n1 && i <n2 && i <n3; ; 29 : ++i) mov ebx, DWORD PTR _n2$[ebp] cmp ecx, ebx jge $LN9#SumProduct cmp ecx, DWORD PTR _n3$[ebp] jge $LN9#SumProduct ; 30 : sum += v1[i]*v2[i]; fld QWORD PTR [eax-8] lea edi, DWORD PTR [edx-1] fmul QWORD PTR [esi-24] faddp ST(1), ST(0) cmp edi, ebx jge SHORT $LN9#SumProduct ; 28 : i<n1 && i <n2 && i <n3; ; 29 : ++i) cmp edi, DWORD PTR _n3$[ebp] jge SHORT $LN9#SumProduct ; 30 : sum += v1[i]*v2[i]; mov edi, DWORD PTR tv491[ebp] fld QWORD PTR [edi+eax] fmul QWORD PTR [eax] faddp ST(1), ST(0) cmp edx, ebx jge SHORT $LN9#SumProduct ; 28 : i<n1 && i <n2 && i <n3; ; 29 : ++i) cmp edx, DWORD PTR _n3$[ebp] jge SHORT $LN9#SumProduct ; 30 : sum += v1[i]*v2[i]; fld QWORD PTR [eax+8] lea edi, DWORD PTR [edx+1] fmul QWORD PTR [esi-8] faddp ST(1), ST(0) cmp edi, ebx jge SHORT $LN9#SumProduct ; 28 : i<n1 && i <n2 && i <n3; ; 29 : ++i) cmp edi, DWORD PTR _n3$[ebp] jge SHORT $LN9#SumProduct ; 30 : sum += v1[i]*v2[i]; fld QWORD PTR [eax+16] mov edi, DWORD PTR _n1$[ebp] fmul QWORD PTR [esi] add ecx, 4 lea ebx, DWORD PTR [edi-3] add eax, 32 ; 00000020H add esi, 32 ; 00000020H faddp ST(1), ST(0) add edx, 4 cmp ecx, ebx jl SHORT $LN15#SumProduct mov ebx, DWORD PTR _v2$[ebp] $LC8#SumProduct: ; 28 : i<n1 && i <n2 && i <n3; ; 29 : ++i) cmp ecx, edi jge SHORT $LN9#SumProduct mov edx, DWORD PTR _v1$[ebp] lea eax, DWORD PTR [ebx+ecx*8] sub edx, ebx $LC3#SumProduct: cmp ecx, DWORD PTR _n2$[ebp] jge SHORT $LN9#SumProduct cmp ecx, DWORD PTR _n3$[ebp] jge SHORT $LN9#SumProduct ; 30 : sum += v1[i]*v2[i]; fld QWORD PTR [eax+edx] inc ecx fmul QWORD PTR [eax] add eax, 8 faddp ST(1), ST(0) cmp ecx, edi jl SHORT $LC3#SumProduct $LN9#SumProduct: ; 31 : return sum; ; 32 : } pop edi pop esi pop ebx mov esp, ebp pop ebp ret 0 ?SumProduct##YANPBN0HHH#Z ENDP ; SumProduct _TEXT ENDS PUBLIC ?SumProduct##YANPBN0HH#Z ; SumProduct ; Function compile flags: /Ogtp ; COMDAT ?SumProduct##YANPBN0HH#Z _TEXT SEGMENT tv448 = -4 ; size = 4 _v1$ = 8 ; size = 4 _v2$ = 12 ; size = 4 _n1$ = 16 ; size = 4 _n2$ = 20 ; size = 4 ?SumProduct##YANPBN0HH#Z PROC ; SumProduct, COMDAT ; 15 : { push ebp mov ebp, esp push ecx ; 16 : double sum=0; fldz push ebx mov ebx, DWORD PTR _v2$[ebp] push esi push edi mov edi, DWORD PTR _n1$[ebp] ; 17 : for(int i=0; xor ecx, ecx ; 18 : i<n1 && i <n2; ; 19 : ++i) cmp edi, 4 jl SHORT $LC8#SumProduct#2 ; 16 : double sum=0; mov edi, DWORD PTR _v1$[ebp] lea edx, DWORD PTR [edi+24] ; 20 : sum += v1[i]*v2[i]; sub edi, ebx lea esi, DWORD PTR [ecx+2] lea eax, DWORD PTR [ebx+8] mov DWORD PTR tv448[ebp], edi $LN19#SumProduct#2: mov edi, DWORD PTR _n2$[ebp] cmp ecx, edi jge SHORT $LN9#SumProduct#2 fld QWORD PTR [eax-8] lea ebx, DWORD PTR [esi-1] fmul QWORD PTR [edx-24] faddp ST(1), ST(0) cmp ebx, edi jge SHORT $LN9#SumProduct#2 mov ebx, DWORD PTR tv448[ebp] fld QWORD PTR [ebx+eax] fmul QWORD PTR [eax] faddp ST(1), ST(0) cmp esi, edi jge SHORT $LN9#SumProduct#2 fld QWORD PTR [eax+8] lea ebx, DWORD PTR [esi+1] fmul QWORD PTR [edx-8] faddp ST(1), ST(0) cmp ebx, edi jge SHORT $LN9#SumProduct#2 fld QWORD PTR [eax+16] mov edi, DWORD PTR _n1$[ebp] fmul QWORD PTR [edx] add ecx, 4 lea ebx, DWORD PTR [edi-3] add eax, 32 ; 00000020H add edx, 32 ; 00000020H faddp ST(1), ST(0) add esi, 4 cmp ecx, ebx jl SHORT $LN19#SumProduct#2 mov ebx, DWORD PTR _v2$[ebp] $LC8#SumProduct#2: ; 18 : i<n1 && i <n2; ; 19 : ++i) cmp ecx, edi jge SHORT $LN9#SumProduct#2 mov edx, DWORD PTR _v1$[ebp] lea eax, DWORD PTR [ebx+ecx*8] sub edx, ebx $LC3#SumProduct#2: cmp ecx, DWORD PTR _n2$[ebp] jge SHORT $LN9#SumProduct#2 ; 20 : sum += v1[i]*v2[i]; fld QWORD PTR [eax+edx] inc ecx fmul QWORD PTR [eax] add eax, 8 faddp ST(1), ST(0) cmp ecx, edi jl SHORT $LC3#SumProduct#2 $LN9#SumProduct#2: ; 21 : return sum; ; 22 : } pop edi pop esi pop ebx mov esp, ebp pop ebp ret 0 ?SumProduct##YANPBN0HH#Z ENDP ; SumProduct _TEXT ENDS PUBLIC ?SumProduct##YANPBN0H#Z ; SumProduct ; Function compile flags: /Ogtp ; COMDAT ?SumProduct##YANPBN0H#Z _TEXT SEGMENT _v1$ = 8 ; size = 4 _v2$ = 12 ; size = 4 ?SumProduct##YANPBN0H#Z PROC ; SumProduct, COMDAT ; _n$ = eax ; 5 : { push ebp mov ebp, esp mov edx, DWORD PTR _v2$[ebp] ; 6 : double sum=0; fldz push ebx push esi mov esi, eax ; 7 : for(int i=0; xor ebx, ebx push edi mov edi, DWORD PTR _v1$[ebp] ; 8 : i<n; ; 9 : ++i) cmp esi, 4 jl SHORT $LC9#SumProduct#3 ; 6 : double sum=0; lea eax, DWORD PTR [edx+8] lea ecx, DWORD PTR [edi+24] ; 10 : sum += v1[i]*v2[i]; sub edi, edx lea edx, DWORD PTR [esi-4] shr edx, 2 inc edx lea ebx, DWORD PTR [edx*4] $LN10#SumProduct#3: fld QWORD PTR [eax-8] add eax, 32 ; 00000020H fmul QWORD PTR [ecx-24] add ecx, 32 ; 00000020H dec edx faddp ST(1), ST(0) fld QWORD PTR [edi+eax-32] fmul QWORD PTR [eax-32] faddp ST(1), ST(0) fld QWORD PTR [eax-24] fmul QWORD PTR [ecx-40] faddp ST(1), ST(0) fld QWORD PTR [eax-16] fmul QWORD PTR [ecx-32] faddp ST(1), ST(0) jne SHORT $LN10#SumProduct#3 ; 6 : double sum=0; mov edx, DWORD PTR _v2$[ebp] mov edi, DWORD PTR _v1$[ebp] $LC9#SumProduct#3: ; 8 : i<n; ; 9 : ++i) cmp ebx, esi jge SHORT $LN8#SumProduct#3 sub edi, edx lea eax, DWORD PTR [edx+ebx*8] sub esi, ebx $LC3#SumProduct#3: ; 10 : sum += v1[i]*v2[i]; fld QWORD PTR [eax+edi] add eax, 8 dec esi fmul QWORD PTR [eax-8] faddp ST(1), ST(0) jne SHORT $LC3#SumProduct#3 $LN8#SumProduct#3: ; 11 : return sum; ; 12 : } pop edi pop esi pop ebx pop ebp ret 0 ?SumProduct##YANPBN0H#Z ENDP ; SumProduct _TEXT ENDS END
One big difference between CPUs is the pipeline optimization The CPU can execute in parallel several instructions until reaches a conditional branch. From this point instead of waiting until all the instructions are executed, the CPU can continue with a branch in parallel until the condition is available and ready to be evaluated. If the assumption was correct, then we have a gain. Otherwise the CPU will go with the other branch. So the tricky part for a CPU is to find the best assumptions and to execute as many instructions in parallel as possible.