Why the OpenMP SIMD directive reduces performance? - fortran

I am learning how to use SIMD directives with OpenMP/Fortran. I
wrote the simple code:
program loop
implicit none
integer :: i,j
real*8 :: x
x = 0.0
do i=1,10000
do j=1,10000000
x = x + 1.0/(1.0*i)
enddo
enddo
print*, x
end program loop
when I compile this code and run it I get:
ifort -O3 -vec-report3 -xhost loop_simd.f90
loop_simd.f90(10): (col. 12) remark: LOOP WAS VECTORIZED
loop_simd.f90(9): (col. 7) remark: loop was not vectorized: not inner loop
time ./a.out
97876060.8355515
real 0m8.940s
user 0m8.937s
sys 0m0.005s
I did what the compiler suggested about the "not inner loop" and
added a SIMD collapse(2) directive:
program loop
implicit none
integer :: i,j
real*8 :: x
x = 0.0
!$omp simd collapse(2) reduction(+:x)
do i=1,10000
do j=1,10000000
x = x + 1.0/(1.0*i)
enddo
enddo
print*, x
end program loop
then I compiled and run the code again and I got the following
output:
ifort -O3 -vec-report3 -openmp -xhost loop_simd.f90
loop_simd.f90(8): (col. 7) remark: OpenMP SIMD LOOP WAS VECTORIZED
time ./a.out
97876054.9903757
real 0m26.535s
user 0m26.540s
sys 0m0.003s
What I don't know is why the performance decreases with SIMD?
And when SIMD will be better than standard Fortran code?
.section .text
.LNDBG_TX:
# mark_description "Intel(R) Fortran Intel(R) 64 Compiler XE for applications running on Intel(R) 64, Version 14.0.2.144 Build 2";
# mark_description "0140120";
# mark_description "-O3 -vec-report3 -openmp -xhost -S";
.file "loop_simd.f90"
.text
..TXTST0:
L__routine_start_MAIN___0:
# -- Begin MAIN__
# mark_begin;
.align 16,0x90
.globl MAIN__
MAIN__:
..B1.1: # Preds ..B1.0
..___tag_value_MAIN__.1: #1.9
..LN0:
.file 1 "loop_simd.f90"
.loc 1 1 is_stmt 1
pushq %rbp #1.9
..___tag_value_MAIN__.3: #
..LN1:
movq %rsp, %rbp #1.9
..___tag_value_MAIN__.4: #
..LN2:
andq $-128, %rsp #1.9
..LN3:
subq $128, %rsp #1.9
..LN4:
movq $0x0000117fe, %rsi #1.9
..LN5:
movl $3, %edi #1.9
..LN6:
call __intel_new_feature_proc_init #1.9
..LN7:
# LOE rbx r12 r13 r14 r15
..B1.12: # Preds ..B1.1
..LN8:
vstmxcsr (%rsp) #1.9
..LN9:
movl $.2.3_2_kmpc_loc_struct_pack.1, %edi #1.9
..LN10:
xorl %esi, %esi #1.9
..LN11:
orl $32832, (%rsp) #1.9
..LN12:
xorl %eax, %eax #1.9
..LN13:
vldmxcsr (%rsp) #1.9
..___tag_value_MAIN__.6: #1.9
..LN14:
call __kmpc_begin #1.9
..___tag_value_MAIN__.7: #
..LN15:
# LOE rbx r12 r13 r14 r15
..B1.2: # Preds ..B1.12
..LN16:
movl $__NLITPACK_0.0.1, %edi #1.9
..LN17:
call for_set_reentrancy #1.9
..LN18:
# LOE rbx r12 r13 r14 r15
..B1.3: # Preds ..B1.2
..LN19:
.loc 1 8 is_stmt 1
movl $4, %eax #8.7
..LN20:
.loc 1 6 is_stmt 1
vxorpd %ymm2, %ymm2, %ymm2 #6.7
..LN21:
.loc 1 8 is_stmt 1
vmovd %eax, %xmm0 #8.7
..LN22:
xorl %eax, %eax #8.7
..LN23:
vpshufd $0, %xmm0, %xmm1 #8.7
..LN24:
vmovdqu .L_2il0floatpacket.19(%rip), %xmm0 #8.7
..LN25:
# LOE rbx r12 r13 r14 r15 eax xmm0 xmm1 ymm2
..B1.4: # Preds ..B1.6 ..B1.3
..LN26:
.loc 1 11 is_stmt 1
vcvtdq2ps %xmm0, %xmm3 #11.34
..LN27:
vrcpps %xmm3, %xmm5 #11.28
..LN28:
vmulps %xmm3, %xmm5, %xmm4 #11.28
..LN29:
vaddps %xmm5, %xmm5, %xmm6 #11.28
..LN30:
vmulps %xmm5, %xmm4, %xmm7 #11.28
..LN31:
.loc 1 10 is_stmt 1
xorl %edx, %edx #10.12
..LN32:
.loc 1 11 is_stmt 1
vsubps %xmm7, %xmm6, %xmm8 #11.28
..LN33:
vcvtps2pd %xmm8, %ymm3 #11.28
..LN34:
# LOE rbx r12 r13 r14 r15 eax edx xmm0 xmm1 ymm2 ymm3
..B1.5: # Preds ..B1.5 ..B1.4
..LN35:
.loc 1 10 is_stmt 1
incl %edx #10.12
..LN36:
.loc 1 11 is_stmt 1
vaddpd %ymm3, %ymm2, %ymm2 #11.17
..LN37:
.loc 1 10 is_stmt 1
cmpl $10000000, %edx #10.12
..LN38:
jb ..B1.5 # Prob 99% #10.12
..LN39:
# LOE rbx r12 r13 r14 r15 eax edx xmm0 xmm1 ymm2 ymm3
..B1.6: # Preds ..B1.5
..LN40:
.loc 1 8 is_stmt 1
addl $4, %eax #8.7
..LN41:
.loc 1 10 is_stmt 1
vpaddd %xmm1, %xmm0, %xmm0 #10.12
..LN42:
.loc 1 8 is_stmt 1
cmpl $10000, %eax #8.7
..LN43:
jb ..B1.4 # Prob 66% #8.7
..LN44:
# LOE rbx r12 r13 r14 r15 eax xmm0 xmm1 ymm2
..B1.7: # Preds ..B1.6
..LN45:
.loc 1 6 is_stmt 1
..LN46:
.loc 1 15 is_stmt 1
lea (%rsp), %rdi #15.7
..LN47:
.loc 1 6 is_stmt 1
vextractf128 $1, %ymm2, %xmm0 #6.7
..LN48:
.loc 1 15 is_stmt 1
movl $-1, %esi #15.7
..LN49:
.loc 1 6 is_stmt 1
vaddpd %xmm0, %xmm2, %xmm1 #6.7
..LN50:
vunpckhpd %xmm1, %xmm1, %xmm3 #6.7
..LN51:
.loc 1 15 is_stmt 1
lea 64(%rsp), %r8 #15.7
..LN52:
movq $0x1208384ff00, %rdx #15.7
..LN53:
movl $__STRLITPACK_0.0.1, %ecx #15.7
..LN54:
xorl %eax, %eax #15.7
..LN55:
.loc 1 6 is_stmt 1
vaddsd %xmm3, %xmm1, %xmm4 #6.7
..LN56:
.loc 1 15 is_stmt 1
vmovsd %xmm4, 64(%rsp) #15.7
..LN57:
movq $0, (%rsp) #15.7
..LN58:
vzeroupper #15.7
..LN59:
call for_write_seq_lis #15.7
..LN60:
# LOE rbx r12 r13 r14 r15
..B1.8: # Preds ..B1.7
..LN61:
.loc 1 18 is_stmt 1
movl $.2.3_2_kmpc_loc_struct_pack.12, %edi #18.1
..LN62:
xorl %eax, %eax #18.1
..___tag_value_MAIN__.8: #18.1
..LN63:
call __kmpc_end #18.1
..___tag_value_MAIN__.9: #
..LN64:
# LOE rbx r12 r13 r14 r15
..B1.9: # Preds ..B1.8
..LN65:
movl $1, %eax #18.1
..LN66:
movq %rbp, %rsp #18.1
..LN67:
popq %rbp #18.1
..___tag_value_MAIN__.10: #
..LN68:
ret #18.1
.align 16,0x90
..___tag_value_MAIN__.12: #
..LN69:
# LOE
..LN70:
# mark_end;
.type MAIN__,#function
.size MAIN__,.-MAIN__
..LNMAIN__.71:
.LNMAIN__:
.data
.align 4
.align 4
.2.3_2_kmpc_loc_struct_pack.1:
.long 0
.long 2
.long 0
.long 0
.quad .2.3_2__kmpc_loc_pack.0
.align 4
.2.3_2__kmpc_loc_pack.0:
.byte 59
.byte 117
.byte 110
.byte 107
.byte 110
.byte 111
.byte 119
.byte 110
.byte 59
.byte 77
.byte 65
.byte 73
.byte 78
.byte 95
.byte 95
.byte 59
.byte 49
.byte 59
.byte 49
.byte 59
.byte 59
.space 3, 0x00 # pad
.align 4
.2.3_2_kmpc_loc_struct_pack.12:
.long 0
.long 2
.long 0
.long 0
.quad .2.3_2__kmpc_loc_pack.11
.align 4
.2.3_2__kmpc_loc_pack.11:
.byte 59
.byte 117
.byte 110
.byte 107
.byte 110
.byte 111
.byte 119
.byte 110
.byte 59
.byte 77
.byte 65
.byte 73
.byte 78
.byte 95
.byte 95
.byte 59
.byte 49
.byte 56
.byte 59
.byte 49
.byte 56
.byte 59
.byte 59
.section .rodata, "a"
.align 16
.align 8
__NLITPACK_0.0.1:
.long 0x00000002,0x00000000
.align 4
__STRLITPACK_0.0.1:
.byte 48
.byte 1
.byte 1
.byte 0
.byte 0
.data
# -- End MAIN__
.section .rodata, "a"
.space 3, 0x00 # pad
.align 16
.L_2il0floatpacket.19:
.long 0x00000001,0x00000002,0x00000003,0x00000004
.type .L_2il0floatpacket.19,#object
.size .L_2il0floatpacket.19,16
.align 16
.L_2il0floatpacket.20:
.long 0x3f800000,0x3f800000,0x3f800000,0x3f800000
.type .L_2il0floatpacket.20,#object
.size .L_2il0floatpacket.20,16
.data
.section .note.GNU-stack, ""
# End
ASM output for the non-openmp code
.section .text
.LNDBG_TX:
# mark_description "Intel(R) Fortran Intel(R) 64 Compiler XE for applications running on Intel(R) 64, Version 14.0.2.144 Build 2";
# mark_description "0140120";
# mark_description "-O3 -vec-report3 -xhost -S";
.file "loop_simd.f90"
.text
..TXTST0:
L__routine_start_MAIN___0:
# -- Begin MAIN__
# mark_begin;
.align 16,0x90
.globl MAIN__
MAIN__:
..B1.1: # Preds ..B1.0
..___tag_value_MAIN__.1: #1.9
..LN0:
.file 1 "loop_simd.f90"
.loc 1 1 is_stmt 1
pushq %rbp #1.9
..___tag_value_MAIN__.3: #
..LN1:
movq %rsp, %rbp #1.9
..___tag_value_MAIN__.4: #
..LN2:
andq $-128, %rsp #1.9
..LN3:
subq $128, %rsp #1.9
..LN4:
movq $0x0000117fe, %rsi #1.9
..LN5:
movl $3, %edi #1.9
..LN6:
call __intel_new_feature_proc_init #1.9
..LN7:
# LOE rbx r12 r13 r14 r15
..B1.10: # Preds ..B1.1
..LN8:
vstmxcsr (%rsp) #1.9
..LN9:
movl $__NLITPACK_0.0.1, %edi #1.9
..LN10:
orl $32832, (%rsp) #1.9
..LN11:
vldmxcsr (%rsp) #1.9
..LN12:
call for_set_reentrancy #1.9
..LN13:
# LOE rbx r12 r13 r14 r15
..B1.2: # Preds ..B1.10
..LN14:
.loc 1 6 is_stmt 1
..LN15:
.loc 1 11 is_stmt 1
vmovss .L_2il0floatpacket.0(%rip), %xmm6 #11.28
..LN16:
.loc 1 9 is_stmt 1
xorl %eax, %eax #9.7
..LN17:
.loc 1 6 is_stmt 1
vxorpd %ymm8, %ymm8, %ymm8 #6.7
..LN18:
vmovapd %ymm8, %ymm7 #6.7
..LN19:
vmovapd %ymm8, %ymm0 #6.7
..LN20:
vmovapd %ymm8, %ymm1 #6.7
..LN21:
vmovapd %ymm8, %ymm2 #6.7
..LN22:
vmovapd %ymm8, %ymm3 #6.7
..LN23:
vmovapd %ymm8, %ymm4 #6.7
..LN24:
vmovapd %ymm8, %ymm5 #6.7
..LN25:
# LOE rbx r12 r13 r14 r15 eax xmm6 ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm7 ymm8
..B1.3: # Preds ..B1.5 ..B1.2
..LN26:
incl %eax #
..LN27:
.loc 1 11 is_stmt 1
vxorps %xmm9, %xmm9, %xmm9 #11.28
..LN28:
vcvtsi2ss %eax, %xmm9, %xmm9 #11.28
..LN29:
vdivss %xmm9, %xmm6, %xmm10 #11.28
..LN30:
vcvtss2sd %xmm10, %xmm10, %xmm10 #11.28
..LN31:
vmovddup %xmm10, %xmm11 #11.28
..LN32:
.loc 1 10 is_stmt 1
xorl %edx, %edx #10.12
..LN33:
.loc 1 11 is_stmt 1
vinsertf128 $1, %xmm11, %ymm11, %ymm9 #11.28
..LN34:
# LOE rbx r12 r13 r14 r15 eax edx xmm6 ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm7 ymm8 ymm9
..B1.4: # Preds ..B1.4 ..B1.3
..LN35:
.loc 1 10 is_stmt 1
addl $32, %edx #10.12
..LN36:
.loc 1 11 is_stmt 1
vaddpd %ymm9, %ymm8, %ymm8 #11.17
..LN37:
vaddpd %ymm7, %ymm9, %ymm7 #11.17
..LN38:
vaddpd %ymm0, %ymm9, %ymm0 #11.17
..LN39:
vaddpd %ymm1, %ymm9, %ymm1 #11.17
..LN40:
vaddpd %ymm2, %ymm9, %ymm2 #11.17
..LN41:
vaddpd %ymm3, %ymm9, %ymm3 #11.17
..LN42:
vaddpd %ymm4, %ymm9, %ymm4 #11.17
..LN43:
vaddpd %ymm5, %ymm9, %ymm5 #11.17
..LN44:
.loc 1 10 is_stmt 1
cmpl $10000000, %edx #10.12
..LN45:
jb ..B1.4 # Prob 99% #10.12
..LN46:
# LOE rbx r12 r13 r14 r15 eax edx xmm6 ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm7 ymm8 ymm9
..B1.5: # Preds ..B1.4
..LN47:
.loc 1 9 is_stmt 1
cmpl $10000, %eax #9.7
..LN48:
jb ..B1.3 # Prob 66% #9.7
..LN49:
# LOE rbx r12 r13 r14 r15 eax xmm6 ymm0 ymm1 ymm2 ymm3 ymm4 ymm5 ymm7 ymm8
..B1.6: # Preds ..B1.5
..LN50:
.loc 1 6 is_stmt 1
vaddpd %ymm7, %ymm8, %ymm6 #6.7
..LN51:
.loc 1 15 is_stmt 1
lea (%rsp), %rdi #15.7
..LN52:
.loc 1 6 is_stmt 1
vaddpd %ymm1, %ymm0, %ymm0 #6.7
..LN53:
vaddpd %ymm3, %ymm2, %ymm1 #6.7
..LN54:
vaddpd %ymm5, %ymm4, %ymm2 #6.7
..LN55:
vaddpd %ymm0, %ymm6, %ymm3 #6.7
..LN56:
vaddpd %ymm2, %ymm1, %ymm4 #6.7
..LN57:
vaddpd %ymm4, %ymm3, %ymm5 #6.7
..LN58:
.loc 1 15 is_stmt 1
movl $-1, %esi #15.7
..LN59:
movq $0x1208384ff00, %rdx #15.7
..LN60:
movl $__STRLITPACK_0.0.1, %ecx #15.7
..LN61:
xorl %eax, %eax #15.7
..LN62:
lea 64(%rsp), %r8 #15.7
..LN63:
movq $0, (%rsp) #15.7
..LN64:
.loc 1 6 is_stmt 1
vextractf128 $1, %ymm5, %xmm7 #6.7
..LN65:
vaddpd %xmm7, %xmm5, %xmm8 #6.7
..LN66:
vunpckhpd %xmm8, %xmm8, %xmm9 #6.7
..LN67:
vaddsd %xmm9, %xmm8, %xmm10 #6.7
..LN68:
.loc 1 15 is_stmt 1
vmovsd %xmm10, 64(%rsp) #15.7
..LN69:
vzeroupper #15.7
..LN70:
call for_write_seq_lis #15.7
..LN71:
# LOE rbx r12 r13 r14 r15
..B1.7: # Preds ..B1.6
..LN72:
.loc 1 18 is_stmt 1
movl $1, %eax #18.1
..LN73:
movq %rbp, %rsp #18.1
..LN74:
popq %rbp #18.1
..___tag_value_MAIN__.6: #
..LN75:
ret #18.1
.align 16,0x90
..___tag_value_MAIN__.8: #
..LN76:
# LOE
..LN77:
# mark_end;
.type MAIN__,#function
.size MAIN__,.-MAIN__
..LNMAIN__.78:
.LNMAIN__:
.section .rodata, "a"
.align 8
.align 8
__NLITPACK_0.0.1:
.long 0x00000000,0x00000000
.align 4
__STRLITPACK_0.0.1:
.byte 48
.byte 1
.byte 1
.byte 0
.byte 0
.data
# -- End MAIN__
.section .rodata, "a"
.space 3, 0x00 # pad
.align 4
.L_2il0floatpacket.0:
.long 0x3f800000
.type .L_2il0floatpacket.0,#object
.size .L_2il0floatpacket.0,4
.data
.section .note.GNU-stack, ""
# End

With OpenMP, Ifort is using SIMD to vectorize the outer loop (over i), so essentially all the time is spent doing
## set up ymm3 with 4 copies of 1.0/(1.0*i),
# and j = %edx = 0
..B1.5: do {
incl %edx # j++
vaddpd %ymm3, %ymm2, %ymm2 # ymm3 + ymm2 => ymm2
cmpl $10000000, %edx } while(j<10000000);
jb ..B1.5 # Prob 99%
10M iterations of vaddpd will completely dominate the cost of everything outside the loop, so all that matters is that this inner-loop is executed 10k / 4 times. (note the add $4, %eax / cmp $10000, %eax / jb, with a branch target back to before the inner loop.)
Since it's only using a single accumulator, throughput is limited by the loop-carried dependency (3 cycles).
Without OpenMP:
It's still doing the full amount of work, not optimizing away any of the loops.
It auto-vectorizes like with #pragma openmp, but using multiple accumulators for increased parallelism. Multiple add instructions can be in-flight at once, instead of having each one depend on the previous.
The setup for the inner loop is very similar, and then the inner loop is:
## set up ymm3 with 4 copies of 1.0/(1.0*i),
..B1.4:
addl $32, %edx #10.12
vaddpd %ymm9, %ymm8, %ymm8 # ymm8 + ymm9 => ymm8
vaddpd %ymm7, %ymm9, %ymm7 # ymm7 + ymm9 => ymm7
vaddpd %ymm0, %ymm9, %ymm0 # ymm0 + ymm9 => ymm0
vaddpd %ymm1, %ymm9, %ymm1 # ...
vaddpd %ymm2, %ymm9, %ymm2
vaddpd %ymm3, %ymm9, %ymm3
vaddpd %ymm4, %ymm9, %ymm4
vaddpd %ymm5, %ymm9, %ymm5
cmpl $10000000, %edx
jb ..B1.4 # Prob 99%
# then combine the 8 vector accumulators down to one, and horizontal sum that.
8 accumulators could keep 8 vaddpds in flight at once, but the latency is only 3 cycles on Intel SnB/IvB (See Agner Fog's insn tables). You didn't say what microarchitecture you're using, but I could infer Sandybridge/Ivybridge from the fact that -xhost uses AVX1 but not AVX2. (broadcast with vmovddup / vinsertf128, rather than AVX2 vbroadcastsd %xmm9, %ymm9)
This perfectly explains the 3x speed ratio: 26.535 / 8.940 = 2.97 ~= 3. (vaddpd has a throughput of one per clock on pre-Skylake Intel CPUs, latency=3. This version is limited by throughput rather than latency, because of the increased instruction-level parallelism).
Unrolling with this many accumulators will help for Skylake, where FP add has 4 cycle latency and two per cycle throughput. (SKL dropped the lower-latency dedicated vector FP add unit from port 1, and runs it in the improved 4c-latency FMA units on ports 0 and 1.)

You may be better off using SIMD for the inner loop only. Then You can use !$OMP parallel on the outer loop.
As the I is related to the outer loop, you could/should probably also reverse the outer and inner loops.
If you allocated another variable for the 1.0/(1.0*i) , then perhaps that could be vectorised. Then the reduction assumes that the new variable is a vector that is in the heap, and not a value that is private in OMP.
These things usually take a bit to work out...

SIMD instructions are intended to improve the performance of code that operates on vectors or arrays. Your sample code only operates on a scalar variable, it is therefore unsurprising that forcing vectorization does not improve performance!

Related

Problems reverse engineering cpp / asm code (just for the sake of learning)

I am trying some cpp binary disassembling. I wrote this utterly simple code:
#include <iostream>
int main() {
int i=0; int i2=0;
for(int i=0; i<1000000; i++) {i2++; std::cout << "\n" << i2;}
return 0;
}
I then compiled it with g++ using something like:
g++ .cpp -o .cpp.bin
I then ran a:
objdump -d .cpp.bin
Here's what I extracted:
;1lim.cpp.bin: file format elf64-x86-64
;Disassembly of section .init:
_init:
endbr64
sub $0x8,%rsp
mov 0x2fd9(%rip),%rax
test %rax,%rax
je 1016 <_init+0x16>
call *%rax
add $0x8,%rsp
ret
;Disassembly of section .plt:
.plt:
push 0x2f7a(%rip)
bnd jmp *0x2f7b(%rip)
nopl (%rax)
endbr64
push $0x0
bnd jmp 1020 <_init+0x20>
nop
endbr64
push $0x1
bnd jmp 1020 <_init+0x20>
nop
endbr64
push $0x2
bnd jmp 1020 <_init+0x20>
nop
endbr64
push $0x3
bnd jmp 1020 <_init+0x20>
nop
;Disassembly of section .plt.got:
__cxa_finalize#plt:
endbr64
bnd jmp *0x2f55(%rip)
nopl 0x0(%rax,%rax,1)
;Disassembly of section .plt.sec:
__cxa_atexit#plt:
endbr64
bnd jmp *0x2f25(%rip)
nopl 0x0(%rax,%rax,1)
_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc#plt:
endbr64
bnd jmp *0x2f1d(%rip)
nopl 0x0(%rax,%rax,1)
_ZNSt8ios_base4InitC1Ev#plt:
endbr64
bnd jmp *0x2f15(%rip)
nopl 0x0(%rax,%rax,1)
_ZNSolsEi#plt:
endbr64
bnd jmp *0x2f0d(%rip)
nopl 0x0(%rax,%rax,1)
;Disassembly of section .text:
_start:
endbr64
xor %ebp,%ebp
mov %rdx,%r9
pop %rsi
mov %rsp,%rdx
and $0xfffffffffffffff0,%rsp
push %rax
push %rsp
xor %r8d,%r8d
xor %ecx,%ecx
lea 0xca(%rip),%rdi
call *0x2ef3(%rip)
hlt
cs nopw 0x0(%rax,%rax,1)
deregister_tm_clones:
lea 0x2f19(%rip),%rdi
lea 0x2f12(%rip),%rax
cmp %rdi,%rax
je 1118 <deregister_tm_clones+0x28>
mov 0x2ed6(%rip),%rax
test %rax,%rax
je 1118 <deregister_tm_clones+0x28>
jmp *%rax
nopl 0x0(%rax)
ret
nopl 0x0(%rax)
register_tm_clones:
lea 0x2ee9(%rip),%rdi
lea 0x2ee2(%rip),%rsi
sub %rdi,%rsi
mov %rsi,%rax
shr $0x3f,%rsi
sar $0x3,%rax
add %rax,%rsi
sar %rsi
je 1158 <register_tm_clones+0x38>
mov 0x2ea5(%rip),%rax
test %rax,%rax
je 1158 <register_tm_clones+0x38>
jmp *%rax
nopw 0x0(%rax,%rax,1)
ret
nopl 0x0(%rax)
__do_global_dtors_aux:
endbr64
cmpb $0x0,0x2fe5(%rip)
jne 1198 <__do_global_dtors_aux+0x38>
push %rbp
cmpq $0x0,0x2e5a(%rip)
mov %rsp,%rbp
je 1187 <__do_global_dtors_aux+0x27>
mov 0x2e86(%rip),%rdi
call 1070 <__cxa_finalize#plt>
call 10f0 <deregister_tm_clones>
movb $0x1,0x2fbd(%rip)
pop %rbp
ret
nopl (%rax)
ret
nopl 0x0(%rax)
frame_dummy:
endbr64
jmp 1120 <register_tm_clones>
main:
endbr64
push %rbp
mov %rsp,%rbp
sub $0x10,%rsp
movl $0x0,-0x4(%rbp)
movl $0x0,-0xc(%rbp)
movl $0x0,-0x8(%rbp)
jmp 11fd <main+0x54>
addl $0x1,-0xc(%rbp)
lea 0xe2d(%rip),%rax
mov %rax,%rsi
lea 0x2e5f(%rip),%rax
mov %rax,%rdi
call 1090 <_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc#plt>
mov %rax,%rdx
mov -0xc(%rbp),%eax
mov %eax,%esi
mov %rdx,%rdi
call 10b0 <_ZNSolsEi#plt>
addl $0x1,-0x8(%rbp)
cmpl $0xf423f,-0x8(%rbp)
jle 11cc <main+0x23>
mov $0x0,%eax
leave
ret
_Z41__static_initialization_and_destruction_0ii:
endbr64
push %rbp
mov %rsp,%rbp
sub $0x10,%rsp
mov %edi,-0x4(%rbp)
mov %esi,-0x8(%rbp)
cmpl $0x1,-0x4(%rbp)
jne 1260 <_Z41__static_initialization_and_destruction_0ii+0x53>
cmpl $0xffff,-0x8(%rbp)
jne 1260 <_Z41__static_initialization_and_destruction_0ii+0x53>
lea 0x2f1c(%rip),%rax
mov %rax,%rdi
call 10a0 <_ZNSt8ios_base4InitC1Ev#plt>
lea 0x2dc4(%rip),%rax
mov %rax,%rdx
lea 0x2f03(%rip),%rax
mov %rax,%rsi
mov 0x2da0(%rip),%rax
mov %rax,%rdi
call 1080 <__cxa_atexit#plt>
nop
leave
ret
_GLOBAL__sub_I_main:
endbr64
push %rbp
mov %rsp,%rbp
mov $0xffff,%esi
mov $0x1,%edi
call 120d <_Z41__static_initialization_and_destruction_0ii>
pop %rbp
ret
;Disassembly of section .fini:
_fini:
endbr64
sub $0x8,%rsp
add $0x8,%rsp
ret
I am now trying to interpret it using the following:
nasm -f elf64 .asm
How can I possibly fix the assembly code, in order to try to compile it with NASM (it's already a slickly modified version from what I got from objdump)
the answer that i found to this question was the following: instead of running the predeceasing quoted commands, using the following:
gcc -S .cpp
this will produce the following code:
.file "1lim.cpp"
.text
.local _ZStL8__ioinit
.comm _ZStL8__ioinit,1,1
.section .rodata
.LC0:
.string "\n"
.text
.globl main
.type main, #function
main:
.LFB1731:
.cfi_startproc
endbr64
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $16, %rsp
movl $0, -4(%rbp)
movl $0, -12(%rbp)
movl $0, -8(%rbp)
jmp .L2
.L3:
addl $1, -12(%rbp)
leaq .LC0(%rip), %rax
movq %rax, %rsi
leaq _ZSt4cout(%rip), %rax
movq %rax, %rdi
call _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc#PLT
movq %rax, %rdx
movl -12(%rbp), %eax
movl %eax, %esi
movq %rdx, %rdi
call _ZNSolsEi#PLT
addl $1, -8(%rbp)
.L2:
cmpl $999999, -8(%rbp)
jle .L3
movl $0, %eax
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE1731:
.size main, .-main
.type _Z41__static_initialization_and_destruction_0ii, #function
_Z41__static_initialization_and_destruction_0ii:
.LFB2229:
.cfi_startproc
endbr64
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $16, %rsp
movl %edi, -4(%rbp)
movl %esi, -8(%rbp)
cmpl $1, -4(%rbp)
jne .L7
cmpl $65535, -8(%rbp)
jne .L7
leaq _ZStL8__ioinit(%rip), %rax
movq %rax, %rdi
call _ZNSt8ios_base4InitC1Ev#PLT
leaq __dso_handle(%rip), %rax
movq %rax, %rdx
leaq _ZStL8__ioinit(%rip), %rax
movq %rax, %rsi
movq _ZNSt8ios_base4InitD1Ev#GOTPCREL(%rip), %rax
movq %rax, %rdi
call __cxa_atexit#PLT
.L7:
nop
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE2229:
.size _Z41__static_initialization_and_destruction_0ii, .-_Z41__static_initialization_and_destruction_0ii
.type _GLOBAL__sub_I_main, #function
_GLOBAL__sub_I_main:
.LFB2230:
.cfi_startproc
endbr64
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movl $65535, %esi
movl $1, %edi
call _Z41__static_initialization_and_destruction_0ii
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE2230:
.size _GLOBAL__sub_I_main, .-_GLOBAL__sub_I_main
.section .init_array,"aw"
.align 8
.quad _GLOBAL__sub_I_main
.hidden __dso_handle
.ident "GCC: (Ubuntu 11.2.0-19ubuntu1) 11.2.0"
.section .note.GNU-stack,"",#progbits
.section .note.gnu.property,"a"
.align 8
.long 1f - 0f
.long 4f - 1f
.long 5
0:
.string "GNU"
1:
.align 8
.long 0xc0000002
.long 3f - 2f
2:
.long 0x3
3:
.align 8
4:
to then compile it it's just about doing the following:
g++ .s -o .s.bin
to then run it it's just a matter of
./.s.bin
now my question is:
how can i do the same with executables or binaries?
apparently, the answer resides in using a binary called objconv, which stands for converting c++ binaries into assembly code in a correct way. apparently, it can be installed on any operative system using anaconda packge and environment managers
cheers

ja used for a signed number instead of jg [duplicate]

I am reading a book about assembly switch statement, the code has cases/branches when the input n is case: 100, 102, 103, 104, 106. It simplified the jump table by subtracting 100 from n and then if the result is above 6, it goes to the default case in L2, Otherwise it will go to the corresponding branches that match the value in %eax.
And my questions are:
if so, isn't line 7 suppose to be jmp *.L7(,%eax) if the index of jump table is held in %eax?
And why did they change the number into unsigned in line 5 by doing ja .L2?
int x is at %ebp+8,int n at %ebp+12
movl 8(%ebp), %edx
movl 12(%ebp), %eax
subl $100, %eax
cmpl $6, %eax
ja .L2
jmp *.L7(,%eax,4)
.L2:
movl $0, %eax
jmp .L8
.L5:
movl %edx, %eax
jmp .L9
.L3:
leal (%edx,%edx,2), %eax
leal (%edx,%eax,4), %eax
jmp .L8
.L4:
leal 10(%edx), %eax
.L9:
addl $11, %eax
jmp .L8
.L6:
movl %edx, %eax
imull %edx, %eax
.L8:
Jump table:
.section .rodata
.align 4 Align
.L7:
.long .L3 //Case 100: loc_A
.long .L2 //Case 101: loc_def
.long .L4 //Case 102: loc_B
.long .L5 //Case 103: loc_C
.long .L6 //Case 104: loc_D
.long .L2 //Case 105: loc_def
.long .L6 //Case 106: loc_D
isn't line 7 suppose to be jmp *.L7(,%eax) if the index of jump table is held in %eax?
Each entry in the jump table is a long, which is 4 bytes. Hence eax is scaled by 4.
And why did they changed the number into unsigned in line 5 by doing ja .L2?
The point is to exclude any number that's less than 100 and greater than 106. I assume it's obvious how it excludes values greater than 106.
So let's say n was less than 100, e.g. 99. If we then subtract 100 from that we get -1, which when viewed as an unsigned 32-bit value is 4294967295, which is obviously "above" 6, and the jump to .L2 is taken like it should.
subl $100, %eax ; eax = 99-100 == -1
cmpl $6, %eax ; set flags based on -1 - 6 == -7 => ZF=0 and CF=0
ja .L2 ; jump if ZF=0 and CF=0

Optimization of naive matrix multiplication (ICC vs GCC)

The code below uses a very straightforward approach to calculate the matrix product a * b and store the result in c. The code was compiled with -O3 on both GCC 4.4.6 (with -mtune=native) and Intel Compiler 13.0.1 and the speed on GCC is significantly worse (over a factor of two for the sample data used).
I'm curious about the cause of these differences, but unfortunately I'm not familiar enough with the assembly output to understand what's going on here. From a glance it looks as though ICC is doing a better job at vectorizing the computation, but I can't decipher much more than that. (This is mainly for learning purposes since there's no way I would use this in production!)
void __attribute__ ((noinline)) mm( // Line 3
int n,
double*__restrict__ c,
double*__restrict__ a,
double*__restrict__ b
) {
int i, j, k;
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++) {
c[i + n * j] = 0; // Line 12
for (k = 0; k < n; k++) {
c[i + n * j] += a[i + n * k] * b[k + n * j]; // Line 14
}
}
}
}
Here is the output from GCC:
_Z2mmiPdS_S_:
.LFB0:
.cfi_startproc
.cfi_personality 0x3,__gxx_personality_v0
pushq %r14 #
.cfi_def_cfa_offset 16
.cfi_offset 14, -16
testl %edi, %edi # n
movq %rcx, %r14 # b, b
pushq %r13 #
.cfi_def_cfa_offset 24
.cfi_offset 13, -24
pushq %r12 #
.cfi_def_cfa_offset 32
.cfi_offset 12, -32
pushq %rbp #
.cfi_def_cfa_offset 40
.cfi_offset 6, -40
pushq %rbx #
.cfi_def_cfa_offset 48
.cfi_offset 3, -48
jle .L6 #,
leal -1(%rdi), %eax #, tmp96
movslq %edi, %r11 # n, n
movq %rdx, %rbx # a, ivtmp.54
xorl %r12d, %r12d # ivtmp.67
salq $3, %r11 #, D.2193
xorl %ebp, %ebp # prephitmp.37
leaq 8(,%rax,8), %r13 #, D.2208
.L3:
leaq (%rsi,%r12), %r10 #, ivtmp.61
movq %r14, %rcx # b, ivtmp.63
xorl %edx, %edx # j
.p2align 4,,10
.p2align 3
.L5:
movq $0, (%r10) #,* ivtmp.61
movq %rbp, -8(%rsp) # prephitmp.37,
movq %rcx, %r9 # ivtmp.63, ivtmp.70
movsd -8(%rsp), %xmm1 #, prephitmp.37
movq %rbx, %r8 # ivtmp.54, ivtmp.69
xorl %eax, %eax # k
.p2align 4,,10
.p2align 3
.L4:
movsd (%r8), %xmm0 #* ivtmp.69, tmp99
addl $1, %eax #, k
addq %r11, %r8 # D.2193, ivtmp.69
mulsd (%r9), %xmm0 #* ivtmp.70, tmp99
addq $8, %r9 #, ivtmp.70
cmpl %edi, %eax # n, k
addsd %xmm0, %xmm1 # tmp99, prephitmp.37
movsd %xmm1, (%r10) # prephitmp.37,* ivtmp.61
jne .L4 #,
addl $1, %edx #, j
addq %r11, %r10 # D.2193, ivtmp.61
addq %r11, %rcx # D.2193, ivtmp.63
cmpl %edi, %edx # n, j
jne .L5 #,
addq $8, %r12 #, ivtmp.67
addq $8, %rbx #, ivtmp.54
cmpq %r13, %r12 # D.2208, ivtmp.67
jne .L3 #,
.L6:
popq %rbx #
.cfi_def_cfa_offset 40
popq %rbp #
.cfi_def_cfa_offset 32
popq %r12 #
.cfi_def_cfa_offset 24
popq %r13 #
.cfi_def_cfa_offset 16
popq %r14 #
.cfi_def_cfa_offset 8
ret
.cfi_endproc
And here is the output from ICC:
# -- Begin _Z2mmiPdS_S_
# mark_begin;
.align 16,0x90
.globl _Z2mmiPdS_S_
_Z2mmiPdS_S_:
# parameter 1: %edi
# parameter 2: %rsi
# parameter 3: %rdx
# parameter 4: %rcx
..B1.1: # Preds ..B1.0
..___tag_value__Z2mmiPdS_S_.1: #8.3
pushq %r12 #8.3
..___tag_value__Z2mmiPdS_S_.3: #
pushq %r13 #8.3
..___tag_value__Z2mmiPdS_S_.5: #
pushq %r14 #8.3
..___tag_value__Z2mmiPdS_S_.7: #
pushq %r15 #8.3
..___tag_value__Z2mmiPdS_S_.9: #
pushq %rbx #8.3
..___tag_value__Z2mmiPdS_S_.11: #
pushq %rbp #8.3
..___tag_value__Z2mmiPdS_S_.13: #
subq $72, %rsp #8.3
..___tag_value__Z2mmiPdS_S_.15: #
movq %rsi, %r9 #
movslq %edi, %rax #
xorl %r10d, %r10d #11.9
testl %edi, %edi #11.25
jle ..B1.7 # Prob 10% #11.25
# LOE rax rdx rcx rbx rbp rsi r9 r12 r13 r14 r15 edi r10d
..B1.2: # Preds ..B1.1
movl %edi, %r11d #10.5
lea (,%rax,8), %r8 #
andl $-4, %r11d #10.5
movq %rax, %r14 #12.28
movslq %r11d, %r11 #10.5
movl %edi, %r12d #12.28
movq %rsi, 8(%rsp) #12.28
movq %r8, %rbp #12.28
movq %rdx, 32(%rsp) #12.28
movq %r9, %r13 #12.28
movq %rcx, (%rsp) #12.28
movl %r10d, %r15d #12.28
pxor %xmm0, %xmm0 #12.28
movq %r11, %rbx #12.28
# LOE rbx rbp r13 r14 r12d r15d
..B1.3: # Preds ..B1.5 ..B1.48 ..B1.45 ..B1.2
cmpl $12, %r12d #10.5
jle ..B1.38 # Prob 0% #10.5
# LOE rbx rbp r13 r14 r12d r15d
..B1.4: # Preds ..B1.3
movq %r13, %rdi #12.13
xorl %esi, %esi #12.13
movq %rbp, %rdx #12.13
call _intel_fast_memset #12.13
# LOE rbx rbp r13 r14 r12d r15d
..B1.5: # Preds ..B1.4
incl %r15d #11.9
lea (%r13,%r14,8), %r13 #11.9
cmpl %r12d, %r15d #11.9
jb ..B1.3 # Prob 82% #11.9
# LOE rbx rbp r13 r14 r12d r15d
..B1.6: # Preds ..B1.48 ..B1.45 ..B1.5 # Infreq
movl %r12d, %edi #
movq %r14, %rax #
movq 8(%rsp), %rsi #
testl %edi, %edi #11.25
movq 32(%rsp), %rdx #
movq (%rsp), %rcx #
# LOE rax rdx rcx rbx rbp rsi r12 r13 r14 r15 edi
..B1.7: # Preds ..B1.1 ..B1.6 # Infreq
movl $0, %r9d #11.9
movl $0, %r8d #
jle ..B1.33 # Prob 10% #11.25
# LOE rax rdx rcx rbx rbp rsi r8 r12 r13 r14 r15 edi r9d
..B1.8: # Preds ..B1.7 # Infreq
movq %rdx, 32(%rsp) #
# LOE rax rcx rsi r8 edi r9d
..B1.9: # Preds ..B1.31 ..B1.8 # Infreq
xorl %r12d, %r12d #
lea (%rsi,%r8,8), %r13 #14.17
movq %r13, %r15 #10.5
xorl %ebx, %ebx #13.13
andq $15, %r15 #10.5
xorl %r10d, %r10d #
movl %r15d, %r14d #10.5
lea (%rcx,%r8,8), %rbp #14.48
andl $7, %r14d #10.5
xorl %r11d, %r11d #
movl %r14d, 48(%rsp) #
xorl %edx, %edx #
movl %r15d, 56(%rsp) #
movq %r13, 40(%rsp) #
movq %r8, 16(%rsp) #
movl %r9d, 24(%rsp) #
movq %rsi, 8(%rsp) #
movq %rcx, (%rsp) #
movq 32(%rsp), %r14 #
# LOE rax rdx rbp r10 r12 r14 ebx edi r11d
..B1.10: # Preds ..B1.30 ..B1.9 # Infreq
cmpq $8, %rax #10.5
jl ..B1.34 # Prob 10% #10.5
# LOE rax rdx rbp r10 r12 r14 ebx edi r11d
..B1.11: # Preds ..B1.10 # Infreq
movl 56(%rsp), %r9d #10.5
testl %r9d, %r9d #10.5
je ..B1.14 # Prob 50% #10.5
# LOE rax rdx rbp r9 r10 r12 r14 ebx edi r11d
..B1.12: # Preds ..B1.11 # Infreq
cmpl $0, 48(%rsp) #10.5
jne ..B1.34 # Prob 10% #10.5
# LOE rax rdx rbp r10 r12 r14 ebx edi r11d
..B1.13: # Preds ..B1.12 # Infreq
movl $1, %r9d #10.5
# LOE rax rdx rbp r9 r10 r12 r14 ebx edi r11d
..B1.14: # Preds ..B1.13 ..B1.11 # Infreq
movl %r9d, %r13d #10.5
lea 8(%r13), %rcx #10.5
cmpq %rcx, %rax #10.5
jl ..B1.34 # Prob 10% #10.5
# LOE rax rdx rbp r9 r10 r12 r13 r14 ebx edi r11d
..B1.15: # Preds ..B1.14 # Infreq
movl %edi, %r15d #10.5
xorl %ecx, %ecx #10.5
subl %r9d, %r15d #10.5
movslq %r11d, %r8 #14.33
andl $7, %r15d #10.5
negl %r15d #10.5
addl %edi, %r15d #10.5
movslq %r15d, %r15 #10.5
testq %r13, %r13 #10.5
lea (%r14,%r8,8), %rsi #14.33
jbe ..B1.35 # Prob 0% #10.5
# LOE rax rdx rcx rbp rsi r8 r9 r10 r12 r13 r14 r15 ebx edi r11d
..B1.16: # Preds ..B1.15 # Infreq
movsd (%r10,%rbp), %xmm0 #14.48
movq 40(%rsp), %r14 #14.48
# LOE rax rdx rcx rbp rsi r8 r9 r10 r12 r13 r14 r15 ebx edi r11d xmm0
..B1.17: # Preds ..B1.17 ..B1.16 # Infreq
movsd (%rsi,%rcx,8), %xmm1 #14.33
mulsd %xmm0, %xmm1 #14.48
addsd (%r14,%rcx,8), %xmm1 #14.17
movsd %xmm1, (%r14,%rcx,8) #14.17
incq %rcx #10.5
cmpq %r13, %rcx #10.5
jb ..B1.17 # Prob 82% #10.5
# LOE rax rdx rcx rbp rsi r8 r9 r10 r12 r13 r14 r15 ebx edi r11d xmm0
..B1.18: # Preds ..B1.17 # Infreq
movq 32(%rsp), %r14 #
# LOE rax rdx rbp rsi r8 r9 r10 r12 r13 r14 r15 ebx edi r11d xmm0
..B1.19: # Preds ..B1.18 ..B1.35 # Infreq
addq %r9, %r8 #14.33
lea (%r14,%r8,8), %rcx #14.33
testq $15, %rcx #10.5
je ..B1.23 # Prob 60% #10.5
# LOE rax rdx rbp rsi r10 r12 r13 r14 r15 ebx edi r11d xmm0
..B1.20: # Preds ..B1.19 # Infreq
movq 40(%rsp), %rcx #14.48
unpcklpd %xmm0, %xmm0 #14.48
# LOE rax rdx rcx rbp rsi r10 r12 r13 r14 r15 ebx edi r11d xmm0
..B1.21: # Preds ..B1.21 ..B1.20 # Infreq
movsd (%rsi,%r13,8), %xmm1 #14.33
movsd 16(%rsi,%r13,8), %xmm2 #14.33
movsd 32(%rsi,%r13,8), %xmm3 #14.33
movsd 48(%rsi,%r13,8), %xmm4 #14.33
movhpd 8(%rsi,%r13,8), %xmm1 #14.33
movhpd 24(%rsi,%r13,8), %xmm2 #14.33
movhpd 40(%rsi,%r13,8), %xmm3 #14.33
movhpd 56(%rsi,%r13,8), %xmm4 #14.33
mulpd %xmm0, %xmm1 #14.48
mulpd %xmm0, %xmm2 #14.48
mulpd %xmm0, %xmm3 #14.48
mulpd %xmm0, %xmm4 #14.48
addpd (%rcx,%r13,8), %xmm1 #14.17
addpd 16(%rcx,%r13,8), %xmm2 #14.17
addpd 32(%rcx,%r13,8), %xmm3 #14.17
addpd 48(%rcx,%r13,8), %xmm4 #14.17
movaps %xmm1, (%rcx,%r13,8) #14.17
movaps %xmm2, 16(%rcx,%r13,8) #14.17
movaps %xmm3, 32(%rcx,%r13,8) #14.17
movaps %xmm4, 48(%rcx,%r13,8) #14.17
addq $8, %r13 #10.5
cmpq %r15, %r13 #10.5
jb ..B1.21 # Prob 82% #10.5
jmp ..B1.26 # Prob 100% #10.5
# LOE rax rdx rcx rbp rsi r10 r12 r13 r14 r15 ebx edi r11d xmm0
..B1.23: # Preds ..B1.19 # Infreq
movq 40(%rsp), %rcx #14.48
unpcklpd %xmm0, %xmm0 #14.48
.align 16,0x90
# LOE rax rdx rcx rbp rsi r10 r12 r13 r14 r15 ebx edi r11d xmm0
..B1.24: # Preds ..B1.24 ..B1.23 # Infreq
movaps (%rsi,%r13,8), %xmm1 #14.33
movaps 16(%rsi,%r13,8), %xmm2 #14.33
movaps 32(%rsi,%r13,8), %xmm3 #14.33
movaps 48(%rsi,%r13,8), %xmm4 #14.33
mulpd %xmm0, %xmm1 #14.48
mulpd %xmm0, %xmm2 #14.48
mulpd %xmm0, %xmm3 #14.48
mulpd %xmm0, %xmm4 #14.48
addpd (%rcx,%r13,8), %xmm1 #14.17
addpd 16(%rcx,%r13,8), %xmm2 #14.17
addpd 32(%rcx,%r13,8), %xmm3 #14.17
addpd 48(%rcx,%r13,8), %xmm4 #14.17
movaps %xmm1, (%rcx,%r13,8) #14.17
movaps %xmm2, 16(%rcx,%r13,8) #14.17
movaps %xmm3, 32(%rcx,%r13,8) #14.17
movaps %xmm4, 48(%rcx,%r13,8) #14.17
addq $8, %r13 #10.5
cmpq %r15, %r13 #10.5
jb ..B1.24 # Prob 82% #10.5
# LOE rax rdx rcx rbp rsi r10 r12 r13 r14 r15 ebx edi r11d xmm0
..B1.26: # Preds ..B1.24 ..B1.21 ..B1.34 # Infreq
cmpq %rax, %r15 #10.5
jae ..B1.30 # Prob 0% #10.5
# LOE rax rdx rbp r10 r12 r14 r15 ebx edi r11d
..B1.27: # Preds ..B1.26 # Infreq
movsd (%rbp,%r12,8), %xmm0 #14.48
lea (%r14,%rdx,8), %rcx #14.33
movq 40(%rsp), %rsi #14.48
# LOE rax rdx rcx rbp rsi r10 r12 r14 r15 ebx edi r11d xmm0
..B1.28: # Preds ..B1.28 ..B1.27 # Infreq
movsd (%rcx,%r15,8), %xmm1 #14.33
mulsd %xmm0, %xmm1 #14.48
addsd (%rsi,%r15,8), %xmm1 #14.17
movsd %xmm1, (%rsi,%r15,8) #14.17
incq %r15 #10.5
cmpq %rax, %r15 #10.5
jb ..B1.28 # Prob 82% #10.5
# LOE rax rdx rcx rbp rsi r10 r12 r14 r15 ebx edi r11d xmm0
..B1.30: # Preds ..B1.28 ..B1.26 # Infreq
incl %ebx #13.13
addq %rax, %rdx #13.13
addl %edi, %r11d #13.13
addq $8, %r10 #13.13
incq %r12 #13.13
cmpl %edi, %ebx #13.13
jb ..B1.10 # Prob 82% #13.13
# LOE rax rdx rbp r10 r12 r14 ebx edi r11d
..B1.31: # Preds ..B1.30 # Infreq
movl 24(%rsp), %r9d #
incl %r9d #11.9
movq 16(%rsp), %r8 #
addq %rax, %r8 #11.9
movq 8(%rsp), %rsi #
cmpl %edi, %r9d #11.9
movq (%rsp), %rcx #
jb ..B1.9 # Prob 82% #11.9
# LOE rax rcx rsi r8 edi r9d
..B1.33: # Preds ..B1.31 ..B1.7 # Infreq
addq $72, %rsp #18.1
..___tag_value__Z2mmiPdS_S_.16: #
popq %rbp #18.1
..___tag_value__Z2mmiPdS_S_.18: #
popq %rbx #18.1
..___tag_value__Z2mmiPdS_S_.20: #
popq %r15 #18.1
..___tag_value__Z2mmiPdS_S_.22: #
popq %r14 #18.1
..___tag_value__Z2mmiPdS_S_.24: #
popq %r13 #18.1
..___tag_value__Z2mmiPdS_S_.26: #
popq %r12 #18.1
..___tag_value__Z2mmiPdS_S_.28: #
ret #18.1
..___tag_value__Z2mmiPdS_S_.29: #
# LOE
..B1.34: # Preds ..B1.10 ..B1.14 ..B1.12 # Infreq
xorl %r15d, %r15d #10.5
jmp ..B1.26 # Prob 100% #10.5
# LOE rax rdx rbp r10 r12 r14 r15 ebx edi r11d
..B1.35: # Preds ..B1.15 # Infreq
movsd (%rbp,%r12,8), %xmm0 #14.48
jmp ..B1.19 # Prob 100% #14.48
# LOE rax rdx rbp rsi r8 r9 r10 r12 r13 r14 r15 ebx edi r11d xmm0
..B1.38: # Preds ..B1.3 # Infreq
cmpq $4, %r14 #10.5
jl ..B1.47 # Prob 10% #10.5
# LOE rbx rbp r13 r14 r12d r15d
..B1.39: # Preds ..B1.38 # Infreq
xorl %esi, %esi #10.5
movq %rbx, %rdx #10.5
movq %r13, %rcx #
xorl %eax, %eax #
pxor %xmm0, %xmm0 #
# LOE rax rdx rcx rbx rbp rsi r13 r14 r12d r15d xmm0
..B1.40: # Preds ..B1.40 ..B1.39 # Infreq
addq $4, %rsi #10.5
movq %rax, (%rcx) #12.13
movhpd %xmm0, 8(%rcx) #12.13
movq %rax, 16(%rcx) #12.13
movhpd %xmm0, 24(%rcx) #12.13
addq $32, %rcx #10.5
cmpq %rbx, %rsi #10.5
jb ..B1.40 # Prob 82% #10.5
# LOE rax rdx rcx rbx rbp rsi r13 r14 r12d r15d xmm0
..B1.42: # Preds ..B1.40 ..B1.47 # Infreq
cmpq %r14, %rdx #10.5
jae ..B1.48 # Prob 0% #10.5
# LOE rdx rbx rbp r13 r14 r12d r15d
..B1.43: # Preds ..B1.42 # Infreq
xorl %ecx, %ecx #
# LOE rdx rcx rbx rbp r13 r14 r12d r15d
..B1.44: # Preds ..B1.44 ..B1.43 # Infreq
movq %rcx, (%r13,%rdx,8) #12.13
incq %rdx #10.5
cmpq %r14, %rdx #10.5
jb ..B1.44 # Prob 82% #10.5
# LOE rdx rcx rbx rbp r13 r14 r12d r15d
..B1.45: # Preds ..B1.44 # Infreq
incl %r15d #11.9
lea (%r13,%r14,8), %r13 #11.9
cmpl %r12d, %r15d #11.9
jb ..B1.3 # Prob 82% #11.9
jmp ..B1.6 # Prob 100% #11.9
# LOE rbx rbp r13 r14 r12d r15d
..B1.47: # Preds ..B1.38 # Infreq
xorl %edx, %edx #10.5
jmp ..B1.42 # Prob 100% #10.5
# LOE rdx rbx rbp r13 r14 r12d r15d
..B1.48: # Preds ..B1.42 # Infreq
incl %r15d #11.9
lea (%r13,%r14,8), %r13 #11.9
cmpl %r12d, %r15d #11.9
jb ..B1.3 # Prob 82% #11.9
jmp ..B1.6 # Prob 100% #11.9
.align 16,0x90
..___tag_value__Z2mmiPdS_S_.36: #
# LOE rbx rbp r13 r14 r12d r15d
# mark_end;
.type _Z2mmiPdS_S_,#function
.size _Z2mmiPdS_S_,.-_Z2mmiPdS_S_
.data
# -- End _Z2mmiPdS_S_
Edit: With the help of Olaf Dietsche, it looks like the code below can run much faster with GCC 4.8.2, though still a bit (~30%) slower than Intel. The main difference is that the initialization is done ahead of time (this by itself makes no difference) and the loop ordering has been interchanged (this makes a major difference for GCC).
memset(c, 0, n * n);
for (j = 0; j < n; j++) {
for (k = 0; k < n; k++) {
for (i = 0; i < n; i++) {
c[i + n * j] += a[i + n * k] * b[k + n * j]; // Line 14
}
}
}
Your code seems to be wrong or not suitable for vectorization.
When I modify your code according to this blog post Performance – GCC & auto-vectorization
int i, j, k;
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++) {
for (k = 0; k < n; k++) {
c[n * i + k] += a[n * i + j] * b[n * j + k]; // Line 14
}
}
}
and compile it with
gcc-4.8 -O3 -S a.c
it uses at least some SIMD instructions
.L8:
movsd (%rcx), %xmm1
addl $1, %r8d
movsd (%rdx,%rsi), %xmm2
unpcklpd %xmm1, %xmm1
movhpd 8(%rdx,%rsi), %xmm2
movsd (%rax,%rsi), %xmm0
mulpd %xmm2, %xmm1
movhpd 8(%rax,%rsi), %xmm0
addpd %xmm1, %xmm0
movlpd %xmm0, (%rax,%rsi)
movhpd %xmm0, 8(%rax,%rsi)
addq $16, %rsi
cmpl %r8d, %ebx
ja .L8
cmpl %edi, %r15d
je .L9
although not as much as ICC does.
Update:
Adding -funroll-loops enlarges the generated assembly code substantially to about the length of your posted ICC assembly.
Looks like the Intel compiler is using SIMD instructions (mulpd, addpd, movaps etc) -- it's able to perform more than one operation (i.e. both a = b*c and d = e*f) in a single clock cycle, whereas the GCC code would take two to do the same.. I'm not sure if it's possible to enable these operations automatically in GCC, but you can hand-write them in with some work.
It seems like the flags -msse, -msse2, -msse3 to GCC cause it to attempt to do some SIMD optimization of its own.
I'm not sure if ICC really produced faster code in this case because I didn't run any actual benchmarks. But you can tell GCC to unroll the loops with -funroll-loops. The output will be longer, will contain lots of xmm's and will look faster.
Neither icc nor gcc necessarily optimizes the degree of unrolling. To match them, you would use e.g.
gcc -funroll-loops --param max-unroll-times=4
icc -unroll4
as gcc tends to unroll more than optimum for CPUs of the last 8 years (if allowed to do so) while icc is more conservative.
Also glossed over above is that icc -O3 encourages the compiler to optimize loop nesting, and may even engage the special -opt-matmul facility.
The original form implies a dot product reduction inner loop, for which gcc might require both -ffast-math and a more modern choice for -march= in order to optimize. icc is much more aggressive about riffling a dot product (batching into multiple sums), if it can't avoid it by switching loop nest.

Performance of C++11 modern-style loops vs old-style loops

This is the first question I'm posting here, so I hope I won't do anything wrong.
My question concerns the performance of modern-style C++11 loops (std::for_each, range-based for) vs old-style C++ loops (for (...; ...; ...)). From what I understood, it seems to me that the motto of modern C++ is "expressivity with no compromise on performance". Modern C++ style leads to safe, clean, and fast code with little to no performance penalty and, possibly, with a performance gain over old-style C++.
Now I've made a little test to assess how big this gain is concerning loops. First I wrote the following three functions:
using namespace std;
void foo(vector<double>& v)
{
for (size_t i = 0; i < v.size(); i++)
{
v[i] /= 42;
}
}
void bar(vector<double>& v)
{
for (auto& x : v)
{
x /= 42;
}
}
void wee(vector<double>& v)
{
for_each(begin(v), end(v), [] (double& x)
{
x /= 42;
});
}
Then I compared their performance by calling them this way (properly commenting/uncommenting the three lines inside main()'s loop:
vector<double> make_vector()
{
vector<double> v;
for (int i = 0; i < 30000; i++) { v.push_back(i); }
return v;
}
int main()
{
time_t start = clock();
auto v = make_vector();
for (int i = 0; i <= 50000; i++)
{
// UNCOMMENT THE FUNCTION CALL TO BE TESTED, COMMENT THE OTHERS
foo(v);
// bar(v);
// wee(v);
}
time_t end = clock();
cout << (end - start) << endl;
return 0;
}
Averaging over 10 executions of each version of the program obtained by commenting/uncommenting the lines in main()'s loop, and using the old-style loop as a baseline, the range-based for loop performs ~1.9x worse, and the loop based on std::for_each and lambdas performs ~2.3x worse.
I used Clang 3.2 to compile this, and I haven't tried MS VC11 (I'm working on WinXP).
Considering my expectation of getting comparable execution times, my questions are:
Did I do something obviously wrong?
If not, couldn't a 2x performance penalty be a good reason NOT to embrace modern-style loops?
I would like to remark, that I do believe that the clarity and safety of code written in modern C++ style pay off for a possible performance loss, but I quite disagree with the statement that there is no trade-off between clarity/safety on one side and performance on the other side.
Am I missing something?
It looks like the difference only shows up when you do not enable optimisations in your compiler.
With Clang you can enable optimisation with the -O[0-3] flag.
Mankarse is right - most likely you have not enabled optimizations.
Actually on Clang they produce practically same result ASM code in main loop, and small difference in pre/post code.
I have tested four versions: hand_loop_index, hand_loop_iterator, range_based_for, for_each_algorithm
hand_loop_iterator, range_based_for and for_each_algorithm - all three do produce exactly same result ASM for full function body, only difference is in names of labels.
I.e. hand written for loop with iterators results in exactly same ASM code as range-based-for and std::for_each.
There are some differences between loop with index and loop with iterator versions.
Main loop in both cases is almost same. The only minor differece is that for iterators version(s) rdx register is used instead of rsi.
Index version:
.LBB0_7: # %vector.body
# =>This Inner Loop Header: Depth=1
movupd -48(%rsi), %xmm1
movupd -32(%rsi), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -48(%rsi)
movupd %xmm2, -32(%rsi)
movupd -16(%rsi), %xmm1
movupd (%rsi), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -16(%rsi)
movupd %xmm2, (%rsi)
addq $64, %rsi
addq $-8, %rdi
jne .LBB0_7
Iterator version(s):
.LBB1_6: # %vector.body
# =>This Inner Loop Header: Depth=1
movupd -48(%rdx), %xmm1
movupd -32(%rdx), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -48(%rdx)
movupd %xmm2, -32(%rdx)
movupd -16(%rdx), %xmm1
movupd (%rdx), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -16(%rdx)
movupd %xmm2, (%rdx)
addq $64, %rdx
addq $-8, %rsi
jne .LBB1_6
Pre/post code for index vs iterator versions has many differences, but it should not affect greatly total result timing for large enough arrays.
LIVE DEMO on Coliru with ASM output
#include <algorithm>
#include <iterator>
#include <vector>
using namespace std;
void hand_loop_index(vector<double> &v)
{
for (size_t i = 0; i < v.size(); ++i)
{
v[i] /= 42;
}
}
void hand_loop_iterator(vector<double> &v)
{
for (auto first = begin(v), last = end(v); first!=last; ++first)
{
*first /= 42;
}
}
void range_based_for(vector<double> &v)
{
for (auto &x : v)
{
x /= 42;
}
}
void for_each_algorithm(vector<double> &v)
{
for_each(begin(v), end(v), [] (double &x)
{
x /= 42;
});
}
Result ASM:
# clang++ -std=c++1z -O3 -Wall -pedantic -pthread main.cpp -S
.text
.file "main.cpp"
.section .rodata.cst16,"aM",#progbits,16
.align 16
.LCPI0_0:
.quad 4631107791820423168 # double 4.200000e+01
.quad 4631107791820423168 # double 4.200000e+01
.section .rodata.cst8,"aM",#progbits,8
.align 8
.LCPI0_1:
.quad 4631107791820423168 # double 42
.text
.globl _Z15hand_loop_indexRSt6vectorIdSaIdEE
.align 16, 0x90
.type _Z15hand_loop_indexRSt6vectorIdSaIdEE,#function
_Z15hand_loop_indexRSt6vectorIdSaIdEE: # #_Z15hand_loop_indexRSt6vectorIdSaIdEE
.cfi_startproc
# BB#0:
movq (%rdi), %rax
movq 8(%rdi), %rcx
subq %rax, %rcx
je .LBB0_11
# BB#1: # %.lr.ph
sarq $3, %rcx
cmpq $1, %rcx
movl $1, %edx
cmovaq %rcx, %rdx
xorl %edi, %edi
testq %rdx, %rdx
je .LBB0_10
# BB#2: # %overflow.checked
xorl %edi, %edi
movq %rdx, %r8
andq $-4, %r8
je .LBB0_9
# BB#3: # %vector.body.preheader
cmpq $1, %rcx
movl $1, %edi
cmovaq %rcx, %rdi
addq $-4, %rdi
movq %rdi, %rsi
shrq $2, %rsi
xorl %r9d, %r9d
btq $2, %rdi
jb .LBB0_5
# BB#4: # %vector.body.prol
movupd (%rax), %xmm0
movupd 16(%rax), %xmm1
movapd .LCPI0_0(%rip), %xmm2 # xmm2 = [4.200000e+01,4.200000e+01]
divpd %xmm2, %xmm0
divpd %xmm2, %xmm1
movupd %xmm0, (%rax)
movupd %xmm1, 16(%rax)
movl $4, %r9d
.LBB0_5: # %vector.body.preheader.split
testq %rsi, %rsi
je .LBB0_8
# BB#6: # %vector.body.preheader.split.split
cmpq $1, %rcx
movl $1, %edi
cmovaq %rcx, %rdi
andq $-4, %rdi
subq %r9, %rdi
leaq 48(%rax,%r9,8), %rsi
movapd .LCPI0_0(%rip), %xmm0 # xmm0 = [4.200000e+01,4.200000e+01]
.align 16, 0x90
.LBB0_7: # %vector.body
# =>This Inner Loop Header: Depth=1
movupd -48(%rsi), %xmm1
movupd -32(%rsi), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -48(%rsi)
movupd %xmm2, -32(%rsi)
movupd -16(%rsi), %xmm1
movupd (%rsi), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -16(%rsi)
movupd %xmm2, (%rsi)
addq $64, %rsi
addq $-8, %rdi
jne .LBB0_7
.LBB0_8:
movq %r8, %rdi
.LBB0_9: # %middle.block
cmpq %rdi, %rdx
je .LBB0_11
.align 16, 0x90
.LBB0_10: # %scalar.ph
# =>This Inner Loop Header: Depth=1
movsd (%rax,%rdi,8), %xmm0 # xmm0 = mem[0],zero
divsd .LCPI0_1(%rip), %xmm0
movsd %xmm0, (%rax,%rdi,8)
incq %rdi
cmpq %rcx, %rdi
jb .LBB0_10
.LBB0_11: # %._crit_edge
retq
.Lfunc_end0:
.size _Z15hand_loop_indexRSt6vectorIdSaIdEE, .Lfunc_end0-_Z15hand_loop_indexRSt6vectorIdSaIdEE
.cfi_endproc
.section .rodata.cst16,"aM",#progbits,16
.align 16
.LCPI1_0:
.quad 4631107791820423168 # double 4.200000e+01
.quad 4631107791820423168 # double 4.200000e+01
.section .rodata.cst8,"aM",#progbits,8
.align 8
.LCPI1_1:
.quad 4631107791820423168 # double 42
.text
.globl _Z18hand_loop_iteratorRSt6vectorIdSaIdEE
.align 16, 0x90
.type _Z18hand_loop_iteratorRSt6vectorIdSaIdEE,#function
_Z18hand_loop_iteratorRSt6vectorIdSaIdEE: # #_Z18hand_loop_iteratorRSt6vectorIdSaIdEE
.cfi_startproc
# BB#0:
movq (%rdi), %rdx
movq 8(%rdi), %rax
cmpq %rax, %rdx
je .LBB1_11
# BB#1: # %.lr.ph.preheader
movabsq $4611686018427387900, %rsi # imm = 0x3FFFFFFFFFFFFFFC
leaq -8(%rax), %rcx
subq %rdx, %rcx
shrq $3, %rcx
incq %rcx
xorl %edi, %edi
movq %rcx, %r9
andq %rsi, %r9
je .LBB1_8
# BB#2: # %vector.body.preheader
andq %rcx, %rsi
leaq -4(%rsi), %rdi
movq %rdi, %r11
shrq $2, %r11
xorl %r10d, %r10d
btq $2, %rdi
jb .LBB1_4
# BB#3: # %vector.body.prol
movupd (%rdx), %xmm0
movupd 16(%rdx), %xmm1
movapd .LCPI1_0(%rip), %xmm2 # xmm2 = [4.200000e+01,4.200000e+01]
divpd %xmm2, %xmm0
divpd %xmm2, %xmm1
movupd %xmm0, (%rdx)
movupd %xmm1, 16(%rdx)
movl $4, %r10d
.LBB1_4: # %vector.body.preheader.split
leaq (%rdx,%r9,8), %r8
testq %r11, %r11
je .LBB1_7
# BB#5: # %vector.body.preheader.split.split
subq %r10, %rsi
leaq 48(%rdx,%r10,8), %rdx
movapd .LCPI1_0(%rip), %xmm0 # xmm0 = [4.200000e+01,4.200000e+01]
.align 16, 0x90
.LBB1_6: # %vector.body
# =>This Inner Loop Header: Depth=1
movupd -48(%rdx), %xmm1
movupd -32(%rdx), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -48(%rdx)
movupd %xmm2, -32(%rdx)
movupd -16(%rdx), %xmm1
movupd (%rdx), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -16(%rdx)
movupd %xmm2, (%rdx)
addq $64, %rdx
addq $-8, %rsi
jne .LBB1_6
.LBB1_7:
movq %r8, %rdx
movq %r9, %rdi
.LBB1_8: # %middle.block
cmpq %rdi, %rcx
je .LBB1_11
# BB#9:
movsd .LCPI1_1(%rip), %xmm0 # xmm0 = mem[0],zero
.align 16, 0x90
.LBB1_10: # %.lr.ph
# =>This Inner Loop Header: Depth=1
movsd (%rdx), %xmm1 # xmm1 = mem[0],zero
divsd %xmm0, %xmm1
movsd %xmm1, (%rdx)
addq $8, %rdx
cmpq %rdx, %rax
jne .LBB1_10
.LBB1_11: # %._crit_edge
retq
.Lfunc_end1:
.size _Z18hand_loop_iteratorRSt6vectorIdSaIdEE, .Lfunc_end1-_Z18hand_loop_iteratorRSt6vectorIdSaIdEE
.cfi_endproc
.section .rodata.cst16,"aM",#progbits,16
.align 16
.LCPI2_0:
.quad 4631107791820423168 # double 4.200000e+01
.quad 4631107791820423168 # double 4.200000e+01
.section .rodata.cst8,"aM",#progbits,8
.align 8
.LCPI2_1:
.quad 4631107791820423168 # double 42
.text
.globl _Z15range_based_forRSt6vectorIdSaIdEE
.align 16, 0x90
.type _Z15range_based_forRSt6vectorIdSaIdEE,#function
_Z15range_based_forRSt6vectorIdSaIdEE: # #_Z15range_based_forRSt6vectorIdSaIdEE
.cfi_startproc
# BB#0:
movq (%rdi), %rdx
movq 8(%rdi), %rax
cmpq %rax, %rdx
je .LBB2_11
# BB#1: # %.lr.ph.preheader
movabsq $4611686018427387900, %rsi # imm = 0x3FFFFFFFFFFFFFFC
leaq -8(%rax), %rcx
subq %rdx, %rcx
shrq $3, %rcx
incq %rcx
xorl %edi, %edi
movq %rcx, %r9
andq %rsi, %r9
je .LBB2_8
# BB#2: # %vector.body.preheader
andq %rcx, %rsi
leaq -4(%rsi), %rdi
movq %rdi, %r11
shrq $2, %r11
xorl %r10d, %r10d
btq $2, %rdi
jb .LBB2_4
# BB#3: # %vector.body.prol
movupd (%rdx), %xmm0
movupd 16(%rdx), %xmm1
movapd .LCPI2_0(%rip), %xmm2 # xmm2 = [4.200000e+01,4.200000e+01]
divpd %xmm2, %xmm0
divpd %xmm2, %xmm1
movupd %xmm0, (%rdx)
movupd %xmm1, 16(%rdx)
movl $4, %r10d
.LBB2_4: # %vector.body.preheader.split
leaq (%rdx,%r9,8), %r8
testq %r11, %r11
je .LBB2_7
# BB#5: # %vector.body.preheader.split.split
subq %r10, %rsi
leaq 48(%rdx,%r10,8), %rdx
movapd .LCPI2_0(%rip), %xmm0 # xmm0 = [4.200000e+01,4.200000e+01]
.align 16, 0x90
.LBB2_6: # %vector.body
# =>This Inner Loop Header: Depth=1
movupd -48(%rdx), %xmm1
movupd -32(%rdx), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -48(%rdx)
movupd %xmm2, -32(%rdx)
movupd -16(%rdx), %xmm1
movupd (%rdx), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -16(%rdx)
movupd %xmm2, (%rdx)
addq $64, %rdx
addq $-8, %rsi
jne .LBB2_6
.LBB2_7:
movq %r8, %rdx
movq %r9, %rdi
.LBB2_8: # %middle.block
cmpq %rdi, %rcx
je .LBB2_11
# BB#9:
movsd .LCPI2_1(%rip), %xmm0 # xmm0 = mem[0],zero
.align 16, 0x90
.LBB2_10: # %.lr.ph
# =>This Inner Loop Header: Depth=1
movsd (%rdx), %xmm1 # xmm1 = mem[0],zero
divsd %xmm0, %xmm1
movsd %xmm1, (%rdx)
addq $8, %rdx
cmpq %rdx, %rax
jne .LBB2_10
.LBB2_11: # %._crit_edge
retq
.Lfunc_end2:
.size _Z15range_based_forRSt6vectorIdSaIdEE, .Lfunc_end2-_Z15range_based_forRSt6vectorIdSaIdEE
.cfi_endproc
.section .rodata.cst16,"aM",#progbits,16
.align 16
.LCPI3_0:
.quad 4631107791820423168 # double 4.200000e+01
.quad 4631107791820423168 # double 4.200000e+01
.section .rodata.cst8,"aM",#progbits,8
.align 8
.LCPI3_1:
.quad 4631107791820423168 # double 42
.text
.globl _Z18for_each_algorithmRSt6vectorIdSaIdEE
.align 16, 0x90
.type _Z18for_each_algorithmRSt6vectorIdSaIdEE,#function
_Z18for_each_algorithmRSt6vectorIdSaIdEE: # #_Z18for_each_algorithmRSt6vectorIdSaIdEE
.cfi_startproc
# BB#0:
movq (%rdi), %rdx
movq 8(%rdi), %rax
cmpq %rax, %rdx
je .LBB3_11
# BB#1: # %.lr.ph.i.preheader
movabsq $4611686018427387900, %rsi # imm = 0x3FFFFFFFFFFFFFFC
leaq -8(%rax), %rcx
subq %rdx, %rcx
shrq $3, %rcx
incq %rcx
xorl %edi, %edi
movq %rcx, %r9
andq %rsi, %r9
je .LBB3_8
# BB#2: # %vector.body.preheader
andq %rcx, %rsi
leaq -4(%rsi), %rdi
movq %rdi, %r11
shrq $2, %r11
xorl %r10d, %r10d
btq $2, %rdi
jb .LBB3_4
# BB#3: # %vector.body.prol
movupd (%rdx), %xmm0
movupd 16(%rdx), %xmm1
movapd .LCPI3_0(%rip), %xmm2 # xmm2 = [4.200000e+01,4.200000e+01]
divpd %xmm2, %xmm0
divpd %xmm2, %xmm1
movupd %xmm0, (%rdx)
movupd %xmm1, 16(%rdx)
movl $4, %r10d
.LBB3_4: # %vector.body.preheader.split
leaq (%rdx,%r9,8), %r8
testq %r11, %r11
je .LBB3_7
# BB#5: # %vector.body.preheader.split.split
subq %r10, %rsi
leaq 48(%rdx,%r10,8), %rdx
movapd .LCPI3_0(%rip), %xmm0 # xmm0 = [4.200000e+01,4.200000e+01]
.align 16, 0x90
.LBB3_6: # %vector.body
# =>This Inner Loop Header: Depth=1
movupd -48(%rdx), %xmm1
movupd -32(%rdx), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -48(%rdx)
movupd %xmm2, -32(%rdx)
movupd -16(%rdx), %xmm1
movupd (%rdx), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -16(%rdx)
movupd %xmm2, (%rdx)
addq $64, %rdx
addq $-8, %rsi
jne .LBB3_6
.LBB3_7:
movq %r8, %rdx
movq %r9, %rdi
.LBB3_8: # %middle.block
cmpq %rdi, %rcx
je .LBB3_11
# BB#9:
movsd .LCPI3_1(%rip), %xmm0 # xmm0 = mem[0],zero
.align 16, 0x90
.LBB3_10: # %.lr.ph.i
# =>This Inner Loop Header: Depth=1
movsd (%rdx), %xmm1 # xmm1 = mem[0],zero
divsd %xmm0, %xmm1
movsd %xmm1, (%rdx)
addq $8, %rdx
cmpq %rdx, %rax
jne .LBB3_10
.LBB3_11: # %_ZSt8for_eachIN9__gnu_cxx17__normal_iteratorIPdSt6vectorIdSaIdEEEEZ18for_each_algorithmR5_E3$_0ET0_T_SA_S9_.exit
retq
.Lfunc_end3:
.size _Z18for_each_algorithmRSt6vectorIdSaIdEE, .Lfunc_end3-_Z18for_each_algorithmRSt6vectorIdSaIdEE
.cfi_endproc
.ident "clang version 3.7.0 (tags/RELEASE_370/final 246979)"
.section ".note.GNU-stack","",#progbits

Why sin/cos are slower when optimizations are enabled?

After reading a question related with the performance of sin/cos (Why is std::sin() and std::cos() slower than sin() and cos()?), I made some tests with his code and found a weird thing: If i call sin/cos with a float value, it is much slower than with double when compiled with optimization.
#include <cmath>
#include <cstdio>
const int N = 4000;
float cosine[N][N];
float sine[N][N];
int main() {
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
float ang = i*j*2*M_PI/N;
cosine[i][j] = cos(ang);
sine[i][j] = sin(ang);
}
}
}
With the above code I get:
With -O0: 2.402s
With -O1: 9.004s
With -O2: 9.013s
With -O3: 9.001s
Now if I change
float ang = i*j*2*M_PI/N;
To
double ang = i*j*2*M_PI/N;
I get:
With -O0: 2.362s
With -O1: 1.188s
With -O2: 1.197s
With -O3: 1.197s
How can the first test be that faster without optimizations?
I'm using g++ (Ubuntu/Linaro 4.5.2-8ubuntu4) 4.5.2, 64 bits.
EDIT: Changed the title to better describe the problem.
EDIT: Added assembly code
Assembly for first test with O0:
.file "main.cpp"
.globl cosine
.bss
.align 32
.type cosine, #object
.size cosine, 64000000
cosine:
.zero 64000000
.globl sine
.align 32
.type sine, #object
.size sine, 64000000
sine:
.zero 64000000
.text
.globl main
.type main, #function
main:
.LFB87:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
movq %rsp, %rbp
.cfi_offset 6, -16
.cfi_def_cfa_register 6
subq $16, %rsp
movl $0, -4(%rbp)
jmp .L2
.L5:
movl $0, -8(%rbp)
jmp .L3
.L4:
movl -4(%rbp), %eax
imull -8(%rbp), %eax
addl %eax, %eax
cvtsi2sd %eax, %xmm0
movsd .LC0(%rip), %xmm1
mulsd %xmm1, %xmm0
movsd .LC1(%rip), %xmm1
divsd %xmm1, %xmm0
unpcklpd %xmm0, %xmm0
cvtpd2ps %xmm0, %xmm0
movss %xmm0, -12(%rbp)
movss -12(%rbp), %xmm0
cvtps2pd %xmm0, %xmm0
call cos
unpcklpd %xmm0, %xmm0
cvtpd2ps %xmm0, %xmm0
movl -8(%rbp), %eax
cltq
movl -4(%rbp), %edx
movslq %edx, %rdx
imulq $4000, %rdx, %rdx
leaq (%rdx,%rax), %rax
movss %xmm0, cosine(,%rax,4)
movss -12(%rbp), %xmm0
cvtps2pd %xmm0, %xmm0
call sin
unpcklpd %xmm0, %xmm0
cvtpd2ps %xmm0, %xmm0
movl -8(%rbp), %eax
cltq
movl -4(%rbp), %edx
movslq %edx, %rdx
imulq $4000, %rdx, %rdx
leaq (%rdx,%rax), %rax
movss %xmm0, sine(,%rax,4)
addl $1, -8(%rbp)
.L3:
cmpl $3999, -8(%rbp)
setle %al
testb %al, %al
jne .L4
addl $1, -4(%rbp)
.L2:
cmpl $3999, -4(%rbp)
setle %al
testb %al, %al
jne .L5
movl $0, %eax
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE87:
.size main, .-main
.section .rodata
.align 4
.type _ZL1N, #object
.size _ZL1N, 4
_ZL1N:
.long 4000
.align 8
.LC0:
.long 1413754136
.long 1074340347
.align 8
.LC1:
.long 0
.long 1085227008
.ident "GCC: (Ubuntu/Linaro 4.5.2-8ubuntu4) 4.5.2"
.section .note.GNU-stack,"",#progbits
Assembly for first test with O3:
.file "main.cpp"
.text
.p2align 4,,15
.globl main
.type main, #function
main:
.LFB121:
.cfi_startproc
pushq %r15
.cfi_def_cfa_offset 16
xorl %r15d, %r15d
.cfi_offset 15, -16
pushq %r14
.cfi_def_cfa_offset 24
movl $cosine+16000, %r14d
.cfi_offset 14, -24
pushq %r13
.cfi_def_cfa_offset 32
xorl %r13d, %r13d
.cfi_offset 13, -32
pushq %r12
.cfi_def_cfa_offset 40
pushq %rbp
.cfi_def_cfa_offset 48
pushq %rbx
.cfi_def_cfa_offset 56
subq $24, %rsp
.cfi_def_cfa_offset 80
.p2align 4,,10
.p2align 3
.L2:
movslq %r15d, %rbp
.cfi_offset 3, -56
.cfi_offset 6, -48
.cfi_offset 12, -40
movl %r13d, %r12d
movl $0x3f800000, %edx
imulq $16000, %rbp, %rbp
xorl %eax, %eax
leaq cosine(%rbp), %rbx
addq $sine, %rbp
jmp .L5
.p2align 4,,10
.p2align 3
.L3:
movl %r12d, %eax
leaq 8(%rsp), %rsi
leaq 12(%rsp), %rdi
subl %r13d, %eax
cvtsi2sd %eax, %xmm0
mulsd .LC2(%rip), %xmm0
divsd .LC3(%rip), %xmm0
unpcklpd %xmm0, %xmm0
cvtpd2ps %xmm0, %xmm0
call sincosf
movl 8(%rsp), %edx
movl 12(%rsp), %eax
.L5:
movl %edx, (%rbx)
addq $4, %rbx
movl %eax, 0(%rbp)
addl %r13d, %r12d
addq $4, %rbp
cmpq %r14, %rbx
jne .L3
addl $1, %r15d
addl $2, %r13d
leaq 16000(%rbx), %r14
cmpl $4000, %r15d
jne .L2
addq $24, %rsp
.cfi_def_cfa_offset 56
xorl %eax, %eax
popq %rbx
.cfi_def_cfa_offset 48
popq %rbp
.cfi_def_cfa_offset 40
popq %r12
.cfi_def_cfa_offset 32
popq %r13
.cfi_def_cfa_offset 24
popq %r14
.cfi_def_cfa_offset 16
popq %r15
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE121:
.size main, .-main
.globl cosine
.bss
.align 32
.type cosine, #object
.size cosine, 64000000
cosine:
.zero 64000000
.globl sine
.align 32
.type sine, #object
.size sine, 64000000
sine:
.zero 64000000
.section .rodata.cst8,"aM",#progbits,8
.align 8
.LC2:
.long 1413754136
.long 1074340347
.align 8
.LC3:
.long 0
.long 1085227008
.ident "GCC: (Ubuntu/Linaro 4.5.2-8ubuntu4) 4.5.2"
.section .note.GNU-stack,"",#progbits
Here's a possibility:
In C, cos is double precision and cosf is single precision. In C++, std::cos has overloads for both double and single.
You aren't calling std::cos. If <cmath> doesn't also overload ::cos (as far as I know, it is not required to), then you are just calling the C double precision function. If this is the case, then you're suffering the cost of converting between float, double, and back.
Now, some standard libraries implement cos(float x) as (float)cos((double)x), so even if you are calling the float function it might still be doing conversions behind the scenes.
This shouldn't account for a 9x performance difference, though.
AFAIK it's because computers work at double precision natively. Using float requires conversions.'