Slow std::string concatenation on windows - c++

I have a program that needs to concatenate lots of strings together (to be more precise integers converted to strings). On my Ubuntu machine (running g++ 7.3.0) the code runs in 1.5 seconds. But the code needs to be run on Windows as well (running g++ 6.3.0 using MinGW), where it takes 15 seconds to complete. Furthermore, the Ubuntu setup runs on a much slower Laptop using an i7-4712MQ CPU # 2.30GHz, whereas the Windows machine runs on an i7-7700K CPU # 4.20GHz.
The code to reproduce the times is shown below. I compile the code with g++ tester.cpp -O2 -o tester (or tester.exe for windows)
#include <iostream>
#include <chrono>
int main(int argc, char const *argv[]) {
auto started = std::chrono::high_resolution_clock::now();
std::string str = "";
const int n = 10000000;
str.reserve(2 * n);
int a = 1;
for (int i = 0; i < n; ++i) {
str += std::to_string(a) + " ";
}
auto done = std::chrono::high_resolution_clock::now();
double secs = (double) std::chrono::duration_cast<std::chrono::milliseconds>(done-started).count() / 1000;
std::cout << "Done in " << secs << "\n";
return 0;
}
Any idea where the large performance gap might come from?
The disassemblies look like this:
Ubuntu:
.file "tester.cpp"
.text
.align 2
.p2align 4,,15
.type _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.19, #function
_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.19:
.LFB2389:
.cfi_startproc
pushq %r12
.cfi_def_cfa_offset 16
.cfi_offset 12, -16
pushq %rbp
.cfi_def_cfa_offset 24
.cfi_offset 6, -24
movq %rsi, %r12
pushq %rbx
.cfi_def_cfa_offset 32
.cfi_offset 3, -32
movq %rdx, %rbx
movq %rdi, %rbp
subq %rsi, %rbx
subq $16, %rsp
.cfi_def_cfa_offset 48
movq %fs:40, %rax
movq %rax, 8(%rsp)
xorl %eax, %eax
cmpq $15, %rbx
movq %rbx, (%rsp)
ja .L12
movq (%rdi), %rdx
cmpq $1, %rbx
movq %rdx, %rax
jne .L4
movzbl (%rsi), %eax
movb %al, (%rdx)
movq (%rdi), %rdx
.L5:
movq (%rsp), %rax
movq %rax, 8(%rbp)
movb $0, (%rdx,%rax)
movq 8(%rsp), %rax
xorq %fs:40, %rax
jne .L13
addq $16, %rsp
.cfi_remember_state
.cfi_def_cfa_offset 32
popq %rbx
.cfi_def_cfa_offset 24
popq %rbp
.cfi_def_cfa_offset 16
popq %r12
.cfi_def_cfa_offset 8
ret
.L12:
.cfi_restore_state
xorl %edx, %edx
movq %rsp, %rsi
call _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_createERmm#PLT
movq (%rsp), %rdx
movq %rax, 0(%rbp)
movq %rdx, 16(%rbp)
.L3:
movq %rbx, %rdx
movq %r12, %rsi
movq %rax, %rdi
call memcpy#PLT
movq 0(%rbp), %rdx
jmp .L5
.L4:
testq %rbx, %rbx
je .L5
jmp .L3
.L13:
call __stack_chk_fail#PLT
.cfi_endproc
.LFE2389:
.size _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.19, .-_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.19
.set _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPcEEvT_S7_St20forward_iterator_tag.isra.23,_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.19
.section .text._ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z,"axG",#progbits,_ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z,comdat
.p2align 4,,15
.weak _ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z
.type _ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z, #function
_ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z:
.LFB1953:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsi, %r10
movq %rdx, %rsi
movq %rcx, %rdx
movq %rsp, %rbp
.cfi_def_cfa_register 6
pushq %r12
pushq %rbx
.cfi_offset 12, -24
.cfi_offset 3, -32
movq %rdi, %r12
subq $208, %rsp
testb %al, %al
movq %r8, -160(%rbp)
movq %r9, -152(%rbp)
je .L15
movaps %xmm0, -144(%rbp)
movaps %xmm1, -128(%rbp)
movaps %xmm2, -112(%rbp)
movaps %xmm3, -96(%rbp)
movaps %xmm4, -80(%rbp)
movaps %xmm5, -64(%rbp)
movaps %xmm6, -48(%rbp)
movaps %xmm7, -32(%rbp)
.L15:
movq %fs:40, %rax
movq %rax, -200(%rbp)
xorl %eax, %eax
leaq 30(%rsi), %rax
leaq -224(%rbp), %rcx
andq $-16, %rax
movl $32, -224(%rbp)
movl $48, -220(%rbp)
subq %rax, %rsp
leaq 16(%rbp), %rax
leaq 15(%rsp), %rbx
movq %rax, -216(%rbp)
leaq -192(%rbp), %rax
andq $-16, %rbx
movq %rbx, %rdi
movq %rax, -208(%rbp)
call *%r10
leaq 16(%r12), %rdx
movq %r12, %rdi
movq %rbx, %rsi
movq %rdx, (%r12)
movslq %eax, %rdx
addq %rbx, %rdx
call _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPcEEvT_S7_St20forward_iterator_tag.isra.23
movq -200(%rbp), %rdi
xorq %fs:40, %rdi
movq %r12, %rax
jne .L18
leaq -16(%rbp), %rsp
popq %rbx
popq %r12
popq %rbp
.cfi_remember_state
.cfi_def_cfa 7, 8
ret
.L18:
.cfi_restore_state
call __stack_chk_fail#PLT
.cfi_endproc
.LFE1953:
.size _ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z, .-_ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z
.section .rodata.str1.1,"aMS",#progbits,1
.LC0:
.string ""
.LC1:
.string "%d"
.LC2:
.string "basic_string::append"
.LC3:
.string " "
.LC5:
.string "Done in "
.LC6:
.string "\n"
.section .text.startup,"ax",#progbits
.p2align 4,,15
.globl main
.type main, #function
main:
.LFB1871:
.cfi_startproc
.cfi_personality 0x9b,DW.ref.__gxx_personality_v0
.cfi_lsda 0x1b,.LLSDA1871
pushq %r15
.cfi_def_cfa_offset 16
.cfi_offset 15, -16
pushq %r14
.cfi_def_cfa_offset 24
.cfi_offset 14, -24
pushq %r13
.cfi_def_cfa_offset 32
.cfi_offset 13, -32
pushq %r12
.cfi_def_cfa_offset 40
.cfi_offset 12, -40
pushq %rbp
.cfi_def_cfa_offset 48
.cfi_offset 6, -48
pushq %rbx
.cfi_def_cfa_offset 56
.cfi_offset 3, -56
subq $136, %rsp
.cfi_def_cfa_offset 192
leaq 16(%rsp), %r13
movq %fs:40, %rax
movq %rax, 120(%rsp)
xorl %eax, %eax
call _ZNSt6chrono3_V212system_clock3nowEv#PLT
leaq .LC0(%rip), %rdx
movq %rax, (%rsp)
leaq 16(%r13), %rax
movq %r13, %rdi
movq %rdx, %rsi
movq %rax, 16(%rsp)
.LEHB0:
call _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.19
.LEHE0:
movl $20000000, %esi
movq %r13, %rdi
.LEHB1:
call _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE7reserveEm#PLT
.LEHE1:
leaq 48(%rsp), %rbp
leaq 80(%rsp), %rax
movl $10000000, %ebx
movabsq $9223372036854775807, %r14
leaq 96(%rsp), %r12
movq %rax, 8(%rsp)
leaq 16(%rbp), %r15
jmp .L25
.p2align 4,,10
.p2align 3
.L21:
movq %rcx, 80(%rsp)
movq 16(%rax), %rcx
movq %rcx, 96(%rsp)
.L22:
movq 8(%rax), %rcx
movb $0, 16(%rax)
movq %r13, %rdi
movq %rcx, 88(%rsp)
movq %rdx, (%rax)
movq $0, 8(%rax)
movq 80(%rsp), %rsi
movq 88(%rsp), %rdx
.LEHB2:
call _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_appendEPKcm#PLT
.LEHE2:
movq 80(%rsp), %rdi
cmpq %r12, %rdi
je .L23
call _ZdlPv#PLT
.L23:
movq 48(%rsp), %rdi
cmpq %r15, %rdi
je .L24
call _ZdlPv#PLT
.L24:
subl $1, %ebx
je .L40
.L25:
movq vsnprintf#GOTPCREL(%rip), %rsi
leaq .LC1(%rip), %rcx
movl $1, %r8d
movl $16, %edx
movq %rbp, %rdi
xorl %eax, %eax
.LEHB3:
call _ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z
.LEHE3:
cmpq %r14, 56(%rsp)
je .L41
leaq .LC3(%rip), %rsi
movl $1, %edx
movq %rbp, %rdi
.LEHB4:
call _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_appendEPKcm#PLT
.LEHE4:
movq %r12, 80(%rsp)
movq (%rax), %rcx
leaq 16(%rax), %rdx
cmpq %rdx, %rcx
jne .L21
movdqu 16(%rax), %xmm0
movaps %xmm0, 96(%rsp)
jmp .L22
.p2align 4,,10
.p2align 3
.L40:
call _ZNSt6chrono3_V212system_clock3nowEv#PLT
subq (%rsp), %rax
movabsq $4835703278458516699, %rdx
leaq .LC5(%rip), %rsi
pxor %xmm0, %xmm0
leaq _ZSt4cout(%rip), %rdi
movq %rax, %rcx
imulq %rdx
sarq $63, %rcx
sarq $18, %rdx
subq %rcx, %rdx
cvtsi2sdq %rdx, %xmm0
movl $8, %edx
divsd .LC4(%rip), %xmm0
movsd %xmm0, (%rsp)
.LEHB5:
call _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l#PLT
movsd (%rsp), %xmm0
leaq _ZSt4cout(%rip), %rdi
call _ZNSo9_M_insertIdEERSoT_#PLT
leaq .LC6(%rip), %rsi
movq %rax, %rdi
call _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc#PLT
.LEHE5:
movq 16(%rsp), %rdi
addq $16, %r13
cmpq %r13, %rdi
je .L26
call _ZdlPv#PLT
.L26:
xorl %eax, %eax
movq 120(%rsp), %rbx
xorq %fs:40, %rbx
jne .L42
addq $136, %rsp
.cfi_remember_state
.cfi_def_cfa_offset 56
popq %rbx
.cfi_def_cfa_offset 48
popq %rbp
.cfi_def_cfa_offset 40
popq %r12
.cfi_def_cfa_offset 32
popq %r13
.cfi_def_cfa_offset 24
popq %r14
.cfi_def_cfa_offset 16
popq %r15
.cfi_def_cfa_offset 8
ret
.L41:
.cfi_restore_state
leaq .LC2(%rip), %rdi
.LEHB6:
call _ZSt20__throw_length_errorPKc#PLT
.LEHE6:
.L35:
movq %rax, %rbx
.L29:
movq 48(%rsp), %rdi
addq $16, %rbp
cmpq %rbp, %rdi
je .L31
call _ZdlPv#PLT
.L31:
movq 16(%rsp), %rdi
addq $16, %r13
cmpq %r13, %rdi
je .L32
call _ZdlPv#PLT
.L32:
movq %rbx, %rdi
.LEHB7:
call _Unwind_Resume#PLT
.LEHE7:
.L34:
movq %rax, %rbx
jmp .L31
.L36:
movq 8(%rsp), %rdx
movq 80(%rsp), %rdi
movq %rax, %rbx
addq $16, %rdx
cmpq %rdx, %rdi
je .L29
call _ZdlPv#PLT
jmp .L29
.L42:
call __stack_chk_fail#PLT
.cfi_endproc
.LFE1871:
.globl __gxx_personality_v0
.section .gcc_except_table,"a",#progbits
.LLSDA1871:
.byte 0xff
.byte 0xff
.byte 0x1
.uleb128 .LLSDACSE1871-.LLSDACSB1871
.LLSDACSB1871:
.uleb128 .LEHB0-.LFB1871
.uleb128 .LEHE0-.LEHB0
.uleb128 0
.uleb128 0
.uleb128 .LEHB1-.LFB1871
.uleb128 .LEHE1-.LEHB1
.uleb128 .L34-.LFB1871
.uleb128 0
.uleb128 .LEHB2-.LFB1871
.uleb128 .LEHE2-.LEHB2
.uleb128 .L36-.LFB1871
.uleb128 0
.uleb128 .LEHB3-.LFB1871
.uleb128 .LEHE3-.LEHB3
.uleb128 .L34-.LFB1871
.uleb128 0
.uleb128 .LEHB4-.LFB1871
.uleb128 .LEHE4-.LEHB4
.uleb128 .L35-.LFB1871
.uleb128 0
.uleb128 .LEHB5-.LFB1871
.uleb128 .LEHE5-.LEHB5
.uleb128 .L34-.LFB1871
.uleb128 0
.uleb128 .LEHB6-.LFB1871
.uleb128 .LEHE6-.LEHB6
.uleb128 .L35-.LFB1871
.uleb128 0
.uleb128 .LEHB7-.LFB1871
.uleb128 .LEHE7-.LEHB7
.uleb128 0
.uleb128 0
.LLSDACSE1871:
.section .text.startup
.size main, .-main
.p2align 4,,15
.type _GLOBAL__sub_I_main, #function
_GLOBAL__sub_I_main:
.LFB2369:
.cfi_startproc
leaq _ZStL8__ioinit(%rip), %rdi
subq $8, %rsp
.cfi_def_cfa_offset 16
call _ZNSt8ios_base4InitC1Ev#PLT
movq _ZNSt8ios_base4InitD1Ev#GOTPCREL(%rip), %rdi
leaq __dso_handle(%rip), %rdx
leaq _ZStL8__ioinit(%rip), %rsi
addq $8, %rsp
.cfi_def_cfa_offset 8
jmp __cxa_atexit#PLT
.cfi_endproc
.LFE2369:
.size _GLOBAL__sub_I_main, .-_GLOBAL__sub_I_main
.section .init_array,"aw"
.align 8
.quad _GLOBAL__sub_I_main
.local _ZStL8__ioinit
.comm _ZStL8__ioinit,1,1
.section .rodata.cst8,"aM",#progbits,8
.align 8
.LC4:
.long 0
.long 1083129856
.hidden DW.ref.__gxx_personality_v0
.weak DW.ref.__gxx_personality_v0
.section .data.DW.ref.__gxx_personality_v0,"awG",#progbits,DW.ref.__gxx_personality_v0,comdat
.align 8
.type DW.ref.__gxx_personality_v0, #object
.size DW.ref.__gxx_personality_v0, 8
DW.ref.__gxx_personality_v0:
.quad __gxx_personality_v0
.hidden __dso_handle
.ident "GCC: (Ubuntu 7.3.0-16ubuntu3) 7.3.0"
.section .note.GNU-stack,"",#progbits
Windows:
.file "tester.cpp"
.text
.p2align 4,,15
.def ___tcf_0; .scl 3; .type 32; .endef
___tcf_0:
LFB2556:
.cfi_startproc
movl $__ZStL8__ioinit, %ecx
jmp __ZNSt8ios_base4InitD1Ev
.cfi_endproc
LFE2556:
.section .rdata,"dr"
.align 4
LC0:
.ascii "basic_string::_M_construct null not valid\0"
.text
.align 2
.p2align 4,,15
.def __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPcEEvT_S7_St20forward_iterator_tag.isra.29; .scl 3; .type 32; .endef
__ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPcEEvT_S7_St20forward_iterator_tag.isra.29:
LFB2587:
.cfi_startproc
pushl %edi
.cfi_def_cfa_offset 8
.cfi_offset 7, -8
pushl %esi
.cfi_def_cfa_offset 12
.cfi_offset 6, -12
movl %ecx, %esi
pushl %ebx
.cfi_def_cfa_offset 16
.cfi_offset 3, -16
subl $32, %esp
.cfi_def_cfa_offset 48
movl 48(%esp), %edi
movl 52(%esp), %ebx
testl %edi, %edi
jne L5
testl %ebx, %ebx
je L5
movl $LC0, (%esp)
call __ZSt19__throw_logic_errorPKc
.p2align 4,,10
L5:
subl %edi, %ebx
cmpl $15, %ebx
movl %ebx, 28(%esp)
ja L22
movl (%esi), %edx
cmpl $1, %ebx
movl %edx, %eax
je L23
testl %ebx, %ebx
jne L6
L8:
movl 28(%esp), %eax
movl %eax, 4(%esi)
movb $0, (%edx,%eax)
addl $32, %esp
.cfi_remember_state
.cfi_def_cfa_offset 16
popl %ebx
.cfi_restore 3
.cfi_def_cfa_offset 12
popl %esi
.cfi_restore 6
.cfi_def_cfa_offset 8
popl %edi
.cfi_restore 7
.cfi_def_cfa_offset 4
ret $8
.p2align 4,,10
L22:
.cfi_restore_state
leal 28(%esp), %eax
movl $0, 4(%esp)
movl %esi, %ecx
movl %eax, (%esp)
call __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_createERjj
.cfi_def_cfa_offset 40
subl $8, %esp
.cfi_def_cfa_offset 48
movl %eax, (%esi)
movl 28(%esp), %edx
movl %edx, 8(%esi)
L6:
movl %ebx, 8(%esp)
movl %edi, 4(%esp)
movl %eax, (%esp)
call _memcpy
movl (%esi), %edx
jmp L8
.p2align 4,,10
L23:
movzbl (%edi), %eax
movb %al, (%edx)
movl (%esi), %edx
jmp L8
.cfi_endproc
LFE2587:
.def __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.21; .scl 3; .type 32; .endef
.set __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.21,__ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPcEEvT_S7_St20forward_iterator_tag.isra.29
.section .text$_ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_jPKS8_PcEjSB_z,"x"
.linkonce discard
.p2align 4,,15
.globl __ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_jPKS8_PcEjSB_z
.def __ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_jPKS8_PcEjSB_z; .scl 2; .type 32; .endef
__ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_jPKS8_PcEjSB_z:
LFB2177:
.cfi_startproc
pushl %ebp
.cfi_def_cfa_offset 8
.cfi_offset 5, -8
movl %esp, %ebp
.cfi_def_cfa_register 5
pushl %esi
pushl %ebx
subl $16, %esp
.cfi_offset 6, -12
.cfi_offset 3, -16
movl 16(%ebp), %edx
movl 8(%ebp), %esi
leal 30(%edx), %eax
andl $-16, %eax
call ___chkstk_ms
subl %eax, %esp
leal 24(%ebp), %eax
leal 31(%esp), %ebx
movl %edx, 4(%esp)
movl %eax, 12(%esp)
movl 20(%ebp), %eax
andl $-16, %ebx
movl %ebx, (%esp)
movl %eax, 8(%esp)
call *12(%ebp)
leal 8(%esi), %edx
addl %ebx, %eax
movl %esi, %ecx
movl %edx, (%esi)
movl %eax, 4(%esp)
movl %ebx, (%esp)
call __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPcEEvT_S7_St20forward_iterator_tag.isra.29
subl $8, %esp
leal -8(%ebp), %esp
movl %esi, %eax
popl %ebx
.cfi_restore 3
popl %esi
.cfi_restore 6
popl %ebp
.cfi_restore 5
.cfi_def_cfa 4, 4
ret
.cfi_endproc
LFE2177:
.def ___main; .scl 2; .type 32; .endef
.section .rdata,"dr"
LC1:
.ascii "\0"
LC2:
.ascii "%d\0"
LC3:
.ascii "basic_string::append\0"
LC4:
.ascii " \0"
.def ___divdi3; .scl 2; .type 32; .endef
LC6:
.ascii "Done in \0"
LC7:
.ascii "\12\0"
.section .text.startup,"x"
.p2align 4,,15
.globl _main
.def _main; .scl 2; .type 32; .endef
_main:
LFB2111:
.cfi_startproc
.cfi_personality 0,___gxx_personality_v0
.cfi_lsda 0,LLSDA2111
leal 4(%esp), %ecx
.cfi_def_cfa 1, 0
andl $-16, %esp
pushl -4(%ecx)
pushl %ebp
.cfi_escape 0x10,0x5,0x2,0x75,0
movl %esp, %ebp
pushl %edi
pushl %esi
pushl %ebx
pushl %ecx
.cfi_escape 0xf,0x3,0x75,0x70,0x6
.cfi_escape 0x10,0x7,0x2,0x75,0x7c
.cfi_escape 0x10,0x6,0x2,0x75,0x78
.cfi_escape 0x10,0x3,0x2,0x75,0x74
subl $152, %esp
call ___main
call __ZNSt6chrono3_V212system_clock3nowEv
leal -96(%ebp), %ecx
movl %eax, -136(%ebp)
leal -88(%ebp), %eax
movl $LC1, 4(%esp)
movl $LC1, (%esp)
movl %edx, -132(%ebp)
movl %eax, -96(%ebp)
LEHB0:
call __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.21
LEHE0:
leal -96(%ebp), %ecx
subl $8, %esp
movl $20000000, (%esp)
LEHB1:
call __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE7reserveEj
LEHE1:
subl $4, %esp
movl $10000000, %edi
leal -72(%ebp), %esi
leal -40(%ebp), %ebx
jmp L32
.p2align 4,,10
L28:
movl %ecx, -48(%ebp)
movl 8(%eax), %ecx
movl %ecx, -40(%ebp)
L29:
movl 4(%eax), %ecx
movb $0, 8(%eax)
movl %ecx, -44(%ebp)
movl %edx, (%eax)
leal -96(%ebp), %ecx
movl $0, 4(%eax)
movl -44(%ebp), %eax
movl %eax, 4(%esp)
movl -48(%ebp), %eax
movl %eax, (%esp)
LEHB2:
call __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_appendEPKcj
LEHE2:
movl -48(%ebp), %eax
subl $8, %esp
cmpl %ebx, %eax
je L30
movl %eax, (%esp)
call __ZdlPv
L30:
movl -72(%ebp), %eax
leal -64(%ebp), %edx
cmpl %edx, %eax
je L31
movl %eax, (%esp)
call __ZdlPv
L31:
subl $1, %edi
je L46
L32:
movl $1, 16(%esp)
movl $LC2, 12(%esp)
movl $16, 8(%esp)
movl $_vsnprintf, 4(%esp)
movl %esi, (%esp)
LEHB3:
call __ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_jPKS8_PcEjSB_z
LEHE3:
cmpl $2147483647, -68(%ebp)
je L47
movl $1, 4(%esp)
movl $LC4, (%esp)
movl %esi, %ecx
LEHB4:
call __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_appendEPKcj
LEHE4:
movl %ebx, -48(%ebp)
movl (%eax), %ecx
leal 8(%eax), %edx
subl $8, %esp
cmpl %edx, %ecx
jne L28
movl 12(%eax), %ecx
movl %ecx, -120(%ebp)
movl 16(%eax), %ecx
movl %ecx, -124(%ebp)
movl 20(%eax), %ecx
movl %ecx, -128(%ebp)
movl 8(%eax), %ecx
movl %ecx, -40(%ebp)
movl -120(%ebp), %ecx
movl %ecx, -36(%ebp)
movl -124(%ebp), %ecx
movl %ecx, -32(%ebp)
movl -128(%ebp), %ecx
movl %ecx, -28(%ebp)
jmp L29
.p2align 4,,10
L46:
call __ZNSt6chrono3_V212system_clock3nowEv
subl -136(%ebp), %eax
movl $1000000, 8(%esp)
sbbl -132(%ebp), %edx
movl $0, 12(%esp)
movl %eax, (%esp)
movl %edx, 4(%esp)
call ___divdi3
movl %eax, -120(%ebp)
movl %edx, -116(%ebp)
fildq -120(%ebp)
movl $8, 8(%esp)
movl $LC6, 4(%esp)
movl $__ZSt4cout, (%esp)
fdivs LC5
fstpl -120(%ebp)
LEHB5:
call __ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_i
fldl -120(%ebp)
movl $__ZSt4cout, %ecx
fstpl (%esp)
call __ZNSo9_M_insertIdEERSoT_
subl $8, %esp
movl $LC7, 4(%esp)
movl %eax, (%esp)
call __ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
LEHE5:
movl -96(%ebp), %eax
leal -88(%ebp), %edi
cmpl %edi, %eax
je L43
movl %eax, (%esp)
call __ZdlPv
L43:
leal -16(%ebp), %esp
xorl %eax, %eax
popl %ecx
.cfi_remember_state
.cfi_restore 1
.cfi_def_cfa 1, 0
popl %ebx
.cfi_restore 3
popl %esi
.cfi_restore 6
popl %edi
.cfi_restore 7
popl %ebp
.cfi_restore 5
leal -4(%ecx), %esp
.cfi_def_cfa 4, 4
ret
L47:
.cfi_restore_state
movl $LC3, (%esp)
LEHB6:
call __ZSt20__throw_length_errorPKc
LEHE6:
L41:
movl %eax, %ebx
L36:
movl -72(%ebp), %eax
leal -64(%ebp), %edx
cmpl %edx, %eax
je L38
movl %eax, (%esp)
call __ZdlPv
L38:
movl -96(%ebp), %eax
leal -88(%ebp), %edi
cmpl %edi, %eax
je L39
movl %eax, (%esp)
call __ZdlPv
L39:
movl %ebx, (%esp)
LEHB7:
call __Unwind_Resume
LEHE7:
L42:
movl %eax, %esi
movl -48(%ebp), %eax
cmpl %ebx, %eax
je L35
movl %eax, (%esp)
call __ZdlPv
L35:
movl %esi, %ebx
jmp L36
L40:
movl %eax, %ebx
jmp L38
.cfi_endproc
LFE2111:
.def ___gxx_personality_v0; .scl 2; .type 32; .endef
.section .gcc_except_table,"w"
LLSDA2111:
.byte 0xff
.byte 0xff
.byte 0x1
.uleb128 LLSDACSE2111-LLSDACSB2111
LLSDACSB2111:
.uleb128 LEHB0-LFB2111
.uleb128 LEHE0-LEHB0
.uleb128 0
.uleb128 0
.uleb128 LEHB1-LFB2111
.uleb128 LEHE1-LEHB1
.uleb128 L40-LFB2111
.uleb128 0
.uleb128 LEHB2-LFB2111
.uleb128 LEHE2-LEHB2
.uleb128 L42-LFB2111
.uleb128 0
.uleb128 LEHB3-LFB2111
.uleb128 LEHE3-LEHB3
.uleb128 L40-LFB2111
.uleb128 0
.uleb128 LEHB4-LFB2111
.uleb128 LEHE4-LEHB4
.uleb128 L41-LFB2111
.uleb128 0
.uleb128 LEHB5-LFB2111
.uleb128 LEHE5-LEHB5
.uleb128 L40-LFB2111
.uleb128 0
.uleb128 LEHB6-LFB2111
.uleb128 LEHE6-LEHB6
.uleb128 L41-LFB2111
.uleb128 0
.uleb128 LEHB7-LFB2111
.uleb128 LEHE7-LEHB7
.uleb128 0
.uleb128 0
LLSDACSE2111:
.section .text.startup,"x"
.p2align 4,,15
.def __GLOBAL__sub_I_main; .scl 3; .type 32; .endef
__GLOBAL__sub_I_main:
LFB2557:
.cfi_startproc
subl $28, %esp
.cfi_def_cfa_offset 32
movl $__ZStL8__ioinit, %ecx
call __ZNSt8ios_base4InitC1Ev
movl $___tcf_0, (%esp)
call _atexit
addl $28, %esp
.cfi_def_cfa_offset 4
ret
.cfi_endproc
LFE2557:
.section .ctors,"w"
.align 4
.long __GLOBAL__sub_I_main
.lcomm __ZStL8__ioinit,1,1
.section .rdata,"dr"
.align 4
LC5:
.long 1148846080
.ident "GCC: (MinGW.org GCC-6.3.0-1) 6.3.0"
.def __ZNSt8ios_base4InitD1Ev; .scl 2; .type 32; .endef
.def __ZSt19__throw_logic_errorPKc; .scl 2; .type 32; .endef
.def __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_createERjj; .scl 2; .type 32; .endef
.def _memcpy; .scl 2; .type 32; .endef
.def __ZNSt6chrono3_V212system_clock3nowEv; .scl 2; .type 32; .endef
.def __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE7reserveEj; .scl 2; .type 32; .endef
.def __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_appendEPKcj; .scl 2; .type 32; .endef
.def __ZdlPv; .scl 2; .type 32; .endef
.def _vsnprintf; .scl 2; .type 32; .endef
.def __ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_i; .scl 2; .type 32; .endef
.def __ZNSo9_M_insertIdEERSoT_; .scl 2; .type 32; .endef
.def __ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc; .scl 2; .type 32; .endef
.def __ZSt20__throw_length_errorPKc; .scl 2; .type 32; .endef
.def __Unwind_Resume; .scl 2; .type 32; .endef
.def __ZNSt8ios_base4InitC1Ev; .scl 2; .type 32; .endef
.def _atexit; .scl 2; .type 32; .endef

Quick look at disassembly shows that Windows version uses movl (i. e. long word, 32 bit move) and Linux version uses movq (quad word, 64 bit) and SSE registers xmm.
My bet is that on Linux, you compile for x86-64, while on Windows you target 32 bit x86.
x86-64 includes SSE2 extension, while x86 does not, so MinGW defaults to no-SSE mode.
If that's the case, building with 64 bit toolchain on Windows should result in comparable performance. Alternatively, you might enable SSE for 32 bit builds (-msse2 compiler flag, if I remember correctly).

The mingw.org implementation just seems to be much more inefficient than linux, Visual Studio or mingw-w64.org.
>g++ --version
g++ (MinGW.org GCC-6.3.0-1) 6.3.0
Done in 24.808
>g++ --version
g++ (i686-posix-dwarf-rev2, Built by MinGW-W64 project) 6.3.0
Done in 0.679

Tested with MSYS2 MinGW64:
g++ --version
g++.exe (Rev2, Built by MSYS2 project) 7.3.0
g++.exe -Wall -O3 -mtune=native -fno-exceptions -fno-rtti -c main.cpp -o main.o
g++.exe -o test.exe main.o -s
Done in 0.547
Env: Windows 10 x64
CPU: Intel Core i5-6300U, 2.4GH
RAM: 16GB DDR4
In any case, MinGW uses mswcrt.dll instead of GNU libc (windows bundled one, not a universal CRT/visual studio CRT etc) so speed gap may comes from C standard library from my experience.
P.S. with some changes (same compiler flags)
#include <iostream>
#include <chrono>
#ifdef _WIN32
#include <windows.h>
static std::size_t page_size() noexcept {
::SYSTEM_INFO si;
::GetSystemInfo(&si);
return si.dwPageSize;
}
#else
#include <sys/types.h>
#include <unistd.h>
static std::size_t page_size() noexcept {
return static_cast<std::size_t>( ::sysconf(_SC_PAGESIZE) );
}
#endif // _WIN32
int main(int argc, char const *argv[]) {
auto started = std::chrono::high_resolution_clock::now();
const std::size_t n = 10000000;
// align size to page boundary
const std::size_t al = page_size() - 1;
const std::size_t buff_size = ( (n << 1) + al) & ~al;
std::string str;
str.reserve(buff_size);
const std::string to_append( std::to_string(1) );
for (std::size_t i = 0; i < n; ++i) {
str.append( to_append );
str.push_back(' ');
}
auto done = std::chrono::high_resolution_clock::now();
double secs = (double) std::chrono::duration_cast<std::chrono::milliseconds>(done-started).count() / 1000;
std::cout << "Done in " << secs << "\n";
return 0;
}
Done in 0.046
Asm ouput for main function:
main:
pushq %r14
.seh_pushreg %r14
pushq %r13
.seh_pushreg %r13
pushq %r12
.seh_pushreg %r12
pushq %rbp
.seh_pushreg %rbp
pushq %rdi
.seh_pushreg %rdi
pushq %rsi
.seh_pushreg %rsi
pushq %rbx
.seh_pushreg %rbx
subq $144, %rsp
.seh_stackalloc 144
.seh_endprologue
movl $10000000, %esi
call __main
leaq 96(%rsp), %r13
leaq 64(%rsp), %rbp
call _ZNSt6chrono3_V212system_clock3nowEv
movq %r13, %rcx
leaq 16(%rbp), %r12
movq %rax, %r14
call *__imp_GetSystemInfo(%rip)
movl 100(%rsp), %eax
movq %rbp, %rcx
movq %r12, 64(%rsp)
movq $0, 72(%rsp)
leaq 19999999(%rax), %rdx
negq %rax
movb $0, 80(%rsp)
andq %rax, %rdx
call _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE7reserveEy
movl $1, 32(%rsp)
movq %r13, %rcx
leaq .LC0(%rip), %r9
movl $16, %r8d
leaq _ZL9vsnprintfPcyPKcS_(%rip), %rdx
call _ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_yPKS8_PcEySB_z
jmp .L14
.p2align 4,,10
.L16:
movb $32, (%rdx,%rbx)
.L26:
movq 64(%rsp), %rax
movq %rdi, 72(%rsp)
movb $0, 1(%rax,%rbx)
subq $1, %rsi
je .L27
.L14:
movq 96(%rsp), %rdx
movq 104(%rsp), %r8
movq %rbp, %rcx
call _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_appendEPKcy
movq 72(%rsp), %rbx
movq 64(%rsp), %rdx
movl $15, %eax
leaq 1(%rbx), %rdi
cmpq %r12, %rdx
je .L15
movq 80(%rsp), %rax
.L15:
cmpq %rax, %rdi
jbe .L16
xorl %r9d, %r9d
xorl %r8d, %r8d
movq %rbx, %rdx
movq %rbp, %rcx
movq $1, 32(%rsp)
call _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_mutateEyyPKcy
movq 64(%rsp), %rax
movb $32, (%rax,%rbx)
jmp .L26
.p2align 4,,10
.L27:
call _ZNSt6chrono3_V212system_clock3nowEv
pxor %xmm1, %xmm1
movl $8, %r8d
movabsq $4835703278458516699, %rdx
subq %r14, %rax
addq $16, %r13
movq %rax, %rcx
imulq %rdx
sarq $63, %rcx
sarq $18, %rdx
subq %rcx, %rdx
movq .refptr._ZSt4cout(%rip), %rcx
cvtsi2sdq %rdx, %xmm1
leaq .LC2(%rip), %rdx
divsd .LC1(%rip), %xmm1
movsd %xmm1, 56(%rsp)
call _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_x
movsd 56(%rsp), %xmm1
movq .refptr._ZSt4cout(%rip), %rcx
call _ZNSo9_M_insertIdEERSoT_
leaq .LC3(%rip), %rdx
movq %rax, %rcx
call _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
movq 96(%rsp), %rcx
cmpq %r13, %rcx
je .L19
call _ZdlPv
.L19:
movq 64(%rsp), %rcx
addq $16, %rbp
cmpq %rbp, %rcx
je .L20
call _ZdlPv
.L20:
xorl %eax, %eax
addq $144, %rsp
popq %rbx
popq %rsi
popq %rdi
popq %rbp
popq %r12
popq %r13
popq %r14
ret
.seh_endproc
.p2align 4,,15
.def _GLOBAL__sub_I_main; .scl 3; .type 32; .endef
.seh_proc _GLOBAL__sub_I_main

(Just for the proportions) Windows Release target vs. Debug target on Visual Studio C++: By default, Debug target compile-line is without optimization, while Release target compile-line is with /O2 optimization, with /Oi ("Enable Intrinsic Functions"), & with /GL ("Whole Program Optimization"). Your code, on my workstation, Debug x64 vs Relesae x64:
Debug: 70 sec.
Release: 0.27 sec.
You build with MinGW (which I am not familiar with). But from a fast search, there is a talk about Debug/Release mode ...and MinGW has equivalent /O2 optimization, /Oi ("Enable Intrinsic Functions"), and /Og ("Enable Global Optimization") flags, it seems.
-
Compile with these 3 flags (x64 target), & compare with the VS Release x64 benchmark. Anyway, this is MS default compile optimization for a Release target.
-
Test Environment:
HP 8100, Windows 10 Pro 64 bit, CPU i7 870, 16 GB DDR3 RAM, Visual Studio 2017, Targets: Debug x64 / Release x64

I tried your code at my Windows with MinGW 4.8.0 and got ~20 seconds. When I changed string concatination to std::stringstream I got 0.5 seconds:
...
std::stringstream ss;
for (int i = 0; i < n; ++i) {
//str += std::to_string(a) + " ";
ss << a << " ";
}
str = ss.str();
...

Related

Why does march=native corrupt my program?

I'm compiling the program:
#include <iostream>
#include <vector>
#include <cstddef>
#include <algorithm>
struct Model
{
int open, extend;
};
struct Cell
{
int a, b;
};
typedef std::vector<std::vector<Cell>> DPMatrix;
void print(const DPMatrix& matrix)
{
for (std::size_t i = 0; i < matrix.size(); ++i) {
for (std::size_t j = 0; j < matrix[i].size(); ++j) {
std::cout << '{' << matrix[i][j].a << ' ' << matrix[i][j].b << "} ";
}
std::cout << std::endl;
}
}
DPMatrix init_dp_matrix(const std::size_t num_cols, const std::size_t num_rows, const Model& model)
{
DPMatrix result(num_cols, DPMatrix::value_type(num_rows, Cell()));
const int inf = model.open * std::max(num_cols, num_rows);
for (int i = 1; i < num_cols; ++i) {
result[i][0].b = model.open + (i - 1) * model.extend;
}
for (int j = 1; j < num_rows; ++j) {
result[0][j].a = model.open + (j - 1) * model.extend;
}
return result;
}
int main()
{
const Model model = {-8, -1};
const DPMatrix matrix = init_dp_matrix(10, 2, model);
print(matrix);
}
With GCC 9.2.0:
$ g++-9 -v
Reading specs from /home/dcooke/tools/octopus/build/brew/Cellar/gcc#9/9.2.0/bin/../lib/gcc/9/gcc/x86_64-pc-linux-gnu/9.2.0/specs
COLLECT_GCC=/home/dcooke/tools/octopus/build/brew/bin/g++-9
COLLECT_LTO_WRAPPER=/home/dcooke/tools/octopus/build/brew/Cellar/gcc#9/9.2.0/bin/../libexec/gcc/x86_64-pc-linux-gnu/9.2.0/lto-wrapper
Target: x86_64-pc-linux-gnu
Configured with: ../configure --with-bugurl=https://github.com/Linuxbrew/homebrew-core/issues --disable-multilib --prefix=/home/linuxbrew/.linuxbrew/Cellar/gcc#9/9.2.0 --libdir=/home/linuxbrew/.linuxbrew/Cellar/gcc#9/9.2.0/lib/gcc/9 --enable-languages=c,c++,objc,obj-c++,fortran --disable-nls --enable-checking=release --program-suffix=-9 --with-gmp=/home/linuxbrew/.linuxbrew/opt/gmp --with-mpfr=/home/linuxbrew/.linuxbrew/opt/mpfr --with-mpc=/home/linuxbrew/.linuxbrew/opt/libmpc --with-isl=/home/linuxbrew/.linuxbrew/opt/isl --with-pkgversion='Homebrew GCC 9.2.0'
Thread model: posix
gcc version 9.2.0 (Homebrew GCC 9.2.0)
with -march=native:
$ g++-9 -O3 -march=native -o bug bug.cpp
On an Ubuntu machine with Intel chips:
$ lsb_release -a
No LSB modules are available.
Distributor ID: Ubuntu
Description: Ubuntu 18.04.3 LTS
Release: 18.04
Codename: bioni
$ grep model /proc/cpuinfo | head -2
model : 85
model name : Intel(R) Xeon(R) Platinum 8175M CPU # 2.50GHz
Running the program I get bogus output:
$ ./bug
{0 0} {-8 0}
{-2048 255} {0 0}
{-2304 255} {0 0}
{-2560 255} {0 0}
{-2816 255} {0 0}
{-3072 255} {0 0}
{-3328 255} {0 0}
{-3584 255} {0 0}
{-3840 255} {0 0}
{0 -16} {0 0}
If I compile without -march=native I get the correct output:
$ g++-9 -O3 -o bug bug.cpp
$ ./bug
{0 0} {-8 0}
{0 -8} {0 0}
{0 -9} {0 0}
{0 -10} {0 0}
{0 -11} {0 0}
{0 -12} {0 0}
{0 -13} {0 0}
{0 -14} {0 0}
{0 -15} {0 0}
{0 -16} {0 0
The assembly for the -match=native version is:
$ g++-9 -O3 -march=native -S bug.cpp
$ cat bug.s
.file "bug.cpp"
.text
.section .text._ZNKSt5ctypeIcE8do_widenEc,"axG",#progbits,_ZNKSt5ctypeIcE8do_widenEc,comdat
.align 2
.p2align 4
.weak _ZNKSt5ctypeIcE8do_widenEc
.type _ZNKSt5ctypeIcE8do_widenEc, #function
_ZNKSt5ctypeIcE8do_widenEc:
.LFB1303:
.cfi_startproc
movl %esi, %eax
ret
.cfi_endproc
.LFE1303:
.size _ZNKSt5ctypeIcE8do_widenEc, .-_ZNKSt5ctypeIcE8do_widenEc
.section .rodata.str1.1,"aMS",#progbits,1
.LC0:
.string "} "
.text
.p2align 4
.globl _Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE
.type _Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE, #function
_Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE:
.LFB2359:
.cfi_startproc
movq (%rdi), %rdx
cmpq %rdx, 8(%rdi)
je .L23
pushq %r15
.cfi_def_cfa_offset 16
.cfi_offset 15, -16
pushq %r14
.cfi_def_cfa_offset 24
.cfi_offset 14, -24
pushq %r13
.cfi_def_cfa_offset 32
.cfi_offset 13, -32
movabsq $-6148914691236517205, %r13
pushq %r12
.cfi_def_cfa_offset 40
.cfi_offset 12, -40
xorl %r12d, %r12d
pushq %rbp
.cfi_def_cfa_offset 48
.cfi_offset 6, -48
movq %rdi, %rbp
pushq %rbx
.cfi_def_cfa_offset 56
.cfi_offset 3, -56
subq $24, %rsp
.cfi_def_cfa_offset 80
.p2align 4,,10
.p2align 3
.L4:
leaq (%r12,%r12,2), %rbx
salq $3, %rbx
addq %rbx, %rdx
movq 8(%rdx), %rax
xorl %r14d, %r14d
cmpq %rax, (%rdx)
je .L8
.p2align 4,,10
.p2align 3
.L5:
movl $1, %edx
leaq 15(%rsp), %rsi
movl $_ZSt4cout, %edi
movb $123, 15(%rsp)
call _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l
movq %rax, %rdi
movq 0(%rbp), %rax
leaq 0(,%r14,8), %r15
movq (%rax,%rbx), %rax
movl (%rax,%r14,8), %esi
incq %r14
call _ZNSolsEi
movq %rax, %rdi
movl $1, %edx
leaq 15(%rsp), %rsi
movb $32, 15(%rsp)
call _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l
movq %rax, %rdi
movq 0(%rbp), %rax
movq (%rax,%rbx), %rax
movl 4(%rax,%r15), %esi
call _ZNSolsEi
movq %rax, %rdi
movl $2, %edx
movl $.LC0, %esi
call _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l
movq 0(%rbp), %rdx
addq %rbx, %rdx
movq 8(%rdx), %rax
subq (%rdx), %rax
sarq $3, %rax
cmpq %rax, %r14
jb .L5
.L8:
movq _ZSt4cout(%rip), %rax
movq -24(%rax), %rax
movq _ZSt4cout+240(%rax), %r14
testq %r14, %r14
je .L26
cmpb $0, 56(%r14)
je .L9
movsbl 67(%r14), %esi
.L10:
movl $_ZSt4cout, %edi
call _ZNSo3putEc
movq %rax, %rdi
call _ZNSo5flushEv
movq 0(%rbp), %rdx
movq 8(%rbp), %rax
incq %r12
subq %rdx, %rax
sarq $3, %rax
imulq %r13, %rax
cmpq %r12, %rax
ja .L4
addq $24, %rsp
.cfi_remember_state
.cfi_def_cfa_offset 56
popq %rbx
.cfi_def_cfa_offset 48
popq %rbp
.cfi_def_cfa_offset 40
popq %r12
.cfi_def_cfa_offset 32
popq %r13
.cfi_def_cfa_offset 24
popq %r14
.cfi_def_cfa_offset 16
popq %r15
.cfi_def_cfa_offset 8
ret
.p2align 4,,10
.p2align 3
.L9:
.cfi_restore_state
movq %r14, %rdi
call _ZNKSt5ctypeIcE13_M_widen_initEv
movq (%r14), %rax
movl $10, %esi
movq 48(%rax), %rax
cmpq $_ZNKSt5ctypeIcE8do_widenEc, %rax
je .L10
movq %r14, %rdi
call *%rax
movsbl %al, %esi
jmp .L10
.L23:
.cfi_def_cfa_offset 8
.cfi_restore 3
.cfi_restore 6
.cfi_restore 12
.cfi_restore 13
.cfi_restore 14
.cfi_restore 15
ret
.L26:
.cfi_def_cfa_offset 80
.cfi_offset 3, -56
.cfi_offset 6, -48
.cfi_offset 12, -40
.cfi_offset 13, -32
.cfi_offset 14, -24
.cfi_offset 15, -16
call _ZSt16__throw_bad_castv
.cfi_endproc
.LFE2359:
.size _Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE, .-_Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE
.section .rodata.str1.8,"aMS",#progbits,1
.align 8
.LC2:
.string "cannot create std::vector larger than max_size()"
.section .text.unlikely,"ax",#progbits
.LCOLDB6:
.text
.LHOTB6:
.p2align 4
.globl _Z14init_dp_matrixmmRK5Model
.type _Z14init_dp_matrixmmRK5Model, #function
_Z14init_dp_matrixmmRK5Model:
.LFB2360:
.cfi_startproc
.cfi_personality 0x3,__gxx_personality_v0
.cfi_lsda 0x3,.LLSDA2360
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movabsq $1152921504606846975, %rax
movq %rsp, %rbp
.cfi_def_cfa_register 6
pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %rbx
andq $-32, %rsp
subq $64, %rsp
.cfi_offset 15, -24
.cfi_offset 14, -32
.cfi_offset 13, -40
.cfi_offset 12, -48
.cfi_offset 3, -56
movq %rdi, 24(%rsp)
movq %rsi, 40(%rsp)
movq %rcx, 16(%rsp)
cmpq %rax, %rdx
ja .L103
movq %rdx, %r15
testq %rdx, %rdx
je .L71
leaq 0(,%rdx,8), %rbx
movq %rbx, %rdi
.LEHB0:
call _Znwm
.LEHE0:
movq %rax, %r13
leaq -1(%r15), %rax
cmpq $3, %rax
movq %r15, %rdx
movq %r13, %rax
jbe .L30
shrq $2, %rdx
salq $5, %rdx
addq %r13, %rdx
vpxor %xmm0, %xmm0, %xmm0
.p2align 4,,10
.p2align 3
.L32:
vmovdqu32 %ymm0, (%rax)
addq $32, %rax
cmpq %rdx, %rax
jne .L32
movq %r15, %rcx
andq $-4, %rcx
movq %r15, %rdx
andl $3, %edx
leaq 0(%r13,%rcx,8), %rax
cmpq %rcx, %r15
je .L33
.L30:
movq $0, (%rax)
cmpq $1, %rdx
je .L33
movq $0, 8(%rax)
cmpq $2, %rdx
je .L33
movq $0, 16(%rax)
cmpq $3, %rdx
je .L33
movq $0, 24(%rax)
.L33:
leaq 0(%r13,%rbx), %rax
movq %rax, 56(%rsp)
.L29:
movabsq $384307168202282325, %rax
cmpq %rax, 40(%rsp)
ja .L104
movq 40(%rsp), %rax
movq 24(%rsp), %r12
leaq (%rax,%rax,2), %rbx
movq $0, (%r12)
movq $0, 8(%r12)
movq $0, 16(%r12)
salq $3, %rbx
testq %rax, %rax
je .L35
movq %rbx, %rdi
vzeroupper
.LEHB1:
call _Znwm
.LEHE1:
addq %rax, %rbx
movq %rax, (%r12)
movq %rax, 8(%r12)
movq %rbx, 16(%r12)
movq 56(%rsp), %r12
movq %rax, %r14
subq %r13, %r12
movq %r12, %rax
sarq $3, %rax
je .L40
movabsq $1152921504606846975, %rdx
cmpq %rdx, %rax
ja .L41
movq 40(%rsp), %rax
movq %r14, %rbx
movq %rax, 48(%rsp)
.p2align 4,,10
.p2align 3
.L46:
movq $0, (%rbx)
movq $0, 8(%rbx)
movq $0, 16(%rbx)
movq %r12, %rdi
.LEHB2:
call _Znwm
.LEHE2:
leaq (%rax,%r12), %rcx
movq %rax, (%rbx)
movq %rcx, 16(%rbx)
movq %rax, %rdi
cmpq %r13, 56(%rsp)
je .L42
movq %r12, %rdx
movq %r13, %rsi
movq %rcx, 32(%rsp)
call memcpy
movq 32(%rsp), %rcx
addq $24, %rbx
movq %rcx, -16(%rbx)
decq 48(%rsp)
jne .L46
movq 24(%rsp), %rax
movq %rbx, 8(%rax)
.L47:
movq %r13, %rdi
call _ZdlPv
.L48:
movq 16(%rsp), %rax
cmpq $1, 40(%rsp)
movl (%rax), %edx
jbe .L62
movl 4(%rax), %edi
movq 24(%rsp), %rax
movq (%rax), %rsi
movq 40(%rsp), %rax
leaq -2(%rax), %rcx
cmpq $7, %rcx
jbe .L73
movq %rcx, %r8
shrq $3, %r8
leaq (%r8,%r8,2), %r8
salq $6, %r8
vmovdqa64 .LC1(%rip), %ymm3
vmovdqa64 .LC3(%rip), %ymm4
vmovdqa64 .LC4(%rip), %ymm6
vmovdqa64 .LC5(%rip), %ymm5
vpbroadcastd %edi, %ymm10
vpbroadcastd %edx, %ymm9
leaq 24(%rsi), %rax
leaq 24(%rsi,%r8), %r8
vpcmpeqd %ymm8, %ymm8, %ymm8
kxnorb %k1, %k1, %k1
.p2align 4,,10
.p2align 3
.L61:
vmovdqa64 %ymm3, %ymm0
vpaddd %ymm8, %ymm0, %ymm0
vpmulld %ymm10, %ymm0, %ymm0
vmovdqu64 (%rax), %ymm2
vmovdqu64 96(%rax), %ymm1
vpermt2q 32(%rax), %ymm6, %ymm2
vpermt2q 128(%rax), %ymm6, %ymm1
vpermt2q 64(%rax), %ymm5, %ymm2
vpaddd %ymm9, %ymm0, %ymm0
vpermt2q 160(%rax), %ymm5, %ymm1
kmovb %k1, %k2
addq $192, %rax
vpscatterqd %xmm0, 4(,%ymm2,1){%k2}
vperm2i128 $17, %ymm0, %ymm0, %ymm0
kmovb %k1, %k3
vpaddd %ymm4, %ymm3, %ymm3
vpscatterqd %xmm0, 4(,%ymm1,1){%k3}
cmpq %r8, %rax
jne .L61
andq $-8, %rcx
leaq 1(%rcx), %r8
leal 1(%rcx), %eax
.L59:
leaq (%r8,%r8,2), %rcx
movq (%rsi,%rcx,8), %r8
leal -1(%rax), %ecx
imull %edi, %ecx
movq 40(%rsp), %rbx
addl %edx, %ecx
movl %ecx, 4(%r8)
leal 1(%rax), %ecx
movslq %ecx, %r8
cmpq %r8, %rbx
jbe .L62
leaq (%r8,%r8,2), %r8
movq (%rsi,%r8,8), %r9
movl %edi, %r8d
imull %eax, %r8d
addl %edx, %r8d
movl %r8d, 4(%r9)
leal 2(%rax), %r8d
movslq %r8d, %r9
cmpq %r9, %rbx
jbe .L62
imull %edi, %ecx
leaq (%r9,%r9,2), %r9
movq (%rsi,%r9,8), %r9
addl %edx, %ecx
movl %ecx, 4(%r9)
leal 3(%rax), %ecx
movslq %ecx, %r9
cmpq %r9, %rbx
jbe .L62
imull %edi, %r8d
leaq (%r9,%r9,2), %r9
movq (%rsi,%r9,8), %r9
addl %edx, %r8d
movl %r8d, 4(%r9)
leal 4(%rax), %r8d
movslq %r8d, %r9
cmpq %r9, %rbx
jbe .L62
imull %edi, %ecx
leaq (%r9,%r9,2), %r9
movq (%rsi,%r9,8), %r9
addl %edx, %ecx
movl %ecx, 4(%r9)
leal 5(%rax), %ecx
movslq %ecx, %r9
cmpq %r9, %rbx
jbe .L62
imull %edi, %r8d
leaq (%r9,%r9,2), %r9
movq (%rsi,%r9,8), %r9
addl %edx, %r8d
movl %r8d, 4(%r9)
leal 6(%rax), %r8d
movslq %r8d, %r9
cmpq %r9, %rbx
jbe .L62
imull %edi, %ecx
leaq (%r9,%r9,2), %r9
movq (%rsi,%r9,8), %r9
addl $7, %eax
addl %edx, %ecx
cltq
movl %ecx, 4(%r9)
cmpq %rax, %rbx
jbe .L62
imull %r8d, %edi
leaq (%rax,%rax,2), %rax
movq (%rsi,%rax,8), %rax
leal (%rdi,%rdx), %r8d
movl %r8d, 4(%rax)
.L62:
cmpq $1, %r15
jbe .L27
movq 16(%rsp), %rax
leaq -1(%r15), %r8
movl 4(%rax), %edi
movq 24(%rsp), %rax
movq (%rax), %rax
movq (%rax), %rsi
leaq -2(%r15), %rax
cmpq $6, %rax
jbe .L74
movq %r8, %rcx
shrq $3, %rcx
salq $6, %rcx
vmovdqa64 .LC1(%rip), %ymm2
vmovdqa64 .LC3(%rip), %ymm4
vpbroadcastd %edi, %ymm6
vpbroadcastd %edx, %ymm5
movq %rsi, %rax
addq %rsi, %rcx
vpcmpeqd %ymm3, %ymm3, %ymm3
.p2align 4,,10
.p2align 3
.L66:
vmovdqa64 %ymm2, %ymm0
vpaddd %ymm3, %ymm0, %ymm0
vpmulld %ymm6, %ymm0, %ymm0
addq $64, %rax
vpaddd %ymm4, %ymm2, %ymm2
vpaddd %ymm5, %ymm0, %ymm0
vmovd %xmm0, -56(%rax)
vpextrd $1, %xmm0, -48(%rax)
vpextrd $2, %xmm0, -40(%rax)
vpextrd $3, %xmm0, -32(%rax)
vextracti128 $0x1, %ymm0, %xmm0
vmovd %xmm0, -24(%rax)
vpextrd $1, %xmm0, -16(%rax)
vpextrd $2, %xmm0, -8(%rax)
vpextrd $3, %xmm0, (%rax)
cmpq %rcx, %rax
jne .L66
movq %r8, %rcx
andq $-8, %rcx
leaq 1(%rcx), %r9
leal 1(%rcx), %eax
cmpq %r8, %rcx
je .L27
.L64:
leal -1(%rax), %ecx
imull %edi, %ecx
addl %edx, %ecx
movl %ecx, (%rsi,%r9,8)
leal 1(%rax), %ecx
movslq %ecx, %r9
cmpq %r15, %r9
jnb .L27
movl %edi, %r8d
imull %eax, %r8d
addl %edx, %r8d
movl %r8d, (%rsi,%r9,8)
leal 2(%rax), %r8d
movslq %r8d, %r9
cmpq %r9, %r15
jbe .L27
imull %edi, %ecx
addl %edx, %ecx
movl %ecx, (%rsi,%r9,8)
leal 3(%rax), %ecx
movslq %ecx, %r9
cmpq %r15, %r9
jnb .L27
imull %edi, %r8d
addl %edx, %r8d
movl %r8d, (%rsi,%r9,8)
leal 4(%rax), %r8d
movslq %r8d, %r9
cmpq %r9, %r15
jbe .L27
imull %edi, %ecx
addl %edx, %ecx
movl %ecx, (%rsi,%r9,8)
leal 5(%rax), %ecx
movslq %ecx, %r9
cmpq %r9, %r15
jbe .L27
imull %edi, %r8d
addl $6, %eax
cltq
addl %edx, %r8d
movl %r8d, (%rsi,%r9,8)
cmpq %rax, %r15
jbe .L27
imull %ecx, %edi
addl %edi, %edx
movl %edx, (%rsi,%rax,8)
.L27:
movq 24(%rsp), %rax
vzeroupper
leaq -40(%rbp), %rsp
popq %rbx
popq %r12
popq %r13
popq %r14
popq %r15
popq %rbp
.cfi_remember_state
.cfi_def_cfa 7, 8
ret
.p2align 4,,10
.p2align 3
.L37:
.cfi_restore_state
movq %r12, 8(%r14)
addq $24, %r14
cmpq %r14, %rbx
je .L45
.L40:
movq $0, (%r14)
movq %r12, 16(%r14)
cmpq %r13, 56(%rsp)
je .L37
movq %r12, %rdx
movq %r13, %rsi
xorl %edi, %edi
call memcpy
addq $24, %r14
movq %r12, -16(%r14)
cmpq %r14, %rbx
jne .L40
.L45:
movq 24(%rsp), %rax
movq %rbx, 8(%rax)
testq %r13, %r13
je .L48
.L105:
movq %r13, %rdi
call _ZdlPv
jmp .L48
.p2align 4,,10
.p2align 3
.L42:
movq %rcx, 8(%rbx)
addq $24, %rbx
decq 48(%rsp)
jne .L46
movq 24(%rsp), %rax
movq %rbx, 8(%rax)
testq %r13, %r13
je .L48
jmp .L105
.p2align 4,,10
.p2align 3
.L71:
movq $0, 56(%rsp)
xorl %r13d, %r13d
jmp .L29
.p2align 4,,10
.p2align 3
.L35:
testq %r13, %r13
je .L106
vzeroupper
jmp .L47
.L73:
movl $1, %eax
movl $1, %r8d
jmp .L59
.L74:
movl $1, %eax
movl $1, %r9d
jmp .L64
.L106:
movq 16(%rsp), %rax
movl (%rax), %edx
jmp .L62
.L41:
movq $0, (%r14)
movq $0, 8(%r14)
movq $0, 16(%r14)
.LEHB3:
call _ZSt17__throw_bad_allocv
.LEHE3:
.L104:
movl $.LC2, %edi
vzeroupper
.LEHB4:
call _ZSt20__throw_length_errorPKc
.LEHE4:
.L103:
movl $.LC2, %edi
.LEHB5:
call _ZSt20__throw_length_errorPKc
.LEHE5:
.L78:
movq %rax, %rdi
jmp .L49
.L77:
movq %rax, %rdi
jmp .L50
.L75:
movq %rax, %r12
vzeroupper
jmp .L56
.globl __gxx_personality_v0
.section .gcc_except_table,"a",#progbits
.align 4
.LLSDA2360:
.byte 0xff
.byte 0x3
.uleb128 .LLSDATT2360-.LLSDATTD2360
.LLSDATTD2360:
.byte 0x1
.uleb128 .LLSDACSE2360-.LLSDACSB2360
.LLSDACSB2360:
.uleb128 .LEHB0-.LFB2360
.uleb128 .LEHE0-.LEHB0
.uleb128 0
.uleb128 0
.uleb128 .LEHB1-.LFB2360
.uleb128 .LEHE1-.LEHB1
.uleb128 .L75-.LFB2360
.uleb128 0
.uleb128 .LEHB2-.LFB2360
.uleb128 .LEHE2-.LEHB2
.uleb128 .L77-.LFB2360
.uleb128 0x1
.uleb128 .LEHB3-.LFB2360
.uleb128 .LEHE3-.LEHB3
.uleb128 .L78-.LFB2360
.uleb128 0x1
.uleb128 .LEHB4-.LFB2360
.uleb128 .LEHE4-.LEHB4
.uleb128 .L75-.LFB2360
.uleb128 0
.uleb128 .LEHB5-.LFB2360
.uleb128 .LEHE5-.LEHB5
.uleb128 0
.uleb128 0
.LLSDACSE2360:
.byte 0x1
.byte 0
.align 4
.long 0
.LLSDATT2360:
.text
.cfi_endproc
.section .text.unlikely
.cfi_startproc
.cfi_personality 0x3,__gxx_personality_v0
.cfi_lsda 0x3,.LLSDAC2360
.type _Z14init_dp_matrixmmRK5Model.cold, #function
_Z14init_dp_matrixmmRK5Model.cold:
.LFSB2360:
.L49:
.cfi_def_cfa 6, 16
.cfi_offset 3, -56
.cfi_offset 6, -16
.cfi_offset 12, -48
.cfi_offset 13, -40
.cfi_offset 14, -32
.cfi_offset 15, -24
movq %r14, %rbx
.L50:
vzeroupper
call __cxa_begin_catch
.L53:
cmpq %rbx, %r14
jne .L107
.LEHB6:
call __cxa_rethrow
.LEHE6:
.L76:
movq %rax, %r12
vzeroupper
call __cxa_end_catch
movq 24(%rsp), %rax
movq (%rax), %rdi
testq %rdi, %rdi
je .L56
call _ZdlPv
.L56:
testq %r13, %r13
je .L69
movq %r13, %rdi
call _ZdlPv
.L69:
movq %r12, %rdi
.LEHB7:
call _Unwind_Resume
.LEHE7:
.L107:
movq (%r14), %rdi
testq %rdi, %rdi
je .L52
call _ZdlPv
.L52:
addq $24, %r14
jmp .L53
.cfi_endproc
.LFE2360:
.section .gcc_except_table
.align 4
.LLSDAC2360:
.byte 0xff
.byte 0x3
.uleb128 .LLSDATTC2360-.LLSDATTDC2360
.LLSDATTDC2360:
.byte 0x1
.uleb128 .LLSDACSEC2360-.LLSDACSBC2360
.LLSDACSBC2360:
.uleb128 .LEHB6-.LCOLDB6
.uleb128 .LEHE6-.LEHB6
.uleb128 .L76-.LCOLDB6
.uleb128 0
.uleb128 .LEHB7-.LCOLDB6
.uleb128 .LEHE7-.LEHB7
.uleb128 0
.uleb128 0
.LLSDACSEC2360:
.byte 0x1
.byte 0
.align 4
.long 0
.LLSDATTC2360:
.section .text.unlikely
.text
.size _Z14init_dp_matrixmmRK5Model, .-_Z14init_dp_matrixmmRK5Model
.section .text.unlikely
.size _Z14init_dp_matrixmmRK5Model.cold, .-_Z14init_dp_matrixmmRK5Model.cold
.LCOLDE6:
.text
.LHOTE6:
.section .text._ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev,"axG",#progbits,_ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED5Ev,comdat
.align 2
.p2align 4
.weak _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev
.type _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev, #function
_ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev:
.LFB2637:
.cfi_startproc
pushq %r12
.cfi_def_cfa_offset 16
.cfi_offset 12, -16
movq %rdi, %r12
pushq %rbp
.cfi_def_cfa_offset 24
.cfi_offset 6, -24
pushq %rbx
.cfi_def_cfa_offset 32
.cfi_offset 3, -32
movq 8(%rdi), %rbx
movq (%rdi), %rbp
cmpq %rbp, %rbx
je .L109
.p2align 4,,10
.p2align 3
.L113:
movq 0(%rbp), %rdi
testq %rdi, %rdi
je .L110
addq $24, %rbp
call _ZdlPv
cmpq %rbp, %rbx
jne .L113
.L111:
movq (%r12), %rbp
.L109:
testq %rbp, %rbp
je .L115
popq %rbx
.cfi_remember_state
.cfi_def_cfa_offset 24
movq %rbp, %rdi
popq %rbp
.cfi_def_cfa_offset 16
popq %r12
.cfi_def_cfa_offset 8
jmp _ZdlPv
.p2align 4,,10
.p2align 3
.L110:
.cfi_restore_state
addq $24, %rbp
cmpq %rbp, %rbx
jne .L113
jmp .L111
.p2align 4,,10
.p2align 3
.L115:
popq %rbx
.cfi_def_cfa_offset 24
popq %rbp
.cfi_def_cfa_offset 16
popq %r12
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE2637:
.size _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev, .-_ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev
.weak _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED1Ev
.set _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED1Ev,_ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev
.section .text.unlikely
.LCOLDB7:
.section .text.startup,"ax",#progbits
.LHOTB7:
.p2align 4
.globl main
.type main, #function
main:
.LFB2371:
.cfi_startproc
.cfi_personality 0x3,__gxx_personality_v0
.cfi_lsda 0x3,.LLSDA2371
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movl $2, %edx
movl $10, %esi
subq $48, %rsp
.cfi_def_cfa_offset 64
leaq 16(%rsp), %rdi
leaq 8(%rsp), %rcx
movq $-8, 8(%rsp)
.LEHB8:
call _Z14init_dp_matrixmmRK5Model
.LEHE8:
leaq 16(%rsp), %rdi
.LEHB9:
call _Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE
.LEHE9:
leaq 16(%rsp), %rdi
call _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED1Ev
addq $48, %rsp
.cfi_remember_state
.cfi_def_cfa_offset 16
xorl %eax, %eax
popq %rbp
.cfi_def_cfa_offset 8
ret
.L119:
.cfi_restore_state
movq %rax, %rbp
jmp .L118
.section .gcc_except_table
.LLSDA2371:
.byte 0xff
.byte 0xff
.byte 0x1
.uleb128 .LLSDACSE2371-.LLSDACSB2371
.LLSDACSB2371:
.uleb128 .LEHB8-.LFB2371
.uleb128 .LEHE8-.LEHB8
.uleb128 0
.uleb128 0
.uleb128 .LEHB9-.LFB2371
.uleb128 .LEHE9-.LEHB9
.uleb128 .L119-.LFB2371
.uleb128 0
.LLSDACSE2371:
.section .text.startup
.cfi_endproc
.section .text.unlikely
.cfi_startproc
.cfi_personality 0x3,__gxx_personality_v0
.cfi_lsda 0x3,.LLSDAC2371
.type main.cold, #function
main.cold:
.LFSB2371:
.L118:
.cfi_def_cfa_offset 64
.cfi_offset 6, -16
leaq 16(%rsp), %rdi
vzeroupper
call _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED1Ev
movq %rbp, %rdi
.LEHB10:
call _Unwind_Resume
.LEHE10:
.cfi_endproc
.LFE2371:
.section .gcc_except_table
.LLSDAC2371:
.byte 0xff
.byte 0xff
.byte 0x1
.uleb128 .LLSDACSEC2371-.LLSDACSBC2371
.LLSDACSBC2371:
.uleb128 .LEHB10-.LCOLDB7
.uleb128 .LEHE10-.LEHB10
.uleb128 0
.uleb128 0
.LLSDACSEC2371:
.section .text.unlikely
.section .text.startup
.size main, .-main
.section .text.unlikely
.size main.cold, .-main.cold
.LCOLDE7:
.section .text.startup
.LHOTE7:
.p2align 4
.type _GLOBAL__sub_I__Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE, #function
_GLOBAL__sub_I__Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE:
.LFB3017:
.cfi_startproc
subq $8, %rsp
.cfi_def_cfa_offset 16
movl $_ZStL8__ioinit, %edi
call _ZNSt8ios_base4InitC1Ev
movl $__dso_handle, %edx
movl $_ZStL8__ioinit, %esi
movl $_ZNSt8ios_base4InitD1Ev, %edi
addq $8, %rsp
.cfi_def_cfa_offset 8
jmp __cxa_atexit
.cfi_endproc
.LFE3017:
.size _GLOBAL__sub_I__Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE, .-_GLOBAL__sub_I__Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE
.section .init_array,"aw"
.align 8
.quad _GLOBAL__sub_I__Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE
.local _ZStL8__ioinit
.comm _ZStL8__ioinit,1,1
.section .rodata.cst32,"aM",#progbits,32
.align 32
.LC1:
.long 1
.long 2
.long 3
.long 4
.long 5
.long 6
.long 7
.long 8
.align 32
.LC3:
.long 8
.long 8
.long 8
.long 8
.long 8
.long 8
.long 8
.long 8
.align 32
.LC4:
.quad 0
.quad 3
.quad 6
.quad 0
.align 32
.LC5:
.quad 0
.quad 1
.quad 2
.quad 5
.hidden __dso_handle
.ident "GCC: (Homebrew GCC 9.2.0) 9.2.0"
.section .note.GNU-stack,"",#progbits
The assembly for the non -march=native version is available on godbolt.
What is going wrong, is this a compiler bug or is my program ill formed? How can I mitigate this issue if it is a compiler bug?
Additional info
Compiling with -v:
$ ~/tools/octopus/build/brew/bin/g++-9 -O3 -march=native -S bug.cpp -v
Reading specs from /home/dcooke/tools/octopus/build/brew/Cellar/gcc#9/9.2.0/bin/../lib/gcc/9/gcc/x86_64-pc-linux-gnu/9.2.0/specs
COLLECT_GCC=/home/dcooke/tools/octopus/build/brew/bin/g++-9
Target: x86_64-pc-linux-gnu
Configured with: ../configure --with-bugurl=https://github.com/Linuxbrew/homebrew-core/issues --disable-multilib --prefix=/home/linuxbrew/.linuxbrew/Cellar/gcc#9/9.2.0 --libdir=/home/linuxbrew/.linuxbrew/Cellar/gcc#9/9.2.0/lib/gcc/9 --enable-languages=c,c++,objc,obj-c++,fortran --disable-nls --enable-checking=release --program-suffix=-9 --with-gmp=/home/linuxbrew/.linuxbrew/opt/gmp --with-mpfr=/home/linuxbrew/.linuxbrew/opt/mpfr --with-mpc=/home/linuxbrew/.linuxbrew/opt/libmpc --with-isl=/home/linuxbrew/.linuxbrew/opt/isl --with-pkgversion='Homebrew GCC 9.2.0'
Thread model: posix
gcc version 9.2.0 (Homebrew GCC 9.2.0)
COLLECT_GCC_OPTIONS='-O3' '-march=native' '-S' '-v' '-shared-libgcc'
/home/dcooke/tools/octopus/build/brew/Cellar/gcc#9/9.2.0/bin/../libexec/gcc/x86_64-pc-linux-gnu/9.2.0/cc1plus -quiet -v -imultiarch x86_64-linux-gnu -iprefix /home/dcooke/tools/octopus/build/brew/Cellar/gcc#9/9.2.0/bin/../lib/gcc/9/gcc/x86_64-pc-linux-gnu/9.2.0/ -D_GNU_SOURCE bug.cpp -isysroot /home/dcooke/tools/octopus/build/brew/nonexistent -idirafter /home/dcooke/tools/octopus/build/brew/include -idirafter /usr/include/x86_64-linux-gnu -idirafter /usr/include -march=skylake-avx512 -mmmx -mno-3dnow -msse -msse2 -msse3 -mssse3 -mno-sse4a -mcx16 -msahf -mmovbe -maes -mno-sha -mpclmul -mpopcnt -mabm -mno-lwp -mfma -mno-fma4 -mno-xop -mbmi -mno-sgx -mbmi2 -mno-pconfig -mno-wbnoinvd -mno-tbm -mavx -mavx2 -msse4.2 -msse4.1 -mlzcnt -mrtm -mhle -mrdrnd -mf16c -mfsgsbase -mrdseed -mprfchw -madx -mfxsr -mxsave -mxsaveopt -mavx512f -mno-avx512er -mavx512cd -mno-avx512pf -mno-prefetchwt1 -mclflushopt -mxsavec -mxsaves -mavx512dq -mavx512bw -mavx512vl -mno-avx512ifma -mno-avx512vbmi -mno-avx5124fmaps -mno-avx5124vnniw -mclwb -mno-mwaitx -mno-clzero -mpku -mno-rdpid -mno-gfni -mno-shstk -mno-avx512vbmi2 -mno-avx512vnni -mno-vaes -mno-vpclmulqdq -mno-avx512bitalg -mno-movdiri -mno-movdir64b -mno-waitpkg -mno-cldemote -mno-ptwrite --param l1-cache-size=32 --param l1-cache-line-size=64 --param l2-cache-size=33792 -mtune=skylake-avx512 -quiet -dumpbase bug.cpp -auxbase bug -O3 -version -o bug.s
GNU C++14 (Homebrew GCC 9.2.0) version 9.2.0 (x86_64-pc-linux-gnu)
compiled by GNU C version 9.2.0, GMP version 6.1.2, MPFR version 4.0.2, MPC version 1.1.0, isl version isl-0.21-GMP
Compiling with -O2 or less makes the problem go away:
$ g++-9 -O2 -march=native -o bug bug.cpp
$ ./bug
{0 0} {-8 0}
{0 -8} {0 0}
{0 -9} {0 0}
{0 -10} {0 0}
{0 -11} {0 0}
{0 -12} {0 0}
{0 -13} {0 0}
{0 -14} {0 0}
{0 -15} {0 0}
{0 -16} {0 0}
I tried building on a different machine with Intel chips:
$ rpm -q centos-release
centos-release-7-3.1611.el7.centos.x86_64
$ grep model /proc/cpuinfo | head -2
model : 85
model name : Intel(R) Xeon(R) Gold 6148 CPU # 2.40GHz
$ g++-9 -O3 -march=native -o bug bug.cpp -v
Reading specs from /gpfs1/well/gerton/dan/apps/octopus/build/brew/Cellar/gcc#9/9.2.0/bin/../lib/gcc/9/gcc/x86_64-pc-linux-gnu/9.2.0/specs
COLLECT_GCC=/well/gerton/dan/apps/octopus/build/brew/bin/g++-9
COLLECT_LTO_WRAPPER=/gpfs1/well/gerton/dan/apps/octopus/build/brew/Cellar/gcc#9/9.2.0/bin/../libexec/gcc/x86_64-pc-linux-gnu/9.2.0/lto-wrapper
Target: x86_64-pc-linux-gnu
Configured with: ../configure --with-bugurl=https://github.com/Linuxbrew/homebrew-core/issues --disable-multilib --prefix=/home/linuxbrew/.linuxbrew/Cellar/gcc#9/9.2.0 --libdir=/home/linuxbrew/.linuxbrew/Cellar/gcc#9/9.2.0/lib/gcc/9 --enable-languages=c,c++,objc,obj-c++,fortran --disable-nls --enable-checking=release --program-suffix=-9 --with-gmp=/home/linuxbrew/.linuxbrew/opt/gmp --with-mpfr=/home/linuxbrew/.linuxbrew/opt/mpfr --with-mpc=/home/linuxbrew/.linuxbrew/opt/libmpc --with-isl=/home/linuxbrew/.linuxbrew/opt/isl --with-pkgversion='Homebrew GCC 9.2.0'
Thread model: posix
gcc version 9.2.0 (Homebrew GCC 9.2.0)
COLLECT_GCC_OPTIONS='-O3' '-march=native' '-o' 'bug' '-v' '-shared-libgcc'
/gpfs1/well/gerton/dan/apps/octopus/build/brew/Cellar/gcc#9/9.2.0/bin/../libexec/gcc/x86_64-pc-linux-gnu/9.2.0/cc1plus -quiet -v -imultiarch x86_64-linux-gnu -iprefix /gpfs1/well/gerton/dan/apps/octopus/build/brew/Cellar/gcc#9/9.2.0/bin/../lib/gcc/9/gcc/x86_64-pc-linux-gnu/9.2.0/ -D_GNU_SOURCE bug.cpp -isysroot /gpfs1/well/gerton/dan/apps/octopus/build/brew/nonexistent -idirafter /gpfs1/well/gerton/dan/apps/octopus/build/brew/include -idirafter /gpfs1/well/gerton/dan/apps/octopus/build/brew/opt/glibc/include -march=skylake-avx512 -mmmx -mno-3dnow -msse -msse2 -msse3 -mssse3 -mno-sse4a -mcx16 -msahf -mmovbe -maes -mno-sha -mpclmul -mpopcnt -mabm -mno-lwp -mfma -mno-fma4 -mno-xop -mbmi -mno-sgx -mbmi2 -mno-pconfig -mno-wbnoinvd -mno-tbm -mavx -mavx2 -msse4.2 -msse4.1 -mlzcnt -mrtm -mhle -mrdrnd -mf16c -mfsgsbase -mrdseed -mprfchw -madx -mfxsr -mxsave -mxsaveopt -mavx512f -mno-avx512er -mavx512cd -mno-avx512pf -mno-prefetchwt1 -mclflushopt -mxsavec -mxsaves -mavx512dq -mavx512bw -mavx512vl -mno-avx512ifma -mno-avx512vbmi -mno-avx5124fmaps -mno-avx5124vnniw -mclwb -mno-mwaitx -mno-clzero -mno-pku -mno-rdpid -mno-gfni -mno-shstk -mno-avx512vbmi2 -mno-avx512vnni -mno-vaes -mno-vpclmulqdq -mno-avx512bitalg -mno-movdiri -mno-movdir64b -mno-waitpkg -mno-cldemote -mno-ptwrite --param l1-cache-size=32 --param l1-cache-line-size=64 --param l2-cache-size=28160 -mtune=skylake-avx512 -quiet -dumpbase bug.cpp -auxbase bug -O3 -version -o /tmp/cczPrvHP.s
GNU C++14 (Homebrew GCC 9.2.0) version 9.2.0 (x86_64-pc-linux-gnu)
compiled by GNU C version 9.2.0, GMP version 6.1.2, MPFR version 4.0.2, MPC version 1.1.0, isl version isl-0.21-GMP
$ ./bug
{0 0} {-8 0}
{0 -8} {0 0}
{0 -9} {0 0}
{0 -10} {0 0}
{0 -11} {0 0}
{0 -12} {0 0}
{0 -13} {0 0}
{0 -14} {0 0}
{0 -15} {0 0}
{0 -16} {0 0}
The correct output...
-ftree-loop-vectorize is the culprit:
$ g++-9 -march=native -O2 -o bug bug.cpp -ftree-loop-vectorize
$ ./bug
{0 0} {-8 0}
{-2048 255} {0 0}
{-2304 255} {0 0}
{-2560 255} {0 0}
{-2816 255} {0 0}
{-3072 255} {0 0}
{-3328 255} {0 0}
{-3584 255} {0 0}
{-3840 255} {0 0}
{0 -16} {0 0}
None of the other O3 flags result in this behaviour.
This turned out to be due to a bug in binutils gas. This solution was to upgrade my binutils to 2.32.

Function doesn't work when running normally, but does while debugging

I have the following code:
#include <iostream>
#include <cmath>
bool primes[21] = {0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0};
int find_last_power(int n, int p){
return (int) std::pow(n, (double) 1/p);
}
long long solve(int n){
long long solution = 1;
for (int i=2; i<=n; i++){
if (primes[i]){
std::cout << "p" << i << " : " << std::pow(i, find_last_power(n, i)) << std::endl;
solution *= static_cast<long long>(std::pow(i, find_last_power(n, i)));
}
}
return solution;
}
int main(){
std::cout << solve(20); return 0;
}
primes is an array of n+1 booleans whose value primes[i] is true if i is prime and false if i is composite.
find_last_power(n, p) returns the exponent (int) of the largest power of p that is less than or equal to n.
If you run the program it writes out:
p2 : 16
p3 : 9
p5 : 5
p7 : 7
p11 : 11
p13 : 13
p17 : 17
p19 : 19
214885440 // this is the return value of solve(20)
// it is supposed to be the product of the numbers on the right (16,9...)
But the returned number is not the expected output. The program, however, runs correctly in a debugger, which is why I find it very hard to identify the bug. The expected output should be 232792560.
Any help is appreciated.
As requested, here is the assembler source.
.file "PE_5.cxx"
.text
.section .rdata,"dr"
__ZStL19piecewise_construct:
.space 1
.lcomm __ZStL8__ioinit,1,1
.globl _primes
.data
.align 4
_primes:
.byte 0
.byte 0
.byte 1
.byte 1
.byte 0
.byte 1
.byte 0
.byte 1
.byte 0
.byte 0
.byte 0
.byte 1
.byte 0
.byte 1
.byte 0
.byte 0
.byte 0
.byte 1
.byte 0
.byte 1
.byte 0
.text
.globl __Z15find_last_powerii
.def __Z15find_last_powerii; .scl 2; .type 32; .endef
__Z15find_last_powerii:
LFB1717:
.cfi_startproc
pushl %ebp
.cfi_def_cfa_offset 8
.cfi_offset 5, -8
movl %esp, %ebp
.cfi_def_cfa_register 5
subl $40, %esp
fildl 12(%ebp)
fld1
fdivp %st, %st(1)
fstpl 4(%esp)
movl 8(%ebp), %eax
movl %eax, (%esp)
call __ZSt3powIidEN9__gnu_cxx11__promote_2IT_T0_NS0_9__promoteIS2_XsrSt12__is_integerIS2_E7__valueEE6__typeENS4_IS3_XsrS5_IS3_E7__valueEE6__typeEE6__typeES2_S3_
fnstcw -10(%ebp)
movzwl -10(%ebp), %eax
orb $12, %ah
movw %ax, -12(%ebp)
fldcw -12(%ebp)
fistpl -16(%ebp)
fldcw -10(%ebp)
movl -16(%ebp), %eax
leave
.cfi_restore 5
.cfi_def_cfa 4, 4
ret
.cfi_endproc
LFE1717:
.section .rdata,"dr"
LC2:
.ascii "p\0"
LC3:
.ascii " : \0"
.text
.globl __Z5solvei
.def __Z5solvei; .scl 2; .type 32; .endef
__Z5solvei:
LFB1718:
.cfi_startproc
pushl %ebp
.cfi_def_cfa_offset 8
.cfi_offset 5, -8
movl %esp, %ebp
.cfi_def_cfa_register 5
pushl %ebx
subl $52, %esp
.cfi_offset 3, -12
movl $1, -16(%ebp)
movl $0, -12(%ebp)
movl $2, -20(%ebp)
L6:
movl -20(%ebp), %eax
cmpl 8(%ebp), %eax
jg L4
movl -20(%ebp), %eax
addl $_primes, %eax
movzbl (%eax), %eax
testb %al, %al
je L5
movl $LC2, 4(%esp)
movl $__ZSt4cout, (%esp)
call __ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
movl %eax, %edx
movl -20(%ebp), %eax
movl %eax, (%esp)
movl %edx, %ecx
call __ZNSolsEi
subl $4, %esp
movl $LC3, 4(%esp)
movl %eax, (%esp)
call __ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
movl %eax, %ebx
movl -20(%ebp), %eax
movl %eax, 4(%esp)
movl 8(%ebp), %eax
movl %eax, (%esp)
call __Z15find_last_powerii
movl %eax, 4(%esp)
movl -20(%ebp), %eax
movl %eax, (%esp)
call __ZSt3powIiiEN9__gnu_cxx11__promote_2IT_T0_NS0_9__promoteIS2_XsrSt12__is_integerIS2_E7__valueEE6__typeENS4_IS3_XsrS5_IS3_E7__valueEE6__typeEE6__typeES2_S3_
fstpl (%esp)
movl %ebx, %ecx
call __ZNSolsEd
subl $8, %esp
movl $__ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_, (%esp)
movl %eax, %ecx
call __ZNSolsEPFRSoS_E
subl $4, %esp
movl -20(%ebp), %eax
movl %eax, 4(%esp)
movl 8(%ebp), %eax
movl %eax, (%esp)
call __Z15find_last_powerii
movl %eax, 4(%esp)
movl -20(%ebp), %eax
movl %eax, (%esp)
call __ZSt3powIiiEN9__gnu_cxx11__promote_2IT_T0_NS0_9__promoteIS2_XsrSt12__is_integerIS2_E7__valueEE6__typeENS4_IS3_XsrS5_IS3_E7__valueEE6__typeEE6__typeES2_S3_
fnstcw -26(%ebp)
movzwl -26(%ebp), %eax
orb $12, %ah
movw %ax, -28(%ebp)
fldcw -28(%ebp)
fistpq -40(%ebp)
fldcw -26(%ebp)
movl -40(%ebp), %eax
movl -36(%ebp), %edx
movl -12(%ebp), %ecx
movl %ecx, %ebx
imull %eax, %ebx
movl -16(%ebp), %ecx
imull %edx, %ecx
addl %ebx, %ecx
mull -16(%ebp)
addl %edx, %ecx
movl %ecx, %edx
movl %eax, -16(%ebp)
movl %edx, -12(%ebp)
movl %eax, -16(%ebp)
movl %edx, -12(%ebp)
L5:
addl $1, -20(%ebp)
jmp L6
L4:
movl -16(%ebp), %eax
movl -12(%ebp), %edx
movl -4(%ebp), %ebx
leave
.cfi_restore 5
.cfi_restore 3
.cfi_def_cfa 4, 4
ret
.cfi_endproc
LFE1718:
.def ___main; .scl 2; .type 32; .endef
.globl _main
.def _main; .scl 2; .type 32; .endef
_main:
LFB1719:
.cfi_startproc
leal 4(%esp), %ecx
.cfi_def_cfa 1, 0
andl $-16, %esp
pushl -4(%ecx)
pushl %ebp
.cfi_escape 0x10,0x5,0x2,0x75,0
movl %esp, %ebp
pushl %ecx
.cfi_escape 0xf,0x3,0x75,0x7c,0x6
subl $20, %esp
call ___main
movl $20, (%esp)
call __Z5solvei
movl %eax, (%esp)
movl %edx, 4(%esp)
movl $__ZSt4cout, %ecx
call __ZNSolsEx
subl $8, %esp
movl $0, %eax
movl -4(%ebp), %ecx
.cfi_def_cfa 1, 0
leave
.cfi_restore 5
leal -4(%ecx), %esp
.cfi_def_cfa 4, 4
ret
.cfi_endproc
LFE1719:
.section .text$_ZSt3powIidEN9__gnu_cxx11__promote_2IT_T0_NS0_9__promoteIS2_XsrSt12__is_integerIS2_E7__valueEE6__typeENS4_IS3_XsrS5_IS3_E7__valueEE6__typeEE6__typeES2_S3_,"x"
.linkonce discard
.globl __ZSt3powIidEN9__gnu_cxx11__promote_2IT_T0_NS0_9__promoteIS2_XsrSt12__is_integerIS2_E7__valueEE6__typeENS4_IS3_XsrS5_IS3_E7__valueEE6__typeEE6__typeES2_S3_
.def __ZSt3powIidEN9__gnu_cxx11__promote_2IT_T0_NS0_9__promoteIS2_XsrSt12__is_integerIS2_E7__valueEE6__typeENS4_IS3_XsrS5_IS3_E7__valueEE6__typeEE6__typeES2_S3_; .scl 2; .type 32; .endef
__ZSt3powIidEN9__gnu_cxx11__promote_2IT_T0_NS0_9__promoteIS2_XsrSt12__is_integerIS2_E7__valueEE6__typeENS4_IS3_XsrS5_IS3_E7__valueEE6__typeEE6__typeES2_S3_:
LFB1955:
.cfi_startproc
pushl %ebp
.cfi_def_cfa_offset 8
.cfi_offset 5, -8
movl %esp, %ebp
.cfi_def_cfa_register 5
subl $40, %esp
movl 12(%ebp), %eax
movl %eax, -16(%ebp)
movl 16(%ebp), %eax
movl %eax, -12(%ebp)
fildl 8(%ebp)
fldl -16(%ebp)
fstpl 8(%esp)
fstpl (%esp)
call _pow
leave
.cfi_restore 5
.cfi_def_cfa 4, 4
ret
.cfi_endproc
LFE1955:
.section .text$_ZSt3powIiiEN9__gnu_cxx11__promote_2IT_T0_NS0_9__promoteIS2_XsrSt12__is_integerIS2_E7__valueEE6__typeENS4_IS3_XsrS5_IS3_E7__valueEE6__typeEE6__typeES2_S3_,"x"
.linkonce discard
.globl __ZSt3powIiiEN9__gnu_cxx11__promote_2IT_T0_NS0_9__promoteIS2_XsrSt12__is_integerIS2_E7__valueEE6__typeENS4_IS3_XsrS5_IS3_E7__valueEE6__typeEE6__typeES2_S3_
.def __ZSt3powIiiEN9__gnu_cxx11__promote_2IT_T0_NS0_9__promoteIS2_XsrSt12__is_integerIS2_E7__valueEE6__typeENS4_IS3_XsrS5_IS3_E7__valueEE6__typeEE6__typeES2_S3_; .scl 2; .type 32; .endef
__ZSt3powIiiEN9__gnu_cxx11__promote_2IT_T0_NS0_9__promoteIS2_XsrSt12__is_integerIS2_E7__valueEE6__typeENS4_IS3_XsrS5_IS3_E7__valueEE6__typeEE6__typeES2_S3_:
LFB1957:
.cfi_startproc
pushl %ebp
.cfi_def_cfa_offset 8
.cfi_offset 5, -8
movl %esp, %ebp
.cfi_def_cfa_register 5
subl $24, %esp
fildl 12(%ebp)
fildl 8(%ebp)
fxch %st(1)
fstpl 8(%esp)
fstpl (%esp)
call _pow
leave
.cfi_restore 5
.cfi_def_cfa 4, 4
ret
.cfi_endproc
LFE1957:
.text
.def ___tcf_0; .scl 3; .type 32; .endef
___tcf_0:
LFB2201:
.cfi_startproc
pushl %ebp
.cfi_def_cfa_offset 8
.cfi_offset 5, -8
movl %esp, %ebp
.cfi_def_cfa_register 5
subl $8, %esp
movl $__ZStL8__ioinit, %ecx
call __ZNSt8ios_base4InitD1Ev
leave
.cfi_restore 5
.cfi_def_cfa 4, 4
ret
.cfi_endproc
LFE2201:
.def __Z41__static_initialization_and_destruction_0ii; .scl 3; .type 32; .endef
__Z41__static_initialization_and_destruction_0ii:
LFB2200:
.cfi_startproc
pushl %ebp
.cfi_def_cfa_offset 8
.cfi_offset 5, -8
movl %esp, %ebp
.cfi_def_cfa_register 5
subl $24, %esp
cmpl $1, 8(%ebp)
jne L17
cmpl $65535, 12(%ebp)
jne L17
movl $__ZStL8__ioinit, %ecx
call __ZNSt8ios_base4InitC1Ev
movl $___tcf_0, (%esp)
call _atexit
L17:
nop
leave
.cfi_restore 5
.cfi_def_cfa 4, 4
ret
.cfi_endproc
LFE2200:
.def __GLOBAL__sub_I_primes; .scl 3; .type 32; .endef
__GLOBAL__sub_I_primes:
LFB2202:
.cfi_startproc
pushl %ebp
.cfi_def_cfa_offset 8
.cfi_offset 5, -8
movl %esp, %ebp
.cfi_def_cfa_register 5
subl $24, %esp
movl $65535, 4(%esp)
movl $1, (%esp)
call __Z41__static_initialization_and_destruction_0ii
leave
.cfi_restore 5
.cfi_def_cfa 4, 4
ret
.cfi_endproc
LFE2202:
.section .ctors,"w"
.align 4
.long __GLOBAL__sub_I_primes
.ident "GCC: (MinGW.org GCC-8.2.0-3) 8.2.0"
.def __ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc; .scl 2; .type 32; .endef
.def __ZNSolsEi; .scl 2; .type 32; .endef
.def __ZNSolsEd; .scl 2; .type 32; .endef
.def __ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_; .scl 2; .type 32; .endef
.def __ZNSolsEPFRSoS_E; .scl 2; .type 32; .endef
.def __ZNSolsEx; .scl 2; .type 32; .endef
.def _pow; .scl 2; .type 32; .endef
.def __ZNSt8ios_base4InitD1Ev; .scl 2; .type 32; .endef
.def __ZNSt8ios_base4InitC1Ev; .scl 2; .type 32; .endef
.def _atexit; .scl 2; .type 32; .endef
It was compiled with the following commands (on 64-bit Intel i5 4690k, Windows 10):
g++ -S -o asm.s PE_5.cxx
g++ -c asm.s -o outtput.o
g++ output.o -o out.exe
g++ --version
// g++ (MinGW.org GCC-8.2.0-3) 8.2.0
You have int overflow. Change the returned type
int solve(int n){
with
long long solve(int n){

Why does a simple use of ostringstream generates so much assembly code?

Consider the following simple example that formats a string and an integer using ostringstream and discards the output:
#include <sstream>
void ostringstream_test() {
std::ostringstream ss;
ss << "x = " << 42;
ss.str();
}
Compiling it with clang++ -S -O3 -DNDEBUG -std=c++14 test.cc generates a ton of assembly code (half a kilobyte in x86-64 instructions compared to less than a hundred bytes for a similar sprintf code) - see below the output. Why does it generates so much code, is it inherent to the ostringstream API or this particular compiler/library does something wrong?
.globl __Z18ostringstream_testv
.p2align 4, 0x90
__Z18ostringstream_testv: ## #_Z18ostringstream_testv
Lfunc_begin0:
.cfi_startproc
.cfi_personality 155, ___gxx_personality_v0
.cfi_lsda 16, Lexception0
## BB#0:
pushq %rbp
Lcfi0:
.cfi_def_cfa_offset 16
Lcfi1:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Lcfi2:
.cfi_def_cfa_register %rbp
pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %rbx
subq $328, %rsp ## imm = 0x148
Lcfi3:
.cfi_offset %rbx, -56
Lcfi4:
.cfi_offset %r12, -48
Lcfi5:
.cfi_offset %r13, -40
Lcfi6:
.cfi_offset %r14, -32
Lcfi7:
.cfi_offset %r15, -24
leaq -256(%rbp), %r14
leaq -360(%rbp), %r12
movq __ZTCNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE0_NS_13basic_ostreamIcS2_EE#GOTPCREL(%rip), %rax
leaq 24(%rax), %rcx
movq %rcx, -368(%rbp)
addq $64, %rax
movq %rax, -256(%rbp)
Ltmp0:
movq %r14, %rdi
movq %r12, %rsi
callq __ZNSt3__18ios_base4initEPv
Ltmp1:
## BB#1:
movq $0, -120(%rbp)
movl $-1, -112(%rbp)
movq __ZTVNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE#GOTPCREL(%rip), %rbx
leaq 24(%rbx), %r13
movq %r13, -368(%rbp)
addq $64, %rbx
movq %rbx, -256(%rbp)
Ltmp3:
movq %r12, %rdi
callq __ZNSt3__115basic_streambufIcNS_11char_traitsIcEEEC2Ev
Ltmp4:
## BB#2:
movq __ZTVNSt3__115basic_stringbufIcNS_11char_traitsIcEENS_9allocatorIcEEEE#GOTPCREL(%rip), %r15
addq $16, %r15
movq %r15, -360(%rbp)
movq $0, -272(%rbp)
movq $0, -280(%rbp)
movq $0, -288(%rbp)
movq $0, -296(%rbp)
movl $16, -264(%rbp)
xorps %xmm0, %xmm0
movaps %xmm0, -80(%rbp)
movq $0, -64(%rbp)
Ltmp6:
leaq -80(%rbp), %rsi
movq %r12, %rdi
callq __ZNSt3__115basic_stringbufIcNS_11char_traitsIcEENS_9allocatorIcEEE3strERKNS_12basic_stringIcS2_S4_EE
Ltmp7:
## BB#3:
testb $1, -80(%rbp)
je LBB0_5
## BB#4:
movq -64(%rbp), %rdi
callq __ZdlPv
LBB0_5:
Ltmp9:
leaq L_.str(%rip), %rsi
leaq -368(%rbp), %rdi
movl $4, %edx
callq __ZNSt3__124__put_character_sequenceIcNS_11char_traitsIcEEEERNS_13basic_ostreamIT_T0_EES7_PKS4_m
Ltmp10:
## BB#6:
Ltmp11:
movl $42, %esi
movq %rax, %rdi
callq __ZNSt3__113basic_ostreamIcNS_11char_traitsIcEEElsEi
Ltmp12:
## BB#7:
Ltmp13:
leaq -104(%rbp), %rdi
movq %r12, %rsi
callq __ZNKSt3__115basic_stringbufIcNS_11char_traitsIcEENS_9allocatorIcEEE3strEv
Ltmp14:
## BB#8:
testb $1, -104(%rbp)
je LBB0_10
## BB#9:
movq -88(%rbp), %rdi
callq __ZdlPv
LBB0_10:
movq %r13, -368(%rbp)
movq %rbx, -256(%rbp)
movq %r15, -360(%rbp)
testb $1, -296(%rbp)
je LBB0_12
## BB#11:
movq -280(%rbp), %rdi
callq __ZdlPv
LBB0_12:
movq %r12, %rdi
callq __ZNSt3__115basic_streambufIcNS_11char_traitsIcEEED2Ev
movq __ZTTNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE#GOTPCREL(%rip), %rsi
addq $8, %rsi
leaq -368(%rbp), %rdi
callq __ZNSt3__113basic_ostreamIcNS_11char_traitsIcEEED2Ev
movq %r14, %rdi
callq __ZNSt3__19basic_iosIcNS_11char_traitsIcEEED2Ev
addq $328, %rsp ## imm = 0x148
popq %rbx
popq %r12
popq %r13
popq %r14
popq %r15
popq %rbp
retq
LBB0_13:
Ltmp8:
movq %rax, -48(%rbp) ## 8-byte Spill
testb $1, -80(%rbp)
je LBB0_18
## BB#14:
movq -64(%rbp), %rdi
callq __ZdlPv
testb $1, -296(%rbp)
jne LBB0_19
jmp LBB0_20
LBB0_16:
Ltmp5:
movq %rax, -48(%rbp) ## 8-byte Spill
jmp LBB0_21
LBB0_15:
Ltmp2:
movq %rax, -48(%rbp) ## 8-byte Spill
jmp LBB0_22
LBB0_17:
Ltmp15:
movq %rax, -48(%rbp) ## 8-byte Spill
movq %r13, -368(%rbp)
movq %rbx, -256(%rbp)
movq %r15, -360(%rbp)
LBB0_18:
testb $1, -296(%rbp)
je LBB0_20
LBB0_19:
movq -280(%rbp), %rdi
callq __ZdlPv
LBB0_20:
movq %r12, %rdi
callq __ZNSt3__115basic_streambufIcNS_11char_traitsIcEEED2Ev
LBB0_21:
movq __ZTTNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE#GOTPCREL(%rip), %rsi
addq $8, %rsi
leaq -368(%rbp), %rdi
callq __ZNSt3__113basic_ostreamIcNS_11char_traitsIcEEED2Ev
LBB0_22:
movq %r14, %rdi
callq __ZNSt3__19basic_iosIcNS_11char_traitsIcEEED2Ev
movq -48(%rbp), %rdi ## 8-byte Reload
callq __Unwind_Resume
Lfunc_end0:
.cfi_endproc
.section __TEXT,__gcc_except_tab
.p2align 2
GCC_except_table0:
Lexception0:
.byte 255 ## #LPStart Encoding = omit
.byte 155 ## #TType Encoding = indirect pcrel sdata4
.asciz "\303\200" ## #TType base offset
.byte 3 ## Call site Encoding = udata4
.byte 65 ## Call site table length
Lset0 = Ltmp0-Lfunc_begin0 ## >> Call Site 1 <<
.long Lset0
Lset1 = Ltmp1-Ltmp0 ## Call between Ltmp0 and Ltmp1
.long Lset1
Lset2 = Ltmp2-Lfunc_begin0 ## jumps to Ltmp2
.long Lset2
.byte 0 ## On action: cleanup
Lset3 = Ltmp3-Lfunc_begin0 ## >> Call Site 2 <<
.long Lset3
Lset4 = Ltmp4-Ltmp3 ## Call between Ltmp3 and Ltmp4
.long Lset4
Lset5 = Ltmp5-Lfunc_begin0 ## jumps to Ltmp5
.long Lset5
.byte 0 ## On action: cleanup
Lset6 = Ltmp6-Lfunc_begin0 ## >> Call Site 3 <<
.long Lset6
Lset7 = Ltmp7-Ltmp6 ## Call between Ltmp6 and Ltmp7
.long Lset7
Lset8 = Ltmp8-Lfunc_begin0 ## jumps to Ltmp8
.long Lset8
.byte 0 ## On action: cleanup
Lset9 = Ltmp9-Lfunc_begin0 ## >> Call Site 4 <<
.long Lset9
Lset10 = Ltmp14-Ltmp9 ## Call between Ltmp9 and Ltmp14
.long Lset10
Lset11 = Ltmp15-Lfunc_begin0 ## jumps to Ltmp15
.long Lset11
.byte 0 ## On action: cleanup
Lset12 = Ltmp14-Lfunc_begin0 ## >> Call Site 5 <<
.long Lset12
Lset13 = Lfunc_end0-Ltmp14 ## Call between Ltmp14 and Lfunc_end0
.long Lset13
.long 0 ## has no landing pad
.byte 0 ## On action: cleanup
.p2align 2
The most likely reason for the difference is that the IOStream implementation is expanded inline while the sprintf() use is just a function call. Nothing inherently prevents IOStreams to be implemented by a library. It does take a tiny but of abstraction and planning, though: the definition in the standard uses templates. These are normally just implemented inline. Declaring the typically used instantiations (for character types char and wchar_t) as extern templates and explicitly instantiating them is extra work, though. I showed a long time ago that it does pay off in term of compile-time and, at least, libstdc++ preinstantiates the IOStreams functions in a library. Based on you experiment it seems libc++ doesn’t.

Why segmentation fault is caused by class variables order?

I've created following program :
class CLexer
{
public:
CLexer( ) {
iCursorPos = 0;
}
void putCharacter(char character)
{
if(character != ' ' && character != '\n') {
m_strToken[iCursorPos] = character;
iCursorPos++;
}
else {
m_strToken[iCursorPos] = '\0';
iCursorPos = 0;
}
}
private:
char m_strToken[1024];
int iCursorPos = 0;
};
int main(int argc, char * argv[]) {
CLexer lex;
lex.putCharacter('m');
return 0;
}
Assembler output produced by compiler :
.file "main.cpp"
.section .text._ZN6CLexerC2Ev,"axG",#progbits,_ZN6CLexerC5Ev,comdat
.align 2
.weak _ZN6CLexerC2Ev
.type _ZN6CLexerC2Ev, #function
_ZN6CLexerC2Ev:
.LFB1:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movq %rdi, -8(%rbp)
movq -8(%rbp), %rax
movl $0, 1024(%rax)
movq -8(%rbp), %rax
movl $0, 1024(%rax)
nop
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE1:
.size _ZN6CLexerC2Ev, .-_ZN6CLexerC2Ev
.weak _ZN6CLexerC1Ev
.set _ZN6CLexerC1Ev,_ZN6CLexerC2Ev
.section .text._ZN6CLexer12putCharacterEc,"axG",#progbits,_ZN6CLexer12putCharacterEc,comdat
.align 2
.weak _ZN6CLexer12putCharacterEc
.type _ZN6CLexer12putCharacterEc, #function
_ZN6CLexer12putCharacterEc:
.LFB3:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movq %rdi, -8(%rbp)
movl %esi, %eax
movb %al, -12(%rbp)
cmpb $32, -12(%rbp)
je .L3
cmpb $10, -12(%rbp)
je .L3
movq -8(%rbp), %rax
movl 1024(%rax), %eax
movq -8(%rbp), %rdx
cltq
movzbl -12(%rbp), %ecx
movb %cl, (%rdx,%rax)
movq -8(%rbp), %rax
movl 1024(%rax), %eax
leal 1(%rax), %edx
movq -8(%rbp), %rax
movl %edx, 1024(%rax)
jmp .L4
.L3:
movq -8(%rbp), %rax
movl 1024(%rax), %eax
movq -8(%rbp), %rdx
cltq
movb $0, (%rdx,%rax)
movq -8(%rbp), %rax
movl $0, 1024(%rax)
.L4:
nop
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE3:
.size _ZN6CLexer12putCharacterEc, .-_ZN6CLexer12putCharacterEc
.text
.globl main
.type main, #function
main:
.LFB4:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $1056, %rsp
movl %edi, -1044(%rbp)
movq %rsi, -1056(%rbp)
leaq -1040(%rbp), %rax
movq %rax, %rdi
call _ZN6CLexerC1Ev
leaq -1040(%rbp), %rax
movl $109, %esi
movq %rax, %rdi
call _ZN6CLexer12putCharacterEc
movl $0, %eax
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE4:
.size main, .-main
.ident "GCC: (GNU) 6.1.1 20160501"
.section .note.GNU-stack,"",#progbits
And after execution, first call to putCharacter method with 'm' character as parameter is throwing segfault.
Attached gdb is giving following output :
Program received signal SIGSEGV, Segmentation fault.
0x00000000004018e5 in CLexer::putCharacter (this=0x7fffffffe370,
character=109 'm') at src/main.cpp:60
60 m_strToken[iCursorPos] = character;
I've managed to fix this error by moving iCursorPos variable above m_strToken in class declaration but i think it isn't proper way to fix this issue.
I'm using g++ (GCC) 6.1.1 20160501 on the lastest and updated version of ArchLinux x86_64.
if(character != ' ' && character != '\n') {
m_strToken[iCursorPos] = character;
iCursorPos++;
}
You don't check that iCursorPos < 1024 here. So you write past the end of the buffer, into iCursorPos itself.
The next access m_strToken[iCursorPos] = character; probably writes way past the end of the buffer, and you get a segfault (luckily).
Your "fix" still isn't correct, since you corrupt other parts of your objects memory regardless.

Why sin/cos are slower when optimizations are enabled?

After reading a question related with the performance of sin/cos (Why is std::sin() and std::cos() slower than sin() and cos()?), I made some tests with his code and found a weird thing: If i call sin/cos with a float value, it is much slower than with double when compiled with optimization.
#include <cmath>
#include <cstdio>
const int N = 4000;
float cosine[N][N];
float sine[N][N];
int main() {
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
float ang = i*j*2*M_PI/N;
cosine[i][j] = cos(ang);
sine[i][j] = sin(ang);
}
}
}
With the above code I get:
With -O0: 2.402s
With -O1: 9.004s
With -O2: 9.013s
With -O3: 9.001s
Now if I change
float ang = i*j*2*M_PI/N;
To
double ang = i*j*2*M_PI/N;
I get:
With -O0: 2.362s
With -O1: 1.188s
With -O2: 1.197s
With -O3: 1.197s
How can the first test be that faster without optimizations?
I'm using g++ (Ubuntu/Linaro 4.5.2-8ubuntu4) 4.5.2, 64 bits.
EDIT: Changed the title to better describe the problem.
EDIT: Added assembly code
Assembly for first test with O0:
.file "main.cpp"
.globl cosine
.bss
.align 32
.type cosine, #object
.size cosine, 64000000
cosine:
.zero 64000000
.globl sine
.align 32
.type sine, #object
.size sine, 64000000
sine:
.zero 64000000
.text
.globl main
.type main, #function
main:
.LFB87:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
movq %rsp, %rbp
.cfi_offset 6, -16
.cfi_def_cfa_register 6
subq $16, %rsp
movl $0, -4(%rbp)
jmp .L2
.L5:
movl $0, -8(%rbp)
jmp .L3
.L4:
movl -4(%rbp), %eax
imull -8(%rbp), %eax
addl %eax, %eax
cvtsi2sd %eax, %xmm0
movsd .LC0(%rip), %xmm1
mulsd %xmm1, %xmm0
movsd .LC1(%rip), %xmm1
divsd %xmm1, %xmm0
unpcklpd %xmm0, %xmm0
cvtpd2ps %xmm0, %xmm0
movss %xmm0, -12(%rbp)
movss -12(%rbp), %xmm0
cvtps2pd %xmm0, %xmm0
call cos
unpcklpd %xmm0, %xmm0
cvtpd2ps %xmm0, %xmm0
movl -8(%rbp), %eax
cltq
movl -4(%rbp), %edx
movslq %edx, %rdx
imulq $4000, %rdx, %rdx
leaq (%rdx,%rax), %rax
movss %xmm0, cosine(,%rax,4)
movss -12(%rbp), %xmm0
cvtps2pd %xmm0, %xmm0
call sin
unpcklpd %xmm0, %xmm0
cvtpd2ps %xmm0, %xmm0
movl -8(%rbp), %eax
cltq
movl -4(%rbp), %edx
movslq %edx, %rdx
imulq $4000, %rdx, %rdx
leaq (%rdx,%rax), %rax
movss %xmm0, sine(,%rax,4)
addl $1, -8(%rbp)
.L3:
cmpl $3999, -8(%rbp)
setle %al
testb %al, %al
jne .L4
addl $1, -4(%rbp)
.L2:
cmpl $3999, -4(%rbp)
setle %al
testb %al, %al
jne .L5
movl $0, %eax
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE87:
.size main, .-main
.section .rodata
.align 4
.type _ZL1N, #object
.size _ZL1N, 4
_ZL1N:
.long 4000
.align 8
.LC0:
.long 1413754136
.long 1074340347
.align 8
.LC1:
.long 0
.long 1085227008
.ident "GCC: (Ubuntu/Linaro 4.5.2-8ubuntu4) 4.5.2"
.section .note.GNU-stack,"",#progbits
Assembly for first test with O3:
.file "main.cpp"
.text
.p2align 4,,15
.globl main
.type main, #function
main:
.LFB121:
.cfi_startproc
pushq %r15
.cfi_def_cfa_offset 16
xorl %r15d, %r15d
.cfi_offset 15, -16
pushq %r14
.cfi_def_cfa_offset 24
movl $cosine+16000, %r14d
.cfi_offset 14, -24
pushq %r13
.cfi_def_cfa_offset 32
xorl %r13d, %r13d
.cfi_offset 13, -32
pushq %r12
.cfi_def_cfa_offset 40
pushq %rbp
.cfi_def_cfa_offset 48
pushq %rbx
.cfi_def_cfa_offset 56
subq $24, %rsp
.cfi_def_cfa_offset 80
.p2align 4,,10
.p2align 3
.L2:
movslq %r15d, %rbp
.cfi_offset 3, -56
.cfi_offset 6, -48
.cfi_offset 12, -40
movl %r13d, %r12d
movl $0x3f800000, %edx
imulq $16000, %rbp, %rbp
xorl %eax, %eax
leaq cosine(%rbp), %rbx
addq $sine, %rbp
jmp .L5
.p2align 4,,10
.p2align 3
.L3:
movl %r12d, %eax
leaq 8(%rsp), %rsi
leaq 12(%rsp), %rdi
subl %r13d, %eax
cvtsi2sd %eax, %xmm0
mulsd .LC2(%rip), %xmm0
divsd .LC3(%rip), %xmm0
unpcklpd %xmm0, %xmm0
cvtpd2ps %xmm0, %xmm0
call sincosf
movl 8(%rsp), %edx
movl 12(%rsp), %eax
.L5:
movl %edx, (%rbx)
addq $4, %rbx
movl %eax, 0(%rbp)
addl %r13d, %r12d
addq $4, %rbp
cmpq %r14, %rbx
jne .L3
addl $1, %r15d
addl $2, %r13d
leaq 16000(%rbx), %r14
cmpl $4000, %r15d
jne .L2
addq $24, %rsp
.cfi_def_cfa_offset 56
xorl %eax, %eax
popq %rbx
.cfi_def_cfa_offset 48
popq %rbp
.cfi_def_cfa_offset 40
popq %r12
.cfi_def_cfa_offset 32
popq %r13
.cfi_def_cfa_offset 24
popq %r14
.cfi_def_cfa_offset 16
popq %r15
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE121:
.size main, .-main
.globl cosine
.bss
.align 32
.type cosine, #object
.size cosine, 64000000
cosine:
.zero 64000000
.globl sine
.align 32
.type sine, #object
.size sine, 64000000
sine:
.zero 64000000
.section .rodata.cst8,"aM",#progbits,8
.align 8
.LC2:
.long 1413754136
.long 1074340347
.align 8
.LC3:
.long 0
.long 1085227008
.ident "GCC: (Ubuntu/Linaro 4.5.2-8ubuntu4) 4.5.2"
.section .note.GNU-stack,"",#progbits
Here's a possibility:
In C, cos is double precision and cosf is single precision. In C++, std::cos has overloads for both double and single.
You aren't calling std::cos. If <cmath> doesn't also overload ::cos (as far as I know, it is not required to), then you are just calling the C double precision function. If this is the case, then you're suffering the cost of converting between float, double, and back.
Now, some standard libraries implement cos(float x) as (float)cos((double)x), so even if you are calling the float function it might still be doing conversions behind the scenes.
This shouldn't account for a 9x performance difference, though.
AFAIK it's because computers work at double precision natively. Using float requires conversions.'