Slow std::string concatenation on windows - c++
I have a program that needs to concatenate lots of strings together (to be more precise integers converted to strings). On my Ubuntu machine (running g++ 7.3.0) the code runs in 1.5 seconds. But the code needs to be run on Windows as well (running g++ 6.3.0 using MinGW), where it takes 15 seconds to complete. Furthermore, the Ubuntu setup runs on a much slower Laptop using an i7-4712MQ CPU # 2.30GHz, whereas the Windows machine runs on an i7-7700K CPU # 4.20GHz.
The code to reproduce the times is shown below. I compile the code with g++ tester.cpp -O2 -o tester (or tester.exe for windows)
#include <iostream>
#include <chrono>
int main(int argc, char const *argv[]) {
auto started = std::chrono::high_resolution_clock::now();
std::string str = "";
const int n = 10000000;
str.reserve(2 * n);
int a = 1;
for (int i = 0; i < n; ++i) {
str += std::to_string(a) + " ";
}
auto done = std::chrono::high_resolution_clock::now();
double secs = (double) std::chrono::duration_cast<std::chrono::milliseconds>(done-started).count() / 1000;
std::cout << "Done in " << secs << "\n";
return 0;
}
Any idea where the large performance gap might come from?
The disassemblies look like this:
Ubuntu:
.file "tester.cpp"
.text
.align 2
.p2align 4,,15
.type _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.19, #function
_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.19:
.LFB2389:
.cfi_startproc
pushq %r12
.cfi_def_cfa_offset 16
.cfi_offset 12, -16
pushq %rbp
.cfi_def_cfa_offset 24
.cfi_offset 6, -24
movq %rsi, %r12
pushq %rbx
.cfi_def_cfa_offset 32
.cfi_offset 3, -32
movq %rdx, %rbx
movq %rdi, %rbp
subq %rsi, %rbx
subq $16, %rsp
.cfi_def_cfa_offset 48
movq %fs:40, %rax
movq %rax, 8(%rsp)
xorl %eax, %eax
cmpq $15, %rbx
movq %rbx, (%rsp)
ja .L12
movq (%rdi), %rdx
cmpq $1, %rbx
movq %rdx, %rax
jne .L4
movzbl (%rsi), %eax
movb %al, (%rdx)
movq (%rdi), %rdx
.L5:
movq (%rsp), %rax
movq %rax, 8(%rbp)
movb $0, (%rdx,%rax)
movq 8(%rsp), %rax
xorq %fs:40, %rax
jne .L13
addq $16, %rsp
.cfi_remember_state
.cfi_def_cfa_offset 32
popq %rbx
.cfi_def_cfa_offset 24
popq %rbp
.cfi_def_cfa_offset 16
popq %r12
.cfi_def_cfa_offset 8
ret
.L12:
.cfi_restore_state
xorl %edx, %edx
movq %rsp, %rsi
call _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_createERmm#PLT
movq (%rsp), %rdx
movq %rax, 0(%rbp)
movq %rdx, 16(%rbp)
.L3:
movq %rbx, %rdx
movq %r12, %rsi
movq %rax, %rdi
call memcpy#PLT
movq 0(%rbp), %rdx
jmp .L5
.L4:
testq %rbx, %rbx
je .L5
jmp .L3
.L13:
call __stack_chk_fail#PLT
.cfi_endproc
.LFE2389:
.size _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.19, .-_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.19
.set _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPcEEvT_S7_St20forward_iterator_tag.isra.23,_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.19
.section .text._ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z,"axG",#progbits,_ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z,comdat
.p2align 4,,15
.weak _ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z
.type _ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z, #function
_ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z:
.LFB1953:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsi, %r10
movq %rdx, %rsi
movq %rcx, %rdx
movq %rsp, %rbp
.cfi_def_cfa_register 6
pushq %r12
pushq %rbx
.cfi_offset 12, -24
.cfi_offset 3, -32
movq %rdi, %r12
subq $208, %rsp
testb %al, %al
movq %r8, -160(%rbp)
movq %r9, -152(%rbp)
je .L15
movaps %xmm0, -144(%rbp)
movaps %xmm1, -128(%rbp)
movaps %xmm2, -112(%rbp)
movaps %xmm3, -96(%rbp)
movaps %xmm4, -80(%rbp)
movaps %xmm5, -64(%rbp)
movaps %xmm6, -48(%rbp)
movaps %xmm7, -32(%rbp)
.L15:
movq %fs:40, %rax
movq %rax, -200(%rbp)
xorl %eax, %eax
leaq 30(%rsi), %rax
leaq -224(%rbp), %rcx
andq $-16, %rax
movl $32, -224(%rbp)
movl $48, -220(%rbp)
subq %rax, %rsp
leaq 16(%rbp), %rax
leaq 15(%rsp), %rbx
movq %rax, -216(%rbp)
leaq -192(%rbp), %rax
andq $-16, %rbx
movq %rbx, %rdi
movq %rax, -208(%rbp)
call *%r10
leaq 16(%r12), %rdx
movq %r12, %rdi
movq %rbx, %rsi
movq %rdx, (%r12)
movslq %eax, %rdx
addq %rbx, %rdx
call _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPcEEvT_S7_St20forward_iterator_tag.isra.23
movq -200(%rbp), %rdi
xorq %fs:40, %rdi
movq %r12, %rax
jne .L18
leaq -16(%rbp), %rsp
popq %rbx
popq %r12
popq %rbp
.cfi_remember_state
.cfi_def_cfa 7, 8
ret
.L18:
.cfi_restore_state
call __stack_chk_fail#PLT
.cfi_endproc
.LFE1953:
.size _ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z, .-_ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z
.section .rodata.str1.1,"aMS",#progbits,1
.LC0:
.string ""
.LC1:
.string "%d"
.LC2:
.string "basic_string::append"
.LC3:
.string " "
.LC5:
.string "Done in "
.LC6:
.string "\n"
.section .text.startup,"ax",#progbits
.p2align 4,,15
.globl main
.type main, #function
main:
.LFB1871:
.cfi_startproc
.cfi_personality 0x9b,DW.ref.__gxx_personality_v0
.cfi_lsda 0x1b,.LLSDA1871
pushq %r15
.cfi_def_cfa_offset 16
.cfi_offset 15, -16
pushq %r14
.cfi_def_cfa_offset 24
.cfi_offset 14, -24
pushq %r13
.cfi_def_cfa_offset 32
.cfi_offset 13, -32
pushq %r12
.cfi_def_cfa_offset 40
.cfi_offset 12, -40
pushq %rbp
.cfi_def_cfa_offset 48
.cfi_offset 6, -48
pushq %rbx
.cfi_def_cfa_offset 56
.cfi_offset 3, -56
subq $136, %rsp
.cfi_def_cfa_offset 192
leaq 16(%rsp), %r13
movq %fs:40, %rax
movq %rax, 120(%rsp)
xorl %eax, %eax
call _ZNSt6chrono3_V212system_clock3nowEv#PLT
leaq .LC0(%rip), %rdx
movq %rax, (%rsp)
leaq 16(%r13), %rax
movq %r13, %rdi
movq %rdx, %rsi
movq %rax, 16(%rsp)
.LEHB0:
call _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.19
.LEHE0:
movl $20000000, %esi
movq %r13, %rdi
.LEHB1:
call _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE7reserveEm#PLT
.LEHE1:
leaq 48(%rsp), %rbp
leaq 80(%rsp), %rax
movl $10000000, %ebx
movabsq $9223372036854775807, %r14
leaq 96(%rsp), %r12
movq %rax, 8(%rsp)
leaq 16(%rbp), %r15
jmp .L25
.p2align 4,,10
.p2align 3
.L21:
movq %rcx, 80(%rsp)
movq 16(%rax), %rcx
movq %rcx, 96(%rsp)
.L22:
movq 8(%rax), %rcx
movb $0, 16(%rax)
movq %r13, %rdi
movq %rcx, 88(%rsp)
movq %rdx, (%rax)
movq $0, 8(%rax)
movq 80(%rsp), %rsi
movq 88(%rsp), %rdx
.LEHB2:
call _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_appendEPKcm#PLT
.LEHE2:
movq 80(%rsp), %rdi
cmpq %r12, %rdi
je .L23
call _ZdlPv#PLT
.L23:
movq 48(%rsp), %rdi
cmpq %r15, %rdi
je .L24
call _ZdlPv#PLT
.L24:
subl $1, %ebx
je .L40
.L25:
movq vsnprintf#GOTPCREL(%rip), %rsi
leaq .LC1(%rip), %rcx
movl $1, %r8d
movl $16, %edx
movq %rbp, %rdi
xorl %eax, %eax
.LEHB3:
call _ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z
.LEHE3:
cmpq %r14, 56(%rsp)
je .L41
leaq .LC3(%rip), %rsi
movl $1, %edx
movq %rbp, %rdi
.LEHB4:
call _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_appendEPKcm#PLT
.LEHE4:
movq %r12, 80(%rsp)
movq (%rax), %rcx
leaq 16(%rax), %rdx
cmpq %rdx, %rcx
jne .L21
movdqu 16(%rax), %xmm0
movaps %xmm0, 96(%rsp)
jmp .L22
.p2align 4,,10
.p2align 3
.L40:
call _ZNSt6chrono3_V212system_clock3nowEv#PLT
subq (%rsp), %rax
movabsq $4835703278458516699, %rdx
leaq .LC5(%rip), %rsi
pxor %xmm0, %xmm0
leaq _ZSt4cout(%rip), %rdi
movq %rax, %rcx
imulq %rdx
sarq $63, %rcx
sarq $18, %rdx
subq %rcx, %rdx
cvtsi2sdq %rdx, %xmm0
movl $8, %edx
divsd .LC4(%rip), %xmm0
movsd %xmm0, (%rsp)
.LEHB5:
call _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l#PLT
movsd (%rsp), %xmm0
leaq _ZSt4cout(%rip), %rdi
call _ZNSo9_M_insertIdEERSoT_#PLT
leaq .LC6(%rip), %rsi
movq %rax, %rdi
call _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc#PLT
.LEHE5:
movq 16(%rsp), %rdi
addq $16, %r13
cmpq %r13, %rdi
je .L26
call _ZdlPv#PLT
.L26:
xorl %eax, %eax
movq 120(%rsp), %rbx
xorq %fs:40, %rbx
jne .L42
addq $136, %rsp
.cfi_remember_state
.cfi_def_cfa_offset 56
popq %rbx
.cfi_def_cfa_offset 48
popq %rbp
.cfi_def_cfa_offset 40
popq %r12
.cfi_def_cfa_offset 32
popq %r13
.cfi_def_cfa_offset 24
popq %r14
.cfi_def_cfa_offset 16
popq %r15
.cfi_def_cfa_offset 8
ret
.L41:
.cfi_restore_state
leaq .LC2(%rip), %rdi
.LEHB6:
call _ZSt20__throw_length_errorPKc#PLT
.LEHE6:
.L35:
movq %rax, %rbx
.L29:
movq 48(%rsp), %rdi
addq $16, %rbp
cmpq %rbp, %rdi
je .L31
call _ZdlPv#PLT
.L31:
movq 16(%rsp), %rdi
addq $16, %r13
cmpq %r13, %rdi
je .L32
call _ZdlPv#PLT
.L32:
movq %rbx, %rdi
.LEHB7:
call _Unwind_Resume#PLT
.LEHE7:
.L34:
movq %rax, %rbx
jmp .L31
.L36:
movq 8(%rsp), %rdx
movq 80(%rsp), %rdi
movq %rax, %rbx
addq $16, %rdx
cmpq %rdx, %rdi
je .L29
call _ZdlPv#PLT
jmp .L29
.L42:
call __stack_chk_fail#PLT
.cfi_endproc
.LFE1871:
.globl __gxx_personality_v0
.section .gcc_except_table,"a",#progbits
.LLSDA1871:
.byte 0xff
.byte 0xff
.byte 0x1
.uleb128 .LLSDACSE1871-.LLSDACSB1871
.LLSDACSB1871:
.uleb128 .LEHB0-.LFB1871
.uleb128 .LEHE0-.LEHB0
.uleb128 0
.uleb128 0
.uleb128 .LEHB1-.LFB1871
.uleb128 .LEHE1-.LEHB1
.uleb128 .L34-.LFB1871
.uleb128 0
.uleb128 .LEHB2-.LFB1871
.uleb128 .LEHE2-.LEHB2
.uleb128 .L36-.LFB1871
.uleb128 0
.uleb128 .LEHB3-.LFB1871
.uleb128 .LEHE3-.LEHB3
.uleb128 .L34-.LFB1871
.uleb128 0
.uleb128 .LEHB4-.LFB1871
.uleb128 .LEHE4-.LEHB4
.uleb128 .L35-.LFB1871
.uleb128 0
.uleb128 .LEHB5-.LFB1871
.uleb128 .LEHE5-.LEHB5
.uleb128 .L34-.LFB1871
.uleb128 0
.uleb128 .LEHB6-.LFB1871
.uleb128 .LEHE6-.LEHB6
.uleb128 .L35-.LFB1871
.uleb128 0
.uleb128 .LEHB7-.LFB1871
.uleb128 .LEHE7-.LEHB7
.uleb128 0
.uleb128 0
.LLSDACSE1871:
.section .text.startup
.size main, .-main
.p2align 4,,15
.type _GLOBAL__sub_I_main, #function
_GLOBAL__sub_I_main:
.LFB2369:
.cfi_startproc
leaq _ZStL8__ioinit(%rip), %rdi
subq $8, %rsp
.cfi_def_cfa_offset 16
call _ZNSt8ios_base4InitC1Ev#PLT
movq _ZNSt8ios_base4InitD1Ev#GOTPCREL(%rip), %rdi
leaq __dso_handle(%rip), %rdx
leaq _ZStL8__ioinit(%rip), %rsi
addq $8, %rsp
.cfi_def_cfa_offset 8
jmp __cxa_atexit#PLT
.cfi_endproc
.LFE2369:
.size _GLOBAL__sub_I_main, .-_GLOBAL__sub_I_main
.section .init_array,"aw"
.align 8
.quad _GLOBAL__sub_I_main
.local _ZStL8__ioinit
.comm _ZStL8__ioinit,1,1
.section .rodata.cst8,"aM",#progbits,8
.align 8
.LC4:
.long 0
.long 1083129856
.hidden DW.ref.__gxx_personality_v0
.weak DW.ref.__gxx_personality_v0
.section .data.DW.ref.__gxx_personality_v0,"awG",#progbits,DW.ref.__gxx_personality_v0,comdat
.align 8
.type DW.ref.__gxx_personality_v0, #object
.size DW.ref.__gxx_personality_v0, 8
DW.ref.__gxx_personality_v0:
.quad __gxx_personality_v0
.hidden __dso_handle
.ident "GCC: (Ubuntu 7.3.0-16ubuntu3) 7.3.0"
.section .note.GNU-stack,"",#progbits
Windows:
.file "tester.cpp"
.text
.p2align 4,,15
.def ___tcf_0; .scl 3; .type 32; .endef
___tcf_0:
LFB2556:
.cfi_startproc
movl $__ZStL8__ioinit, %ecx
jmp __ZNSt8ios_base4InitD1Ev
.cfi_endproc
LFE2556:
.section .rdata,"dr"
.align 4
LC0:
.ascii "basic_string::_M_construct null not valid\0"
.text
.align 2
.p2align 4,,15
.def __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPcEEvT_S7_St20forward_iterator_tag.isra.29; .scl 3; .type 32; .endef
__ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPcEEvT_S7_St20forward_iterator_tag.isra.29:
LFB2587:
.cfi_startproc
pushl %edi
.cfi_def_cfa_offset 8
.cfi_offset 7, -8
pushl %esi
.cfi_def_cfa_offset 12
.cfi_offset 6, -12
movl %ecx, %esi
pushl %ebx
.cfi_def_cfa_offset 16
.cfi_offset 3, -16
subl $32, %esp
.cfi_def_cfa_offset 48
movl 48(%esp), %edi
movl 52(%esp), %ebx
testl %edi, %edi
jne L5
testl %ebx, %ebx
je L5
movl $LC0, (%esp)
call __ZSt19__throw_logic_errorPKc
.p2align 4,,10
L5:
subl %edi, %ebx
cmpl $15, %ebx
movl %ebx, 28(%esp)
ja L22
movl (%esi), %edx
cmpl $1, %ebx
movl %edx, %eax
je L23
testl %ebx, %ebx
jne L6
L8:
movl 28(%esp), %eax
movl %eax, 4(%esi)
movb $0, (%edx,%eax)
addl $32, %esp
.cfi_remember_state
.cfi_def_cfa_offset 16
popl %ebx
.cfi_restore 3
.cfi_def_cfa_offset 12
popl %esi
.cfi_restore 6
.cfi_def_cfa_offset 8
popl %edi
.cfi_restore 7
.cfi_def_cfa_offset 4
ret $8
.p2align 4,,10
L22:
.cfi_restore_state
leal 28(%esp), %eax
movl $0, 4(%esp)
movl %esi, %ecx
movl %eax, (%esp)
call __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_createERjj
.cfi_def_cfa_offset 40
subl $8, %esp
.cfi_def_cfa_offset 48
movl %eax, (%esi)
movl 28(%esp), %edx
movl %edx, 8(%esi)
L6:
movl %ebx, 8(%esp)
movl %edi, 4(%esp)
movl %eax, (%esp)
call _memcpy
movl (%esi), %edx
jmp L8
.p2align 4,,10
L23:
movzbl (%edi), %eax
movb %al, (%edx)
movl (%esi), %edx
jmp L8
.cfi_endproc
LFE2587:
.def __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.21; .scl 3; .type 32; .endef
.set __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.21,__ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPcEEvT_S7_St20forward_iterator_tag.isra.29
.section .text$_ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_jPKS8_PcEjSB_z,"x"
.linkonce discard
.p2align 4,,15
.globl __ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_jPKS8_PcEjSB_z
.def __ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_jPKS8_PcEjSB_z; .scl 2; .type 32; .endef
__ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_jPKS8_PcEjSB_z:
LFB2177:
.cfi_startproc
pushl %ebp
.cfi_def_cfa_offset 8
.cfi_offset 5, -8
movl %esp, %ebp
.cfi_def_cfa_register 5
pushl %esi
pushl %ebx
subl $16, %esp
.cfi_offset 6, -12
.cfi_offset 3, -16
movl 16(%ebp), %edx
movl 8(%ebp), %esi
leal 30(%edx), %eax
andl $-16, %eax
call ___chkstk_ms
subl %eax, %esp
leal 24(%ebp), %eax
leal 31(%esp), %ebx
movl %edx, 4(%esp)
movl %eax, 12(%esp)
movl 20(%ebp), %eax
andl $-16, %ebx
movl %ebx, (%esp)
movl %eax, 8(%esp)
call *12(%ebp)
leal 8(%esi), %edx
addl %ebx, %eax
movl %esi, %ecx
movl %edx, (%esi)
movl %eax, 4(%esp)
movl %ebx, (%esp)
call __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPcEEvT_S7_St20forward_iterator_tag.isra.29
subl $8, %esp
leal -8(%ebp), %esp
movl %esi, %eax
popl %ebx
.cfi_restore 3
popl %esi
.cfi_restore 6
popl %ebp
.cfi_restore 5
.cfi_def_cfa 4, 4
ret
.cfi_endproc
LFE2177:
.def ___main; .scl 2; .type 32; .endef
.section .rdata,"dr"
LC1:
.ascii "\0"
LC2:
.ascii "%d\0"
LC3:
.ascii "basic_string::append\0"
LC4:
.ascii " \0"
.def ___divdi3; .scl 2; .type 32; .endef
LC6:
.ascii "Done in \0"
LC7:
.ascii "\12\0"
.section .text.startup,"x"
.p2align 4,,15
.globl _main
.def _main; .scl 2; .type 32; .endef
_main:
LFB2111:
.cfi_startproc
.cfi_personality 0,___gxx_personality_v0
.cfi_lsda 0,LLSDA2111
leal 4(%esp), %ecx
.cfi_def_cfa 1, 0
andl $-16, %esp
pushl -4(%ecx)
pushl %ebp
.cfi_escape 0x10,0x5,0x2,0x75,0
movl %esp, %ebp
pushl %edi
pushl %esi
pushl %ebx
pushl %ecx
.cfi_escape 0xf,0x3,0x75,0x70,0x6
.cfi_escape 0x10,0x7,0x2,0x75,0x7c
.cfi_escape 0x10,0x6,0x2,0x75,0x78
.cfi_escape 0x10,0x3,0x2,0x75,0x74
subl $152, %esp
call ___main
call __ZNSt6chrono3_V212system_clock3nowEv
leal -96(%ebp), %ecx
movl %eax, -136(%ebp)
leal -88(%ebp), %eax
movl $LC1, 4(%esp)
movl $LC1, (%esp)
movl %edx, -132(%ebp)
movl %eax, -96(%ebp)
LEHB0:
call __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.21
LEHE0:
leal -96(%ebp), %ecx
subl $8, %esp
movl $20000000, (%esp)
LEHB1:
call __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE7reserveEj
LEHE1:
subl $4, %esp
movl $10000000, %edi
leal -72(%ebp), %esi
leal -40(%ebp), %ebx
jmp L32
.p2align 4,,10
L28:
movl %ecx, -48(%ebp)
movl 8(%eax), %ecx
movl %ecx, -40(%ebp)
L29:
movl 4(%eax), %ecx
movb $0, 8(%eax)
movl %ecx, -44(%ebp)
movl %edx, (%eax)
leal -96(%ebp), %ecx
movl $0, 4(%eax)
movl -44(%ebp), %eax
movl %eax, 4(%esp)
movl -48(%ebp), %eax
movl %eax, (%esp)
LEHB2:
call __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_appendEPKcj
LEHE2:
movl -48(%ebp), %eax
subl $8, %esp
cmpl %ebx, %eax
je L30
movl %eax, (%esp)
call __ZdlPv
L30:
movl -72(%ebp), %eax
leal -64(%ebp), %edx
cmpl %edx, %eax
je L31
movl %eax, (%esp)
call __ZdlPv
L31:
subl $1, %edi
je L46
L32:
movl $1, 16(%esp)
movl $LC2, 12(%esp)
movl $16, 8(%esp)
movl $_vsnprintf, 4(%esp)
movl %esi, (%esp)
LEHB3:
call __ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_jPKS8_PcEjSB_z
LEHE3:
cmpl $2147483647, -68(%ebp)
je L47
movl $1, 4(%esp)
movl $LC4, (%esp)
movl %esi, %ecx
LEHB4:
call __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_appendEPKcj
LEHE4:
movl %ebx, -48(%ebp)
movl (%eax), %ecx
leal 8(%eax), %edx
subl $8, %esp
cmpl %edx, %ecx
jne L28
movl 12(%eax), %ecx
movl %ecx, -120(%ebp)
movl 16(%eax), %ecx
movl %ecx, -124(%ebp)
movl 20(%eax), %ecx
movl %ecx, -128(%ebp)
movl 8(%eax), %ecx
movl %ecx, -40(%ebp)
movl -120(%ebp), %ecx
movl %ecx, -36(%ebp)
movl -124(%ebp), %ecx
movl %ecx, -32(%ebp)
movl -128(%ebp), %ecx
movl %ecx, -28(%ebp)
jmp L29
.p2align 4,,10
L46:
call __ZNSt6chrono3_V212system_clock3nowEv
subl -136(%ebp), %eax
movl $1000000, 8(%esp)
sbbl -132(%ebp), %edx
movl $0, 12(%esp)
movl %eax, (%esp)
movl %edx, 4(%esp)
call ___divdi3
movl %eax, -120(%ebp)
movl %edx, -116(%ebp)
fildq -120(%ebp)
movl $8, 8(%esp)
movl $LC6, 4(%esp)
movl $__ZSt4cout, (%esp)
fdivs LC5
fstpl -120(%ebp)
LEHB5:
call __ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_i
fldl -120(%ebp)
movl $__ZSt4cout, %ecx
fstpl (%esp)
call __ZNSo9_M_insertIdEERSoT_
subl $8, %esp
movl $LC7, 4(%esp)
movl %eax, (%esp)
call __ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
LEHE5:
movl -96(%ebp), %eax
leal -88(%ebp), %edi
cmpl %edi, %eax
je L43
movl %eax, (%esp)
call __ZdlPv
L43:
leal -16(%ebp), %esp
xorl %eax, %eax
popl %ecx
.cfi_remember_state
.cfi_restore 1
.cfi_def_cfa 1, 0
popl %ebx
.cfi_restore 3
popl %esi
.cfi_restore 6
popl %edi
.cfi_restore 7
popl %ebp
.cfi_restore 5
leal -4(%ecx), %esp
.cfi_def_cfa 4, 4
ret
L47:
.cfi_restore_state
movl $LC3, (%esp)
LEHB6:
call __ZSt20__throw_length_errorPKc
LEHE6:
L41:
movl %eax, %ebx
L36:
movl -72(%ebp), %eax
leal -64(%ebp), %edx
cmpl %edx, %eax
je L38
movl %eax, (%esp)
call __ZdlPv
L38:
movl -96(%ebp), %eax
leal -88(%ebp), %edi
cmpl %edi, %eax
je L39
movl %eax, (%esp)
call __ZdlPv
L39:
movl %ebx, (%esp)
LEHB7:
call __Unwind_Resume
LEHE7:
L42:
movl %eax, %esi
movl -48(%ebp), %eax
cmpl %ebx, %eax
je L35
movl %eax, (%esp)
call __ZdlPv
L35:
movl %esi, %ebx
jmp L36
L40:
movl %eax, %ebx
jmp L38
.cfi_endproc
LFE2111:
.def ___gxx_personality_v0; .scl 2; .type 32; .endef
.section .gcc_except_table,"w"
LLSDA2111:
.byte 0xff
.byte 0xff
.byte 0x1
.uleb128 LLSDACSE2111-LLSDACSB2111
LLSDACSB2111:
.uleb128 LEHB0-LFB2111
.uleb128 LEHE0-LEHB0
.uleb128 0
.uleb128 0
.uleb128 LEHB1-LFB2111
.uleb128 LEHE1-LEHB1
.uleb128 L40-LFB2111
.uleb128 0
.uleb128 LEHB2-LFB2111
.uleb128 LEHE2-LEHB2
.uleb128 L42-LFB2111
.uleb128 0
.uleb128 LEHB3-LFB2111
.uleb128 LEHE3-LEHB3
.uleb128 L40-LFB2111
.uleb128 0
.uleb128 LEHB4-LFB2111
.uleb128 LEHE4-LEHB4
.uleb128 L41-LFB2111
.uleb128 0
.uleb128 LEHB5-LFB2111
.uleb128 LEHE5-LEHB5
.uleb128 L40-LFB2111
.uleb128 0
.uleb128 LEHB6-LFB2111
.uleb128 LEHE6-LEHB6
.uleb128 L41-LFB2111
.uleb128 0
.uleb128 LEHB7-LFB2111
.uleb128 LEHE7-LEHB7
.uleb128 0
.uleb128 0
LLSDACSE2111:
.section .text.startup,"x"
.p2align 4,,15
.def __GLOBAL__sub_I_main; .scl 3; .type 32; .endef
__GLOBAL__sub_I_main:
LFB2557:
.cfi_startproc
subl $28, %esp
.cfi_def_cfa_offset 32
movl $__ZStL8__ioinit, %ecx
call __ZNSt8ios_base4InitC1Ev
movl $___tcf_0, (%esp)
call _atexit
addl $28, %esp
.cfi_def_cfa_offset 4
ret
.cfi_endproc
LFE2557:
.section .ctors,"w"
.align 4
.long __GLOBAL__sub_I_main
.lcomm __ZStL8__ioinit,1,1
.section .rdata,"dr"
.align 4
LC5:
.long 1148846080
.ident "GCC: (MinGW.org GCC-6.3.0-1) 6.3.0"
.def __ZNSt8ios_base4InitD1Ev; .scl 2; .type 32; .endef
.def __ZSt19__throw_logic_errorPKc; .scl 2; .type 32; .endef
.def __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_createERjj; .scl 2; .type 32; .endef
.def _memcpy; .scl 2; .type 32; .endef
.def __ZNSt6chrono3_V212system_clock3nowEv; .scl 2; .type 32; .endef
.def __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE7reserveEj; .scl 2; .type 32; .endef
.def __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_appendEPKcj; .scl 2; .type 32; .endef
.def __ZdlPv; .scl 2; .type 32; .endef
.def _vsnprintf; .scl 2; .type 32; .endef
.def __ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_i; .scl 2; .type 32; .endef
.def __ZNSo9_M_insertIdEERSoT_; .scl 2; .type 32; .endef
.def __ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc; .scl 2; .type 32; .endef
.def __ZSt20__throw_length_errorPKc; .scl 2; .type 32; .endef
.def __Unwind_Resume; .scl 2; .type 32; .endef
.def __ZNSt8ios_base4InitC1Ev; .scl 2; .type 32; .endef
.def _atexit; .scl 2; .type 32; .endef
Quick look at disassembly shows that Windows version uses movl (i. e. long word, 32 bit move) and Linux version uses movq (quad word, 64 bit) and SSE registers xmm.
My bet is that on Linux, you compile for x86-64, while on Windows you target 32 bit x86.
x86-64 includes SSE2 extension, while x86 does not, so MinGW defaults to no-SSE mode.
If that's the case, building with 64 bit toolchain on Windows should result in comparable performance. Alternatively, you might enable SSE for 32 bit builds (-msse2 compiler flag, if I remember correctly).
The mingw.org implementation just seems to be much more inefficient than linux, Visual Studio or mingw-w64.org.
>g++ --version
g++ (MinGW.org GCC-6.3.0-1) 6.3.0
Done in 24.808
>g++ --version
g++ (i686-posix-dwarf-rev2, Built by MinGW-W64 project) 6.3.0
Done in 0.679
Tested with MSYS2 MinGW64:
g++ --version
g++.exe (Rev2, Built by MSYS2 project) 7.3.0
g++.exe -Wall -O3 -mtune=native -fno-exceptions -fno-rtti -c main.cpp -o main.o
g++.exe -o test.exe main.o -s
Done in 0.547
Env: Windows 10 x64
CPU: Intel Core i5-6300U, 2.4GH
RAM: 16GB DDR4
In any case, MinGW uses mswcrt.dll instead of GNU libc (windows bundled one, not a universal CRT/visual studio CRT etc) so speed gap may comes from C standard library from my experience.
P.S. with some changes (same compiler flags)
#include <iostream>
#include <chrono>
#ifdef _WIN32
#include <windows.h>
static std::size_t page_size() noexcept {
::SYSTEM_INFO si;
::GetSystemInfo(&si);
return si.dwPageSize;
}
#else
#include <sys/types.h>
#include <unistd.h>
static std::size_t page_size() noexcept {
return static_cast<std::size_t>( ::sysconf(_SC_PAGESIZE) );
}
#endif // _WIN32
int main(int argc, char const *argv[]) {
auto started = std::chrono::high_resolution_clock::now();
const std::size_t n = 10000000;
// align size to page boundary
const std::size_t al = page_size() - 1;
const std::size_t buff_size = ( (n << 1) + al) & ~al;
std::string str;
str.reserve(buff_size);
const std::string to_append( std::to_string(1) );
for (std::size_t i = 0; i < n; ++i) {
str.append( to_append );
str.push_back(' ');
}
auto done = std::chrono::high_resolution_clock::now();
double secs = (double) std::chrono::duration_cast<std::chrono::milliseconds>(done-started).count() / 1000;
std::cout << "Done in " << secs << "\n";
return 0;
}
Done in 0.046
Asm ouput for main function:
main:
pushq %r14
.seh_pushreg %r14
pushq %r13
.seh_pushreg %r13
pushq %r12
.seh_pushreg %r12
pushq %rbp
.seh_pushreg %rbp
pushq %rdi
.seh_pushreg %rdi
pushq %rsi
.seh_pushreg %rsi
pushq %rbx
.seh_pushreg %rbx
subq $144, %rsp
.seh_stackalloc 144
.seh_endprologue
movl $10000000, %esi
call __main
leaq 96(%rsp), %r13
leaq 64(%rsp), %rbp
call _ZNSt6chrono3_V212system_clock3nowEv
movq %r13, %rcx
leaq 16(%rbp), %r12
movq %rax, %r14
call *__imp_GetSystemInfo(%rip)
movl 100(%rsp), %eax
movq %rbp, %rcx
movq %r12, 64(%rsp)
movq $0, 72(%rsp)
leaq 19999999(%rax), %rdx
negq %rax
movb $0, 80(%rsp)
andq %rax, %rdx
call _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE7reserveEy
movl $1, 32(%rsp)
movq %r13, %rcx
leaq .LC0(%rip), %r9
movl $16, %r8d
leaq _ZL9vsnprintfPcyPKcS_(%rip), %rdx
call _ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_yPKS8_PcEySB_z
jmp .L14
.p2align 4,,10
.L16:
movb $32, (%rdx,%rbx)
.L26:
movq 64(%rsp), %rax
movq %rdi, 72(%rsp)
movb $0, 1(%rax,%rbx)
subq $1, %rsi
je .L27
.L14:
movq 96(%rsp), %rdx
movq 104(%rsp), %r8
movq %rbp, %rcx
call _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_appendEPKcy
movq 72(%rsp), %rbx
movq 64(%rsp), %rdx
movl $15, %eax
leaq 1(%rbx), %rdi
cmpq %r12, %rdx
je .L15
movq 80(%rsp), %rax
.L15:
cmpq %rax, %rdi
jbe .L16
xorl %r9d, %r9d
xorl %r8d, %r8d
movq %rbx, %rdx
movq %rbp, %rcx
movq $1, 32(%rsp)
call _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_mutateEyyPKcy
movq 64(%rsp), %rax
movb $32, (%rax,%rbx)
jmp .L26
.p2align 4,,10
.L27:
call _ZNSt6chrono3_V212system_clock3nowEv
pxor %xmm1, %xmm1
movl $8, %r8d
movabsq $4835703278458516699, %rdx
subq %r14, %rax
addq $16, %r13
movq %rax, %rcx
imulq %rdx
sarq $63, %rcx
sarq $18, %rdx
subq %rcx, %rdx
movq .refptr._ZSt4cout(%rip), %rcx
cvtsi2sdq %rdx, %xmm1
leaq .LC2(%rip), %rdx
divsd .LC1(%rip), %xmm1
movsd %xmm1, 56(%rsp)
call _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_x
movsd 56(%rsp), %xmm1
movq .refptr._ZSt4cout(%rip), %rcx
call _ZNSo9_M_insertIdEERSoT_
leaq .LC3(%rip), %rdx
movq %rax, %rcx
call _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
movq 96(%rsp), %rcx
cmpq %r13, %rcx
je .L19
call _ZdlPv
.L19:
movq 64(%rsp), %rcx
addq $16, %rbp
cmpq %rbp, %rcx
je .L20
call _ZdlPv
.L20:
xorl %eax, %eax
addq $144, %rsp
popq %rbx
popq %rsi
popq %rdi
popq %rbp
popq %r12
popq %r13
popq %r14
ret
.seh_endproc
.p2align 4,,15
.def _GLOBAL__sub_I_main; .scl 3; .type 32; .endef
.seh_proc _GLOBAL__sub_I_main
(Just for the proportions) Windows Release target vs. Debug target on Visual Studio C++: By default, Debug target compile-line is without optimization, while Release target compile-line is with /O2 optimization, with /Oi ("Enable Intrinsic Functions"), & with /GL ("Whole Program Optimization"). Your code, on my workstation, Debug x64 vs Relesae x64:
Debug: 70 sec.
Release: 0.27 sec.
You build with MinGW (which I am not familiar with). But from a fast search, there is a talk about Debug/Release mode ...and MinGW has equivalent /O2 optimization, /Oi ("Enable Intrinsic Functions"), and /Og ("Enable Global Optimization") flags, it seems.
-
Compile with these 3 flags (x64 target), & compare with the VS Release x64 benchmark. Anyway, this is MS default compile optimization for a Release target.
-
Test Environment:
HP 8100, Windows 10 Pro 64 bit, CPU i7 870, 16 GB DDR3 RAM, Visual Studio 2017, Targets: Debug x64 / Release x64
I tried your code at my Windows with MinGW 4.8.0 and got ~20 seconds. When I changed string concatination to std::stringstream I got 0.5 seconds:
...
std::stringstream ss;
for (int i = 0; i < n; ++i) {
//str += std::to_string(a) + " ";
ss << a << " ";
}
str = ss.str();
...
Related
Why does march=native corrupt my program?
I'm compiling the program: #include <iostream> #include <vector> #include <cstddef> #include <algorithm> struct Model { int open, extend; }; struct Cell { int a, b; }; typedef std::vector<std::vector<Cell>> DPMatrix; void print(const DPMatrix& matrix) { for (std::size_t i = 0; i < matrix.size(); ++i) { for (std::size_t j = 0; j < matrix[i].size(); ++j) { std::cout << '{' << matrix[i][j].a << ' ' << matrix[i][j].b << "} "; } std::cout << std::endl; } } DPMatrix init_dp_matrix(const std::size_t num_cols, const std::size_t num_rows, const Model& model) { DPMatrix result(num_cols, DPMatrix::value_type(num_rows, Cell())); const int inf = model.open * std::max(num_cols, num_rows); for (int i = 1; i < num_cols; ++i) { result[i][0].b = model.open + (i - 1) * model.extend; } for (int j = 1; j < num_rows; ++j) { result[0][j].a = model.open + (j - 1) * model.extend; } return result; } int main() { const Model model = {-8, -1}; const DPMatrix matrix = init_dp_matrix(10, 2, model); print(matrix); } With GCC 9.2.0: $ g++-9 -v Reading specs from /home/dcooke/tools/octopus/build/brew/Cellar/gcc#9/9.2.0/bin/../lib/gcc/9/gcc/x86_64-pc-linux-gnu/9.2.0/specs COLLECT_GCC=/home/dcooke/tools/octopus/build/brew/bin/g++-9 COLLECT_LTO_WRAPPER=/home/dcooke/tools/octopus/build/brew/Cellar/gcc#9/9.2.0/bin/../libexec/gcc/x86_64-pc-linux-gnu/9.2.0/lto-wrapper Target: x86_64-pc-linux-gnu Configured with: ../configure --with-bugurl=https://github.com/Linuxbrew/homebrew-core/issues --disable-multilib --prefix=/home/linuxbrew/.linuxbrew/Cellar/gcc#9/9.2.0 --libdir=/home/linuxbrew/.linuxbrew/Cellar/gcc#9/9.2.0/lib/gcc/9 --enable-languages=c,c++,objc,obj-c++,fortran --disable-nls --enable-checking=release --program-suffix=-9 --with-gmp=/home/linuxbrew/.linuxbrew/opt/gmp --with-mpfr=/home/linuxbrew/.linuxbrew/opt/mpfr --with-mpc=/home/linuxbrew/.linuxbrew/opt/libmpc --with-isl=/home/linuxbrew/.linuxbrew/opt/isl --with-pkgversion='Homebrew GCC 9.2.0' Thread model: posix gcc version 9.2.0 (Homebrew GCC 9.2.0) with -march=native: $ g++-9 -O3 -march=native -o bug bug.cpp On an Ubuntu machine with Intel chips: $ lsb_release -a No LSB modules are available. Distributor ID: Ubuntu Description: Ubuntu 18.04.3 LTS Release: 18.04 Codename: bioni $ grep model /proc/cpuinfo | head -2 model : 85 model name : Intel(R) Xeon(R) Platinum 8175M CPU # 2.50GHz Running the program I get bogus output: $ ./bug {0 0} {-8 0} {-2048 255} {0 0} {-2304 255} {0 0} {-2560 255} {0 0} {-2816 255} {0 0} {-3072 255} {0 0} {-3328 255} {0 0} {-3584 255} {0 0} {-3840 255} {0 0} {0 -16} {0 0} If I compile without -march=native I get the correct output: $ g++-9 -O3 -o bug bug.cpp $ ./bug {0 0} {-8 0} {0 -8} {0 0} {0 -9} {0 0} {0 -10} {0 0} {0 -11} {0 0} {0 -12} {0 0} {0 -13} {0 0} {0 -14} {0 0} {0 -15} {0 0} {0 -16} {0 0 The assembly for the -match=native version is: $ g++-9 -O3 -march=native -S bug.cpp $ cat bug.s .file "bug.cpp" .text .section .text._ZNKSt5ctypeIcE8do_widenEc,"axG",#progbits,_ZNKSt5ctypeIcE8do_widenEc,comdat .align 2 .p2align 4 .weak _ZNKSt5ctypeIcE8do_widenEc .type _ZNKSt5ctypeIcE8do_widenEc, #function _ZNKSt5ctypeIcE8do_widenEc: .LFB1303: .cfi_startproc movl %esi, %eax ret .cfi_endproc .LFE1303: .size _ZNKSt5ctypeIcE8do_widenEc, .-_ZNKSt5ctypeIcE8do_widenEc .section .rodata.str1.1,"aMS",#progbits,1 .LC0: .string "} " .text .p2align 4 .globl _Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE .type _Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE, #function _Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE: .LFB2359: .cfi_startproc movq (%rdi), %rdx cmpq %rdx, 8(%rdi) je .L23 pushq %r15 .cfi_def_cfa_offset 16 .cfi_offset 15, -16 pushq %r14 .cfi_def_cfa_offset 24 .cfi_offset 14, -24 pushq %r13 .cfi_def_cfa_offset 32 .cfi_offset 13, -32 movabsq $-6148914691236517205, %r13 pushq %r12 .cfi_def_cfa_offset 40 .cfi_offset 12, -40 xorl %r12d, %r12d pushq %rbp .cfi_def_cfa_offset 48 .cfi_offset 6, -48 movq %rdi, %rbp pushq %rbx .cfi_def_cfa_offset 56 .cfi_offset 3, -56 subq $24, %rsp .cfi_def_cfa_offset 80 .p2align 4,,10 .p2align 3 .L4: leaq (%r12,%r12,2), %rbx salq $3, %rbx addq %rbx, %rdx movq 8(%rdx), %rax xorl %r14d, %r14d cmpq %rax, (%rdx) je .L8 .p2align 4,,10 .p2align 3 .L5: movl $1, %edx leaq 15(%rsp), %rsi movl $_ZSt4cout, %edi movb $123, 15(%rsp) call _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l movq %rax, %rdi movq 0(%rbp), %rax leaq 0(,%r14,8), %r15 movq (%rax,%rbx), %rax movl (%rax,%r14,8), %esi incq %r14 call _ZNSolsEi movq %rax, %rdi movl $1, %edx leaq 15(%rsp), %rsi movb $32, 15(%rsp) call _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l movq %rax, %rdi movq 0(%rbp), %rax movq (%rax,%rbx), %rax movl 4(%rax,%r15), %esi call _ZNSolsEi movq %rax, %rdi movl $2, %edx movl $.LC0, %esi call _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l movq 0(%rbp), %rdx addq %rbx, %rdx movq 8(%rdx), %rax subq (%rdx), %rax sarq $3, %rax cmpq %rax, %r14 jb .L5 .L8: movq _ZSt4cout(%rip), %rax movq -24(%rax), %rax movq _ZSt4cout+240(%rax), %r14 testq %r14, %r14 je .L26 cmpb $0, 56(%r14) je .L9 movsbl 67(%r14), %esi .L10: movl $_ZSt4cout, %edi call _ZNSo3putEc movq %rax, %rdi call _ZNSo5flushEv movq 0(%rbp), %rdx movq 8(%rbp), %rax incq %r12 subq %rdx, %rax sarq $3, %rax imulq %r13, %rax cmpq %r12, %rax ja .L4 addq $24, %rsp .cfi_remember_state .cfi_def_cfa_offset 56 popq %rbx .cfi_def_cfa_offset 48 popq %rbp .cfi_def_cfa_offset 40 popq %r12 .cfi_def_cfa_offset 32 popq %r13 .cfi_def_cfa_offset 24 popq %r14 .cfi_def_cfa_offset 16 popq %r15 .cfi_def_cfa_offset 8 ret .p2align 4,,10 .p2align 3 .L9: .cfi_restore_state movq %r14, %rdi call _ZNKSt5ctypeIcE13_M_widen_initEv movq (%r14), %rax movl $10, %esi movq 48(%rax), %rax cmpq $_ZNKSt5ctypeIcE8do_widenEc, %rax je .L10 movq %r14, %rdi call *%rax movsbl %al, %esi jmp .L10 .L23: .cfi_def_cfa_offset 8 .cfi_restore 3 .cfi_restore 6 .cfi_restore 12 .cfi_restore 13 .cfi_restore 14 .cfi_restore 15 ret .L26: .cfi_def_cfa_offset 80 .cfi_offset 3, -56 .cfi_offset 6, -48 .cfi_offset 12, -40 .cfi_offset 13, -32 .cfi_offset 14, -24 .cfi_offset 15, -16 call _ZSt16__throw_bad_castv .cfi_endproc .LFE2359: .size _Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE, .-_Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE .section .rodata.str1.8,"aMS",#progbits,1 .align 8 .LC2: .string "cannot create std::vector larger than max_size()" .section .text.unlikely,"ax",#progbits .LCOLDB6: .text .LHOTB6: .p2align 4 .globl _Z14init_dp_matrixmmRK5Model .type _Z14init_dp_matrixmmRK5Model, #function _Z14init_dp_matrixmmRK5Model: .LFB2360: .cfi_startproc .cfi_personality 0x3,__gxx_personality_v0 .cfi_lsda 0x3,.LLSDA2360 pushq %rbp .cfi_def_cfa_offset 16 .cfi_offset 6, -16 movabsq $1152921504606846975, %rax movq %rsp, %rbp .cfi_def_cfa_register 6 pushq %r15 pushq %r14 pushq %r13 pushq %r12 pushq %rbx andq $-32, %rsp subq $64, %rsp .cfi_offset 15, -24 .cfi_offset 14, -32 .cfi_offset 13, -40 .cfi_offset 12, -48 .cfi_offset 3, -56 movq %rdi, 24(%rsp) movq %rsi, 40(%rsp) movq %rcx, 16(%rsp) cmpq %rax, %rdx ja .L103 movq %rdx, %r15 testq %rdx, %rdx je .L71 leaq 0(,%rdx,8), %rbx movq %rbx, %rdi .LEHB0: call _Znwm .LEHE0: movq %rax, %r13 leaq -1(%r15), %rax cmpq $3, %rax movq %r15, %rdx movq %r13, %rax jbe .L30 shrq $2, %rdx salq $5, %rdx addq %r13, %rdx vpxor %xmm0, %xmm0, %xmm0 .p2align 4,,10 .p2align 3 .L32: vmovdqu32 %ymm0, (%rax) addq $32, %rax cmpq %rdx, %rax jne .L32 movq %r15, %rcx andq $-4, %rcx movq %r15, %rdx andl $3, %edx leaq 0(%r13,%rcx,8), %rax cmpq %rcx, %r15 je .L33 .L30: movq $0, (%rax) cmpq $1, %rdx je .L33 movq $0, 8(%rax) cmpq $2, %rdx je .L33 movq $0, 16(%rax) cmpq $3, %rdx je .L33 movq $0, 24(%rax) .L33: leaq 0(%r13,%rbx), %rax movq %rax, 56(%rsp) .L29: movabsq $384307168202282325, %rax cmpq %rax, 40(%rsp) ja .L104 movq 40(%rsp), %rax movq 24(%rsp), %r12 leaq (%rax,%rax,2), %rbx movq $0, (%r12) movq $0, 8(%r12) movq $0, 16(%r12) salq $3, %rbx testq %rax, %rax je .L35 movq %rbx, %rdi vzeroupper .LEHB1: call _Znwm .LEHE1: addq %rax, %rbx movq %rax, (%r12) movq %rax, 8(%r12) movq %rbx, 16(%r12) movq 56(%rsp), %r12 movq %rax, %r14 subq %r13, %r12 movq %r12, %rax sarq $3, %rax je .L40 movabsq $1152921504606846975, %rdx cmpq %rdx, %rax ja .L41 movq 40(%rsp), %rax movq %r14, %rbx movq %rax, 48(%rsp) .p2align 4,,10 .p2align 3 .L46: movq $0, (%rbx) movq $0, 8(%rbx) movq $0, 16(%rbx) movq %r12, %rdi .LEHB2: call _Znwm .LEHE2: leaq (%rax,%r12), %rcx movq %rax, (%rbx) movq %rcx, 16(%rbx) movq %rax, %rdi cmpq %r13, 56(%rsp) je .L42 movq %r12, %rdx movq %r13, %rsi movq %rcx, 32(%rsp) call memcpy movq 32(%rsp), %rcx addq $24, %rbx movq %rcx, -16(%rbx) decq 48(%rsp) jne .L46 movq 24(%rsp), %rax movq %rbx, 8(%rax) .L47: movq %r13, %rdi call _ZdlPv .L48: movq 16(%rsp), %rax cmpq $1, 40(%rsp) movl (%rax), %edx jbe .L62 movl 4(%rax), %edi movq 24(%rsp), %rax movq (%rax), %rsi movq 40(%rsp), %rax leaq -2(%rax), %rcx cmpq $7, %rcx jbe .L73 movq %rcx, %r8 shrq $3, %r8 leaq (%r8,%r8,2), %r8 salq $6, %r8 vmovdqa64 .LC1(%rip), %ymm3 vmovdqa64 .LC3(%rip), %ymm4 vmovdqa64 .LC4(%rip), %ymm6 vmovdqa64 .LC5(%rip), %ymm5 vpbroadcastd %edi, %ymm10 vpbroadcastd %edx, %ymm9 leaq 24(%rsi), %rax leaq 24(%rsi,%r8), %r8 vpcmpeqd %ymm8, %ymm8, %ymm8 kxnorb %k1, %k1, %k1 .p2align 4,,10 .p2align 3 .L61: vmovdqa64 %ymm3, %ymm0 vpaddd %ymm8, %ymm0, %ymm0 vpmulld %ymm10, %ymm0, %ymm0 vmovdqu64 (%rax), %ymm2 vmovdqu64 96(%rax), %ymm1 vpermt2q 32(%rax), %ymm6, %ymm2 vpermt2q 128(%rax), %ymm6, %ymm1 vpermt2q 64(%rax), %ymm5, %ymm2 vpaddd %ymm9, %ymm0, %ymm0 vpermt2q 160(%rax), %ymm5, %ymm1 kmovb %k1, %k2 addq $192, %rax vpscatterqd %xmm0, 4(,%ymm2,1){%k2} vperm2i128 $17, %ymm0, %ymm0, %ymm0 kmovb %k1, %k3 vpaddd %ymm4, %ymm3, %ymm3 vpscatterqd %xmm0, 4(,%ymm1,1){%k3} cmpq %r8, %rax jne .L61 andq $-8, %rcx leaq 1(%rcx), %r8 leal 1(%rcx), %eax .L59: leaq (%r8,%r8,2), %rcx movq (%rsi,%rcx,8), %r8 leal -1(%rax), %ecx imull %edi, %ecx movq 40(%rsp), %rbx addl %edx, %ecx movl %ecx, 4(%r8) leal 1(%rax), %ecx movslq %ecx, %r8 cmpq %r8, %rbx jbe .L62 leaq (%r8,%r8,2), %r8 movq (%rsi,%r8,8), %r9 movl %edi, %r8d imull %eax, %r8d addl %edx, %r8d movl %r8d, 4(%r9) leal 2(%rax), %r8d movslq %r8d, %r9 cmpq %r9, %rbx jbe .L62 imull %edi, %ecx leaq (%r9,%r9,2), %r9 movq (%rsi,%r9,8), %r9 addl %edx, %ecx movl %ecx, 4(%r9) leal 3(%rax), %ecx movslq %ecx, %r9 cmpq %r9, %rbx jbe .L62 imull %edi, %r8d leaq (%r9,%r9,2), %r9 movq (%rsi,%r9,8), %r9 addl %edx, %r8d movl %r8d, 4(%r9) leal 4(%rax), %r8d movslq %r8d, %r9 cmpq %r9, %rbx jbe .L62 imull %edi, %ecx leaq (%r9,%r9,2), %r9 movq (%rsi,%r9,8), %r9 addl %edx, %ecx movl %ecx, 4(%r9) leal 5(%rax), %ecx movslq %ecx, %r9 cmpq %r9, %rbx jbe .L62 imull %edi, %r8d leaq (%r9,%r9,2), %r9 movq (%rsi,%r9,8), %r9 addl %edx, %r8d movl %r8d, 4(%r9) leal 6(%rax), %r8d movslq %r8d, %r9 cmpq %r9, %rbx jbe .L62 imull %edi, %ecx leaq (%r9,%r9,2), %r9 movq (%rsi,%r9,8), %r9 addl $7, %eax addl %edx, %ecx cltq movl %ecx, 4(%r9) cmpq %rax, %rbx jbe .L62 imull %r8d, %edi leaq (%rax,%rax,2), %rax movq (%rsi,%rax,8), %rax leal (%rdi,%rdx), %r8d movl %r8d, 4(%rax) .L62: cmpq $1, %r15 jbe .L27 movq 16(%rsp), %rax leaq -1(%r15), %r8 movl 4(%rax), %edi movq 24(%rsp), %rax movq (%rax), %rax movq (%rax), %rsi leaq -2(%r15), %rax cmpq $6, %rax jbe .L74 movq %r8, %rcx shrq $3, %rcx salq $6, %rcx vmovdqa64 .LC1(%rip), %ymm2 vmovdqa64 .LC3(%rip), %ymm4 vpbroadcastd %edi, %ymm6 vpbroadcastd %edx, %ymm5 movq %rsi, %rax addq %rsi, %rcx vpcmpeqd %ymm3, %ymm3, %ymm3 .p2align 4,,10 .p2align 3 .L66: vmovdqa64 %ymm2, %ymm0 vpaddd %ymm3, %ymm0, %ymm0 vpmulld %ymm6, %ymm0, %ymm0 addq $64, %rax vpaddd %ymm4, %ymm2, %ymm2 vpaddd %ymm5, %ymm0, %ymm0 vmovd %xmm0, -56(%rax) vpextrd $1, %xmm0, -48(%rax) vpextrd $2, %xmm0, -40(%rax) vpextrd $3, %xmm0, -32(%rax) vextracti128 $0x1, %ymm0, %xmm0 vmovd %xmm0, -24(%rax) vpextrd $1, %xmm0, -16(%rax) vpextrd $2, %xmm0, -8(%rax) vpextrd $3, %xmm0, (%rax) cmpq %rcx, %rax jne .L66 movq %r8, %rcx andq $-8, %rcx leaq 1(%rcx), %r9 leal 1(%rcx), %eax cmpq %r8, %rcx je .L27 .L64: leal -1(%rax), %ecx imull %edi, %ecx addl %edx, %ecx movl %ecx, (%rsi,%r9,8) leal 1(%rax), %ecx movslq %ecx, %r9 cmpq %r15, %r9 jnb .L27 movl %edi, %r8d imull %eax, %r8d addl %edx, %r8d movl %r8d, (%rsi,%r9,8) leal 2(%rax), %r8d movslq %r8d, %r9 cmpq %r9, %r15 jbe .L27 imull %edi, %ecx addl %edx, %ecx movl %ecx, (%rsi,%r9,8) leal 3(%rax), %ecx movslq %ecx, %r9 cmpq %r15, %r9 jnb .L27 imull %edi, %r8d addl %edx, %r8d movl %r8d, (%rsi,%r9,8) leal 4(%rax), %r8d movslq %r8d, %r9 cmpq %r9, %r15 jbe .L27 imull %edi, %ecx addl %edx, %ecx movl %ecx, (%rsi,%r9,8) leal 5(%rax), %ecx movslq %ecx, %r9 cmpq %r9, %r15 jbe .L27 imull %edi, %r8d addl $6, %eax cltq addl %edx, %r8d movl %r8d, (%rsi,%r9,8) cmpq %rax, %r15 jbe .L27 imull %ecx, %edi addl %edi, %edx movl %edx, (%rsi,%rax,8) .L27: movq 24(%rsp), %rax vzeroupper leaq -40(%rbp), %rsp popq %rbx popq %r12 popq %r13 popq %r14 popq %r15 popq %rbp .cfi_remember_state .cfi_def_cfa 7, 8 ret .p2align 4,,10 .p2align 3 .L37: .cfi_restore_state movq %r12, 8(%r14) addq $24, %r14 cmpq %r14, %rbx je .L45 .L40: movq $0, (%r14) movq %r12, 16(%r14) cmpq %r13, 56(%rsp) je .L37 movq %r12, %rdx movq %r13, %rsi xorl %edi, %edi call memcpy addq $24, %r14 movq %r12, -16(%r14) cmpq %r14, %rbx jne .L40 .L45: movq 24(%rsp), %rax movq %rbx, 8(%rax) testq %r13, %r13 je .L48 .L105: movq %r13, %rdi call _ZdlPv jmp .L48 .p2align 4,,10 .p2align 3 .L42: movq %rcx, 8(%rbx) addq $24, %rbx decq 48(%rsp) jne .L46 movq 24(%rsp), %rax movq %rbx, 8(%rax) testq %r13, %r13 je .L48 jmp .L105 .p2align 4,,10 .p2align 3 .L71: movq $0, 56(%rsp) xorl %r13d, %r13d jmp .L29 .p2align 4,,10 .p2align 3 .L35: testq %r13, %r13 je .L106 vzeroupper jmp .L47 .L73: movl $1, %eax movl $1, %r8d jmp .L59 .L74: movl $1, %eax movl $1, %r9d jmp .L64 .L106: movq 16(%rsp), %rax movl (%rax), %edx jmp .L62 .L41: movq $0, (%r14) movq $0, 8(%r14) movq $0, 16(%r14) .LEHB3: call _ZSt17__throw_bad_allocv .LEHE3: .L104: movl $.LC2, %edi vzeroupper .LEHB4: call _ZSt20__throw_length_errorPKc .LEHE4: .L103: movl $.LC2, %edi .LEHB5: call _ZSt20__throw_length_errorPKc .LEHE5: .L78: movq %rax, %rdi jmp .L49 .L77: movq %rax, %rdi jmp .L50 .L75: movq %rax, %r12 vzeroupper jmp .L56 .globl __gxx_personality_v0 .section .gcc_except_table,"a",#progbits .align 4 .LLSDA2360: .byte 0xff .byte 0x3 .uleb128 .LLSDATT2360-.LLSDATTD2360 .LLSDATTD2360: .byte 0x1 .uleb128 .LLSDACSE2360-.LLSDACSB2360 .LLSDACSB2360: .uleb128 .LEHB0-.LFB2360 .uleb128 .LEHE0-.LEHB0 .uleb128 0 .uleb128 0 .uleb128 .LEHB1-.LFB2360 .uleb128 .LEHE1-.LEHB1 .uleb128 .L75-.LFB2360 .uleb128 0 .uleb128 .LEHB2-.LFB2360 .uleb128 .LEHE2-.LEHB2 .uleb128 .L77-.LFB2360 .uleb128 0x1 .uleb128 .LEHB3-.LFB2360 .uleb128 .LEHE3-.LEHB3 .uleb128 .L78-.LFB2360 .uleb128 0x1 .uleb128 .LEHB4-.LFB2360 .uleb128 .LEHE4-.LEHB4 .uleb128 .L75-.LFB2360 .uleb128 0 .uleb128 .LEHB5-.LFB2360 .uleb128 .LEHE5-.LEHB5 .uleb128 0 .uleb128 0 .LLSDACSE2360: .byte 0x1 .byte 0 .align 4 .long 0 .LLSDATT2360: .text .cfi_endproc .section .text.unlikely .cfi_startproc .cfi_personality 0x3,__gxx_personality_v0 .cfi_lsda 0x3,.LLSDAC2360 .type _Z14init_dp_matrixmmRK5Model.cold, #function _Z14init_dp_matrixmmRK5Model.cold: .LFSB2360: .L49: .cfi_def_cfa 6, 16 .cfi_offset 3, -56 .cfi_offset 6, -16 .cfi_offset 12, -48 .cfi_offset 13, -40 .cfi_offset 14, -32 .cfi_offset 15, -24 movq %r14, %rbx .L50: vzeroupper call __cxa_begin_catch .L53: cmpq %rbx, %r14 jne .L107 .LEHB6: call __cxa_rethrow .LEHE6: .L76: movq %rax, %r12 vzeroupper call __cxa_end_catch movq 24(%rsp), %rax movq (%rax), %rdi testq %rdi, %rdi je .L56 call _ZdlPv .L56: testq %r13, %r13 je .L69 movq %r13, %rdi call _ZdlPv .L69: movq %r12, %rdi .LEHB7: call _Unwind_Resume .LEHE7: .L107: movq (%r14), %rdi testq %rdi, %rdi je .L52 call _ZdlPv .L52: addq $24, %r14 jmp .L53 .cfi_endproc .LFE2360: .section .gcc_except_table .align 4 .LLSDAC2360: .byte 0xff .byte 0x3 .uleb128 .LLSDATTC2360-.LLSDATTDC2360 .LLSDATTDC2360: .byte 0x1 .uleb128 .LLSDACSEC2360-.LLSDACSBC2360 .LLSDACSBC2360: .uleb128 .LEHB6-.LCOLDB6 .uleb128 .LEHE6-.LEHB6 .uleb128 .L76-.LCOLDB6 .uleb128 0 .uleb128 .LEHB7-.LCOLDB6 .uleb128 .LEHE7-.LEHB7 .uleb128 0 .uleb128 0 .LLSDACSEC2360: .byte 0x1 .byte 0 .align 4 .long 0 .LLSDATTC2360: .section .text.unlikely .text .size _Z14init_dp_matrixmmRK5Model, .-_Z14init_dp_matrixmmRK5Model .section .text.unlikely .size _Z14init_dp_matrixmmRK5Model.cold, .-_Z14init_dp_matrixmmRK5Model.cold .LCOLDE6: .text .LHOTE6: .section .text._ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev,"axG",#progbits,_ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED5Ev,comdat .align 2 .p2align 4 .weak _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev .type _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev, #function _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev: .LFB2637: .cfi_startproc pushq %r12 .cfi_def_cfa_offset 16 .cfi_offset 12, -16 movq %rdi, %r12 pushq %rbp .cfi_def_cfa_offset 24 .cfi_offset 6, -24 pushq %rbx .cfi_def_cfa_offset 32 .cfi_offset 3, -32 movq 8(%rdi), %rbx movq (%rdi), %rbp cmpq %rbp, %rbx je .L109 .p2align 4,,10 .p2align 3 .L113: movq 0(%rbp), %rdi testq %rdi, %rdi je .L110 addq $24, %rbp call _ZdlPv cmpq %rbp, %rbx jne .L113 .L111: movq (%r12), %rbp .L109: testq %rbp, %rbp je .L115 popq %rbx .cfi_remember_state .cfi_def_cfa_offset 24 movq %rbp, %rdi popq %rbp .cfi_def_cfa_offset 16 popq %r12 .cfi_def_cfa_offset 8 jmp _ZdlPv .p2align 4,,10 .p2align 3 .L110: .cfi_restore_state addq $24, %rbp cmpq %rbp, %rbx jne .L113 jmp .L111 .p2align 4,,10 .p2align 3 .L115: popq %rbx .cfi_def_cfa_offset 24 popq %rbp .cfi_def_cfa_offset 16 popq %r12 .cfi_def_cfa_offset 8 ret .cfi_endproc .LFE2637: .size _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev, .-_ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev .weak _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED1Ev .set _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED1Ev,_ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev .section .text.unlikely .LCOLDB7: .section .text.startup,"ax",#progbits .LHOTB7: .p2align 4 .globl main .type main, #function main: .LFB2371: .cfi_startproc .cfi_personality 0x3,__gxx_personality_v0 .cfi_lsda 0x3,.LLSDA2371 pushq %rbp .cfi_def_cfa_offset 16 .cfi_offset 6, -16 movl $2, %edx movl $10, %esi subq $48, %rsp .cfi_def_cfa_offset 64 leaq 16(%rsp), %rdi leaq 8(%rsp), %rcx movq $-8, 8(%rsp) .LEHB8: call _Z14init_dp_matrixmmRK5Model .LEHE8: leaq 16(%rsp), %rdi .LEHB9: call _Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE .LEHE9: leaq 16(%rsp), %rdi call _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED1Ev addq $48, %rsp .cfi_remember_state .cfi_def_cfa_offset 16 xorl %eax, %eax popq %rbp .cfi_def_cfa_offset 8 ret .L119: .cfi_restore_state movq %rax, %rbp jmp .L118 .section .gcc_except_table .LLSDA2371: .byte 0xff .byte 0xff .byte 0x1 .uleb128 .LLSDACSE2371-.LLSDACSB2371 .LLSDACSB2371: .uleb128 .LEHB8-.LFB2371 .uleb128 .LEHE8-.LEHB8 .uleb128 0 .uleb128 0 .uleb128 .LEHB9-.LFB2371 .uleb128 .LEHE9-.LEHB9 .uleb128 .L119-.LFB2371 .uleb128 0 .LLSDACSE2371: .section .text.startup .cfi_endproc .section .text.unlikely .cfi_startproc .cfi_personality 0x3,__gxx_personality_v0 .cfi_lsda 0x3,.LLSDAC2371 .type main.cold, #function main.cold: .LFSB2371: .L118: .cfi_def_cfa_offset 64 .cfi_offset 6, -16 leaq 16(%rsp), %rdi vzeroupper call _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED1Ev movq %rbp, %rdi .LEHB10: call _Unwind_Resume .LEHE10: .cfi_endproc .LFE2371: .section .gcc_except_table .LLSDAC2371: .byte 0xff .byte 0xff .byte 0x1 .uleb128 .LLSDACSEC2371-.LLSDACSBC2371 .LLSDACSBC2371: .uleb128 .LEHB10-.LCOLDB7 .uleb128 .LEHE10-.LEHB10 .uleb128 0 .uleb128 0 .LLSDACSEC2371: .section .text.unlikely .section .text.startup .size main, .-main .section .text.unlikely .size main.cold, .-main.cold .LCOLDE7: .section .text.startup .LHOTE7: .p2align 4 .type _GLOBAL__sub_I__Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE, #function _GLOBAL__sub_I__Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE: .LFB3017: .cfi_startproc subq $8, %rsp .cfi_def_cfa_offset 16 movl $_ZStL8__ioinit, %edi call _ZNSt8ios_base4InitC1Ev movl $__dso_handle, %edx movl $_ZStL8__ioinit, %esi movl $_ZNSt8ios_base4InitD1Ev, %edi addq $8, %rsp .cfi_def_cfa_offset 8 jmp __cxa_atexit .cfi_endproc .LFE3017: .size _GLOBAL__sub_I__Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE, .-_GLOBAL__sub_I__Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE .section .init_array,"aw" .align 8 .quad _GLOBAL__sub_I__Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE .local _ZStL8__ioinit .comm _ZStL8__ioinit,1,1 .section .rodata.cst32,"aM",#progbits,32 .align 32 .LC1: .long 1 .long 2 .long 3 .long 4 .long 5 .long 6 .long 7 .long 8 .align 32 .LC3: .long 8 .long 8 .long 8 .long 8 .long 8 .long 8 .long 8 .long 8 .align 32 .LC4: .quad 0 .quad 3 .quad 6 .quad 0 .align 32 .LC5: .quad 0 .quad 1 .quad 2 .quad 5 .hidden __dso_handle .ident "GCC: (Homebrew GCC 9.2.0) 9.2.0" .section .note.GNU-stack,"",#progbits The assembly for the non -march=native version is available on godbolt. What is going wrong, is this a compiler bug or is my program ill formed? How can I mitigate this issue if it is a compiler bug? Additional info Compiling with -v: $ ~/tools/octopus/build/brew/bin/g++-9 -O3 -march=native -S bug.cpp -v Reading specs from /home/dcooke/tools/octopus/build/brew/Cellar/gcc#9/9.2.0/bin/../lib/gcc/9/gcc/x86_64-pc-linux-gnu/9.2.0/specs COLLECT_GCC=/home/dcooke/tools/octopus/build/brew/bin/g++-9 Target: x86_64-pc-linux-gnu Configured with: ../configure --with-bugurl=https://github.com/Linuxbrew/homebrew-core/issues --disable-multilib --prefix=/home/linuxbrew/.linuxbrew/Cellar/gcc#9/9.2.0 --libdir=/home/linuxbrew/.linuxbrew/Cellar/gcc#9/9.2.0/lib/gcc/9 --enable-languages=c,c++,objc,obj-c++,fortran --disable-nls --enable-checking=release --program-suffix=-9 --with-gmp=/home/linuxbrew/.linuxbrew/opt/gmp --with-mpfr=/home/linuxbrew/.linuxbrew/opt/mpfr --with-mpc=/home/linuxbrew/.linuxbrew/opt/libmpc --with-isl=/home/linuxbrew/.linuxbrew/opt/isl --with-pkgversion='Homebrew GCC 9.2.0' Thread model: posix gcc version 9.2.0 (Homebrew GCC 9.2.0) COLLECT_GCC_OPTIONS='-O3' '-march=native' '-S' '-v' '-shared-libgcc' /home/dcooke/tools/octopus/build/brew/Cellar/gcc#9/9.2.0/bin/../libexec/gcc/x86_64-pc-linux-gnu/9.2.0/cc1plus -quiet -v -imultiarch x86_64-linux-gnu -iprefix /home/dcooke/tools/octopus/build/brew/Cellar/gcc#9/9.2.0/bin/../lib/gcc/9/gcc/x86_64-pc-linux-gnu/9.2.0/ -D_GNU_SOURCE bug.cpp -isysroot /home/dcooke/tools/octopus/build/brew/nonexistent -idirafter /home/dcooke/tools/octopus/build/brew/include -idirafter /usr/include/x86_64-linux-gnu -idirafter /usr/include -march=skylake-avx512 -mmmx -mno-3dnow -msse -msse2 -msse3 -mssse3 -mno-sse4a -mcx16 -msahf -mmovbe -maes -mno-sha -mpclmul -mpopcnt -mabm -mno-lwp -mfma -mno-fma4 -mno-xop -mbmi -mno-sgx -mbmi2 -mno-pconfig -mno-wbnoinvd -mno-tbm -mavx -mavx2 -msse4.2 -msse4.1 -mlzcnt -mrtm -mhle -mrdrnd -mf16c -mfsgsbase -mrdseed -mprfchw -madx -mfxsr -mxsave -mxsaveopt -mavx512f -mno-avx512er -mavx512cd -mno-avx512pf -mno-prefetchwt1 -mclflushopt -mxsavec -mxsaves -mavx512dq -mavx512bw -mavx512vl -mno-avx512ifma -mno-avx512vbmi -mno-avx5124fmaps -mno-avx5124vnniw -mclwb -mno-mwaitx -mno-clzero -mpku -mno-rdpid -mno-gfni -mno-shstk -mno-avx512vbmi2 -mno-avx512vnni -mno-vaes -mno-vpclmulqdq -mno-avx512bitalg -mno-movdiri -mno-movdir64b -mno-waitpkg -mno-cldemote -mno-ptwrite --param l1-cache-size=32 --param l1-cache-line-size=64 --param l2-cache-size=33792 -mtune=skylake-avx512 -quiet -dumpbase bug.cpp -auxbase bug -O3 -version -o bug.s GNU C++14 (Homebrew GCC 9.2.0) version 9.2.0 (x86_64-pc-linux-gnu) compiled by GNU C version 9.2.0, GMP version 6.1.2, MPFR version 4.0.2, MPC version 1.1.0, isl version isl-0.21-GMP Compiling with -O2 or less makes the problem go away: $ g++-9 -O2 -march=native -o bug bug.cpp $ ./bug {0 0} {-8 0} {0 -8} {0 0} {0 -9} {0 0} {0 -10} {0 0} {0 -11} {0 0} {0 -12} {0 0} {0 -13} {0 0} {0 -14} {0 0} {0 -15} {0 0} {0 -16} {0 0} I tried building on a different machine with Intel chips: $ rpm -q centos-release centos-release-7-3.1611.el7.centos.x86_64 $ grep model /proc/cpuinfo | head -2 model : 85 model name : Intel(R) Xeon(R) Gold 6148 CPU # 2.40GHz $ g++-9 -O3 -march=native -o bug bug.cpp -v Reading specs from /gpfs1/well/gerton/dan/apps/octopus/build/brew/Cellar/gcc#9/9.2.0/bin/../lib/gcc/9/gcc/x86_64-pc-linux-gnu/9.2.0/specs COLLECT_GCC=/well/gerton/dan/apps/octopus/build/brew/bin/g++-9 COLLECT_LTO_WRAPPER=/gpfs1/well/gerton/dan/apps/octopus/build/brew/Cellar/gcc#9/9.2.0/bin/../libexec/gcc/x86_64-pc-linux-gnu/9.2.0/lto-wrapper Target: x86_64-pc-linux-gnu Configured with: ../configure --with-bugurl=https://github.com/Linuxbrew/homebrew-core/issues --disable-multilib --prefix=/home/linuxbrew/.linuxbrew/Cellar/gcc#9/9.2.0 --libdir=/home/linuxbrew/.linuxbrew/Cellar/gcc#9/9.2.0/lib/gcc/9 --enable-languages=c,c++,objc,obj-c++,fortran --disable-nls --enable-checking=release --program-suffix=-9 --with-gmp=/home/linuxbrew/.linuxbrew/opt/gmp --with-mpfr=/home/linuxbrew/.linuxbrew/opt/mpfr --with-mpc=/home/linuxbrew/.linuxbrew/opt/libmpc --with-isl=/home/linuxbrew/.linuxbrew/opt/isl --with-pkgversion='Homebrew GCC 9.2.0' Thread model: posix gcc version 9.2.0 (Homebrew GCC 9.2.0) COLLECT_GCC_OPTIONS='-O3' '-march=native' '-o' 'bug' '-v' '-shared-libgcc' /gpfs1/well/gerton/dan/apps/octopus/build/brew/Cellar/gcc#9/9.2.0/bin/../libexec/gcc/x86_64-pc-linux-gnu/9.2.0/cc1plus -quiet -v -imultiarch x86_64-linux-gnu -iprefix /gpfs1/well/gerton/dan/apps/octopus/build/brew/Cellar/gcc#9/9.2.0/bin/../lib/gcc/9/gcc/x86_64-pc-linux-gnu/9.2.0/ -D_GNU_SOURCE bug.cpp -isysroot /gpfs1/well/gerton/dan/apps/octopus/build/brew/nonexistent -idirafter /gpfs1/well/gerton/dan/apps/octopus/build/brew/include -idirafter /gpfs1/well/gerton/dan/apps/octopus/build/brew/opt/glibc/include -march=skylake-avx512 -mmmx -mno-3dnow -msse -msse2 -msse3 -mssse3 -mno-sse4a -mcx16 -msahf -mmovbe -maes -mno-sha -mpclmul -mpopcnt -mabm -mno-lwp -mfma -mno-fma4 -mno-xop -mbmi -mno-sgx -mbmi2 -mno-pconfig -mno-wbnoinvd -mno-tbm -mavx -mavx2 -msse4.2 -msse4.1 -mlzcnt -mrtm -mhle -mrdrnd -mf16c -mfsgsbase -mrdseed -mprfchw -madx -mfxsr -mxsave -mxsaveopt -mavx512f -mno-avx512er -mavx512cd -mno-avx512pf -mno-prefetchwt1 -mclflushopt -mxsavec -mxsaves -mavx512dq -mavx512bw -mavx512vl -mno-avx512ifma -mno-avx512vbmi -mno-avx5124fmaps -mno-avx5124vnniw -mclwb -mno-mwaitx -mno-clzero -mno-pku -mno-rdpid -mno-gfni -mno-shstk -mno-avx512vbmi2 -mno-avx512vnni -mno-vaes -mno-vpclmulqdq -mno-avx512bitalg -mno-movdiri -mno-movdir64b -mno-waitpkg -mno-cldemote -mno-ptwrite --param l1-cache-size=32 --param l1-cache-line-size=64 --param l2-cache-size=28160 -mtune=skylake-avx512 -quiet -dumpbase bug.cpp -auxbase bug -O3 -version -o /tmp/cczPrvHP.s GNU C++14 (Homebrew GCC 9.2.0) version 9.2.0 (x86_64-pc-linux-gnu) compiled by GNU C version 9.2.0, GMP version 6.1.2, MPFR version 4.0.2, MPC version 1.1.0, isl version isl-0.21-GMP $ ./bug {0 0} {-8 0} {0 -8} {0 0} {0 -9} {0 0} {0 -10} {0 0} {0 -11} {0 0} {0 -12} {0 0} {0 -13} {0 0} {0 -14} {0 0} {0 -15} {0 0} {0 -16} {0 0} The correct output... -ftree-loop-vectorize is the culprit: $ g++-9 -march=native -O2 -o bug bug.cpp -ftree-loop-vectorize $ ./bug {0 0} {-8 0} {-2048 255} {0 0} {-2304 255} {0 0} {-2560 255} {0 0} {-2816 255} {0 0} {-3072 255} {0 0} {-3328 255} {0 0} {-3584 255} {0 0} {-3840 255} {0 0} {0 -16} {0 0} None of the other O3 flags result in this behaviour.
This turned out to be due to a bug in binutils gas. This solution was to upgrade my binutils to 2.32.
Function doesn't work when running normally, but does while debugging
I have the following code: #include <iostream> #include <cmath> bool primes[21] = {0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0}; int find_last_power(int n, int p){ return (int) std::pow(n, (double) 1/p); } long long solve(int n){ long long solution = 1; for (int i=2; i<=n; i++){ if (primes[i]){ std::cout << "p" << i << " : " << std::pow(i, find_last_power(n, i)) << std::endl; solution *= static_cast<long long>(std::pow(i, find_last_power(n, i))); } } return solution; } int main(){ std::cout << solve(20); return 0; } primes is an array of n+1 booleans whose value primes[i] is true if i is prime and false if i is composite. find_last_power(n, p) returns the exponent (int) of the largest power of p that is less than or equal to n. If you run the program it writes out: p2 : 16 p3 : 9 p5 : 5 p7 : 7 p11 : 11 p13 : 13 p17 : 17 p19 : 19 214885440 // this is the return value of solve(20) // it is supposed to be the product of the numbers on the right (16,9...) But the returned number is not the expected output. The program, however, runs correctly in a debugger, which is why I find it very hard to identify the bug. The expected output should be 232792560. Any help is appreciated. As requested, here is the assembler source. .file "PE_5.cxx" .text .section .rdata,"dr" __ZStL19piecewise_construct: .space 1 .lcomm __ZStL8__ioinit,1,1 .globl _primes .data .align 4 _primes: .byte 0 .byte 0 .byte 1 .byte 1 .byte 0 .byte 1 .byte 0 .byte 1 .byte 0 .byte 0 .byte 0 .byte 1 .byte 0 .byte 1 .byte 0 .byte 0 .byte 0 .byte 1 .byte 0 .byte 1 .byte 0 .text .globl __Z15find_last_powerii .def __Z15find_last_powerii; .scl 2; .type 32; .endef __Z15find_last_powerii: LFB1717: .cfi_startproc pushl %ebp .cfi_def_cfa_offset 8 .cfi_offset 5, -8 movl %esp, %ebp .cfi_def_cfa_register 5 subl $40, %esp fildl 12(%ebp) fld1 fdivp %st, %st(1) fstpl 4(%esp) movl 8(%ebp), %eax movl %eax, (%esp) call __ZSt3powIidEN9__gnu_cxx11__promote_2IT_T0_NS0_9__promoteIS2_XsrSt12__is_integerIS2_E7__valueEE6__typeENS4_IS3_XsrS5_IS3_E7__valueEE6__typeEE6__typeES2_S3_ fnstcw -10(%ebp) movzwl -10(%ebp), %eax orb $12, %ah movw %ax, -12(%ebp) fldcw -12(%ebp) fistpl -16(%ebp) fldcw -10(%ebp) movl -16(%ebp), %eax leave .cfi_restore 5 .cfi_def_cfa 4, 4 ret .cfi_endproc LFE1717: .section .rdata,"dr" LC2: .ascii "p\0" LC3: .ascii " : \0" .text .globl __Z5solvei .def __Z5solvei; .scl 2; .type 32; .endef __Z5solvei: LFB1718: .cfi_startproc pushl %ebp .cfi_def_cfa_offset 8 .cfi_offset 5, -8 movl %esp, %ebp .cfi_def_cfa_register 5 pushl %ebx subl $52, %esp .cfi_offset 3, -12 movl $1, -16(%ebp) movl $0, -12(%ebp) movl $2, -20(%ebp) L6: movl -20(%ebp), %eax cmpl 8(%ebp), %eax jg L4 movl -20(%ebp), %eax addl $_primes, %eax movzbl (%eax), %eax testb %al, %al je L5 movl $LC2, 4(%esp) movl $__ZSt4cout, (%esp) call __ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc movl %eax, %edx movl -20(%ebp), %eax movl %eax, (%esp) movl %edx, %ecx call __ZNSolsEi subl $4, %esp movl $LC3, 4(%esp) movl %eax, (%esp) call __ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc movl %eax, %ebx movl -20(%ebp), %eax movl %eax, 4(%esp) movl 8(%ebp), %eax movl %eax, (%esp) call __Z15find_last_powerii movl %eax, 4(%esp) movl -20(%ebp), %eax movl %eax, (%esp) call __ZSt3powIiiEN9__gnu_cxx11__promote_2IT_T0_NS0_9__promoteIS2_XsrSt12__is_integerIS2_E7__valueEE6__typeENS4_IS3_XsrS5_IS3_E7__valueEE6__typeEE6__typeES2_S3_ fstpl (%esp) movl %ebx, %ecx call __ZNSolsEd subl $8, %esp movl $__ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_, (%esp) movl %eax, %ecx call __ZNSolsEPFRSoS_E subl $4, %esp movl -20(%ebp), %eax movl %eax, 4(%esp) movl 8(%ebp), %eax movl %eax, (%esp) call __Z15find_last_powerii movl %eax, 4(%esp) movl -20(%ebp), %eax movl %eax, (%esp) call __ZSt3powIiiEN9__gnu_cxx11__promote_2IT_T0_NS0_9__promoteIS2_XsrSt12__is_integerIS2_E7__valueEE6__typeENS4_IS3_XsrS5_IS3_E7__valueEE6__typeEE6__typeES2_S3_ fnstcw -26(%ebp) movzwl -26(%ebp), %eax orb $12, %ah movw %ax, -28(%ebp) fldcw -28(%ebp) fistpq -40(%ebp) fldcw -26(%ebp) movl -40(%ebp), %eax movl -36(%ebp), %edx movl -12(%ebp), %ecx movl %ecx, %ebx imull %eax, %ebx movl -16(%ebp), %ecx imull %edx, %ecx addl %ebx, %ecx mull -16(%ebp) addl %edx, %ecx movl %ecx, %edx movl %eax, -16(%ebp) movl %edx, -12(%ebp) movl %eax, -16(%ebp) movl %edx, -12(%ebp) L5: addl $1, -20(%ebp) jmp L6 L4: movl -16(%ebp), %eax movl -12(%ebp), %edx movl -4(%ebp), %ebx leave .cfi_restore 5 .cfi_restore 3 .cfi_def_cfa 4, 4 ret .cfi_endproc LFE1718: .def ___main; .scl 2; .type 32; .endef .globl _main .def _main; .scl 2; .type 32; .endef _main: LFB1719: .cfi_startproc leal 4(%esp), %ecx .cfi_def_cfa 1, 0 andl $-16, %esp pushl -4(%ecx) pushl %ebp .cfi_escape 0x10,0x5,0x2,0x75,0 movl %esp, %ebp pushl %ecx .cfi_escape 0xf,0x3,0x75,0x7c,0x6 subl $20, %esp call ___main movl $20, (%esp) call __Z5solvei movl %eax, (%esp) movl %edx, 4(%esp) movl $__ZSt4cout, %ecx call __ZNSolsEx subl $8, %esp movl $0, %eax movl -4(%ebp), %ecx .cfi_def_cfa 1, 0 leave .cfi_restore 5 leal -4(%ecx), %esp .cfi_def_cfa 4, 4 ret .cfi_endproc LFE1719: .section .text$_ZSt3powIidEN9__gnu_cxx11__promote_2IT_T0_NS0_9__promoteIS2_XsrSt12__is_integerIS2_E7__valueEE6__typeENS4_IS3_XsrS5_IS3_E7__valueEE6__typeEE6__typeES2_S3_,"x" .linkonce discard .globl __ZSt3powIidEN9__gnu_cxx11__promote_2IT_T0_NS0_9__promoteIS2_XsrSt12__is_integerIS2_E7__valueEE6__typeENS4_IS3_XsrS5_IS3_E7__valueEE6__typeEE6__typeES2_S3_ .def __ZSt3powIidEN9__gnu_cxx11__promote_2IT_T0_NS0_9__promoteIS2_XsrSt12__is_integerIS2_E7__valueEE6__typeENS4_IS3_XsrS5_IS3_E7__valueEE6__typeEE6__typeES2_S3_; .scl 2; .type 32; .endef __ZSt3powIidEN9__gnu_cxx11__promote_2IT_T0_NS0_9__promoteIS2_XsrSt12__is_integerIS2_E7__valueEE6__typeENS4_IS3_XsrS5_IS3_E7__valueEE6__typeEE6__typeES2_S3_: LFB1955: .cfi_startproc pushl %ebp .cfi_def_cfa_offset 8 .cfi_offset 5, -8 movl %esp, %ebp .cfi_def_cfa_register 5 subl $40, %esp movl 12(%ebp), %eax movl %eax, -16(%ebp) movl 16(%ebp), %eax movl %eax, -12(%ebp) fildl 8(%ebp) fldl -16(%ebp) fstpl 8(%esp) fstpl (%esp) call _pow leave .cfi_restore 5 .cfi_def_cfa 4, 4 ret .cfi_endproc LFE1955: .section .text$_ZSt3powIiiEN9__gnu_cxx11__promote_2IT_T0_NS0_9__promoteIS2_XsrSt12__is_integerIS2_E7__valueEE6__typeENS4_IS3_XsrS5_IS3_E7__valueEE6__typeEE6__typeES2_S3_,"x" .linkonce discard .globl __ZSt3powIiiEN9__gnu_cxx11__promote_2IT_T0_NS0_9__promoteIS2_XsrSt12__is_integerIS2_E7__valueEE6__typeENS4_IS3_XsrS5_IS3_E7__valueEE6__typeEE6__typeES2_S3_ .def __ZSt3powIiiEN9__gnu_cxx11__promote_2IT_T0_NS0_9__promoteIS2_XsrSt12__is_integerIS2_E7__valueEE6__typeENS4_IS3_XsrS5_IS3_E7__valueEE6__typeEE6__typeES2_S3_; .scl 2; .type 32; .endef __ZSt3powIiiEN9__gnu_cxx11__promote_2IT_T0_NS0_9__promoteIS2_XsrSt12__is_integerIS2_E7__valueEE6__typeENS4_IS3_XsrS5_IS3_E7__valueEE6__typeEE6__typeES2_S3_: LFB1957: .cfi_startproc pushl %ebp .cfi_def_cfa_offset 8 .cfi_offset 5, -8 movl %esp, %ebp .cfi_def_cfa_register 5 subl $24, %esp fildl 12(%ebp) fildl 8(%ebp) fxch %st(1) fstpl 8(%esp) fstpl (%esp) call _pow leave .cfi_restore 5 .cfi_def_cfa 4, 4 ret .cfi_endproc LFE1957: .text .def ___tcf_0; .scl 3; .type 32; .endef ___tcf_0: LFB2201: .cfi_startproc pushl %ebp .cfi_def_cfa_offset 8 .cfi_offset 5, -8 movl %esp, %ebp .cfi_def_cfa_register 5 subl $8, %esp movl $__ZStL8__ioinit, %ecx call __ZNSt8ios_base4InitD1Ev leave .cfi_restore 5 .cfi_def_cfa 4, 4 ret .cfi_endproc LFE2201: .def __Z41__static_initialization_and_destruction_0ii; .scl 3; .type 32; .endef __Z41__static_initialization_and_destruction_0ii: LFB2200: .cfi_startproc pushl %ebp .cfi_def_cfa_offset 8 .cfi_offset 5, -8 movl %esp, %ebp .cfi_def_cfa_register 5 subl $24, %esp cmpl $1, 8(%ebp) jne L17 cmpl $65535, 12(%ebp) jne L17 movl $__ZStL8__ioinit, %ecx call __ZNSt8ios_base4InitC1Ev movl $___tcf_0, (%esp) call _atexit L17: nop leave .cfi_restore 5 .cfi_def_cfa 4, 4 ret .cfi_endproc LFE2200: .def __GLOBAL__sub_I_primes; .scl 3; .type 32; .endef __GLOBAL__sub_I_primes: LFB2202: .cfi_startproc pushl %ebp .cfi_def_cfa_offset 8 .cfi_offset 5, -8 movl %esp, %ebp .cfi_def_cfa_register 5 subl $24, %esp movl $65535, 4(%esp) movl $1, (%esp) call __Z41__static_initialization_and_destruction_0ii leave .cfi_restore 5 .cfi_def_cfa 4, 4 ret .cfi_endproc LFE2202: .section .ctors,"w" .align 4 .long __GLOBAL__sub_I_primes .ident "GCC: (MinGW.org GCC-8.2.0-3) 8.2.0" .def __ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc; .scl 2; .type 32; .endef .def __ZNSolsEi; .scl 2; .type 32; .endef .def __ZNSolsEd; .scl 2; .type 32; .endef .def __ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_; .scl 2; .type 32; .endef .def __ZNSolsEPFRSoS_E; .scl 2; .type 32; .endef .def __ZNSolsEx; .scl 2; .type 32; .endef .def _pow; .scl 2; .type 32; .endef .def __ZNSt8ios_base4InitD1Ev; .scl 2; .type 32; .endef .def __ZNSt8ios_base4InitC1Ev; .scl 2; .type 32; .endef .def _atexit; .scl 2; .type 32; .endef It was compiled with the following commands (on 64-bit Intel i5 4690k, Windows 10): g++ -S -o asm.s PE_5.cxx g++ -c asm.s -o outtput.o g++ output.o -o out.exe g++ --version // g++ (MinGW.org GCC-8.2.0-3) 8.2.0
You have int overflow. Change the returned type int solve(int n){ with long long solve(int n){
Why does a simple use of ostringstream generates so much assembly code?
Consider the following simple example that formats a string and an integer using ostringstream and discards the output: #include <sstream> void ostringstream_test() { std::ostringstream ss; ss << "x = " << 42; ss.str(); } Compiling it with clang++ -S -O3 -DNDEBUG -std=c++14 test.cc generates a ton of assembly code (half a kilobyte in x86-64 instructions compared to less than a hundred bytes for a similar sprintf code) - see below the output. Why does it generates so much code, is it inherent to the ostringstream API or this particular compiler/library does something wrong? .globl __Z18ostringstream_testv .p2align 4, 0x90 __Z18ostringstream_testv: ## #_Z18ostringstream_testv Lfunc_begin0: .cfi_startproc .cfi_personality 155, ___gxx_personality_v0 .cfi_lsda 16, Lexception0 ## BB#0: pushq %rbp Lcfi0: .cfi_def_cfa_offset 16 Lcfi1: .cfi_offset %rbp, -16 movq %rsp, %rbp Lcfi2: .cfi_def_cfa_register %rbp pushq %r15 pushq %r14 pushq %r13 pushq %r12 pushq %rbx subq $328, %rsp ## imm = 0x148 Lcfi3: .cfi_offset %rbx, -56 Lcfi4: .cfi_offset %r12, -48 Lcfi5: .cfi_offset %r13, -40 Lcfi6: .cfi_offset %r14, -32 Lcfi7: .cfi_offset %r15, -24 leaq -256(%rbp), %r14 leaq -360(%rbp), %r12 movq __ZTCNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE0_NS_13basic_ostreamIcS2_EE#GOTPCREL(%rip), %rax leaq 24(%rax), %rcx movq %rcx, -368(%rbp) addq $64, %rax movq %rax, -256(%rbp) Ltmp0: movq %r14, %rdi movq %r12, %rsi callq __ZNSt3__18ios_base4initEPv Ltmp1: ## BB#1: movq $0, -120(%rbp) movl $-1, -112(%rbp) movq __ZTVNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE#GOTPCREL(%rip), %rbx leaq 24(%rbx), %r13 movq %r13, -368(%rbp) addq $64, %rbx movq %rbx, -256(%rbp) Ltmp3: movq %r12, %rdi callq __ZNSt3__115basic_streambufIcNS_11char_traitsIcEEEC2Ev Ltmp4: ## BB#2: movq __ZTVNSt3__115basic_stringbufIcNS_11char_traitsIcEENS_9allocatorIcEEEE#GOTPCREL(%rip), %r15 addq $16, %r15 movq %r15, -360(%rbp) movq $0, -272(%rbp) movq $0, -280(%rbp) movq $0, -288(%rbp) movq $0, -296(%rbp) movl $16, -264(%rbp) xorps %xmm0, %xmm0 movaps %xmm0, -80(%rbp) movq $0, -64(%rbp) Ltmp6: leaq -80(%rbp), %rsi movq %r12, %rdi callq __ZNSt3__115basic_stringbufIcNS_11char_traitsIcEENS_9allocatorIcEEE3strERKNS_12basic_stringIcS2_S4_EE Ltmp7: ## BB#3: testb $1, -80(%rbp) je LBB0_5 ## BB#4: movq -64(%rbp), %rdi callq __ZdlPv LBB0_5: Ltmp9: leaq L_.str(%rip), %rsi leaq -368(%rbp), %rdi movl $4, %edx callq __ZNSt3__124__put_character_sequenceIcNS_11char_traitsIcEEEERNS_13basic_ostreamIT_T0_EES7_PKS4_m Ltmp10: ## BB#6: Ltmp11: movl $42, %esi movq %rax, %rdi callq __ZNSt3__113basic_ostreamIcNS_11char_traitsIcEEElsEi Ltmp12: ## BB#7: Ltmp13: leaq -104(%rbp), %rdi movq %r12, %rsi callq __ZNKSt3__115basic_stringbufIcNS_11char_traitsIcEENS_9allocatorIcEEE3strEv Ltmp14: ## BB#8: testb $1, -104(%rbp) je LBB0_10 ## BB#9: movq -88(%rbp), %rdi callq __ZdlPv LBB0_10: movq %r13, -368(%rbp) movq %rbx, -256(%rbp) movq %r15, -360(%rbp) testb $1, -296(%rbp) je LBB0_12 ## BB#11: movq -280(%rbp), %rdi callq __ZdlPv LBB0_12: movq %r12, %rdi callq __ZNSt3__115basic_streambufIcNS_11char_traitsIcEEED2Ev movq __ZTTNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE#GOTPCREL(%rip), %rsi addq $8, %rsi leaq -368(%rbp), %rdi callq __ZNSt3__113basic_ostreamIcNS_11char_traitsIcEEED2Ev movq %r14, %rdi callq __ZNSt3__19basic_iosIcNS_11char_traitsIcEEED2Ev addq $328, %rsp ## imm = 0x148 popq %rbx popq %r12 popq %r13 popq %r14 popq %r15 popq %rbp retq LBB0_13: Ltmp8: movq %rax, -48(%rbp) ## 8-byte Spill testb $1, -80(%rbp) je LBB0_18 ## BB#14: movq -64(%rbp), %rdi callq __ZdlPv testb $1, -296(%rbp) jne LBB0_19 jmp LBB0_20 LBB0_16: Ltmp5: movq %rax, -48(%rbp) ## 8-byte Spill jmp LBB0_21 LBB0_15: Ltmp2: movq %rax, -48(%rbp) ## 8-byte Spill jmp LBB0_22 LBB0_17: Ltmp15: movq %rax, -48(%rbp) ## 8-byte Spill movq %r13, -368(%rbp) movq %rbx, -256(%rbp) movq %r15, -360(%rbp) LBB0_18: testb $1, -296(%rbp) je LBB0_20 LBB0_19: movq -280(%rbp), %rdi callq __ZdlPv LBB0_20: movq %r12, %rdi callq __ZNSt3__115basic_streambufIcNS_11char_traitsIcEEED2Ev LBB0_21: movq __ZTTNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE#GOTPCREL(%rip), %rsi addq $8, %rsi leaq -368(%rbp), %rdi callq __ZNSt3__113basic_ostreamIcNS_11char_traitsIcEEED2Ev LBB0_22: movq %r14, %rdi callq __ZNSt3__19basic_iosIcNS_11char_traitsIcEEED2Ev movq -48(%rbp), %rdi ## 8-byte Reload callq __Unwind_Resume Lfunc_end0: .cfi_endproc .section __TEXT,__gcc_except_tab .p2align 2 GCC_except_table0: Lexception0: .byte 255 ## #LPStart Encoding = omit .byte 155 ## #TType Encoding = indirect pcrel sdata4 .asciz "\303\200" ## #TType base offset .byte 3 ## Call site Encoding = udata4 .byte 65 ## Call site table length Lset0 = Ltmp0-Lfunc_begin0 ## >> Call Site 1 << .long Lset0 Lset1 = Ltmp1-Ltmp0 ## Call between Ltmp0 and Ltmp1 .long Lset1 Lset2 = Ltmp2-Lfunc_begin0 ## jumps to Ltmp2 .long Lset2 .byte 0 ## On action: cleanup Lset3 = Ltmp3-Lfunc_begin0 ## >> Call Site 2 << .long Lset3 Lset4 = Ltmp4-Ltmp3 ## Call between Ltmp3 and Ltmp4 .long Lset4 Lset5 = Ltmp5-Lfunc_begin0 ## jumps to Ltmp5 .long Lset5 .byte 0 ## On action: cleanup Lset6 = Ltmp6-Lfunc_begin0 ## >> Call Site 3 << .long Lset6 Lset7 = Ltmp7-Ltmp6 ## Call between Ltmp6 and Ltmp7 .long Lset7 Lset8 = Ltmp8-Lfunc_begin0 ## jumps to Ltmp8 .long Lset8 .byte 0 ## On action: cleanup Lset9 = Ltmp9-Lfunc_begin0 ## >> Call Site 4 << .long Lset9 Lset10 = Ltmp14-Ltmp9 ## Call between Ltmp9 and Ltmp14 .long Lset10 Lset11 = Ltmp15-Lfunc_begin0 ## jumps to Ltmp15 .long Lset11 .byte 0 ## On action: cleanup Lset12 = Ltmp14-Lfunc_begin0 ## >> Call Site 5 << .long Lset12 Lset13 = Lfunc_end0-Ltmp14 ## Call between Ltmp14 and Lfunc_end0 .long Lset13 .long 0 ## has no landing pad .byte 0 ## On action: cleanup .p2align 2
The most likely reason for the difference is that the IOStream implementation is expanded inline while the sprintf() use is just a function call. Nothing inherently prevents IOStreams to be implemented by a library. It does take a tiny but of abstraction and planning, though: the definition in the standard uses templates. These are normally just implemented inline. Declaring the typically used instantiations (for character types char and wchar_t) as extern templates and explicitly instantiating them is extra work, though. I showed a long time ago that it does pay off in term of compile-time and, at least, libstdc++ preinstantiates the IOStreams functions in a library. Based on you experiment it seems libc++ doesn’t.
Why segmentation fault is caused by class variables order?
I've created following program : class CLexer { public: CLexer( ) { iCursorPos = 0; } void putCharacter(char character) { if(character != ' ' && character != '\n') { m_strToken[iCursorPos] = character; iCursorPos++; } else { m_strToken[iCursorPos] = '\0'; iCursorPos = 0; } } private: char m_strToken[1024]; int iCursorPos = 0; }; int main(int argc, char * argv[]) { CLexer lex; lex.putCharacter('m'); return 0; } Assembler output produced by compiler : .file "main.cpp" .section .text._ZN6CLexerC2Ev,"axG",#progbits,_ZN6CLexerC5Ev,comdat .align 2 .weak _ZN6CLexerC2Ev .type _ZN6CLexerC2Ev, #function _ZN6CLexerC2Ev: .LFB1: .cfi_startproc pushq %rbp .cfi_def_cfa_offset 16 .cfi_offset 6, -16 movq %rsp, %rbp .cfi_def_cfa_register 6 movq %rdi, -8(%rbp) movq -8(%rbp), %rax movl $0, 1024(%rax) movq -8(%rbp), %rax movl $0, 1024(%rax) nop popq %rbp .cfi_def_cfa 7, 8 ret .cfi_endproc .LFE1: .size _ZN6CLexerC2Ev, .-_ZN6CLexerC2Ev .weak _ZN6CLexerC1Ev .set _ZN6CLexerC1Ev,_ZN6CLexerC2Ev .section .text._ZN6CLexer12putCharacterEc,"axG",#progbits,_ZN6CLexer12putCharacterEc,comdat .align 2 .weak _ZN6CLexer12putCharacterEc .type _ZN6CLexer12putCharacterEc, #function _ZN6CLexer12putCharacterEc: .LFB3: .cfi_startproc pushq %rbp .cfi_def_cfa_offset 16 .cfi_offset 6, -16 movq %rsp, %rbp .cfi_def_cfa_register 6 movq %rdi, -8(%rbp) movl %esi, %eax movb %al, -12(%rbp) cmpb $32, -12(%rbp) je .L3 cmpb $10, -12(%rbp) je .L3 movq -8(%rbp), %rax movl 1024(%rax), %eax movq -8(%rbp), %rdx cltq movzbl -12(%rbp), %ecx movb %cl, (%rdx,%rax) movq -8(%rbp), %rax movl 1024(%rax), %eax leal 1(%rax), %edx movq -8(%rbp), %rax movl %edx, 1024(%rax) jmp .L4 .L3: movq -8(%rbp), %rax movl 1024(%rax), %eax movq -8(%rbp), %rdx cltq movb $0, (%rdx,%rax) movq -8(%rbp), %rax movl $0, 1024(%rax) .L4: nop popq %rbp .cfi_def_cfa 7, 8 ret .cfi_endproc .LFE3: .size _ZN6CLexer12putCharacterEc, .-_ZN6CLexer12putCharacterEc .text .globl main .type main, #function main: .LFB4: .cfi_startproc pushq %rbp .cfi_def_cfa_offset 16 .cfi_offset 6, -16 movq %rsp, %rbp .cfi_def_cfa_register 6 subq $1056, %rsp movl %edi, -1044(%rbp) movq %rsi, -1056(%rbp) leaq -1040(%rbp), %rax movq %rax, %rdi call _ZN6CLexerC1Ev leaq -1040(%rbp), %rax movl $109, %esi movq %rax, %rdi call _ZN6CLexer12putCharacterEc movl $0, %eax leave .cfi_def_cfa 7, 8 ret .cfi_endproc .LFE4: .size main, .-main .ident "GCC: (GNU) 6.1.1 20160501" .section .note.GNU-stack,"",#progbits And after execution, first call to putCharacter method with 'm' character as parameter is throwing segfault. Attached gdb is giving following output : Program received signal SIGSEGV, Segmentation fault. 0x00000000004018e5 in CLexer::putCharacter (this=0x7fffffffe370, character=109 'm') at src/main.cpp:60 60 m_strToken[iCursorPos] = character; I've managed to fix this error by moving iCursorPos variable above m_strToken in class declaration but i think it isn't proper way to fix this issue. I'm using g++ (GCC) 6.1.1 20160501 on the lastest and updated version of ArchLinux x86_64.
if(character != ' ' && character != '\n') { m_strToken[iCursorPos] = character; iCursorPos++; } You don't check that iCursorPos < 1024 here. So you write past the end of the buffer, into iCursorPos itself. The next access m_strToken[iCursorPos] = character; probably writes way past the end of the buffer, and you get a segfault (luckily). Your "fix" still isn't correct, since you corrupt other parts of your objects memory regardless.
Why sin/cos are slower when optimizations are enabled?
After reading a question related with the performance of sin/cos (Why is std::sin() and std::cos() slower than sin() and cos()?), I made some tests with his code and found a weird thing: If i call sin/cos with a float value, it is much slower than with double when compiled with optimization. #include <cmath> #include <cstdio> const int N = 4000; float cosine[N][N]; float sine[N][N]; int main() { for (int i = 0; i < N; i++) { for (int j = 0; j < N; j++) { float ang = i*j*2*M_PI/N; cosine[i][j] = cos(ang); sine[i][j] = sin(ang); } } } With the above code I get: With -O0: 2.402s With -O1: 9.004s With -O2: 9.013s With -O3: 9.001s Now if I change float ang = i*j*2*M_PI/N; To double ang = i*j*2*M_PI/N; I get: With -O0: 2.362s With -O1: 1.188s With -O2: 1.197s With -O3: 1.197s How can the first test be that faster without optimizations? I'm using g++ (Ubuntu/Linaro 4.5.2-8ubuntu4) 4.5.2, 64 bits. EDIT: Changed the title to better describe the problem. EDIT: Added assembly code Assembly for first test with O0: .file "main.cpp" .globl cosine .bss .align 32 .type cosine, #object .size cosine, 64000000 cosine: .zero 64000000 .globl sine .align 32 .type sine, #object .size sine, 64000000 sine: .zero 64000000 .text .globl main .type main, #function main: .LFB87: .cfi_startproc pushq %rbp .cfi_def_cfa_offset 16 movq %rsp, %rbp .cfi_offset 6, -16 .cfi_def_cfa_register 6 subq $16, %rsp movl $0, -4(%rbp) jmp .L2 .L5: movl $0, -8(%rbp) jmp .L3 .L4: movl -4(%rbp), %eax imull -8(%rbp), %eax addl %eax, %eax cvtsi2sd %eax, %xmm0 movsd .LC0(%rip), %xmm1 mulsd %xmm1, %xmm0 movsd .LC1(%rip), %xmm1 divsd %xmm1, %xmm0 unpcklpd %xmm0, %xmm0 cvtpd2ps %xmm0, %xmm0 movss %xmm0, -12(%rbp) movss -12(%rbp), %xmm0 cvtps2pd %xmm0, %xmm0 call cos unpcklpd %xmm0, %xmm0 cvtpd2ps %xmm0, %xmm0 movl -8(%rbp), %eax cltq movl -4(%rbp), %edx movslq %edx, %rdx imulq $4000, %rdx, %rdx leaq (%rdx,%rax), %rax movss %xmm0, cosine(,%rax,4) movss -12(%rbp), %xmm0 cvtps2pd %xmm0, %xmm0 call sin unpcklpd %xmm0, %xmm0 cvtpd2ps %xmm0, %xmm0 movl -8(%rbp), %eax cltq movl -4(%rbp), %edx movslq %edx, %rdx imulq $4000, %rdx, %rdx leaq (%rdx,%rax), %rax movss %xmm0, sine(,%rax,4) addl $1, -8(%rbp) .L3: cmpl $3999, -8(%rbp) setle %al testb %al, %al jne .L4 addl $1, -4(%rbp) .L2: cmpl $3999, -4(%rbp) setle %al testb %al, %al jne .L5 movl $0, %eax leave .cfi_def_cfa 7, 8 ret .cfi_endproc .LFE87: .size main, .-main .section .rodata .align 4 .type _ZL1N, #object .size _ZL1N, 4 _ZL1N: .long 4000 .align 8 .LC0: .long 1413754136 .long 1074340347 .align 8 .LC1: .long 0 .long 1085227008 .ident "GCC: (Ubuntu/Linaro 4.5.2-8ubuntu4) 4.5.2" .section .note.GNU-stack,"",#progbits Assembly for first test with O3: .file "main.cpp" .text .p2align 4,,15 .globl main .type main, #function main: .LFB121: .cfi_startproc pushq %r15 .cfi_def_cfa_offset 16 xorl %r15d, %r15d .cfi_offset 15, -16 pushq %r14 .cfi_def_cfa_offset 24 movl $cosine+16000, %r14d .cfi_offset 14, -24 pushq %r13 .cfi_def_cfa_offset 32 xorl %r13d, %r13d .cfi_offset 13, -32 pushq %r12 .cfi_def_cfa_offset 40 pushq %rbp .cfi_def_cfa_offset 48 pushq %rbx .cfi_def_cfa_offset 56 subq $24, %rsp .cfi_def_cfa_offset 80 .p2align 4,,10 .p2align 3 .L2: movslq %r15d, %rbp .cfi_offset 3, -56 .cfi_offset 6, -48 .cfi_offset 12, -40 movl %r13d, %r12d movl $0x3f800000, %edx imulq $16000, %rbp, %rbp xorl %eax, %eax leaq cosine(%rbp), %rbx addq $sine, %rbp jmp .L5 .p2align 4,,10 .p2align 3 .L3: movl %r12d, %eax leaq 8(%rsp), %rsi leaq 12(%rsp), %rdi subl %r13d, %eax cvtsi2sd %eax, %xmm0 mulsd .LC2(%rip), %xmm0 divsd .LC3(%rip), %xmm0 unpcklpd %xmm0, %xmm0 cvtpd2ps %xmm0, %xmm0 call sincosf movl 8(%rsp), %edx movl 12(%rsp), %eax .L5: movl %edx, (%rbx) addq $4, %rbx movl %eax, 0(%rbp) addl %r13d, %r12d addq $4, %rbp cmpq %r14, %rbx jne .L3 addl $1, %r15d addl $2, %r13d leaq 16000(%rbx), %r14 cmpl $4000, %r15d jne .L2 addq $24, %rsp .cfi_def_cfa_offset 56 xorl %eax, %eax popq %rbx .cfi_def_cfa_offset 48 popq %rbp .cfi_def_cfa_offset 40 popq %r12 .cfi_def_cfa_offset 32 popq %r13 .cfi_def_cfa_offset 24 popq %r14 .cfi_def_cfa_offset 16 popq %r15 .cfi_def_cfa_offset 8 ret .cfi_endproc .LFE121: .size main, .-main .globl cosine .bss .align 32 .type cosine, #object .size cosine, 64000000 cosine: .zero 64000000 .globl sine .align 32 .type sine, #object .size sine, 64000000 sine: .zero 64000000 .section .rodata.cst8,"aM",#progbits,8 .align 8 .LC2: .long 1413754136 .long 1074340347 .align 8 .LC3: .long 0 .long 1085227008 .ident "GCC: (Ubuntu/Linaro 4.5.2-8ubuntu4) 4.5.2" .section .note.GNU-stack,"",#progbits
Here's a possibility: In C, cos is double precision and cosf is single precision. In C++, std::cos has overloads for both double and single. You aren't calling std::cos. If <cmath> doesn't also overload ::cos (as far as I know, it is not required to), then you are just calling the C double precision function. If this is the case, then you're suffering the cost of converting between float, double, and back. Now, some standard libraries implement cos(float x) as (float)cos((double)x), so even if you are calling the float function it might still be doing conversions behind the scenes. This shouldn't account for a 9x performance difference, though.
AFAIK it's because computers work at double precision natively. Using float requires conversions.'