Why does march=native corrupt my program? - c++
I'm compiling the program:
#include <iostream>
#include <vector>
#include <cstddef>
#include <algorithm>
struct Model
{
int open, extend;
};
struct Cell
{
int a, b;
};
typedef std::vector<std::vector<Cell>> DPMatrix;
void print(const DPMatrix& matrix)
{
for (std::size_t i = 0; i < matrix.size(); ++i) {
for (std::size_t j = 0; j < matrix[i].size(); ++j) {
std::cout << '{' << matrix[i][j].a << ' ' << matrix[i][j].b << "} ";
}
std::cout << std::endl;
}
}
DPMatrix init_dp_matrix(const std::size_t num_cols, const std::size_t num_rows, const Model& model)
{
DPMatrix result(num_cols, DPMatrix::value_type(num_rows, Cell()));
const int inf = model.open * std::max(num_cols, num_rows);
for (int i = 1; i < num_cols; ++i) {
result[i][0].b = model.open + (i - 1) * model.extend;
}
for (int j = 1; j < num_rows; ++j) {
result[0][j].a = model.open + (j - 1) * model.extend;
}
return result;
}
int main()
{
const Model model = {-8, -1};
const DPMatrix matrix = init_dp_matrix(10, 2, model);
print(matrix);
}
With GCC 9.2.0:
$ g++-9 -v
Reading specs from /home/dcooke/tools/octopus/build/brew/Cellar/gcc#9/9.2.0/bin/../lib/gcc/9/gcc/x86_64-pc-linux-gnu/9.2.0/specs
COLLECT_GCC=/home/dcooke/tools/octopus/build/brew/bin/g++-9
COLLECT_LTO_WRAPPER=/home/dcooke/tools/octopus/build/brew/Cellar/gcc#9/9.2.0/bin/../libexec/gcc/x86_64-pc-linux-gnu/9.2.0/lto-wrapper
Target: x86_64-pc-linux-gnu
Configured with: ../configure --with-bugurl=https://github.com/Linuxbrew/homebrew-core/issues --disable-multilib --prefix=/home/linuxbrew/.linuxbrew/Cellar/gcc#9/9.2.0 --libdir=/home/linuxbrew/.linuxbrew/Cellar/gcc#9/9.2.0/lib/gcc/9 --enable-languages=c,c++,objc,obj-c++,fortran --disable-nls --enable-checking=release --program-suffix=-9 --with-gmp=/home/linuxbrew/.linuxbrew/opt/gmp --with-mpfr=/home/linuxbrew/.linuxbrew/opt/mpfr --with-mpc=/home/linuxbrew/.linuxbrew/opt/libmpc --with-isl=/home/linuxbrew/.linuxbrew/opt/isl --with-pkgversion='Homebrew GCC 9.2.0'
Thread model: posix
gcc version 9.2.0 (Homebrew GCC 9.2.0)
with -march=native:
$ g++-9 -O3 -march=native -o bug bug.cpp
On an Ubuntu machine with Intel chips:
$ lsb_release -a
No LSB modules are available.
Distributor ID: Ubuntu
Description: Ubuntu 18.04.3 LTS
Release: 18.04
Codename: bioni
$ grep model /proc/cpuinfo | head -2
model : 85
model name : Intel(R) Xeon(R) Platinum 8175M CPU # 2.50GHz
Running the program I get bogus output:
$ ./bug
{0 0} {-8 0}
{-2048 255} {0 0}
{-2304 255} {0 0}
{-2560 255} {0 0}
{-2816 255} {0 0}
{-3072 255} {0 0}
{-3328 255} {0 0}
{-3584 255} {0 0}
{-3840 255} {0 0}
{0 -16} {0 0}
If I compile without -march=native I get the correct output:
$ g++-9 -O3 -o bug bug.cpp
$ ./bug
{0 0} {-8 0}
{0 -8} {0 0}
{0 -9} {0 0}
{0 -10} {0 0}
{0 -11} {0 0}
{0 -12} {0 0}
{0 -13} {0 0}
{0 -14} {0 0}
{0 -15} {0 0}
{0 -16} {0 0
The assembly for the -match=native version is:
$ g++-9 -O3 -march=native -S bug.cpp
$ cat bug.s
.file "bug.cpp"
.text
.section .text._ZNKSt5ctypeIcE8do_widenEc,"axG",#progbits,_ZNKSt5ctypeIcE8do_widenEc,comdat
.align 2
.p2align 4
.weak _ZNKSt5ctypeIcE8do_widenEc
.type _ZNKSt5ctypeIcE8do_widenEc, #function
_ZNKSt5ctypeIcE8do_widenEc:
.LFB1303:
.cfi_startproc
movl %esi, %eax
ret
.cfi_endproc
.LFE1303:
.size _ZNKSt5ctypeIcE8do_widenEc, .-_ZNKSt5ctypeIcE8do_widenEc
.section .rodata.str1.1,"aMS",#progbits,1
.LC0:
.string "} "
.text
.p2align 4
.globl _Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE
.type _Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE, #function
_Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE:
.LFB2359:
.cfi_startproc
movq (%rdi), %rdx
cmpq %rdx, 8(%rdi)
je .L23
pushq %r15
.cfi_def_cfa_offset 16
.cfi_offset 15, -16
pushq %r14
.cfi_def_cfa_offset 24
.cfi_offset 14, -24
pushq %r13
.cfi_def_cfa_offset 32
.cfi_offset 13, -32
movabsq $-6148914691236517205, %r13
pushq %r12
.cfi_def_cfa_offset 40
.cfi_offset 12, -40
xorl %r12d, %r12d
pushq %rbp
.cfi_def_cfa_offset 48
.cfi_offset 6, -48
movq %rdi, %rbp
pushq %rbx
.cfi_def_cfa_offset 56
.cfi_offset 3, -56
subq $24, %rsp
.cfi_def_cfa_offset 80
.p2align 4,,10
.p2align 3
.L4:
leaq (%r12,%r12,2), %rbx
salq $3, %rbx
addq %rbx, %rdx
movq 8(%rdx), %rax
xorl %r14d, %r14d
cmpq %rax, (%rdx)
je .L8
.p2align 4,,10
.p2align 3
.L5:
movl $1, %edx
leaq 15(%rsp), %rsi
movl $_ZSt4cout, %edi
movb $123, 15(%rsp)
call _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l
movq %rax, %rdi
movq 0(%rbp), %rax
leaq 0(,%r14,8), %r15
movq (%rax,%rbx), %rax
movl (%rax,%r14,8), %esi
incq %r14
call _ZNSolsEi
movq %rax, %rdi
movl $1, %edx
leaq 15(%rsp), %rsi
movb $32, 15(%rsp)
call _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l
movq %rax, %rdi
movq 0(%rbp), %rax
movq (%rax,%rbx), %rax
movl 4(%rax,%r15), %esi
call _ZNSolsEi
movq %rax, %rdi
movl $2, %edx
movl $.LC0, %esi
call _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l
movq 0(%rbp), %rdx
addq %rbx, %rdx
movq 8(%rdx), %rax
subq (%rdx), %rax
sarq $3, %rax
cmpq %rax, %r14
jb .L5
.L8:
movq _ZSt4cout(%rip), %rax
movq -24(%rax), %rax
movq _ZSt4cout+240(%rax), %r14
testq %r14, %r14
je .L26
cmpb $0, 56(%r14)
je .L9
movsbl 67(%r14), %esi
.L10:
movl $_ZSt4cout, %edi
call _ZNSo3putEc
movq %rax, %rdi
call _ZNSo5flushEv
movq 0(%rbp), %rdx
movq 8(%rbp), %rax
incq %r12
subq %rdx, %rax
sarq $3, %rax
imulq %r13, %rax
cmpq %r12, %rax
ja .L4
addq $24, %rsp
.cfi_remember_state
.cfi_def_cfa_offset 56
popq %rbx
.cfi_def_cfa_offset 48
popq %rbp
.cfi_def_cfa_offset 40
popq %r12
.cfi_def_cfa_offset 32
popq %r13
.cfi_def_cfa_offset 24
popq %r14
.cfi_def_cfa_offset 16
popq %r15
.cfi_def_cfa_offset 8
ret
.p2align 4,,10
.p2align 3
.L9:
.cfi_restore_state
movq %r14, %rdi
call _ZNKSt5ctypeIcE13_M_widen_initEv
movq (%r14), %rax
movl $10, %esi
movq 48(%rax), %rax
cmpq $_ZNKSt5ctypeIcE8do_widenEc, %rax
je .L10
movq %r14, %rdi
call *%rax
movsbl %al, %esi
jmp .L10
.L23:
.cfi_def_cfa_offset 8
.cfi_restore 3
.cfi_restore 6
.cfi_restore 12
.cfi_restore 13
.cfi_restore 14
.cfi_restore 15
ret
.L26:
.cfi_def_cfa_offset 80
.cfi_offset 3, -56
.cfi_offset 6, -48
.cfi_offset 12, -40
.cfi_offset 13, -32
.cfi_offset 14, -24
.cfi_offset 15, -16
call _ZSt16__throw_bad_castv
.cfi_endproc
.LFE2359:
.size _Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE, .-_Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE
.section .rodata.str1.8,"aMS",#progbits,1
.align 8
.LC2:
.string "cannot create std::vector larger than max_size()"
.section .text.unlikely,"ax",#progbits
.LCOLDB6:
.text
.LHOTB6:
.p2align 4
.globl _Z14init_dp_matrixmmRK5Model
.type _Z14init_dp_matrixmmRK5Model, #function
_Z14init_dp_matrixmmRK5Model:
.LFB2360:
.cfi_startproc
.cfi_personality 0x3,__gxx_personality_v0
.cfi_lsda 0x3,.LLSDA2360
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movabsq $1152921504606846975, %rax
movq %rsp, %rbp
.cfi_def_cfa_register 6
pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %rbx
andq $-32, %rsp
subq $64, %rsp
.cfi_offset 15, -24
.cfi_offset 14, -32
.cfi_offset 13, -40
.cfi_offset 12, -48
.cfi_offset 3, -56
movq %rdi, 24(%rsp)
movq %rsi, 40(%rsp)
movq %rcx, 16(%rsp)
cmpq %rax, %rdx
ja .L103
movq %rdx, %r15
testq %rdx, %rdx
je .L71
leaq 0(,%rdx,8), %rbx
movq %rbx, %rdi
.LEHB0:
call _Znwm
.LEHE0:
movq %rax, %r13
leaq -1(%r15), %rax
cmpq $3, %rax
movq %r15, %rdx
movq %r13, %rax
jbe .L30
shrq $2, %rdx
salq $5, %rdx
addq %r13, %rdx
vpxor %xmm0, %xmm0, %xmm0
.p2align 4,,10
.p2align 3
.L32:
vmovdqu32 %ymm0, (%rax)
addq $32, %rax
cmpq %rdx, %rax
jne .L32
movq %r15, %rcx
andq $-4, %rcx
movq %r15, %rdx
andl $3, %edx
leaq 0(%r13,%rcx,8), %rax
cmpq %rcx, %r15
je .L33
.L30:
movq $0, (%rax)
cmpq $1, %rdx
je .L33
movq $0, 8(%rax)
cmpq $2, %rdx
je .L33
movq $0, 16(%rax)
cmpq $3, %rdx
je .L33
movq $0, 24(%rax)
.L33:
leaq 0(%r13,%rbx), %rax
movq %rax, 56(%rsp)
.L29:
movabsq $384307168202282325, %rax
cmpq %rax, 40(%rsp)
ja .L104
movq 40(%rsp), %rax
movq 24(%rsp), %r12
leaq (%rax,%rax,2), %rbx
movq $0, (%r12)
movq $0, 8(%r12)
movq $0, 16(%r12)
salq $3, %rbx
testq %rax, %rax
je .L35
movq %rbx, %rdi
vzeroupper
.LEHB1:
call _Znwm
.LEHE1:
addq %rax, %rbx
movq %rax, (%r12)
movq %rax, 8(%r12)
movq %rbx, 16(%r12)
movq 56(%rsp), %r12
movq %rax, %r14
subq %r13, %r12
movq %r12, %rax
sarq $3, %rax
je .L40
movabsq $1152921504606846975, %rdx
cmpq %rdx, %rax
ja .L41
movq 40(%rsp), %rax
movq %r14, %rbx
movq %rax, 48(%rsp)
.p2align 4,,10
.p2align 3
.L46:
movq $0, (%rbx)
movq $0, 8(%rbx)
movq $0, 16(%rbx)
movq %r12, %rdi
.LEHB2:
call _Znwm
.LEHE2:
leaq (%rax,%r12), %rcx
movq %rax, (%rbx)
movq %rcx, 16(%rbx)
movq %rax, %rdi
cmpq %r13, 56(%rsp)
je .L42
movq %r12, %rdx
movq %r13, %rsi
movq %rcx, 32(%rsp)
call memcpy
movq 32(%rsp), %rcx
addq $24, %rbx
movq %rcx, -16(%rbx)
decq 48(%rsp)
jne .L46
movq 24(%rsp), %rax
movq %rbx, 8(%rax)
.L47:
movq %r13, %rdi
call _ZdlPv
.L48:
movq 16(%rsp), %rax
cmpq $1, 40(%rsp)
movl (%rax), %edx
jbe .L62
movl 4(%rax), %edi
movq 24(%rsp), %rax
movq (%rax), %rsi
movq 40(%rsp), %rax
leaq -2(%rax), %rcx
cmpq $7, %rcx
jbe .L73
movq %rcx, %r8
shrq $3, %r8
leaq (%r8,%r8,2), %r8
salq $6, %r8
vmovdqa64 .LC1(%rip), %ymm3
vmovdqa64 .LC3(%rip), %ymm4
vmovdqa64 .LC4(%rip), %ymm6
vmovdqa64 .LC5(%rip), %ymm5
vpbroadcastd %edi, %ymm10
vpbroadcastd %edx, %ymm9
leaq 24(%rsi), %rax
leaq 24(%rsi,%r8), %r8
vpcmpeqd %ymm8, %ymm8, %ymm8
kxnorb %k1, %k1, %k1
.p2align 4,,10
.p2align 3
.L61:
vmovdqa64 %ymm3, %ymm0
vpaddd %ymm8, %ymm0, %ymm0
vpmulld %ymm10, %ymm0, %ymm0
vmovdqu64 (%rax), %ymm2
vmovdqu64 96(%rax), %ymm1
vpermt2q 32(%rax), %ymm6, %ymm2
vpermt2q 128(%rax), %ymm6, %ymm1
vpermt2q 64(%rax), %ymm5, %ymm2
vpaddd %ymm9, %ymm0, %ymm0
vpermt2q 160(%rax), %ymm5, %ymm1
kmovb %k1, %k2
addq $192, %rax
vpscatterqd %xmm0, 4(,%ymm2,1){%k2}
vperm2i128 $17, %ymm0, %ymm0, %ymm0
kmovb %k1, %k3
vpaddd %ymm4, %ymm3, %ymm3
vpscatterqd %xmm0, 4(,%ymm1,1){%k3}
cmpq %r8, %rax
jne .L61
andq $-8, %rcx
leaq 1(%rcx), %r8
leal 1(%rcx), %eax
.L59:
leaq (%r8,%r8,2), %rcx
movq (%rsi,%rcx,8), %r8
leal -1(%rax), %ecx
imull %edi, %ecx
movq 40(%rsp), %rbx
addl %edx, %ecx
movl %ecx, 4(%r8)
leal 1(%rax), %ecx
movslq %ecx, %r8
cmpq %r8, %rbx
jbe .L62
leaq (%r8,%r8,2), %r8
movq (%rsi,%r8,8), %r9
movl %edi, %r8d
imull %eax, %r8d
addl %edx, %r8d
movl %r8d, 4(%r9)
leal 2(%rax), %r8d
movslq %r8d, %r9
cmpq %r9, %rbx
jbe .L62
imull %edi, %ecx
leaq (%r9,%r9,2), %r9
movq (%rsi,%r9,8), %r9
addl %edx, %ecx
movl %ecx, 4(%r9)
leal 3(%rax), %ecx
movslq %ecx, %r9
cmpq %r9, %rbx
jbe .L62
imull %edi, %r8d
leaq (%r9,%r9,2), %r9
movq (%rsi,%r9,8), %r9
addl %edx, %r8d
movl %r8d, 4(%r9)
leal 4(%rax), %r8d
movslq %r8d, %r9
cmpq %r9, %rbx
jbe .L62
imull %edi, %ecx
leaq (%r9,%r9,2), %r9
movq (%rsi,%r9,8), %r9
addl %edx, %ecx
movl %ecx, 4(%r9)
leal 5(%rax), %ecx
movslq %ecx, %r9
cmpq %r9, %rbx
jbe .L62
imull %edi, %r8d
leaq (%r9,%r9,2), %r9
movq (%rsi,%r9,8), %r9
addl %edx, %r8d
movl %r8d, 4(%r9)
leal 6(%rax), %r8d
movslq %r8d, %r9
cmpq %r9, %rbx
jbe .L62
imull %edi, %ecx
leaq (%r9,%r9,2), %r9
movq (%rsi,%r9,8), %r9
addl $7, %eax
addl %edx, %ecx
cltq
movl %ecx, 4(%r9)
cmpq %rax, %rbx
jbe .L62
imull %r8d, %edi
leaq (%rax,%rax,2), %rax
movq (%rsi,%rax,8), %rax
leal (%rdi,%rdx), %r8d
movl %r8d, 4(%rax)
.L62:
cmpq $1, %r15
jbe .L27
movq 16(%rsp), %rax
leaq -1(%r15), %r8
movl 4(%rax), %edi
movq 24(%rsp), %rax
movq (%rax), %rax
movq (%rax), %rsi
leaq -2(%r15), %rax
cmpq $6, %rax
jbe .L74
movq %r8, %rcx
shrq $3, %rcx
salq $6, %rcx
vmovdqa64 .LC1(%rip), %ymm2
vmovdqa64 .LC3(%rip), %ymm4
vpbroadcastd %edi, %ymm6
vpbroadcastd %edx, %ymm5
movq %rsi, %rax
addq %rsi, %rcx
vpcmpeqd %ymm3, %ymm3, %ymm3
.p2align 4,,10
.p2align 3
.L66:
vmovdqa64 %ymm2, %ymm0
vpaddd %ymm3, %ymm0, %ymm0
vpmulld %ymm6, %ymm0, %ymm0
addq $64, %rax
vpaddd %ymm4, %ymm2, %ymm2
vpaddd %ymm5, %ymm0, %ymm0
vmovd %xmm0, -56(%rax)
vpextrd $1, %xmm0, -48(%rax)
vpextrd $2, %xmm0, -40(%rax)
vpextrd $3, %xmm0, -32(%rax)
vextracti128 $0x1, %ymm0, %xmm0
vmovd %xmm0, -24(%rax)
vpextrd $1, %xmm0, -16(%rax)
vpextrd $2, %xmm0, -8(%rax)
vpextrd $3, %xmm0, (%rax)
cmpq %rcx, %rax
jne .L66
movq %r8, %rcx
andq $-8, %rcx
leaq 1(%rcx), %r9
leal 1(%rcx), %eax
cmpq %r8, %rcx
je .L27
.L64:
leal -1(%rax), %ecx
imull %edi, %ecx
addl %edx, %ecx
movl %ecx, (%rsi,%r9,8)
leal 1(%rax), %ecx
movslq %ecx, %r9
cmpq %r15, %r9
jnb .L27
movl %edi, %r8d
imull %eax, %r8d
addl %edx, %r8d
movl %r8d, (%rsi,%r9,8)
leal 2(%rax), %r8d
movslq %r8d, %r9
cmpq %r9, %r15
jbe .L27
imull %edi, %ecx
addl %edx, %ecx
movl %ecx, (%rsi,%r9,8)
leal 3(%rax), %ecx
movslq %ecx, %r9
cmpq %r15, %r9
jnb .L27
imull %edi, %r8d
addl %edx, %r8d
movl %r8d, (%rsi,%r9,8)
leal 4(%rax), %r8d
movslq %r8d, %r9
cmpq %r9, %r15
jbe .L27
imull %edi, %ecx
addl %edx, %ecx
movl %ecx, (%rsi,%r9,8)
leal 5(%rax), %ecx
movslq %ecx, %r9
cmpq %r9, %r15
jbe .L27
imull %edi, %r8d
addl $6, %eax
cltq
addl %edx, %r8d
movl %r8d, (%rsi,%r9,8)
cmpq %rax, %r15
jbe .L27
imull %ecx, %edi
addl %edi, %edx
movl %edx, (%rsi,%rax,8)
.L27:
movq 24(%rsp), %rax
vzeroupper
leaq -40(%rbp), %rsp
popq %rbx
popq %r12
popq %r13
popq %r14
popq %r15
popq %rbp
.cfi_remember_state
.cfi_def_cfa 7, 8
ret
.p2align 4,,10
.p2align 3
.L37:
.cfi_restore_state
movq %r12, 8(%r14)
addq $24, %r14
cmpq %r14, %rbx
je .L45
.L40:
movq $0, (%r14)
movq %r12, 16(%r14)
cmpq %r13, 56(%rsp)
je .L37
movq %r12, %rdx
movq %r13, %rsi
xorl %edi, %edi
call memcpy
addq $24, %r14
movq %r12, -16(%r14)
cmpq %r14, %rbx
jne .L40
.L45:
movq 24(%rsp), %rax
movq %rbx, 8(%rax)
testq %r13, %r13
je .L48
.L105:
movq %r13, %rdi
call _ZdlPv
jmp .L48
.p2align 4,,10
.p2align 3
.L42:
movq %rcx, 8(%rbx)
addq $24, %rbx
decq 48(%rsp)
jne .L46
movq 24(%rsp), %rax
movq %rbx, 8(%rax)
testq %r13, %r13
je .L48
jmp .L105
.p2align 4,,10
.p2align 3
.L71:
movq $0, 56(%rsp)
xorl %r13d, %r13d
jmp .L29
.p2align 4,,10
.p2align 3
.L35:
testq %r13, %r13
je .L106
vzeroupper
jmp .L47
.L73:
movl $1, %eax
movl $1, %r8d
jmp .L59
.L74:
movl $1, %eax
movl $1, %r9d
jmp .L64
.L106:
movq 16(%rsp), %rax
movl (%rax), %edx
jmp .L62
.L41:
movq $0, (%r14)
movq $0, 8(%r14)
movq $0, 16(%r14)
.LEHB3:
call _ZSt17__throw_bad_allocv
.LEHE3:
.L104:
movl $.LC2, %edi
vzeroupper
.LEHB4:
call _ZSt20__throw_length_errorPKc
.LEHE4:
.L103:
movl $.LC2, %edi
.LEHB5:
call _ZSt20__throw_length_errorPKc
.LEHE5:
.L78:
movq %rax, %rdi
jmp .L49
.L77:
movq %rax, %rdi
jmp .L50
.L75:
movq %rax, %r12
vzeroupper
jmp .L56
.globl __gxx_personality_v0
.section .gcc_except_table,"a",#progbits
.align 4
.LLSDA2360:
.byte 0xff
.byte 0x3
.uleb128 .LLSDATT2360-.LLSDATTD2360
.LLSDATTD2360:
.byte 0x1
.uleb128 .LLSDACSE2360-.LLSDACSB2360
.LLSDACSB2360:
.uleb128 .LEHB0-.LFB2360
.uleb128 .LEHE0-.LEHB0
.uleb128 0
.uleb128 0
.uleb128 .LEHB1-.LFB2360
.uleb128 .LEHE1-.LEHB1
.uleb128 .L75-.LFB2360
.uleb128 0
.uleb128 .LEHB2-.LFB2360
.uleb128 .LEHE2-.LEHB2
.uleb128 .L77-.LFB2360
.uleb128 0x1
.uleb128 .LEHB3-.LFB2360
.uleb128 .LEHE3-.LEHB3
.uleb128 .L78-.LFB2360
.uleb128 0x1
.uleb128 .LEHB4-.LFB2360
.uleb128 .LEHE4-.LEHB4
.uleb128 .L75-.LFB2360
.uleb128 0
.uleb128 .LEHB5-.LFB2360
.uleb128 .LEHE5-.LEHB5
.uleb128 0
.uleb128 0
.LLSDACSE2360:
.byte 0x1
.byte 0
.align 4
.long 0
.LLSDATT2360:
.text
.cfi_endproc
.section .text.unlikely
.cfi_startproc
.cfi_personality 0x3,__gxx_personality_v0
.cfi_lsda 0x3,.LLSDAC2360
.type _Z14init_dp_matrixmmRK5Model.cold, #function
_Z14init_dp_matrixmmRK5Model.cold:
.LFSB2360:
.L49:
.cfi_def_cfa 6, 16
.cfi_offset 3, -56
.cfi_offset 6, -16
.cfi_offset 12, -48
.cfi_offset 13, -40
.cfi_offset 14, -32
.cfi_offset 15, -24
movq %r14, %rbx
.L50:
vzeroupper
call __cxa_begin_catch
.L53:
cmpq %rbx, %r14
jne .L107
.LEHB6:
call __cxa_rethrow
.LEHE6:
.L76:
movq %rax, %r12
vzeroupper
call __cxa_end_catch
movq 24(%rsp), %rax
movq (%rax), %rdi
testq %rdi, %rdi
je .L56
call _ZdlPv
.L56:
testq %r13, %r13
je .L69
movq %r13, %rdi
call _ZdlPv
.L69:
movq %r12, %rdi
.LEHB7:
call _Unwind_Resume
.LEHE7:
.L107:
movq (%r14), %rdi
testq %rdi, %rdi
je .L52
call _ZdlPv
.L52:
addq $24, %r14
jmp .L53
.cfi_endproc
.LFE2360:
.section .gcc_except_table
.align 4
.LLSDAC2360:
.byte 0xff
.byte 0x3
.uleb128 .LLSDATTC2360-.LLSDATTDC2360
.LLSDATTDC2360:
.byte 0x1
.uleb128 .LLSDACSEC2360-.LLSDACSBC2360
.LLSDACSBC2360:
.uleb128 .LEHB6-.LCOLDB6
.uleb128 .LEHE6-.LEHB6
.uleb128 .L76-.LCOLDB6
.uleb128 0
.uleb128 .LEHB7-.LCOLDB6
.uleb128 .LEHE7-.LEHB7
.uleb128 0
.uleb128 0
.LLSDACSEC2360:
.byte 0x1
.byte 0
.align 4
.long 0
.LLSDATTC2360:
.section .text.unlikely
.text
.size _Z14init_dp_matrixmmRK5Model, .-_Z14init_dp_matrixmmRK5Model
.section .text.unlikely
.size _Z14init_dp_matrixmmRK5Model.cold, .-_Z14init_dp_matrixmmRK5Model.cold
.LCOLDE6:
.text
.LHOTE6:
.section .text._ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev,"axG",#progbits,_ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED5Ev,comdat
.align 2
.p2align 4
.weak _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev
.type _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev, #function
_ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev:
.LFB2637:
.cfi_startproc
pushq %r12
.cfi_def_cfa_offset 16
.cfi_offset 12, -16
movq %rdi, %r12
pushq %rbp
.cfi_def_cfa_offset 24
.cfi_offset 6, -24
pushq %rbx
.cfi_def_cfa_offset 32
.cfi_offset 3, -32
movq 8(%rdi), %rbx
movq (%rdi), %rbp
cmpq %rbp, %rbx
je .L109
.p2align 4,,10
.p2align 3
.L113:
movq 0(%rbp), %rdi
testq %rdi, %rdi
je .L110
addq $24, %rbp
call _ZdlPv
cmpq %rbp, %rbx
jne .L113
.L111:
movq (%r12), %rbp
.L109:
testq %rbp, %rbp
je .L115
popq %rbx
.cfi_remember_state
.cfi_def_cfa_offset 24
movq %rbp, %rdi
popq %rbp
.cfi_def_cfa_offset 16
popq %r12
.cfi_def_cfa_offset 8
jmp _ZdlPv
.p2align 4,,10
.p2align 3
.L110:
.cfi_restore_state
addq $24, %rbp
cmpq %rbp, %rbx
jne .L113
jmp .L111
.p2align 4,,10
.p2align 3
.L115:
popq %rbx
.cfi_def_cfa_offset 24
popq %rbp
.cfi_def_cfa_offset 16
popq %r12
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE2637:
.size _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev, .-_ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev
.weak _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED1Ev
.set _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED1Ev,_ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev
.section .text.unlikely
.LCOLDB7:
.section .text.startup,"ax",#progbits
.LHOTB7:
.p2align 4
.globl main
.type main, #function
main:
.LFB2371:
.cfi_startproc
.cfi_personality 0x3,__gxx_personality_v0
.cfi_lsda 0x3,.LLSDA2371
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movl $2, %edx
movl $10, %esi
subq $48, %rsp
.cfi_def_cfa_offset 64
leaq 16(%rsp), %rdi
leaq 8(%rsp), %rcx
movq $-8, 8(%rsp)
.LEHB8:
call _Z14init_dp_matrixmmRK5Model
.LEHE8:
leaq 16(%rsp), %rdi
.LEHB9:
call _Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE
.LEHE9:
leaq 16(%rsp), %rdi
call _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED1Ev
addq $48, %rsp
.cfi_remember_state
.cfi_def_cfa_offset 16
xorl %eax, %eax
popq %rbp
.cfi_def_cfa_offset 8
ret
.L119:
.cfi_restore_state
movq %rax, %rbp
jmp .L118
.section .gcc_except_table
.LLSDA2371:
.byte 0xff
.byte 0xff
.byte 0x1
.uleb128 .LLSDACSE2371-.LLSDACSB2371
.LLSDACSB2371:
.uleb128 .LEHB8-.LFB2371
.uleb128 .LEHE8-.LEHB8
.uleb128 0
.uleb128 0
.uleb128 .LEHB9-.LFB2371
.uleb128 .LEHE9-.LEHB9
.uleb128 .L119-.LFB2371
.uleb128 0
.LLSDACSE2371:
.section .text.startup
.cfi_endproc
.section .text.unlikely
.cfi_startproc
.cfi_personality 0x3,__gxx_personality_v0
.cfi_lsda 0x3,.LLSDAC2371
.type main.cold, #function
main.cold:
.LFSB2371:
.L118:
.cfi_def_cfa_offset 64
.cfi_offset 6, -16
leaq 16(%rsp), %rdi
vzeroupper
call _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED1Ev
movq %rbp, %rdi
.LEHB10:
call _Unwind_Resume
.LEHE10:
.cfi_endproc
.LFE2371:
.section .gcc_except_table
.LLSDAC2371:
.byte 0xff
.byte 0xff
.byte 0x1
.uleb128 .LLSDACSEC2371-.LLSDACSBC2371
.LLSDACSBC2371:
.uleb128 .LEHB10-.LCOLDB7
.uleb128 .LEHE10-.LEHB10
.uleb128 0
.uleb128 0
.LLSDACSEC2371:
.section .text.unlikely
.section .text.startup
.size main, .-main
.section .text.unlikely
.size main.cold, .-main.cold
.LCOLDE7:
.section .text.startup
.LHOTE7:
.p2align 4
.type _GLOBAL__sub_I__Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE, #function
_GLOBAL__sub_I__Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE:
.LFB3017:
.cfi_startproc
subq $8, %rsp
.cfi_def_cfa_offset 16
movl $_ZStL8__ioinit, %edi
call _ZNSt8ios_base4InitC1Ev
movl $__dso_handle, %edx
movl $_ZStL8__ioinit, %esi
movl $_ZNSt8ios_base4InitD1Ev, %edi
addq $8, %rsp
.cfi_def_cfa_offset 8
jmp __cxa_atexit
.cfi_endproc
.LFE3017:
.size _GLOBAL__sub_I__Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE, .-_GLOBAL__sub_I__Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE
.section .init_array,"aw"
.align 8
.quad _GLOBAL__sub_I__Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE
.local _ZStL8__ioinit
.comm _ZStL8__ioinit,1,1
.section .rodata.cst32,"aM",#progbits,32
.align 32
.LC1:
.long 1
.long 2
.long 3
.long 4
.long 5
.long 6
.long 7
.long 8
.align 32
.LC3:
.long 8
.long 8
.long 8
.long 8
.long 8
.long 8
.long 8
.long 8
.align 32
.LC4:
.quad 0
.quad 3
.quad 6
.quad 0
.align 32
.LC5:
.quad 0
.quad 1
.quad 2
.quad 5
.hidden __dso_handle
.ident "GCC: (Homebrew GCC 9.2.0) 9.2.0"
.section .note.GNU-stack,"",#progbits
The assembly for the non -march=native version is available on godbolt.
What is going wrong, is this a compiler bug or is my program ill formed? How can I mitigate this issue if it is a compiler bug?
Additional info
Compiling with -v:
$ ~/tools/octopus/build/brew/bin/g++-9 -O3 -march=native -S bug.cpp -v
Reading specs from /home/dcooke/tools/octopus/build/brew/Cellar/gcc#9/9.2.0/bin/../lib/gcc/9/gcc/x86_64-pc-linux-gnu/9.2.0/specs
COLLECT_GCC=/home/dcooke/tools/octopus/build/brew/bin/g++-9
Target: x86_64-pc-linux-gnu
Configured with: ../configure --with-bugurl=https://github.com/Linuxbrew/homebrew-core/issues --disable-multilib --prefix=/home/linuxbrew/.linuxbrew/Cellar/gcc#9/9.2.0 --libdir=/home/linuxbrew/.linuxbrew/Cellar/gcc#9/9.2.0/lib/gcc/9 --enable-languages=c,c++,objc,obj-c++,fortran --disable-nls --enable-checking=release --program-suffix=-9 --with-gmp=/home/linuxbrew/.linuxbrew/opt/gmp --with-mpfr=/home/linuxbrew/.linuxbrew/opt/mpfr --with-mpc=/home/linuxbrew/.linuxbrew/opt/libmpc --with-isl=/home/linuxbrew/.linuxbrew/opt/isl --with-pkgversion='Homebrew GCC 9.2.0'
Thread model: posix
gcc version 9.2.0 (Homebrew GCC 9.2.0)
COLLECT_GCC_OPTIONS='-O3' '-march=native' '-S' '-v' '-shared-libgcc'
/home/dcooke/tools/octopus/build/brew/Cellar/gcc#9/9.2.0/bin/../libexec/gcc/x86_64-pc-linux-gnu/9.2.0/cc1plus -quiet -v -imultiarch x86_64-linux-gnu -iprefix /home/dcooke/tools/octopus/build/brew/Cellar/gcc#9/9.2.0/bin/../lib/gcc/9/gcc/x86_64-pc-linux-gnu/9.2.0/ -D_GNU_SOURCE bug.cpp -isysroot /home/dcooke/tools/octopus/build/brew/nonexistent -idirafter /home/dcooke/tools/octopus/build/brew/include -idirafter /usr/include/x86_64-linux-gnu -idirafter /usr/include -march=skylake-avx512 -mmmx -mno-3dnow -msse -msse2 -msse3 -mssse3 -mno-sse4a -mcx16 -msahf -mmovbe -maes -mno-sha -mpclmul -mpopcnt -mabm -mno-lwp -mfma -mno-fma4 -mno-xop -mbmi -mno-sgx -mbmi2 -mno-pconfig -mno-wbnoinvd -mno-tbm -mavx -mavx2 -msse4.2 -msse4.1 -mlzcnt -mrtm -mhle -mrdrnd -mf16c -mfsgsbase -mrdseed -mprfchw -madx -mfxsr -mxsave -mxsaveopt -mavx512f -mno-avx512er -mavx512cd -mno-avx512pf -mno-prefetchwt1 -mclflushopt -mxsavec -mxsaves -mavx512dq -mavx512bw -mavx512vl -mno-avx512ifma -mno-avx512vbmi -mno-avx5124fmaps -mno-avx5124vnniw -mclwb -mno-mwaitx -mno-clzero -mpku -mno-rdpid -mno-gfni -mno-shstk -mno-avx512vbmi2 -mno-avx512vnni -mno-vaes -mno-vpclmulqdq -mno-avx512bitalg -mno-movdiri -mno-movdir64b -mno-waitpkg -mno-cldemote -mno-ptwrite --param l1-cache-size=32 --param l1-cache-line-size=64 --param l2-cache-size=33792 -mtune=skylake-avx512 -quiet -dumpbase bug.cpp -auxbase bug -O3 -version -o bug.s
GNU C++14 (Homebrew GCC 9.2.0) version 9.2.0 (x86_64-pc-linux-gnu)
compiled by GNU C version 9.2.0, GMP version 6.1.2, MPFR version 4.0.2, MPC version 1.1.0, isl version isl-0.21-GMP
Compiling with -O2 or less makes the problem go away:
$ g++-9 -O2 -march=native -o bug bug.cpp
$ ./bug
{0 0} {-8 0}
{0 -8} {0 0}
{0 -9} {0 0}
{0 -10} {0 0}
{0 -11} {0 0}
{0 -12} {0 0}
{0 -13} {0 0}
{0 -14} {0 0}
{0 -15} {0 0}
{0 -16} {0 0}
I tried building on a different machine with Intel chips:
$ rpm -q centos-release
centos-release-7-3.1611.el7.centos.x86_64
$ grep model /proc/cpuinfo | head -2
model : 85
model name : Intel(R) Xeon(R) Gold 6148 CPU # 2.40GHz
$ g++-9 -O3 -march=native -o bug bug.cpp -v
Reading specs from /gpfs1/well/gerton/dan/apps/octopus/build/brew/Cellar/gcc#9/9.2.0/bin/../lib/gcc/9/gcc/x86_64-pc-linux-gnu/9.2.0/specs
COLLECT_GCC=/well/gerton/dan/apps/octopus/build/brew/bin/g++-9
COLLECT_LTO_WRAPPER=/gpfs1/well/gerton/dan/apps/octopus/build/brew/Cellar/gcc#9/9.2.0/bin/../libexec/gcc/x86_64-pc-linux-gnu/9.2.0/lto-wrapper
Target: x86_64-pc-linux-gnu
Configured with: ../configure --with-bugurl=https://github.com/Linuxbrew/homebrew-core/issues --disable-multilib --prefix=/home/linuxbrew/.linuxbrew/Cellar/gcc#9/9.2.0 --libdir=/home/linuxbrew/.linuxbrew/Cellar/gcc#9/9.2.0/lib/gcc/9 --enable-languages=c,c++,objc,obj-c++,fortran --disable-nls --enable-checking=release --program-suffix=-9 --with-gmp=/home/linuxbrew/.linuxbrew/opt/gmp --with-mpfr=/home/linuxbrew/.linuxbrew/opt/mpfr --with-mpc=/home/linuxbrew/.linuxbrew/opt/libmpc --with-isl=/home/linuxbrew/.linuxbrew/opt/isl --with-pkgversion='Homebrew GCC 9.2.0'
Thread model: posix
gcc version 9.2.0 (Homebrew GCC 9.2.0)
COLLECT_GCC_OPTIONS='-O3' '-march=native' '-o' 'bug' '-v' '-shared-libgcc'
/gpfs1/well/gerton/dan/apps/octopus/build/brew/Cellar/gcc#9/9.2.0/bin/../libexec/gcc/x86_64-pc-linux-gnu/9.2.0/cc1plus -quiet -v -imultiarch x86_64-linux-gnu -iprefix /gpfs1/well/gerton/dan/apps/octopus/build/brew/Cellar/gcc#9/9.2.0/bin/../lib/gcc/9/gcc/x86_64-pc-linux-gnu/9.2.0/ -D_GNU_SOURCE bug.cpp -isysroot /gpfs1/well/gerton/dan/apps/octopus/build/brew/nonexistent -idirafter /gpfs1/well/gerton/dan/apps/octopus/build/brew/include -idirafter /gpfs1/well/gerton/dan/apps/octopus/build/brew/opt/glibc/include -march=skylake-avx512 -mmmx -mno-3dnow -msse -msse2 -msse3 -mssse3 -mno-sse4a -mcx16 -msahf -mmovbe -maes -mno-sha -mpclmul -mpopcnt -mabm -mno-lwp -mfma -mno-fma4 -mno-xop -mbmi -mno-sgx -mbmi2 -mno-pconfig -mno-wbnoinvd -mno-tbm -mavx -mavx2 -msse4.2 -msse4.1 -mlzcnt -mrtm -mhle -mrdrnd -mf16c -mfsgsbase -mrdseed -mprfchw -madx -mfxsr -mxsave -mxsaveopt -mavx512f -mno-avx512er -mavx512cd -mno-avx512pf -mno-prefetchwt1 -mclflushopt -mxsavec -mxsaves -mavx512dq -mavx512bw -mavx512vl -mno-avx512ifma -mno-avx512vbmi -mno-avx5124fmaps -mno-avx5124vnniw -mclwb -mno-mwaitx -mno-clzero -mno-pku -mno-rdpid -mno-gfni -mno-shstk -mno-avx512vbmi2 -mno-avx512vnni -mno-vaes -mno-vpclmulqdq -mno-avx512bitalg -mno-movdiri -mno-movdir64b -mno-waitpkg -mno-cldemote -mno-ptwrite --param l1-cache-size=32 --param l1-cache-line-size=64 --param l2-cache-size=28160 -mtune=skylake-avx512 -quiet -dumpbase bug.cpp -auxbase bug -O3 -version -o /tmp/cczPrvHP.s
GNU C++14 (Homebrew GCC 9.2.0) version 9.2.0 (x86_64-pc-linux-gnu)
compiled by GNU C version 9.2.0, GMP version 6.1.2, MPFR version 4.0.2, MPC version 1.1.0, isl version isl-0.21-GMP
$ ./bug
{0 0} {-8 0}
{0 -8} {0 0}
{0 -9} {0 0}
{0 -10} {0 0}
{0 -11} {0 0}
{0 -12} {0 0}
{0 -13} {0 0}
{0 -14} {0 0}
{0 -15} {0 0}
{0 -16} {0 0}
The correct output...
-ftree-loop-vectorize is the culprit:
$ g++-9 -march=native -O2 -o bug bug.cpp -ftree-loop-vectorize
$ ./bug
{0 0} {-8 0}
{-2048 255} {0 0}
{-2304 255} {0 0}
{-2560 255} {0 0}
{-2816 255} {0 0}
{-3072 255} {0 0}
{-3328 255} {0 0}
{-3584 255} {0 0}
{-3840 255} {0 0}
{0 -16} {0 0}
None of the other O3 flags result in this behaviour.
This turned out to be due to a bug in binutils gas. This solution was to upgrade my binutils to 2.32.
Related
C++ return reference push instruction not showing up in assembly
I'm trying to learn more about how return references are compiled, and I'm currently stuck on how they show up in assembly. The code I'm running is this: struct Obj { char buf[100]; int i; long b; } B, B2; Obj foo(Obj b) { b.i = 100; // Do something to the argument return b; } int main() { B2 = foo(B); } and the assembly code looks like this: .file "return_function_assembly.cpp" .text .globl B .bss .align 32 .type B, #object .size B, 112 B: .zero 112 .globl B2 .align 32 .type B2, #object .size B2, 112 B2: .zero 112 .text .globl _Z3foo3Obj .type _Z3foo3Obj, #function _Z3foo3Obj: .LFB0: .cfi_startproc pushq %rbp .cfi_def_cfa_offset 16 .cfi_offset 6, -16 movq %rsp, %rbp .cfi_def_cfa_register 6 movq %rdi, -8(%rbp) movl $100, 116(%rbp) movq -8(%rbp), %rax movq 16(%rbp), %rdx movq 24(%rbp), %rcx movq %rdx, (%rax) movq %rcx, 8(%rax) movq 32(%rbp), %rdx movq 40(%rbp), %rcx movq %rdx, 16(%rax) movq %rcx, 24(%rax) movq 48(%rbp), %rdx movq 56(%rbp), %rcx movq %rdx, 32(%rax) movq %rcx, 40(%rax) movq 64(%rbp), %rdx movq 72(%rbp), %rcx movq %rdx, 48(%rax) movq %rcx, 56(%rax) movq 80(%rbp), %rdx movq 88(%rbp), %rcx movq %rdx, 64(%rax) movq %rcx, 72(%rax) movq 96(%rbp), %rdx movq 104(%rbp), %rcx movq %rdx, 80(%rax) movq %rcx, 88(%rax) movq 112(%rbp), %rdx movq 120(%rbp), %rcx movq %rdx, 96(%rax) movq %rcx, 104(%rax) movq -8(%rbp), %rax popq %rbp .cfi_def_cfa 7, 8 ret .cfi_endproc .LFE0: .size _Z3foo3Obj, .-_Z3foo3Obj .globl main .type main, #function main: .LFB1: .cfi_startproc pushq %rbp .cfi_def_cfa_offset 16 .cfi_offset 6, -16 movq %rsp, %rbp .cfi_def_cfa_register 6 addq $-128, %rsp movq %fs:40, %rax movq %rax, -8(%rbp) xorl %eax, %eax leaq -128(%rbp), %rax pushq 104+B(%rip) pushq 96+B(%rip) pushq 88+B(%rip) pushq 80+B(%rip) pushq 72+B(%rip) pushq 64+B(%rip) pushq 56+B(%rip) pushq 48+B(%rip) pushq 40+B(%rip) pushq 32+B(%rip) pushq 24+B(%rip) pushq 16+B(%rip) pushq 8+B(%rip) pushq B(%rip) movq %rax, %rdi call _Z3foo3Obj addq $112, %rsp movq -128(%rbp), %rax movq -120(%rbp), %rdx movq %rax, B2(%rip) movq %rdx, 8+B2(%rip) movq -112(%rbp), %rax movq -104(%rbp), %rdx movq %rax, 16+B2(%rip) movq %rdx, 24+B2(%rip) movq -96(%rbp), %rax movq -88(%rbp), %rdx movq %rax, 32+B2(%rip) movq %rdx, 40+B2(%rip) movq -80(%rbp), %rax movq -72(%rbp), %rdx movq %rax, 48+B2(%rip) movq %rdx, 56+B2(%rip) movq -64(%rbp), %rax movq -56(%rbp), %rdx movq %rax, 64+B2(%rip) movq %rdx, 72+B2(%rip) movq -48(%rbp), %rax movq -40(%rbp), %rdx movq %rax, 80+B2(%rip) movq %rdx, 88+B2(%rip) movq -32(%rbp), %rax movq -24(%rbp), %rdx movq %rax, 96+B2(%rip) movq %rdx, 104+B2(%rip) movl $0, %eax movq -8(%rbp), %rdx xorq %fs:40, %rdx je .L5 call __stack_chk_fail#PLT .L5: leave .cfi_def_cfa 7, 8 ret .cfi_endproc .LFE1: .size main, .-main .ident "GCC: (Ubuntu 7.4.0-1ubuntu1~18.04.1) 7.4.0" .section .note.GNU-stack,"",#progbits System is Linux, compiled with g++, and from my understanding of function frames, I should be seeing an additional "push" instruction that pushes the address of B2 onto the stack prior the call instruction in the function frame. However, none of these push instructions seem to correspond to what I'm looking for. I see a leaq instruction, and a pushq (%rbp) at the beginning of main, but nothing that seems to be what I'm supposed to be seeing. Can anyone please advise?
Slow std::string concatenation on windows
I have a program that needs to concatenate lots of strings together (to be more precise integers converted to strings). On my Ubuntu machine (running g++ 7.3.0) the code runs in 1.5 seconds. But the code needs to be run on Windows as well (running g++ 6.3.0 using MinGW), where it takes 15 seconds to complete. Furthermore, the Ubuntu setup runs on a much slower Laptop using an i7-4712MQ CPU # 2.30GHz, whereas the Windows machine runs on an i7-7700K CPU # 4.20GHz. The code to reproduce the times is shown below. I compile the code with g++ tester.cpp -O2 -o tester (or tester.exe for windows) #include <iostream> #include <chrono> int main(int argc, char const *argv[]) { auto started = std::chrono::high_resolution_clock::now(); std::string str = ""; const int n = 10000000; str.reserve(2 * n); int a = 1; for (int i = 0; i < n; ++i) { str += std::to_string(a) + " "; } auto done = std::chrono::high_resolution_clock::now(); double secs = (double) std::chrono::duration_cast<std::chrono::milliseconds>(done-started).count() / 1000; std::cout << "Done in " << secs << "\n"; return 0; } Any idea where the large performance gap might come from? The disassemblies look like this: Ubuntu: .file "tester.cpp" .text .align 2 .p2align 4,,15 .type _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.19, #function _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.19: .LFB2389: .cfi_startproc pushq %r12 .cfi_def_cfa_offset 16 .cfi_offset 12, -16 pushq %rbp .cfi_def_cfa_offset 24 .cfi_offset 6, -24 movq %rsi, %r12 pushq %rbx .cfi_def_cfa_offset 32 .cfi_offset 3, -32 movq %rdx, %rbx movq %rdi, %rbp subq %rsi, %rbx subq $16, %rsp .cfi_def_cfa_offset 48 movq %fs:40, %rax movq %rax, 8(%rsp) xorl %eax, %eax cmpq $15, %rbx movq %rbx, (%rsp) ja .L12 movq (%rdi), %rdx cmpq $1, %rbx movq %rdx, %rax jne .L4 movzbl (%rsi), %eax movb %al, (%rdx) movq (%rdi), %rdx .L5: movq (%rsp), %rax movq %rax, 8(%rbp) movb $0, (%rdx,%rax) movq 8(%rsp), %rax xorq %fs:40, %rax jne .L13 addq $16, %rsp .cfi_remember_state .cfi_def_cfa_offset 32 popq %rbx .cfi_def_cfa_offset 24 popq %rbp .cfi_def_cfa_offset 16 popq %r12 .cfi_def_cfa_offset 8 ret .L12: .cfi_restore_state xorl %edx, %edx movq %rsp, %rsi call _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_createERmm#PLT movq (%rsp), %rdx movq %rax, 0(%rbp) movq %rdx, 16(%rbp) .L3: movq %rbx, %rdx movq %r12, %rsi movq %rax, %rdi call memcpy#PLT movq 0(%rbp), %rdx jmp .L5 .L4: testq %rbx, %rbx je .L5 jmp .L3 .L13: call __stack_chk_fail#PLT .cfi_endproc .LFE2389: .size _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.19, .-_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.19 .set _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPcEEvT_S7_St20forward_iterator_tag.isra.23,_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.19 .section .text._ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z,"axG",#progbits,_ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z,comdat .p2align 4,,15 .weak _ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z .type _ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z, #function _ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z: .LFB1953: .cfi_startproc pushq %rbp .cfi_def_cfa_offset 16 .cfi_offset 6, -16 movq %rsi, %r10 movq %rdx, %rsi movq %rcx, %rdx movq %rsp, %rbp .cfi_def_cfa_register 6 pushq %r12 pushq %rbx .cfi_offset 12, -24 .cfi_offset 3, -32 movq %rdi, %r12 subq $208, %rsp testb %al, %al movq %r8, -160(%rbp) movq %r9, -152(%rbp) je .L15 movaps %xmm0, -144(%rbp) movaps %xmm1, -128(%rbp) movaps %xmm2, -112(%rbp) movaps %xmm3, -96(%rbp) movaps %xmm4, -80(%rbp) movaps %xmm5, -64(%rbp) movaps %xmm6, -48(%rbp) movaps %xmm7, -32(%rbp) .L15: movq %fs:40, %rax movq %rax, -200(%rbp) xorl %eax, %eax leaq 30(%rsi), %rax leaq -224(%rbp), %rcx andq $-16, %rax movl $32, -224(%rbp) movl $48, -220(%rbp) subq %rax, %rsp leaq 16(%rbp), %rax leaq 15(%rsp), %rbx movq %rax, -216(%rbp) leaq -192(%rbp), %rax andq $-16, %rbx movq %rbx, %rdi movq %rax, -208(%rbp) call *%r10 leaq 16(%r12), %rdx movq %r12, %rdi movq %rbx, %rsi movq %rdx, (%r12) movslq %eax, %rdx addq %rbx, %rdx call _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPcEEvT_S7_St20forward_iterator_tag.isra.23 movq -200(%rbp), %rdi xorq %fs:40, %rdi movq %r12, %rax jne .L18 leaq -16(%rbp), %rsp popq %rbx popq %r12 popq %rbp .cfi_remember_state .cfi_def_cfa 7, 8 ret .L18: .cfi_restore_state call __stack_chk_fail#PLT .cfi_endproc .LFE1953: .size _ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z, .-_ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z .section .rodata.str1.1,"aMS",#progbits,1 .LC0: .string "" .LC1: .string "%d" .LC2: .string "basic_string::append" .LC3: .string " " .LC5: .string "Done in " .LC6: .string "\n" .section .text.startup,"ax",#progbits .p2align 4,,15 .globl main .type main, #function main: .LFB1871: .cfi_startproc .cfi_personality 0x9b,DW.ref.__gxx_personality_v0 .cfi_lsda 0x1b,.LLSDA1871 pushq %r15 .cfi_def_cfa_offset 16 .cfi_offset 15, -16 pushq %r14 .cfi_def_cfa_offset 24 .cfi_offset 14, -24 pushq %r13 .cfi_def_cfa_offset 32 .cfi_offset 13, -32 pushq %r12 .cfi_def_cfa_offset 40 .cfi_offset 12, -40 pushq %rbp .cfi_def_cfa_offset 48 .cfi_offset 6, -48 pushq %rbx .cfi_def_cfa_offset 56 .cfi_offset 3, -56 subq $136, %rsp .cfi_def_cfa_offset 192 leaq 16(%rsp), %r13 movq %fs:40, %rax movq %rax, 120(%rsp) xorl %eax, %eax call _ZNSt6chrono3_V212system_clock3nowEv#PLT leaq .LC0(%rip), %rdx movq %rax, (%rsp) leaq 16(%r13), %rax movq %r13, %rdi movq %rdx, %rsi movq %rax, 16(%rsp) .LEHB0: call _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.19 .LEHE0: movl $20000000, %esi movq %r13, %rdi .LEHB1: call _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE7reserveEm#PLT .LEHE1: leaq 48(%rsp), %rbp leaq 80(%rsp), %rax movl $10000000, %ebx movabsq $9223372036854775807, %r14 leaq 96(%rsp), %r12 movq %rax, 8(%rsp) leaq 16(%rbp), %r15 jmp .L25 .p2align 4,,10 .p2align 3 .L21: movq %rcx, 80(%rsp) movq 16(%rax), %rcx movq %rcx, 96(%rsp) .L22: movq 8(%rax), %rcx movb $0, 16(%rax) movq %r13, %rdi movq %rcx, 88(%rsp) movq %rdx, (%rax) movq $0, 8(%rax) movq 80(%rsp), %rsi movq 88(%rsp), %rdx .LEHB2: call _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_appendEPKcm#PLT .LEHE2: movq 80(%rsp), %rdi cmpq %r12, %rdi je .L23 call _ZdlPv#PLT .L23: movq 48(%rsp), %rdi cmpq %r15, %rdi je .L24 call _ZdlPv#PLT .L24: subl $1, %ebx je .L40 .L25: movq vsnprintf#GOTPCREL(%rip), %rsi leaq .LC1(%rip), %rcx movl $1, %r8d movl $16, %edx movq %rbp, %rdi xorl %eax, %eax .LEHB3: call _ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z .LEHE3: cmpq %r14, 56(%rsp) je .L41 leaq .LC3(%rip), %rsi movl $1, %edx movq %rbp, %rdi .LEHB4: call _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_appendEPKcm#PLT .LEHE4: movq %r12, 80(%rsp) movq (%rax), %rcx leaq 16(%rax), %rdx cmpq %rdx, %rcx jne .L21 movdqu 16(%rax), %xmm0 movaps %xmm0, 96(%rsp) jmp .L22 .p2align 4,,10 .p2align 3 .L40: call _ZNSt6chrono3_V212system_clock3nowEv#PLT subq (%rsp), %rax movabsq $4835703278458516699, %rdx leaq .LC5(%rip), %rsi pxor %xmm0, %xmm0 leaq _ZSt4cout(%rip), %rdi movq %rax, %rcx imulq %rdx sarq $63, %rcx sarq $18, %rdx subq %rcx, %rdx cvtsi2sdq %rdx, %xmm0 movl $8, %edx divsd .LC4(%rip), %xmm0 movsd %xmm0, (%rsp) .LEHB5: call _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l#PLT movsd (%rsp), %xmm0 leaq _ZSt4cout(%rip), %rdi call _ZNSo9_M_insertIdEERSoT_#PLT leaq .LC6(%rip), %rsi movq %rax, %rdi call _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc#PLT .LEHE5: movq 16(%rsp), %rdi addq $16, %r13 cmpq %r13, %rdi je .L26 call _ZdlPv#PLT .L26: xorl %eax, %eax movq 120(%rsp), %rbx xorq %fs:40, %rbx jne .L42 addq $136, %rsp .cfi_remember_state .cfi_def_cfa_offset 56 popq %rbx .cfi_def_cfa_offset 48 popq %rbp .cfi_def_cfa_offset 40 popq %r12 .cfi_def_cfa_offset 32 popq %r13 .cfi_def_cfa_offset 24 popq %r14 .cfi_def_cfa_offset 16 popq %r15 .cfi_def_cfa_offset 8 ret .L41: .cfi_restore_state leaq .LC2(%rip), %rdi .LEHB6: call _ZSt20__throw_length_errorPKc#PLT .LEHE6: .L35: movq %rax, %rbx .L29: movq 48(%rsp), %rdi addq $16, %rbp cmpq %rbp, %rdi je .L31 call _ZdlPv#PLT .L31: movq 16(%rsp), %rdi addq $16, %r13 cmpq %r13, %rdi je .L32 call _ZdlPv#PLT .L32: movq %rbx, %rdi .LEHB7: call _Unwind_Resume#PLT .LEHE7: .L34: movq %rax, %rbx jmp .L31 .L36: movq 8(%rsp), %rdx movq 80(%rsp), %rdi movq %rax, %rbx addq $16, %rdx cmpq %rdx, %rdi je .L29 call _ZdlPv#PLT jmp .L29 .L42: call __stack_chk_fail#PLT .cfi_endproc .LFE1871: .globl __gxx_personality_v0 .section .gcc_except_table,"a",#progbits .LLSDA1871: .byte 0xff .byte 0xff .byte 0x1 .uleb128 .LLSDACSE1871-.LLSDACSB1871 .LLSDACSB1871: .uleb128 .LEHB0-.LFB1871 .uleb128 .LEHE0-.LEHB0 .uleb128 0 .uleb128 0 .uleb128 .LEHB1-.LFB1871 .uleb128 .LEHE1-.LEHB1 .uleb128 .L34-.LFB1871 .uleb128 0 .uleb128 .LEHB2-.LFB1871 .uleb128 .LEHE2-.LEHB2 .uleb128 .L36-.LFB1871 .uleb128 0 .uleb128 .LEHB3-.LFB1871 .uleb128 .LEHE3-.LEHB3 .uleb128 .L34-.LFB1871 .uleb128 0 .uleb128 .LEHB4-.LFB1871 .uleb128 .LEHE4-.LEHB4 .uleb128 .L35-.LFB1871 .uleb128 0 .uleb128 .LEHB5-.LFB1871 .uleb128 .LEHE5-.LEHB5 .uleb128 .L34-.LFB1871 .uleb128 0 .uleb128 .LEHB6-.LFB1871 .uleb128 .LEHE6-.LEHB6 .uleb128 .L35-.LFB1871 .uleb128 0 .uleb128 .LEHB7-.LFB1871 .uleb128 .LEHE7-.LEHB7 .uleb128 0 .uleb128 0 .LLSDACSE1871: .section .text.startup .size main, .-main .p2align 4,,15 .type _GLOBAL__sub_I_main, #function _GLOBAL__sub_I_main: .LFB2369: .cfi_startproc leaq _ZStL8__ioinit(%rip), %rdi subq $8, %rsp .cfi_def_cfa_offset 16 call _ZNSt8ios_base4InitC1Ev#PLT movq _ZNSt8ios_base4InitD1Ev#GOTPCREL(%rip), %rdi leaq __dso_handle(%rip), %rdx leaq _ZStL8__ioinit(%rip), %rsi addq $8, %rsp .cfi_def_cfa_offset 8 jmp __cxa_atexit#PLT .cfi_endproc .LFE2369: .size _GLOBAL__sub_I_main, .-_GLOBAL__sub_I_main .section .init_array,"aw" .align 8 .quad _GLOBAL__sub_I_main .local _ZStL8__ioinit .comm _ZStL8__ioinit,1,1 .section .rodata.cst8,"aM",#progbits,8 .align 8 .LC4: .long 0 .long 1083129856 .hidden DW.ref.__gxx_personality_v0 .weak DW.ref.__gxx_personality_v0 .section .data.DW.ref.__gxx_personality_v0,"awG",#progbits,DW.ref.__gxx_personality_v0,comdat .align 8 .type DW.ref.__gxx_personality_v0, #object .size DW.ref.__gxx_personality_v0, 8 DW.ref.__gxx_personality_v0: .quad __gxx_personality_v0 .hidden __dso_handle .ident "GCC: (Ubuntu 7.3.0-16ubuntu3) 7.3.0" .section .note.GNU-stack,"",#progbits Windows: .file "tester.cpp" .text .p2align 4,,15 .def ___tcf_0; .scl 3; .type 32; .endef ___tcf_0: LFB2556: .cfi_startproc movl $__ZStL8__ioinit, %ecx jmp __ZNSt8ios_base4InitD1Ev .cfi_endproc LFE2556: .section .rdata,"dr" .align 4 LC0: .ascii "basic_string::_M_construct null not valid\0" .text .align 2 .p2align 4,,15 .def __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPcEEvT_S7_St20forward_iterator_tag.isra.29; .scl 3; .type 32; .endef __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPcEEvT_S7_St20forward_iterator_tag.isra.29: LFB2587: .cfi_startproc pushl %edi .cfi_def_cfa_offset 8 .cfi_offset 7, -8 pushl %esi .cfi_def_cfa_offset 12 .cfi_offset 6, -12 movl %ecx, %esi pushl %ebx .cfi_def_cfa_offset 16 .cfi_offset 3, -16 subl $32, %esp .cfi_def_cfa_offset 48 movl 48(%esp), %edi movl 52(%esp), %ebx testl %edi, %edi jne L5 testl %ebx, %ebx je L5 movl $LC0, (%esp) call __ZSt19__throw_logic_errorPKc .p2align 4,,10 L5: subl %edi, %ebx cmpl $15, %ebx movl %ebx, 28(%esp) ja L22 movl (%esi), %edx cmpl $1, %ebx movl %edx, %eax je L23 testl %ebx, %ebx jne L6 L8: movl 28(%esp), %eax movl %eax, 4(%esi) movb $0, (%edx,%eax) addl $32, %esp .cfi_remember_state .cfi_def_cfa_offset 16 popl %ebx .cfi_restore 3 .cfi_def_cfa_offset 12 popl %esi .cfi_restore 6 .cfi_def_cfa_offset 8 popl %edi .cfi_restore 7 .cfi_def_cfa_offset 4 ret $8 .p2align 4,,10 L22: .cfi_restore_state leal 28(%esp), %eax movl $0, 4(%esp) movl %esi, %ecx movl %eax, (%esp) call __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_createERjj .cfi_def_cfa_offset 40 subl $8, %esp .cfi_def_cfa_offset 48 movl %eax, (%esi) movl 28(%esp), %edx movl %edx, 8(%esi) L6: movl %ebx, 8(%esp) movl %edi, 4(%esp) movl %eax, (%esp) call _memcpy movl (%esi), %edx jmp L8 .p2align 4,,10 L23: movzbl (%edi), %eax movb %al, (%edx) movl (%esi), %edx jmp L8 .cfi_endproc LFE2587: .def __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.21; .scl 3; .type 32; .endef .set __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.21,__ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPcEEvT_S7_St20forward_iterator_tag.isra.29 .section .text$_ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_jPKS8_PcEjSB_z,"x" .linkonce discard .p2align 4,,15 .globl __ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_jPKS8_PcEjSB_z .def __ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_jPKS8_PcEjSB_z; .scl 2; .type 32; .endef __ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_jPKS8_PcEjSB_z: LFB2177: .cfi_startproc pushl %ebp .cfi_def_cfa_offset 8 .cfi_offset 5, -8 movl %esp, %ebp .cfi_def_cfa_register 5 pushl %esi pushl %ebx subl $16, %esp .cfi_offset 6, -12 .cfi_offset 3, -16 movl 16(%ebp), %edx movl 8(%ebp), %esi leal 30(%edx), %eax andl $-16, %eax call ___chkstk_ms subl %eax, %esp leal 24(%ebp), %eax leal 31(%esp), %ebx movl %edx, 4(%esp) movl %eax, 12(%esp) movl 20(%ebp), %eax andl $-16, %ebx movl %ebx, (%esp) movl %eax, 8(%esp) call *12(%ebp) leal 8(%esi), %edx addl %ebx, %eax movl %esi, %ecx movl %edx, (%esi) movl %eax, 4(%esp) movl %ebx, (%esp) call __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPcEEvT_S7_St20forward_iterator_tag.isra.29 subl $8, %esp leal -8(%ebp), %esp movl %esi, %eax popl %ebx .cfi_restore 3 popl %esi .cfi_restore 6 popl %ebp .cfi_restore 5 .cfi_def_cfa 4, 4 ret .cfi_endproc LFE2177: .def ___main; .scl 2; .type 32; .endef .section .rdata,"dr" LC1: .ascii "\0" LC2: .ascii "%d\0" LC3: .ascii "basic_string::append\0" LC4: .ascii " \0" .def ___divdi3; .scl 2; .type 32; .endef LC6: .ascii "Done in \0" LC7: .ascii "\12\0" .section .text.startup,"x" .p2align 4,,15 .globl _main .def _main; .scl 2; .type 32; .endef _main: LFB2111: .cfi_startproc .cfi_personality 0,___gxx_personality_v0 .cfi_lsda 0,LLSDA2111 leal 4(%esp), %ecx .cfi_def_cfa 1, 0 andl $-16, %esp pushl -4(%ecx) pushl %ebp .cfi_escape 0x10,0x5,0x2,0x75,0 movl %esp, %ebp pushl %edi pushl %esi pushl %ebx pushl %ecx .cfi_escape 0xf,0x3,0x75,0x70,0x6 .cfi_escape 0x10,0x7,0x2,0x75,0x7c .cfi_escape 0x10,0x6,0x2,0x75,0x78 .cfi_escape 0x10,0x3,0x2,0x75,0x74 subl $152, %esp call ___main call __ZNSt6chrono3_V212system_clock3nowEv leal -96(%ebp), %ecx movl %eax, -136(%ebp) leal -88(%ebp), %eax movl $LC1, 4(%esp) movl $LC1, (%esp) movl %edx, -132(%ebp) movl %eax, -96(%ebp) LEHB0: call __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.21 LEHE0: leal -96(%ebp), %ecx subl $8, %esp movl $20000000, (%esp) LEHB1: call __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE7reserveEj LEHE1: subl $4, %esp movl $10000000, %edi leal -72(%ebp), %esi leal -40(%ebp), %ebx jmp L32 .p2align 4,,10 L28: movl %ecx, -48(%ebp) movl 8(%eax), %ecx movl %ecx, -40(%ebp) L29: movl 4(%eax), %ecx movb $0, 8(%eax) movl %ecx, -44(%ebp) movl %edx, (%eax) leal -96(%ebp), %ecx movl $0, 4(%eax) movl -44(%ebp), %eax movl %eax, 4(%esp) movl -48(%ebp), %eax movl %eax, (%esp) LEHB2: call __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_appendEPKcj LEHE2: movl -48(%ebp), %eax subl $8, %esp cmpl %ebx, %eax je L30 movl %eax, (%esp) call __ZdlPv L30: movl -72(%ebp), %eax leal -64(%ebp), %edx cmpl %edx, %eax je L31 movl %eax, (%esp) call __ZdlPv L31: subl $1, %edi je L46 L32: movl $1, 16(%esp) movl $LC2, 12(%esp) movl $16, 8(%esp) movl $_vsnprintf, 4(%esp) movl %esi, (%esp) LEHB3: call __ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_jPKS8_PcEjSB_z LEHE3: cmpl $2147483647, -68(%ebp) je L47 movl $1, 4(%esp) movl $LC4, (%esp) movl %esi, %ecx LEHB4: call __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_appendEPKcj LEHE4: movl %ebx, -48(%ebp) movl (%eax), %ecx leal 8(%eax), %edx subl $8, %esp cmpl %edx, %ecx jne L28 movl 12(%eax), %ecx movl %ecx, -120(%ebp) movl 16(%eax), %ecx movl %ecx, -124(%ebp) movl 20(%eax), %ecx movl %ecx, -128(%ebp) movl 8(%eax), %ecx movl %ecx, -40(%ebp) movl -120(%ebp), %ecx movl %ecx, -36(%ebp) movl -124(%ebp), %ecx movl %ecx, -32(%ebp) movl -128(%ebp), %ecx movl %ecx, -28(%ebp) jmp L29 .p2align 4,,10 L46: call __ZNSt6chrono3_V212system_clock3nowEv subl -136(%ebp), %eax movl $1000000, 8(%esp) sbbl -132(%ebp), %edx movl $0, 12(%esp) movl %eax, (%esp) movl %edx, 4(%esp) call ___divdi3 movl %eax, -120(%ebp) movl %edx, -116(%ebp) fildq -120(%ebp) movl $8, 8(%esp) movl $LC6, 4(%esp) movl $__ZSt4cout, (%esp) fdivs LC5 fstpl -120(%ebp) LEHB5: call __ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_i fldl -120(%ebp) movl $__ZSt4cout, %ecx fstpl (%esp) call __ZNSo9_M_insertIdEERSoT_ subl $8, %esp movl $LC7, 4(%esp) movl %eax, (%esp) call __ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc LEHE5: movl -96(%ebp), %eax leal -88(%ebp), %edi cmpl %edi, %eax je L43 movl %eax, (%esp) call __ZdlPv L43: leal -16(%ebp), %esp xorl %eax, %eax popl %ecx .cfi_remember_state .cfi_restore 1 .cfi_def_cfa 1, 0 popl %ebx .cfi_restore 3 popl %esi .cfi_restore 6 popl %edi .cfi_restore 7 popl %ebp .cfi_restore 5 leal -4(%ecx), %esp .cfi_def_cfa 4, 4 ret L47: .cfi_restore_state movl $LC3, (%esp) LEHB6: call __ZSt20__throw_length_errorPKc LEHE6: L41: movl %eax, %ebx L36: movl -72(%ebp), %eax leal -64(%ebp), %edx cmpl %edx, %eax je L38 movl %eax, (%esp) call __ZdlPv L38: movl -96(%ebp), %eax leal -88(%ebp), %edi cmpl %edi, %eax je L39 movl %eax, (%esp) call __ZdlPv L39: movl %ebx, (%esp) LEHB7: call __Unwind_Resume LEHE7: L42: movl %eax, %esi movl -48(%ebp), %eax cmpl %ebx, %eax je L35 movl %eax, (%esp) call __ZdlPv L35: movl %esi, %ebx jmp L36 L40: movl %eax, %ebx jmp L38 .cfi_endproc LFE2111: .def ___gxx_personality_v0; .scl 2; .type 32; .endef .section .gcc_except_table,"w" LLSDA2111: .byte 0xff .byte 0xff .byte 0x1 .uleb128 LLSDACSE2111-LLSDACSB2111 LLSDACSB2111: .uleb128 LEHB0-LFB2111 .uleb128 LEHE0-LEHB0 .uleb128 0 .uleb128 0 .uleb128 LEHB1-LFB2111 .uleb128 LEHE1-LEHB1 .uleb128 L40-LFB2111 .uleb128 0 .uleb128 LEHB2-LFB2111 .uleb128 LEHE2-LEHB2 .uleb128 L42-LFB2111 .uleb128 0 .uleb128 LEHB3-LFB2111 .uleb128 LEHE3-LEHB3 .uleb128 L40-LFB2111 .uleb128 0 .uleb128 LEHB4-LFB2111 .uleb128 LEHE4-LEHB4 .uleb128 L41-LFB2111 .uleb128 0 .uleb128 LEHB5-LFB2111 .uleb128 LEHE5-LEHB5 .uleb128 L40-LFB2111 .uleb128 0 .uleb128 LEHB6-LFB2111 .uleb128 LEHE6-LEHB6 .uleb128 L41-LFB2111 .uleb128 0 .uleb128 LEHB7-LFB2111 .uleb128 LEHE7-LEHB7 .uleb128 0 .uleb128 0 LLSDACSE2111: .section .text.startup,"x" .p2align 4,,15 .def __GLOBAL__sub_I_main; .scl 3; .type 32; .endef __GLOBAL__sub_I_main: LFB2557: .cfi_startproc subl $28, %esp .cfi_def_cfa_offset 32 movl $__ZStL8__ioinit, %ecx call __ZNSt8ios_base4InitC1Ev movl $___tcf_0, (%esp) call _atexit addl $28, %esp .cfi_def_cfa_offset 4 ret .cfi_endproc LFE2557: .section .ctors,"w" .align 4 .long __GLOBAL__sub_I_main .lcomm __ZStL8__ioinit,1,1 .section .rdata,"dr" .align 4 LC5: .long 1148846080 .ident "GCC: (MinGW.org GCC-6.3.0-1) 6.3.0" .def __ZNSt8ios_base4InitD1Ev; .scl 2; .type 32; .endef .def __ZSt19__throw_logic_errorPKc; .scl 2; .type 32; .endef .def __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_createERjj; .scl 2; .type 32; .endef .def _memcpy; .scl 2; .type 32; .endef .def __ZNSt6chrono3_V212system_clock3nowEv; .scl 2; .type 32; .endef .def __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE7reserveEj; .scl 2; .type 32; .endef .def __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_appendEPKcj; .scl 2; .type 32; .endef .def __ZdlPv; .scl 2; .type 32; .endef .def _vsnprintf; .scl 2; .type 32; .endef .def __ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_i; .scl 2; .type 32; .endef .def __ZNSo9_M_insertIdEERSoT_; .scl 2; .type 32; .endef .def __ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc; .scl 2; .type 32; .endef .def __ZSt20__throw_length_errorPKc; .scl 2; .type 32; .endef .def __Unwind_Resume; .scl 2; .type 32; .endef .def __ZNSt8ios_base4InitC1Ev; .scl 2; .type 32; .endef .def _atexit; .scl 2; .type 32; .endef
Quick look at disassembly shows that Windows version uses movl (i. e. long word, 32 bit move) and Linux version uses movq (quad word, 64 bit) and SSE registers xmm. My bet is that on Linux, you compile for x86-64, while on Windows you target 32 bit x86. x86-64 includes SSE2 extension, while x86 does not, so MinGW defaults to no-SSE mode. If that's the case, building with 64 bit toolchain on Windows should result in comparable performance. Alternatively, you might enable SSE for 32 bit builds (-msse2 compiler flag, if I remember correctly).
The mingw.org implementation just seems to be much more inefficient than linux, Visual Studio or mingw-w64.org. >g++ --version g++ (MinGW.org GCC-6.3.0-1) 6.3.0 Done in 24.808 >g++ --version g++ (i686-posix-dwarf-rev2, Built by MinGW-W64 project) 6.3.0 Done in 0.679
Tested with MSYS2 MinGW64: g++ --version g++.exe (Rev2, Built by MSYS2 project) 7.3.0 g++.exe -Wall -O3 -mtune=native -fno-exceptions -fno-rtti -c main.cpp -o main.o g++.exe -o test.exe main.o -s Done in 0.547 Env: Windows 10 x64 CPU: Intel Core i5-6300U, 2.4GH RAM: 16GB DDR4 In any case, MinGW uses mswcrt.dll instead of GNU libc (windows bundled one, not a universal CRT/visual studio CRT etc) so speed gap may comes from C standard library from my experience. P.S. with some changes (same compiler flags) #include <iostream> #include <chrono> #ifdef _WIN32 #include <windows.h> static std::size_t page_size() noexcept { ::SYSTEM_INFO si; ::GetSystemInfo(&si); return si.dwPageSize; } #else #include <sys/types.h> #include <unistd.h> static std::size_t page_size() noexcept { return static_cast<std::size_t>( ::sysconf(_SC_PAGESIZE) ); } #endif // _WIN32 int main(int argc, char const *argv[]) { auto started = std::chrono::high_resolution_clock::now(); const std::size_t n = 10000000; // align size to page boundary const std::size_t al = page_size() - 1; const std::size_t buff_size = ( (n << 1) + al) & ~al; std::string str; str.reserve(buff_size); const std::string to_append( std::to_string(1) ); for (std::size_t i = 0; i < n; ++i) { str.append( to_append ); str.push_back(' '); } auto done = std::chrono::high_resolution_clock::now(); double secs = (double) std::chrono::duration_cast<std::chrono::milliseconds>(done-started).count() / 1000; std::cout << "Done in " << secs << "\n"; return 0; } Done in 0.046 Asm ouput for main function: main: pushq %r14 .seh_pushreg %r14 pushq %r13 .seh_pushreg %r13 pushq %r12 .seh_pushreg %r12 pushq %rbp .seh_pushreg %rbp pushq %rdi .seh_pushreg %rdi pushq %rsi .seh_pushreg %rsi pushq %rbx .seh_pushreg %rbx subq $144, %rsp .seh_stackalloc 144 .seh_endprologue movl $10000000, %esi call __main leaq 96(%rsp), %r13 leaq 64(%rsp), %rbp call _ZNSt6chrono3_V212system_clock3nowEv movq %r13, %rcx leaq 16(%rbp), %r12 movq %rax, %r14 call *__imp_GetSystemInfo(%rip) movl 100(%rsp), %eax movq %rbp, %rcx movq %r12, 64(%rsp) movq $0, 72(%rsp) leaq 19999999(%rax), %rdx negq %rax movb $0, 80(%rsp) andq %rax, %rdx call _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE7reserveEy movl $1, 32(%rsp) movq %r13, %rcx leaq .LC0(%rip), %r9 movl $16, %r8d leaq _ZL9vsnprintfPcyPKcS_(%rip), %rdx call _ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_yPKS8_PcEySB_z jmp .L14 .p2align 4,,10 .L16: movb $32, (%rdx,%rbx) .L26: movq 64(%rsp), %rax movq %rdi, 72(%rsp) movb $0, 1(%rax,%rbx) subq $1, %rsi je .L27 .L14: movq 96(%rsp), %rdx movq 104(%rsp), %r8 movq %rbp, %rcx call _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_appendEPKcy movq 72(%rsp), %rbx movq 64(%rsp), %rdx movl $15, %eax leaq 1(%rbx), %rdi cmpq %r12, %rdx je .L15 movq 80(%rsp), %rax .L15: cmpq %rax, %rdi jbe .L16 xorl %r9d, %r9d xorl %r8d, %r8d movq %rbx, %rdx movq %rbp, %rcx movq $1, 32(%rsp) call _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_mutateEyyPKcy movq 64(%rsp), %rax movb $32, (%rax,%rbx) jmp .L26 .p2align 4,,10 .L27: call _ZNSt6chrono3_V212system_clock3nowEv pxor %xmm1, %xmm1 movl $8, %r8d movabsq $4835703278458516699, %rdx subq %r14, %rax addq $16, %r13 movq %rax, %rcx imulq %rdx sarq $63, %rcx sarq $18, %rdx subq %rcx, %rdx movq .refptr._ZSt4cout(%rip), %rcx cvtsi2sdq %rdx, %xmm1 leaq .LC2(%rip), %rdx divsd .LC1(%rip), %xmm1 movsd %xmm1, 56(%rsp) call _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_x movsd 56(%rsp), %xmm1 movq .refptr._ZSt4cout(%rip), %rcx call _ZNSo9_M_insertIdEERSoT_ leaq .LC3(%rip), %rdx movq %rax, %rcx call _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc movq 96(%rsp), %rcx cmpq %r13, %rcx je .L19 call _ZdlPv .L19: movq 64(%rsp), %rcx addq $16, %rbp cmpq %rbp, %rcx je .L20 call _ZdlPv .L20: xorl %eax, %eax addq $144, %rsp popq %rbx popq %rsi popq %rdi popq %rbp popq %r12 popq %r13 popq %r14 ret .seh_endproc .p2align 4,,15 .def _GLOBAL__sub_I_main; .scl 3; .type 32; .endef .seh_proc _GLOBAL__sub_I_main
(Just for the proportions) Windows Release target vs. Debug target on Visual Studio C++: By default, Debug target compile-line is without optimization, while Release target compile-line is with /O2 optimization, with /Oi ("Enable Intrinsic Functions"), & with /GL ("Whole Program Optimization"). Your code, on my workstation, Debug x64 vs Relesae x64: Debug: 70 sec. Release: 0.27 sec. You build with MinGW (which I am not familiar with). But from a fast search, there is a talk about Debug/Release mode ...and MinGW has equivalent /O2 optimization, /Oi ("Enable Intrinsic Functions"), and /Og ("Enable Global Optimization") flags, it seems. - Compile with these 3 flags (x64 target), & compare with the VS Release x64 benchmark. Anyway, this is MS default compile optimization for a Release target. - Test Environment: HP 8100, Windows 10 Pro 64 bit, CPU i7 870, 16 GB DDR3 RAM, Visual Studio 2017, Targets: Debug x64 / Release x64
I tried your code at my Windows with MinGW 4.8.0 and got ~20 seconds. When I changed string concatination to std::stringstream I got 0.5 seconds: ... std::stringstream ss; for (int i = 0; i < n; ++i) { //str += std::to_string(a) + " "; ss << a << " "; } str = ss.str(); ...
Why does a simple use of ostringstream generates so much assembly code?
Consider the following simple example that formats a string and an integer using ostringstream and discards the output: #include <sstream> void ostringstream_test() { std::ostringstream ss; ss << "x = " << 42; ss.str(); } Compiling it with clang++ -S -O3 -DNDEBUG -std=c++14 test.cc generates a ton of assembly code (half a kilobyte in x86-64 instructions compared to less than a hundred bytes for a similar sprintf code) - see below the output. Why does it generates so much code, is it inherent to the ostringstream API or this particular compiler/library does something wrong? .globl __Z18ostringstream_testv .p2align 4, 0x90 __Z18ostringstream_testv: ## #_Z18ostringstream_testv Lfunc_begin0: .cfi_startproc .cfi_personality 155, ___gxx_personality_v0 .cfi_lsda 16, Lexception0 ## BB#0: pushq %rbp Lcfi0: .cfi_def_cfa_offset 16 Lcfi1: .cfi_offset %rbp, -16 movq %rsp, %rbp Lcfi2: .cfi_def_cfa_register %rbp pushq %r15 pushq %r14 pushq %r13 pushq %r12 pushq %rbx subq $328, %rsp ## imm = 0x148 Lcfi3: .cfi_offset %rbx, -56 Lcfi4: .cfi_offset %r12, -48 Lcfi5: .cfi_offset %r13, -40 Lcfi6: .cfi_offset %r14, -32 Lcfi7: .cfi_offset %r15, -24 leaq -256(%rbp), %r14 leaq -360(%rbp), %r12 movq __ZTCNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE0_NS_13basic_ostreamIcS2_EE#GOTPCREL(%rip), %rax leaq 24(%rax), %rcx movq %rcx, -368(%rbp) addq $64, %rax movq %rax, -256(%rbp) Ltmp0: movq %r14, %rdi movq %r12, %rsi callq __ZNSt3__18ios_base4initEPv Ltmp1: ## BB#1: movq $0, -120(%rbp) movl $-1, -112(%rbp) movq __ZTVNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE#GOTPCREL(%rip), %rbx leaq 24(%rbx), %r13 movq %r13, -368(%rbp) addq $64, %rbx movq %rbx, -256(%rbp) Ltmp3: movq %r12, %rdi callq __ZNSt3__115basic_streambufIcNS_11char_traitsIcEEEC2Ev Ltmp4: ## BB#2: movq __ZTVNSt3__115basic_stringbufIcNS_11char_traitsIcEENS_9allocatorIcEEEE#GOTPCREL(%rip), %r15 addq $16, %r15 movq %r15, -360(%rbp) movq $0, -272(%rbp) movq $0, -280(%rbp) movq $0, -288(%rbp) movq $0, -296(%rbp) movl $16, -264(%rbp) xorps %xmm0, %xmm0 movaps %xmm0, -80(%rbp) movq $0, -64(%rbp) Ltmp6: leaq -80(%rbp), %rsi movq %r12, %rdi callq __ZNSt3__115basic_stringbufIcNS_11char_traitsIcEENS_9allocatorIcEEE3strERKNS_12basic_stringIcS2_S4_EE Ltmp7: ## BB#3: testb $1, -80(%rbp) je LBB0_5 ## BB#4: movq -64(%rbp), %rdi callq __ZdlPv LBB0_5: Ltmp9: leaq L_.str(%rip), %rsi leaq -368(%rbp), %rdi movl $4, %edx callq __ZNSt3__124__put_character_sequenceIcNS_11char_traitsIcEEEERNS_13basic_ostreamIT_T0_EES7_PKS4_m Ltmp10: ## BB#6: Ltmp11: movl $42, %esi movq %rax, %rdi callq __ZNSt3__113basic_ostreamIcNS_11char_traitsIcEEElsEi Ltmp12: ## BB#7: Ltmp13: leaq -104(%rbp), %rdi movq %r12, %rsi callq __ZNKSt3__115basic_stringbufIcNS_11char_traitsIcEENS_9allocatorIcEEE3strEv Ltmp14: ## BB#8: testb $1, -104(%rbp) je LBB0_10 ## BB#9: movq -88(%rbp), %rdi callq __ZdlPv LBB0_10: movq %r13, -368(%rbp) movq %rbx, -256(%rbp) movq %r15, -360(%rbp) testb $1, -296(%rbp) je LBB0_12 ## BB#11: movq -280(%rbp), %rdi callq __ZdlPv LBB0_12: movq %r12, %rdi callq __ZNSt3__115basic_streambufIcNS_11char_traitsIcEEED2Ev movq __ZTTNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE#GOTPCREL(%rip), %rsi addq $8, %rsi leaq -368(%rbp), %rdi callq __ZNSt3__113basic_ostreamIcNS_11char_traitsIcEEED2Ev movq %r14, %rdi callq __ZNSt3__19basic_iosIcNS_11char_traitsIcEEED2Ev addq $328, %rsp ## imm = 0x148 popq %rbx popq %r12 popq %r13 popq %r14 popq %r15 popq %rbp retq LBB0_13: Ltmp8: movq %rax, -48(%rbp) ## 8-byte Spill testb $1, -80(%rbp) je LBB0_18 ## BB#14: movq -64(%rbp), %rdi callq __ZdlPv testb $1, -296(%rbp) jne LBB0_19 jmp LBB0_20 LBB0_16: Ltmp5: movq %rax, -48(%rbp) ## 8-byte Spill jmp LBB0_21 LBB0_15: Ltmp2: movq %rax, -48(%rbp) ## 8-byte Spill jmp LBB0_22 LBB0_17: Ltmp15: movq %rax, -48(%rbp) ## 8-byte Spill movq %r13, -368(%rbp) movq %rbx, -256(%rbp) movq %r15, -360(%rbp) LBB0_18: testb $1, -296(%rbp) je LBB0_20 LBB0_19: movq -280(%rbp), %rdi callq __ZdlPv LBB0_20: movq %r12, %rdi callq __ZNSt3__115basic_streambufIcNS_11char_traitsIcEEED2Ev LBB0_21: movq __ZTTNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE#GOTPCREL(%rip), %rsi addq $8, %rsi leaq -368(%rbp), %rdi callq __ZNSt3__113basic_ostreamIcNS_11char_traitsIcEEED2Ev LBB0_22: movq %r14, %rdi callq __ZNSt3__19basic_iosIcNS_11char_traitsIcEEED2Ev movq -48(%rbp), %rdi ## 8-byte Reload callq __Unwind_Resume Lfunc_end0: .cfi_endproc .section __TEXT,__gcc_except_tab .p2align 2 GCC_except_table0: Lexception0: .byte 255 ## #LPStart Encoding = omit .byte 155 ## #TType Encoding = indirect pcrel sdata4 .asciz "\303\200" ## #TType base offset .byte 3 ## Call site Encoding = udata4 .byte 65 ## Call site table length Lset0 = Ltmp0-Lfunc_begin0 ## >> Call Site 1 << .long Lset0 Lset1 = Ltmp1-Ltmp0 ## Call between Ltmp0 and Ltmp1 .long Lset1 Lset2 = Ltmp2-Lfunc_begin0 ## jumps to Ltmp2 .long Lset2 .byte 0 ## On action: cleanup Lset3 = Ltmp3-Lfunc_begin0 ## >> Call Site 2 << .long Lset3 Lset4 = Ltmp4-Ltmp3 ## Call between Ltmp3 and Ltmp4 .long Lset4 Lset5 = Ltmp5-Lfunc_begin0 ## jumps to Ltmp5 .long Lset5 .byte 0 ## On action: cleanup Lset6 = Ltmp6-Lfunc_begin0 ## >> Call Site 3 << .long Lset6 Lset7 = Ltmp7-Ltmp6 ## Call between Ltmp6 and Ltmp7 .long Lset7 Lset8 = Ltmp8-Lfunc_begin0 ## jumps to Ltmp8 .long Lset8 .byte 0 ## On action: cleanup Lset9 = Ltmp9-Lfunc_begin0 ## >> Call Site 4 << .long Lset9 Lset10 = Ltmp14-Ltmp9 ## Call between Ltmp9 and Ltmp14 .long Lset10 Lset11 = Ltmp15-Lfunc_begin0 ## jumps to Ltmp15 .long Lset11 .byte 0 ## On action: cleanup Lset12 = Ltmp14-Lfunc_begin0 ## >> Call Site 5 << .long Lset12 Lset13 = Lfunc_end0-Ltmp14 ## Call between Ltmp14 and Lfunc_end0 .long Lset13 .long 0 ## has no landing pad .byte 0 ## On action: cleanup .p2align 2
The most likely reason for the difference is that the IOStream implementation is expanded inline while the sprintf() use is just a function call. Nothing inherently prevents IOStreams to be implemented by a library. It does take a tiny but of abstraction and planning, though: the definition in the standard uses templates. These are normally just implemented inline. Declaring the typically used instantiations (for character types char and wchar_t) as extern templates and explicitly instantiating them is extra work, though. I showed a long time ago that it does pay off in term of compile-time and, at least, libstdc++ preinstantiates the IOStreams functions in a library. Based on you experiment it seems libc++ doesn’t.
Why segmentation fault is caused by class variables order?
I've created following program : class CLexer { public: CLexer( ) { iCursorPos = 0; } void putCharacter(char character) { if(character != ' ' && character != '\n') { m_strToken[iCursorPos] = character; iCursorPos++; } else { m_strToken[iCursorPos] = '\0'; iCursorPos = 0; } } private: char m_strToken[1024]; int iCursorPos = 0; }; int main(int argc, char * argv[]) { CLexer lex; lex.putCharacter('m'); return 0; } Assembler output produced by compiler : .file "main.cpp" .section .text._ZN6CLexerC2Ev,"axG",#progbits,_ZN6CLexerC5Ev,comdat .align 2 .weak _ZN6CLexerC2Ev .type _ZN6CLexerC2Ev, #function _ZN6CLexerC2Ev: .LFB1: .cfi_startproc pushq %rbp .cfi_def_cfa_offset 16 .cfi_offset 6, -16 movq %rsp, %rbp .cfi_def_cfa_register 6 movq %rdi, -8(%rbp) movq -8(%rbp), %rax movl $0, 1024(%rax) movq -8(%rbp), %rax movl $0, 1024(%rax) nop popq %rbp .cfi_def_cfa 7, 8 ret .cfi_endproc .LFE1: .size _ZN6CLexerC2Ev, .-_ZN6CLexerC2Ev .weak _ZN6CLexerC1Ev .set _ZN6CLexerC1Ev,_ZN6CLexerC2Ev .section .text._ZN6CLexer12putCharacterEc,"axG",#progbits,_ZN6CLexer12putCharacterEc,comdat .align 2 .weak _ZN6CLexer12putCharacterEc .type _ZN6CLexer12putCharacterEc, #function _ZN6CLexer12putCharacterEc: .LFB3: .cfi_startproc pushq %rbp .cfi_def_cfa_offset 16 .cfi_offset 6, -16 movq %rsp, %rbp .cfi_def_cfa_register 6 movq %rdi, -8(%rbp) movl %esi, %eax movb %al, -12(%rbp) cmpb $32, -12(%rbp) je .L3 cmpb $10, -12(%rbp) je .L3 movq -8(%rbp), %rax movl 1024(%rax), %eax movq -8(%rbp), %rdx cltq movzbl -12(%rbp), %ecx movb %cl, (%rdx,%rax) movq -8(%rbp), %rax movl 1024(%rax), %eax leal 1(%rax), %edx movq -8(%rbp), %rax movl %edx, 1024(%rax) jmp .L4 .L3: movq -8(%rbp), %rax movl 1024(%rax), %eax movq -8(%rbp), %rdx cltq movb $0, (%rdx,%rax) movq -8(%rbp), %rax movl $0, 1024(%rax) .L4: nop popq %rbp .cfi_def_cfa 7, 8 ret .cfi_endproc .LFE3: .size _ZN6CLexer12putCharacterEc, .-_ZN6CLexer12putCharacterEc .text .globl main .type main, #function main: .LFB4: .cfi_startproc pushq %rbp .cfi_def_cfa_offset 16 .cfi_offset 6, -16 movq %rsp, %rbp .cfi_def_cfa_register 6 subq $1056, %rsp movl %edi, -1044(%rbp) movq %rsi, -1056(%rbp) leaq -1040(%rbp), %rax movq %rax, %rdi call _ZN6CLexerC1Ev leaq -1040(%rbp), %rax movl $109, %esi movq %rax, %rdi call _ZN6CLexer12putCharacterEc movl $0, %eax leave .cfi_def_cfa 7, 8 ret .cfi_endproc .LFE4: .size main, .-main .ident "GCC: (GNU) 6.1.1 20160501" .section .note.GNU-stack,"",#progbits And after execution, first call to putCharacter method with 'm' character as parameter is throwing segfault. Attached gdb is giving following output : Program received signal SIGSEGV, Segmentation fault. 0x00000000004018e5 in CLexer::putCharacter (this=0x7fffffffe370, character=109 'm') at src/main.cpp:60 60 m_strToken[iCursorPos] = character; I've managed to fix this error by moving iCursorPos variable above m_strToken in class declaration but i think it isn't proper way to fix this issue. I'm using g++ (GCC) 6.1.1 20160501 on the lastest and updated version of ArchLinux x86_64.
if(character != ' ' && character != '\n') { m_strToken[iCursorPos] = character; iCursorPos++; } You don't check that iCursorPos < 1024 here. So you write past the end of the buffer, into iCursorPos itself. The next access m_strToken[iCursorPos] = character; probably writes way past the end of the buffer, and you get a segfault (luckily). Your "fix" still isn't correct, since you corrupt other parts of your objects memory regardless.
Why sin/cos are slower when optimizations are enabled?
After reading a question related with the performance of sin/cos (Why is std::sin() and std::cos() slower than sin() and cos()?), I made some tests with his code and found a weird thing: If i call sin/cos with a float value, it is much slower than with double when compiled with optimization. #include <cmath> #include <cstdio> const int N = 4000; float cosine[N][N]; float sine[N][N]; int main() { for (int i = 0; i < N; i++) { for (int j = 0; j < N; j++) { float ang = i*j*2*M_PI/N; cosine[i][j] = cos(ang); sine[i][j] = sin(ang); } } } With the above code I get: With -O0: 2.402s With -O1: 9.004s With -O2: 9.013s With -O3: 9.001s Now if I change float ang = i*j*2*M_PI/N; To double ang = i*j*2*M_PI/N; I get: With -O0: 2.362s With -O1: 1.188s With -O2: 1.197s With -O3: 1.197s How can the first test be that faster without optimizations? I'm using g++ (Ubuntu/Linaro 4.5.2-8ubuntu4) 4.5.2, 64 bits. EDIT: Changed the title to better describe the problem. EDIT: Added assembly code Assembly for first test with O0: .file "main.cpp" .globl cosine .bss .align 32 .type cosine, #object .size cosine, 64000000 cosine: .zero 64000000 .globl sine .align 32 .type sine, #object .size sine, 64000000 sine: .zero 64000000 .text .globl main .type main, #function main: .LFB87: .cfi_startproc pushq %rbp .cfi_def_cfa_offset 16 movq %rsp, %rbp .cfi_offset 6, -16 .cfi_def_cfa_register 6 subq $16, %rsp movl $0, -4(%rbp) jmp .L2 .L5: movl $0, -8(%rbp) jmp .L3 .L4: movl -4(%rbp), %eax imull -8(%rbp), %eax addl %eax, %eax cvtsi2sd %eax, %xmm0 movsd .LC0(%rip), %xmm1 mulsd %xmm1, %xmm0 movsd .LC1(%rip), %xmm1 divsd %xmm1, %xmm0 unpcklpd %xmm0, %xmm0 cvtpd2ps %xmm0, %xmm0 movss %xmm0, -12(%rbp) movss -12(%rbp), %xmm0 cvtps2pd %xmm0, %xmm0 call cos unpcklpd %xmm0, %xmm0 cvtpd2ps %xmm0, %xmm0 movl -8(%rbp), %eax cltq movl -4(%rbp), %edx movslq %edx, %rdx imulq $4000, %rdx, %rdx leaq (%rdx,%rax), %rax movss %xmm0, cosine(,%rax,4) movss -12(%rbp), %xmm0 cvtps2pd %xmm0, %xmm0 call sin unpcklpd %xmm0, %xmm0 cvtpd2ps %xmm0, %xmm0 movl -8(%rbp), %eax cltq movl -4(%rbp), %edx movslq %edx, %rdx imulq $4000, %rdx, %rdx leaq (%rdx,%rax), %rax movss %xmm0, sine(,%rax,4) addl $1, -8(%rbp) .L3: cmpl $3999, -8(%rbp) setle %al testb %al, %al jne .L4 addl $1, -4(%rbp) .L2: cmpl $3999, -4(%rbp) setle %al testb %al, %al jne .L5 movl $0, %eax leave .cfi_def_cfa 7, 8 ret .cfi_endproc .LFE87: .size main, .-main .section .rodata .align 4 .type _ZL1N, #object .size _ZL1N, 4 _ZL1N: .long 4000 .align 8 .LC0: .long 1413754136 .long 1074340347 .align 8 .LC1: .long 0 .long 1085227008 .ident "GCC: (Ubuntu/Linaro 4.5.2-8ubuntu4) 4.5.2" .section .note.GNU-stack,"",#progbits Assembly for first test with O3: .file "main.cpp" .text .p2align 4,,15 .globl main .type main, #function main: .LFB121: .cfi_startproc pushq %r15 .cfi_def_cfa_offset 16 xorl %r15d, %r15d .cfi_offset 15, -16 pushq %r14 .cfi_def_cfa_offset 24 movl $cosine+16000, %r14d .cfi_offset 14, -24 pushq %r13 .cfi_def_cfa_offset 32 xorl %r13d, %r13d .cfi_offset 13, -32 pushq %r12 .cfi_def_cfa_offset 40 pushq %rbp .cfi_def_cfa_offset 48 pushq %rbx .cfi_def_cfa_offset 56 subq $24, %rsp .cfi_def_cfa_offset 80 .p2align 4,,10 .p2align 3 .L2: movslq %r15d, %rbp .cfi_offset 3, -56 .cfi_offset 6, -48 .cfi_offset 12, -40 movl %r13d, %r12d movl $0x3f800000, %edx imulq $16000, %rbp, %rbp xorl %eax, %eax leaq cosine(%rbp), %rbx addq $sine, %rbp jmp .L5 .p2align 4,,10 .p2align 3 .L3: movl %r12d, %eax leaq 8(%rsp), %rsi leaq 12(%rsp), %rdi subl %r13d, %eax cvtsi2sd %eax, %xmm0 mulsd .LC2(%rip), %xmm0 divsd .LC3(%rip), %xmm0 unpcklpd %xmm0, %xmm0 cvtpd2ps %xmm0, %xmm0 call sincosf movl 8(%rsp), %edx movl 12(%rsp), %eax .L5: movl %edx, (%rbx) addq $4, %rbx movl %eax, 0(%rbp) addl %r13d, %r12d addq $4, %rbp cmpq %r14, %rbx jne .L3 addl $1, %r15d addl $2, %r13d leaq 16000(%rbx), %r14 cmpl $4000, %r15d jne .L2 addq $24, %rsp .cfi_def_cfa_offset 56 xorl %eax, %eax popq %rbx .cfi_def_cfa_offset 48 popq %rbp .cfi_def_cfa_offset 40 popq %r12 .cfi_def_cfa_offset 32 popq %r13 .cfi_def_cfa_offset 24 popq %r14 .cfi_def_cfa_offset 16 popq %r15 .cfi_def_cfa_offset 8 ret .cfi_endproc .LFE121: .size main, .-main .globl cosine .bss .align 32 .type cosine, #object .size cosine, 64000000 cosine: .zero 64000000 .globl sine .align 32 .type sine, #object .size sine, 64000000 sine: .zero 64000000 .section .rodata.cst8,"aM",#progbits,8 .align 8 .LC2: .long 1413754136 .long 1074340347 .align 8 .LC3: .long 0 .long 1085227008 .ident "GCC: (Ubuntu/Linaro 4.5.2-8ubuntu4) 4.5.2" .section .note.GNU-stack,"",#progbits
Here's a possibility: In C, cos is double precision and cosf is single precision. In C++, std::cos has overloads for both double and single. You aren't calling std::cos. If <cmath> doesn't also overload ::cos (as far as I know, it is not required to), then you are just calling the C double precision function. If this is the case, then you're suffering the cost of converting between float, double, and back. Now, some standard libraries implement cos(float x) as (float)cos((double)x), so even if you are calling the float function it might still be doing conversions behind the scenes. This shouldn't account for a 9x performance difference, though.
AFAIK it's because computers work at double precision natively. Using float requires conversions.'