Halide: X86 assembly code generation

Halide: X86 assembly code generation - c++

I'm new in Halide. I am trying to compile camera_pipe application from the source code (https://github.com/halide/Halide/tree/master/apps/camera_pipe). I have successfully compiled camera_pipe.cpp. It generates "curved.s" assembly code.
# Lfunc_begin0:
.loc 3 12 0
#/data/nfs_home/akafi/Halide_CoreIR/src/runtime/posix_allocator.cpp:12:0
.cfi_startproc
#BB#0:
pushq %rbp
.Ltmp0:
.cfi_def_cfa_offset 16
.Ltmp1:
.cfi_offset %rbp, -16
movq %rsp, %rbp
.Ltmp2:
.cfi_def_cfa_register %rbp
#DEBUG_VALUE: default_malloc:user_context <- %RDI
#DEBUG_VALUE: default_malloc:x <- %RSI
.Ltmp3:
#DEBUG_VALUE: default_malloc:alignment <- 128
.loc 3 15 27 prologue_end
#/data/nfs_home/akafi/Halide_CoreIR/src/runtime/posix_allocator.cpp:15:27
subq $-128, %rsi
.Ltmp4:
.loc 3 15 18 is_stmt 0
# /data/nfs_home/akafi/Halide_CoreIR/src/runtime/posix_allocator.cpp:15:18
movq %rsi, %rdi
.Ltmp5:
callq malloc#PLT
movq %rax, %rcx
.Ltmp6:
#DEBUG_VALUE: default_malloc:orig <- %RCX
xorl %eax, %eax
.loc 3 16 14 is_stmt 1
# /data/nfs_home/akafi/Halide_CoreIR/src/runtime/posix_allocator.cpp:16:14
.Ltmp7:
testq %rcx, %rcx
je .LBB0_2
.Ltmp8:
# BB#1:
#DEBUG_VALUE: default_malloc:orig <- %RCX
.loc 3 21 68
# data/nfs_home/akafi/Halide_CoreIR/src/runtime/posix_allocator.cpp:21:68
movq %rcx, %rax
addq $135, %rax
......
......
I have tried to debug the source code. I found that he "camera_pipe.cpp" called the "/Halide_CoreIR/src/CodeGen_X86.cpp".
The generated assembly doesn't look like X86 assembly. Then what is the fuction of "CodeGen_X86.cpp"?

It sounds like you may be building using a very old Halide tree—for quite a while there has not any file camera_pipe.cpp, the generated output is not called curved.*, etc.
That said, the x86 backend in CodeGen_X86.cpp does generate x86 code. The curved.s you posted is x86_64 assembly.

Related

C++ return reference push instruction not showing up in assembly

I'm trying to learn more about how return references are compiled, and I'm currently stuck on how they show up in assembly. The code I'm running is this:
struct Obj {
char buf[100];
int i;
long b;
} B, B2;
Obj foo(Obj b) {
b.i = 100; // Do something to the argument
return b;
}
int main() {
B2 = foo(B);
}
and the assembly code looks like this:
.file "return_function_assembly.cpp"
.text
.globl B
.bss
.align 32
.type B, #object
.size B, 112
B:
.zero 112
.globl B2
.align 32
.type B2, #object
.size B2, 112
B2:
.zero 112
.text
.globl _Z3foo3Obj
.type _Z3foo3Obj, #function
_Z3foo3Obj:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movq %rdi, -8(%rbp)
movl $100, 116(%rbp)
movq -8(%rbp), %rax
movq 16(%rbp), %rdx
movq 24(%rbp), %rcx
movq %rdx, (%rax)
movq %rcx, 8(%rax)
movq 32(%rbp), %rdx
movq 40(%rbp), %rcx
movq %rdx, 16(%rax)
movq %rcx, 24(%rax)
movq 48(%rbp), %rdx
movq 56(%rbp), %rcx
movq %rdx, 32(%rax)
movq %rcx, 40(%rax)
movq 64(%rbp), %rdx
movq 72(%rbp), %rcx
movq %rdx, 48(%rax)
movq %rcx, 56(%rax)
movq 80(%rbp), %rdx
movq 88(%rbp), %rcx
movq %rdx, 64(%rax)
movq %rcx, 72(%rax)
movq 96(%rbp), %rdx
movq 104(%rbp), %rcx
movq %rdx, 80(%rax)
movq %rcx, 88(%rax)
movq 112(%rbp), %rdx
movq 120(%rbp), %rcx
movq %rdx, 96(%rax)
movq %rcx, 104(%rax)
movq -8(%rbp), %rax
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE0:
.size _Z3foo3Obj, .-_Z3foo3Obj
.globl main
.type main, #function
main:
.LFB1:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
addq $-128, %rsp
movq %fs:40, %rax
movq %rax, -8(%rbp)
xorl %eax, %eax
leaq -128(%rbp), %rax
pushq 104+B(%rip)
pushq 96+B(%rip)
pushq 88+B(%rip)
pushq 80+B(%rip)
pushq 72+B(%rip)
pushq 64+B(%rip)
pushq 56+B(%rip)
pushq 48+B(%rip)
pushq 40+B(%rip)
pushq 32+B(%rip)
pushq 24+B(%rip)
pushq 16+B(%rip)
pushq 8+B(%rip)
pushq B(%rip)
movq %rax, %rdi
call _Z3foo3Obj
addq $112, %rsp
movq -128(%rbp), %rax
movq -120(%rbp), %rdx
movq %rax, B2(%rip)
movq %rdx, 8+B2(%rip)
movq -112(%rbp), %rax
movq -104(%rbp), %rdx
movq %rax, 16+B2(%rip)
movq %rdx, 24+B2(%rip)
movq -96(%rbp), %rax
movq -88(%rbp), %rdx
movq %rax, 32+B2(%rip)
movq %rdx, 40+B2(%rip)
movq -80(%rbp), %rax
movq -72(%rbp), %rdx
movq %rax, 48+B2(%rip)
movq %rdx, 56+B2(%rip)
movq -64(%rbp), %rax
movq -56(%rbp), %rdx
movq %rax, 64+B2(%rip)
movq %rdx, 72+B2(%rip)
movq -48(%rbp), %rax
movq -40(%rbp), %rdx
movq %rax, 80+B2(%rip)
movq %rdx, 88+B2(%rip)
movq -32(%rbp), %rax
movq -24(%rbp), %rdx
movq %rax, 96+B2(%rip)
movq %rdx, 104+B2(%rip)
movl $0, %eax
movq -8(%rbp), %rdx
xorq %fs:40, %rdx
je .L5
call __stack_chk_fail#PLT
.L5:
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE1:
.size main, .-main
.ident "GCC: (Ubuntu 7.4.0-1ubuntu1~18.04.1) 7.4.0"
.section .note.GNU-stack,"",#progbits
System is Linux, compiled with g++, and from my understanding of function frames, I should be seeing an additional "push" instruction that pushes the address of B2 onto the stack prior the call instruction in the function frame.
However, none of these push instructions seem to correspond to what I'm looking for. I see a leaq instruction, and a pushq (%rbp) at the beginning of main, but nothing that seems to be what I'm supposed to be seeing. Can anyone please advise?

Why does march=native corrupt my program?

I'm compiling the program:
#include <iostream>
#include <vector>
#include <cstddef>
#include <algorithm>
struct Model
{
int open, extend;
};
struct Cell
{
int a, b;
};
typedef std::vector<std::vector<Cell>> DPMatrix;
void print(const DPMatrix& matrix)
{
for (std::size_t i = 0; i < matrix.size(); ++i) {
for (std::size_t j = 0; j < matrix[i].size(); ++j) {
std::cout << '{' << matrix[i][j].a << ' ' << matrix[i][j].b << "} ";
}
std::cout << std::endl;
}
}
DPMatrix init_dp_matrix(const std::size_t num_cols, const std::size_t num_rows, const Model& model)
{
DPMatrix result(num_cols, DPMatrix::value_type(num_rows, Cell()));
const int inf = model.open * std::max(num_cols, num_rows);
for (int i = 1; i < num_cols; ++i) {
result[i][0].b = model.open + (i - 1) * model.extend;
}
for (int j = 1; j < num_rows; ++j) {
result[0][j].a = model.open + (j - 1) * model.extend;
}
return result;
}
int main()
{
const Model model = {-8, -1};
const DPMatrix matrix = init_dp_matrix(10, 2, model);
print(matrix);
}
With GCC 9.2.0:
$ g++-9 -v
Reading specs from /home/dcooke/tools/octopus/build/brew/Cellar/gcc#9/9.2.0/bin/../lib/gcc/9/gcc/x86_64-pc-linux-gnu/9.2.0/specs
COLLECT_GCC=/home/dcooke/tools/octopus/build/brew/bin/g++-9
COLLECT_LTO_WRAPPER=/home/dcooke/tools/octopus/build/brew/Cellar/gcc#9/9.2.0/bin/../libexec/gcc/x86_64-pc-linux-gnu/9.2.0/lto-wrapper
Target: x86_64-pc-linux-gnu
Configured with: ../configure --with-bugurl=https://github.com/Linuxbrew/homebrew-core/issues --disable-multilib --prefix=/home/linuxbrew/.linuxbrew/Cellar/gcc#9/9.2.0 --libdir=/home/linuxbrew/.linuxbrew/Cellar/gcc#9/9.2.0/lib/gcc/9 --enable-languages=c,c++,objc,obj-c++,fortran --disable-nls --enable-checking=release --program-suffix=-9 --with-gmp=/home/linuxbrew/.linuxbrew/opt/gmp --with-mpfr=/home/linuxbrew/.linuxbrew/opt/mpfr --with-mpc=/home/linuxbrew/.linuxbrew/opt/libmpc --with-isl=/home/linuxbrew/.linuxbrew/opt/isl --with-pkgversion='Homebrew GCC 9.2.0'
Thread model: posix
gcc version 9.2.0 (Homebrew GCC 9.2.0)
with -march=native:
$ g++-9 -O3 -march=native -o bug bug.cpp
On an Ubuntu machine with Intel chips:
$ lsb_release -a
No LSB modules are available.
Distributor ID: Ubuntu
Description: Ubuntu 18.04.3 LTS
Release: 18.04
Codename: bioni
$ grep model /proc/cpuinfo | head -2
model : 85
model name : Intel(R) Xeon(R) Platinum 8175M CPU # 2.50GHz
Running the program I get bogus output:
$ ./bug
{0 0} {-8 0}
{-2048 255} {0 0}
{-2304 255} {0 0}
{-2560 255} {0 0}
{-2816 255} {0 0}
{-3072 255} {0 0}
{-3328 255} {0 0}
{-3584 255} {0 0}
{-3840 255} {0 0}
{0 -16} {0 0}
If I compile without -march=native I get the correct output:
$ g++-9 -O3 -o bug bug.cpp
$ ./bug
{0 0} {-8 0}
{0 -8} {0 0}
{0 -9} {0 0}
{0 -10} {0 0}
{0 -11} {0 0}
{0 -12} {0 0}
{0 -13} {0 0}
{0 -14} {0 0}
{0 -15} {0 0}
{0 -16} {0 0
The assembly for the -match=native version is:
$ g++-9 -O3 -march=native -S bug.cpp
$ cat bug.s
.file "bug.cpp"
.text
.section .text._ZNKSt5ctypeIcE8do_widenEc,"axG",#progbits,_ZNKSt5ctypeIcE8do_widenEc,comdat
.align 2
.p2align 4
.weak _ZNKSt5ctypeIcE8do_widenEc
.type _ZNKSt5ctypeIcE8do_widenEc, #function
_ZNKSt5ctypeIcE8do_widenEc:
.LFB1303:
.cfi_startproc
movl %esi, %eax
ret
.cfi_endproc
.LFE1303:
.size _ZNKSt5ctypeIcE8do_widenEc, .-_ZNKSt5ctypeIcE8do_widenEc
.section .rodata.str1.1,"aMS",#progbits,1
.LC0:
.string "} "
.text
.p2align 4
.globl _Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE
.type _Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE, #function
_Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE:
.LFB2359:
.cfi_startproc
movq (%rdi), %rdx
cmpq %rdx, 8(%rdi)
je .L23
pushq %r15
.cfi_def_cfa_offset 16
.cfi_offset 15, -16
pushq %r14
.cfi_def_cfa_offset 24
.cfi_offset 14, -24
pushq %r13
.cfi_def_cfa_offset 32
.cfi_offset 13, -32
movabsq $-6148914691236517205, %r13
pushq %r12
.cfi_def_cfa_offset 40
.cfi_offset 12, -40
xorl %r12d, %r12d
pushq %rbp
.cfi_def_cfa_offset 48
.cfi_offset 6, -48
movq %rdi, %rbp
pushq %rbx
.cfi_def_cfa_offset 56
.cfi_offset 3, -56
subq $24, %rsp
.cfi_def_cfa_offset 80
.p2align 4,,10
.p2align 3
.L4:
leaq (%r12,%r12,2), %rbx
salq $3, %rbx
addq %rbx, %rdx
movq 8(%rdx), %rax
xorl %r14d, %r14d
cmpq %rax, (%rdx)
je .L8
.p2align 4,,10
.p2align 3
.L5:
movl $1, %edx
leaq 15(%rsp), %rsi
movl $_ZSt4cout, %edi
movb $123, 15(%rsp)
call _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l
movq %rax, %rdi
movq 0(%rbp), %rax
leaq 0(,%r14,8), %r15
movq (%rax,%rbx), %rax
movl (%rax,%r14,8), %esi
incq %r14
call _ZNSolsEi
movq %rax, %rdi
movl $1, %edx
leaq 15(%rsp), %rsi
movb $32, 15(%rsp)
call _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l
movq %rax, %rdi
movq 0(%rbp), %rax
movq (%rax,%rbx), %rax
movl 4(%rax,%r15), %esi
call _ZNSolsEi
movq %rax, %rdi
movl $2, %edx
movl $.LC0, %esi
call _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l
movq 0(%rbp), %rdx
addq %rbx, %rdx
movq 8(%rdx), %rax
subq (%rdx), %rax
sarq $3, %rax
cmpq %rax, %r14
jb .L5
.L8:
movq _ZSt4cout(%rip), %rax
movq -24(%rax), %rax
movq _ZSt4cout+240(%rax), %r14
testq %r14, %r14
je .L26
cmpb $0, 56(%r14)
je .L9
movsbl 67(%r14), %esi
.L10:
movl $_ZSt4cout, %edi
call _ZNSo3putEc
movq %rax, %rdi
call _ZNSo5flushEv
movq 0(%rbp), %rdx
movq 8(%rbp), %rax
incq %r12
subq %rdx, %rax
sarq $3, %rax
imulq %r13, %rax
cmpq %r12, %rax
ja .L4
addq $24, %rsp
.cfi_remember_state
.cfi_def_cfa_offset 56
popq %rbx
.cfi_def_cfa_offset 48
popq %rbp
.cfi_def_cfa_offset 40
popq %r12
.cfi_def_cfa_offset 32
popq %r13
.cfi_def_cfa_offset 24
popq %r14
.cfi_def_cfa_offset 16
popq %r15
.cfi_def_cfa_offset 8
ret
.p2align 4,,10
.p2align 3
.L9:
.cfi_restore_state
movq %r14, %rdi
call _ZNKSt5ctypeIcE13_M_widen_initEv
movq (%r14), %rax
movl $10, %esi
movq 48(%rax), %rax
cmpq $_ZNKSt5ctypeIcE8do_widenEc, %rax
je .L10
movq %r14, %rdi
call *%rax
movsbl %al, %esi
jmp .L10
.L23:
.cfi_def_cfa_offset 8
.cfi_restore 3
.cfi_restore 6
.cfi_restore 12
.cfi_restore 13
.cfi_restore 14
.cfi_restore 15
ret
.L26:
.cfi_def_cfa_offset 80
.cfi_offset 3, -56
.cfi_offset 6, -48
.cfi_offset 12, -40
.cfi_offset 13, -32
.cfi_offset 14, -24
.cfi_offset 15, -16
call _ZSt16__throw_bad_castv
.cfi_endproc
.LFE2359:
.size _Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE, .-_Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE
.section .rodata.str1.8,"aMS",#progbits,1
.align 8
.LC2:
.string "cannot create std::vector larger than max_size()"
.section .text.unlikely,"ax",#progbits
.LCOLDB6:
.text
.LHOTB6:
.p2align 4
.globl _Z14init_dp_matrixmmRK5Model
.type _Z14init_dp_matrixmmRK5Model, #function
_Z14init_dp_matrixmmRK5Model:
.LFB2360:
.cfi_startproc
.cfi_personality 0x3,__gxx_personality_v0
.cfi_lsda 0x3,.LLSDA2360
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movabsq $1152921504606846975, %rax
movq %rsp, %rbp
.cfi_def_cfa_register 6
pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %rbx
andq $-32, %rsp
subq $64, %rsp
.cfi_offset 15, -24
.cfi_offset 14, -32
.cfi_offset 13, -40
.cfi_offset 12, -48
.cfi_offset 3, -56
movq %rdi, 24(%rsp)
movq %rsi, 40(%rsp)
movq %rcx, 16(%rsp)
cmpq %rax, %rdx
ja .L103
movq %rdx, %r15
testq %rdx, %rdx
je .L71
leaq 0(,%rdx,8), %rbx
movq %rbx, %rdi
.LEHB0:
call _Znwm
.LEHE0:
movq %rax, %r13
leaq -1(%r15), %rax
cmpq $3, %rax
movq %r15, %rdx
movq %r13, %rax
jbe .L30
shrq $2, %rdx
salq $5, %rdx
addq %r13, %rdx
vpxor %xmm0, %xmm0, %xmm0
.p2align 4,,10
.p2align 3
.L32:
vmovdqu32 %ymm0, (%rax)
addq $32, %rax
cmpq %rdx, %rax
jne .L32
movq %r15, %rcx
andq $-4, %rcx
movq %r15, %rdx
andl $3, %edx
leaq 0(%r13,%rcx,8), %rax
cmpq %rcx, %r15
je .L33
.L30:
movq $0, (%rax)
cmpq $1, %rdx
je .L33
movq $0, 8(%rax)
cmpq $2, %rdx
je .L33
movq $0, 16(%rax)
cmpq $3, %rdx
je .L33
movq $0, 24(%rax)
.L33:
leaq 0(%r13,%rbx), %rax
movq %rax, 56(%rsp)
.L29:
movabsq $384307168202282325, %rax
cmpq %rax, 40(%rsp)
ja .L104
movq 40(%rsp), %rax
movq 24(%rsp), %r12
leaq (%rax,%rax,2), %rbx
movq $0, (%r12)
movq $0, 8(%r12)
movq $0, 16(%r12)
salq $3, %rbx
testq %rax, %rax
je .L35
movq %rbx, %rdi
vzeroupper
.LEHB1:
call _Znwm
.LEHE1:
addq %rax, %rbx
movq %rax, (%r12)
movq %rax, 8(%r12)
movq %rbx, 16(%r12)
movq 56(%rsp), %r12
movq %rax, %r14
subq %r13, %r12
movq %r12, %rax
sarq $3, %rax
je .L40
movabsq $1152921504606846975, %rdx
cmpq %rdx, %rax
ja .L41
movq 40(%rsp), %rax
movq %r14, %rbx
movq %rax, 48(%rsp)
.p2align 4,,10
.p2align 3
.L46:
movq $0, (%rbx)
movq $0, 8(%rbx)
movq $0, 16(%rbx)
movq %r12, %rdi
.LEHB2:
call _Znwm
.LEHE2:
leaq (%rax,%r12), %rcx
movq %rax, (%rbx)
movq %rcx, 16(%rbx)
movq %rax, %rdi
cmpq %r13, 56(%rsp)
je .L42
movq %r12, %rdx
movq %r13, %rsi
movq %rcx, 32(%rsp)
call memcpy
movq 32(%rsp), %rcx
addq $24, %rbx
movq %rcx, -16(%rbx)
decq 48(%rsp)
jne .L46
movq 24(%rsp), %rax
movq %rbx, 8(%rax)
.L47:
movq %r13, %rdi
call _ZdlPv
.L48:
movq 16(%rsp), %rax
cmpq $1, 40(%rsp)
movl (%rax), %edx
jbe .L62
movl 4(%rax), %edi
movq 24(%rsp), %rax
movq (%rax), %rsi
movq 40(%rsp), %rax
leaq -2(%rax), %rcx
cmpq $7, %rcx
jbe .L73
movq %rcx, %r8
shrq $3, %r8
leaq (%r8,%r8,2), %r8
salq $6, %r8
vmovdqa64 .LC1(%rip), %ymm3
vmovdqa64 .LC3(%rip), %ymm4
vmovdqa64 .LC4(%rip), %ymm6
vmovdqa64 .LC5(%rip), %ymm5
vpbroadcastd %edi, %ymm10
vpbroadcastd %edx, %ymm9
leaq 24(%rsi), %rax
leaq 24(%rsi,%r8), %r8
vpcmpeqd %ymm8, %ymm8, %ymm8
kxnorb %k1, %k1, %k1
.p2align 4,,10
.p2align 3
.L61:
vmovdqa64 %ymm3, %ymm0
vpaddd %ymm8, %ymm0, %ymm0
vpmulld %ymm10, %ymm0, %ymm0
vmovdqu64 (%rax), %ymm2
vmovdqu64 96(%rax), %ymm1
vpermt2q 32(%rax), %ymm6, %ymm2
vpermt2q 128(%rax), %ymm6, %ymm1
vpermt2q 64(%rax), %ymm5, %ymm2
vpaddd %ymm9, %ymm0, %ymm0
vpermt2q 160(%rax), %ymm5, %ymm1
kmovb %k1, %k2
addq $192, %rax
vpscatterqd %xmm0, 4(,%ymm2,1){%k2}
vperm2i128 $17, %ymm0, %ymm0, %ymm0
kmovb %k1, %k3
vpaddd %ymm4, %ymm3, %ymm3
vpscatterqd %xmm0, 4(,%ymm1,1){%k3}
cmpq %r8, %rax
jne .L61
andq $-8, %rcx
leaq 1(%rcx), %r8
leal 1(%rcx), %eax
.L59:
leaq (%r8,%r8,2), %rcx
movq (%rsi,%rcx,8), %r8
leal -1(%rax), %ecx
imull %edi, %ecx
movq 40(%rsp), %rbx
addl %edx, %ecx
movl %ecx, 4(%r8)
leal 1(%rax), %ecx
movslq %ecx, %r8
cmpq %r8, %rbx
jbe .L62
leaq (%r8,%r8,2), %r8
movq (%rsi,%r8,8), %r9
movl %edi, %r8d
imull %eax, %r8d
addl %edx, %r8d
movl %r8d, 4(%r9)
leal 2(%rax), %r8d
movslq %r8d, %r9
cmpq %r9, %rbx
jbe .L62
imull %edi, %ecx
leaq (%r9,%r9,2), %r9
movq (%rsi,%r9,8), %r9
addl %edx, %ecx
movl %ecx, 4(%r9)
leal 3(%rax), %ecx
movslq %ecx, %r9
cmpq %r9, %rbx
jbe .L62
imull %edi, %r8d
leaq (%r9,%r9,2), %r9
movq (%rsi,%r9,8), %r9
addl %edx, %r8d
movl %r8d, 4(%r9)
leal 4(%rax), %r8d
movslq %r8d, %r9
cmpq %r9, %rbx
jbe .L62
imull %edi, %ecx
leaq (%r9,%r9,2), %r9
movq (%rsi,%r9,8), %r9
addl %edx, %ecx
movl %ecx, 4(%r9)
leal 5(%rax), %ecx
movslq %ecx, %r9
cmpq %r9, %rbx
jbe .L62
imull %edi, %r8d
leaq (%r9,%r9,2), %r9
movq (%rsi,%r9,8), %r9
addl %edx, %r8d
movl %r8d, 4(%r9)
leal 6(%rax), %r8d
movslq %r8d, %r9
cmpq %r9, %rbx
jbe .L62
imull %edi, %ecx
leaq (%r9,%r9,2), %r9
movq (%rsi,%r9,8), %r9
addl $7, %eax
addl %edx, %ecx
cltq
movl %ecx, 4(%r9)
cmpq %rax, %rbx
jbe .L62
imull %r8d, %edi
leaq (%rax,%rax,2), %rax
movq (%rsi,%rax,8), %rax
leal (%rdi,%rdx), %r8d
movl %r8d, 4(%rax)
.L62:
cmpq $1, %r15
jbe .L27
movq 16(%rsp), %rax
leaq -1(%r15), %r8
movl 4(%rax), %edi
movq 24(%rsp), %rax
movq (%rax), %rax
movq (%rax), %rsi
leaq -2(%r15), %rax
cmpq $6, %rax
jbe .L74
movq %r8, %rcx
shrq $3, %rcx
salq $6, %rcx
vmovdqa64 .LC1(%rip), %ymm2
vmovdqa64 .LC3(%rip), %ymm4
vpbroadcastd %edi, %ymm6
vpbroadcastd %edx, %ymm5
movq %rsi, %rax
addq %rsi, %rcx
vpcmpeqd %ymm3, %ymm3, %ymm3
.p2align 4,,10
.p2align 3
.L66:
vmovdqa64 %ymm2, %ymm0
vpaddd %ymm3, %ymm0, %ymm0
vpmulld %ymm6, %ymm0, %ymm0
addq $64, %rax
vpaddd %ymm4, %ymm2, %ymm2
vpaddd %ymm5, %ymm0, %ymm0
vmovd %xmm0, -56(%rax)
vpextrd $1, %xmm0, -48(%rax)
vpextrd $2, %xmm0, -40(%rax)
vpextrd $3, %xmm0, -32(%rax)
vextracti128 $0x1, %ymm0, %xmm0
vmovd %xmm0, -24(%rax)
vpextrd $1, %xmm0, -16(%rax)
vpextrd $2, %xmm0, -8(%rax)
vpextrd $3, %xmm0, (%rax)
cmpq %rcx, %rax
jne .L66
movq %r8, %rcx
andq $-8, %rcx
leaq 1(%rcx), %r9
leal 1(%rcx), %eax
cmpq %r8, %rcx
je .L27
.L64:
leal -1(%rax), %ecx
imull %edi, %ecx
addl %edx, %ecx
movl %ecx, (%rsi,%r9,8)
leal 1(%rax), %ecx
movslq %ecx, %r9
cmpq %r15, %r9
jnb .L27
movl %edi, %r8d
imull %eax, %r8d
addl %edx, %r8d
movl %r8d, (%rsi,%r9,8)
leal 2(%rax), %r8d
movslq %r8d, %r9
cmpq %r9, %r15
jbe .L27
imull %edi, %ecx
addl %edx, %ecx
movl %ecx, (%rsi,%r9,8)
leal 3(%rax), %ecx
movslq %ecx, %r9
cmpq %r15, %r9
jnb .L27
imull %edi, %r8d
addl %edx, %r8d
movl %r8d, (%rsi,%r9,8)
leal 4(%rax), %r8d
movslq %r8d, %r9
cmpq %r9, %r15
jbe .L27
imull %edi, %ecx
addl %edx, %ecx
movl %ecx, (%rsi,%r9,8)
leal 5(%rax), %ecx
movslq %ecx, %r9
cmpq %r9, %r15
jbe .L27
imull %edi, %r8d
addl $6, %eax
cltq
addl %edx, %r8d
movl %r8d, (%rsi,%r9,8)
cmpq %rax, %r15
jbe .L27
imull %ecx, %edi
addl %edi, %edx
movl %edx, (%rsi,%rax,8)
.L27:
movq 24(%rsp), %rax
vzeroupper
leaq -40(%rbp), %rsp
popq %rbx
popq %r12
popq %r13
popq %r14
popq %r15
popq %rbp
.cfi_remember_state
.cfi_def_cfa 7, 8
ret
.p2align 4,,10
.p2align 3
.L37:
.cfi_restore_state
movq %r12, 8(%r14)
addq $24, %r14
cmpq %r14, %rbx
je .L45
.L40:
movq $0, (%r14)
movq %r12, 16(%r14)
cmpq %r13, 56(%rsp)
je .L37
movq %r12, %rdx
movq %r13, %rsi
xorl %edi, %edi
call memcpy
addq $24, %r14
movq %r12, -16(%r14)
cmpq %r14, %rbx
jne .L40
.L45:
movq 24(%rsp), %rax
movq %rbx, 8(%rax)
testq %r13, %r13
je .L48
.L105:
movq %r13, %rdi
call _ZdlPv
jmp .L48
.p2align 4,,10
.p2align 3
.L42:
movq %rcx, 8(%rbx)
addq $24, %rbx
decq 48(%rsp)
jne .L46
movq 24(%rsp), %rax
movq %rbx, 8(%rax)
testq %r13, %r13
je .L48
jmp .L105
.p2align 4,,10
.p2align 3
.L71:
movq $0, 56(%rsp)
xorl %r13d, %r13d
jmp .L29
.p2align 4,,10
.p2align 3
.L35:
testq %r13, %r13
je .L106
vzeroupper
jmp .L47
.L73:
movl $1, %eax
movl $1, %r8d
jmp .L59
.L74:
movl $1, %eax
movl $1, %r9d
jmp .L64
.L106:
movq 16(%rsp), %rax
movl (%rax), %edx
jmp .L62
.L41:
movq $0, (%r14)
movq $0, 8(%r14)
movq $0, 16(%r14)
.LEHB3:
call _ZSt17__throw_bad_allocv
.LEHE3:
.L104:
movl $.LC2, %edi
vzeroupper
.LEHB4:
call _ZSt20__throw_length_errorPKc
.LEHE4:
.L103:
movl $.LC2, %edi
.LEHB5:
call _ZSt20__throw_length_errorPKc
.LEHE5:
.L78:
movq %rax, %rdi
jmp .L49
.L77:
movq %rax, %rdi
jmp .L50
.L75:
movq %rax, %r12
vzeroupper
jmp .L56
.globl __gxx_personality_v0
.section .gcc_except_table,"a",#progbits
.align 4
.LLSDA2360:
.byte 0xff
.byte 0x3
.uleb128 .LLSDATT2360-.LLSDATTD2360
.LLSDATTD2360:
.byte 0x1
.uleb128 .LLSDACSE2360-.LLSDACSB2360
.LLSDACSB2360:
.uleb128 .LEHB0-.LFB2360
.uleb128 .LEHE0-.LEHB0
.uleb128 0
.uleb128 0
.uleb128 .LEHB1-.LFB2360
.uleb128 .LEHE1-.LEHB1
.uleb128 .L75-.LFB2360
.uleb128 0
.uleb128 .LEHB2-.LFB2360
.uleb128 .LEHE2-.LEHB2
.uleb128 .L77-.LFB2360
.uleb128 0x1
.uleb128 .LEHB3-.LFB2360
.uleb128 .LEHE3-.LEHB3
.uleb128 .L78-.LFB2360
.uleb128 0x1
.uleb128 .LEHB4-.LFB2360
.uleb128 .LEHE4-.LEHB4
.uleb128 .L75-.LFB2360
.uleb128 0
.uleb128 .LEHB5-.LFB2360
.uleb128 .LEHE5-.LEHB5
.uleb128 0
.uleb128 0
.LLSDACSE2360:
.byte 0x1
.byte 0
.align 4
.long 0
.LLSDATT2360:
.text
.cfi_endproc
.section .text.unlikely
.cfi_startproc
.cfi_personality 0x3,__gxx_personality_v0
.cfi_lsda 0x3,.LLSDAC2360
.type _Z14init_dp_matrixmmRK5Model.cold, #function
_Z14init_dp_matrixmmRK5Model.cold:
.LFSB2360:
.L49:
.cfi_def_cfa 6, 16
.cfi_offset 3, -56
.cfi_offset 6, -16
.cfi_offset 12, -48
.cfi_offset 13, -40
.cfi_offset 14, -32
.cfi_offset 15, -24
movq %r14, %rbx
.L50:
vzeroupper
call __cxa_begin_catch
.L53:
cmpq %rbx, %r14
jne .L107
.LEHB6:
call __cxa_rethrow
.LEHE6:
.L76:
movq %rax, %r12
vzeroupper
call __cxa_end_catch
movq 24(%rsp), %rax
movq (%rax), %rdi
testq %rdi, %rdi
je .L56
call _ZdlPv
.L56:
testq %r13, %r13
je .L69
movq %r13, %rdi
call _ZdlPv
.L69:
movq %r12, %rdi
.LEHB7:
call _Unwind_Resume
.LEHE7:
.L107:
movq (%r14), %rdi
testq %rdi, %rdi
je .L52
call _ZdlPv
.L52:
addq $24, %r14
jmp .L53
.cfi_endproc
.LFE2360:
.section .gcc_except_table
.align 4
.LLSDAC2360:
.byte 0xff
.byte 0x3
.uleb128 .LLSDATTC2360-.LLSDATTDC2360
.LLSDATTDC2360:
.byte 0x1
.uleb128 .LLSDACSEC2360-.LLSDACSBC2360
.LLSDACSBC2360:
.uleb128 .LEHB6-.LCOLDB6
.uleb128 .LEHE6-.LEHB6
.uleb128 .L76-.LCOLDB6
.uleb128 0
.uleb128 .LEHB7-.LCOLDB6
.uleb128 .LEHE7-.LEHB7
.uleb128 0
.uleb128 0
.LLSDACSEC2360:
.byte 0x1
.byte 0
.align 4
.long 0
.LLSDATTC2360:
.section .text.unlikely
.text
.size _Z14init_dp_matrixmmRK5Model, .-_Z14init_dp_matrixmmRK5Model
.section .text.unlikely
.size _Z14init_dp_matrixmmRK5Model.cold, .-_Z14init_dp_matrixmmRK5Model.cold
.LCOLDE6:
.text
.LHOTE6:
.section .text._ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev,"axG",#progbits,_ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED5Ev,comdat
.align 2
.p2align 4
.weak _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev
.type _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev, #function
_ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev:
.LFB2637:
.cfi_startproc
pushq %r12
.cfi_def_cfa_offset 16
.cfi_offset 12, -16
movq %rdi, %r12
pushq %rbp
.cfi_def_cfa_offset 24
.cfi_offset 6, -24
pushq %rbx
.cfi_def_cfa_offset 32
.cfi_offset 3, -32
movq 8(%rdi), %rbx
movq (%rdi), %rbp
cmpq %rbp, %rbx
je .L109
.p2align 4,,10
.p2align 3
.L113:
movq 0(%rbp), %rdi
testq %rdi, %rdi
je .L110
addq $24, %rbp
call _ZdlPv
cmpq %rbp, %rbx
jne .L113
.L111:
movq (%r12), %rbp
.L109:
testq %rbp, %rbp
je .L115
popq %rbx
.cfi_remember_state
.cfi_def_cfa_offset 24
movq %rbp, %rdi
popq %rbp
.cfi_def_cfa_offset 16
popq %r12
.cfi_def_cfa_offset 8
jmp _ZdlPv
.p2align 4,,10
.p2align 3
.L110:
.cfi_restore_state
addq $24, %rbp
cmpq %rbp, %rbx
jne .L113
jmp .L111
.p2align 4,,10
.p2align 3
.L115:
popq %rbx
.cfi_def_cfa_offset 24
popq %rbp
.cfi_def_cfa_offset 16
popq %r12
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE2637:
.size _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev, .-_ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev
.weak _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED1Ev
.set _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED1Ev,_ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED2Ev
.section .text.unlikely
.LCOLDB7:
.section .text.startup,"ax",#progbits
.LHOTB7:
.p2align 4
.globl main
.type main, #function
main:
.LFB2371:
.cfi_startproc
.cfi_personality 0x3,__gxx_personality_v0
.cfi_lsda 0x3,.LLSDA2371
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movl $2, %edx
movl $10, %esi
subq $48, %rsp
.cfi_def_cfa_offset 64
leaq 16(%rsp), %rdi
leaq 8(%rsp), %rcx
movq $-8, 8(%rsp)
.LEHB8:
call _Z14init_dp_matrixmmRK5Model
.LEHE8:
leaq 16(%rsp), %rdi
.LEHB9:
call _Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE
.LEHE9:
leaq 16(%rsp), %rdi
call _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED1Ev
addq $48, %rsp
.cfi_remember_state
.cfi_def_cfa_offset 16
xorl %eax, %eax
popq %rbp
.cfi_def_cfa_offset 8
ret
.L119:
.cfi_restore_state
movq %rax, %rbp
jmp .L118
.section .gcc_except_table
.LLSDA2371:
.byte 0xff
.byte 0xff
.byte 0x1
.uleb128 .LLSDACSE2371-.LLSDACSB2371
.LLSDACSB2371:
.uleb128 .LEHB8-.LFB2371
.uleb128 .LEHE8-.LEHB8
.uleb128 0
.uleb128 0
.uleb128 .LEHB9-.LFB2371
.uleb128 .LEHE9-.LEHB9
.uleb128 .L119-.LFB2371
.uleb128 0
.LLSDACSE2371:
.section .text.startup
.cfi_endproc
.section .text.unlikely
.cfi_startproc
.cfi_personality 0x3,__gxx_personality_v0
.cfi_lsda 0x3,.LLSDAC2371
.type main.cold, #function
main.cold:
.LFSB2371:
.L118:
.cfi_def_cfa_offset 64
.cfi_offset 6, -16
leaq 16(%rsp), %rdi
vzeroupper
call _ZNSt6vectorIS_I4CellSaIS0_EESaIS2_EED1Ev
movq %rbp, %rdi
.LEHB10:
call _Unwind_Resume
.LEHE10:
.cfi_endproc
.LFE2371:
.section .gcc_except_table
.LLSDAC2371:
.byte 0xff
.byte 0xff
.byte 0x1
.uleb128 .LLSDACSEC2371-.LLSDACSBC2371
.LLSDACSBC2371:
.uleb128 .LEHB10-.LCOLDB7
.uleb128 .LEHE10-.LEHB10
.uleb128 0
.uleb128 0
.LLSDACSEC2371:
.section .text.unlikely
.section .text.startup
.size main, .-main
.section .text.unlikely
.size main.cold, .-main.cold
.LCOLDE7:
.section .text.startup
.LHOTE7:
.p2align 4
.type _GLOBAL__sub_I__Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE, #function
_GLOBAL__sub_I__Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE:
.LFB3017:
.cfi_startproc
subq $8, %rsp
.cfi_def_cfa_offset 16
movl $_ZStL8__ioinit, %edi
call _ZNSt8ios_base4InitC1Ev
movl $__dso_handle, %edx
movl $_ZStL8__ioinit, %esi
movl $_ZNSt8ios_base4InitD1Ev, %edi
addq $8, %rsp
.cfi_def_cfa_offset 8
jmp __cxa_atexit
.cfi_endproc
.LFE3017:
.size _GLOBAL__sub_I__Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE, .-_GLOBAL__sub_I__Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE
.section .init_array,"aw"
.align 8
.quad _GLOBAL__sub_I__Z5printRKSt6vectorIS_I4CellSaIS0_EESaIS2_EE
.local _ZStL8__ioinit
.comm _ZStL8__ioinit,1,1
.section .rodata.cst32,"aM",#progbits,32
.align 32
.LC1:
.long 1
.long 2
.long 3
.long 4
.long 5
.long 6
.long 7
.long 8
.align 32
.LC3:
.long 8
.long 8
.long 8
.long 8
.long 8
.long 8
.long 8
.long 8
.align 32
.LC4:
.quad 0
.quad 3
.quad 6
.quad 0
.align 32
.LC5:
.quad 0
.quad 1
.quad 2
.quad 5
.hidden __dso_handle
.ident "GCC: (Homebrew GCC 9.2.0) 9.2.0"
.section .note.GNU-stack,"",#progbits
The assembly for the non -march=native version is available on godbolt.
What is going wrong, is this a compiler bug or is my program ill formed? How can I mitigate this issue if it is a compiler bug?
Additional info
Compiling with -v:
$ ~/tools/octopus/build/brew/bin/g++-9 -O3 -march=native -S bug.cpp -v
Reading specs from /home/dcooke/tools/octopus/build/brew/Cellar/gcc#9/9.2.0/bin/../lib/gcc/9/gcc/x86_64-pc-linux-gnu/9.2.0/specs
COLLECT_GCC=/home/dcooke/tools/octopus/build/brew/bin/g++-9
Target: x86_64-pc-linux-gnu
Configured with: ../configure --with-bugurl=https://github.com/Linuxbrew/homebrew-core/issues --disable-multilib --prefix=/home/linuxbrew/.linuxbrew/Cellar/gcc#9/9.2.0 --libdir=/home/linuxbrew/.linuxbrew/Cellar/gcc#9/9.2.0/lib/gcc/9 --enable-languages=c,c++,objc,obj-c++,fortran --disable-nls --enable-checking=release --program-suffix=-9 --with-gmp=/home/linuxbrew/.linuxbrew/opt/gmp --with-mpfr=/home/linuxbrew/.linuxbrew/opt/mpfr --with-mpc=/home/linuxbrew/.linuxbrew/opt/libmpc --with-isl=/home/linuxbrew/.linuxbrew/opt/isl --with-pkgversion='Homebrew GCC 9.2.0'
Thread model: posix
gcc version 9.2.0 (Homebrew GCC 9.2.0)
COLLECT_GCC_OPTIONS='-O3' '-march=native' '-S' '-v' '-shared-libgcc'
/home/dcooke/tools/octopus/build/brew/Cellar/gcc#9/9.2.0/bin/../libexec/gcc/x86_64-pc-linux-gnu/9.2.0/cc1plus -quiet -v -imultiarch x86_64-linux-gnu -iprefix /home/dcooke/tools/octopus/build/brew/Cellar/gcc#9/9.2.0/bin/../lib/gcc/9/gcc/x86_64-pc-linux-gnu/9.2.0/ -D_GNU_SOURCE bug.cpp -isysroot /home/dcooke/tools/octopus/build/brew/nonexistent -idirafter /home/dcooke/tools/octopus/build/brew/include -idirafter /usr/include/x86_64-linux-gnu -idirafter /usr/include -march=skylake-avx512 -mmmx -mno-3dnow -msse -msse2 -msse3 -mssse3 -mno-sse4a -mcx16 -msahf -mmovbe -maes -mno-sha -mpclmul -mpopcnt -mabm -mno-lwp -mfma -mno-fma4 -mno-xop -mbmi -mno-sgx -mbmi2 -mno-pconfig -mno-wbnoinvd -mno-tbm -mavx -mavx2 -msse4.2 -msse4.1 -mlzcnt -mrtm -mhle -mrdrnd -mf16c -mfsgsbase -mrdseed -mprfchw -madx -mfxsr -mxsave -mxsaveopt -mavx512f -mno-avx512er -mavx512cd -mno-avx512pf -mno-prefetchwt1 -mclflushopt -mxsavec -mxsaves -mavx512dq -mavx512bw -mavx512vl -mno-avx512ifma -mno-avx512vbmi -mno-avx5124fmaps -mno-avx5124vnniw -mclwb -mno-mwaitx -mno-clzero -mpku -mno-rdpid -mno-gfni -mno-shstk -mno-avx512vbmi2 -mno-avx512vnni -mno-vaes -mno-vpclmulqdq -mno-avx512bitalg -mno-movdiri -mno-movdir64b -mno-waitpkg -mno-cldemote -mno-ptwrite --param l1-cache-size=32 --param l1-cache-line-size=64 --param l2-cache-size=33792 -mtune=skylake-avx512 -quiet -dumpbase bug.cpp -auxbase bug -O3 -version -o bug.s
GNU C++14 (Homebrew GCC 9.2.0) version 9.2.0 (x86_64-pc-linux-gnu)
compiled by GNU C version 9.2.0, GMP version 6.1.2, MPFR version 4.0.2, MPC version 1.1.0, isl version isl-0.21-GMP
Compiling with -O2 or less makes the problem go away:
$ g++-9 -O2 -march=native -o bug bug.cpp
$ ./bug
{0 0} {-8 0}
{0 -8} {0 0}
{0 -9} {0 0}
{0 -10} {0 0}
{0 -11} {0 0}
{0 -12} {0 0}
{0 -13} {0 0}
{0 -14} {0 0}
{0 -15} {0 0}
{0 -16} {0 0}
I tried building on a different machine with Intel chips:
$ rpm -q centos-release
centos-release-7-3.1611.el7.centos.x86_64
$ grep model /proc/cpuinfo | head -2
model : 85
model name : Intel(R) Xeon(R) Gold 6148 CPU # 2.40GHz
$ g++-9 -O3 -march=native -o bug bug.cpp -v
Reading specs from /gpfs1/well/gerton/dan/apps/octopus/build/brew/Cellar/gcc#9/9.2.0/bin/../lib/gcc/9/gcc/x86_64-pc-linux-gnu/9.2.0/specs
COLLECT_GCC=/well/gerton/dan/apps/octopus/build/brew/bin/g++-9
COLLECT_LTO_WRAPPER=/gpfs1/well/gerton/dan/apps/octopus/build/brew/Cellar/gcc#9/9.2.0/bin/../libexec/gcc/x86_64-pc-linux-gnu/9.2.0/lto-wrapper
Target: x86_64-pc-linux-gnu
Configured with: ../configure --with-bugurl=https://github.com/Linuxbrew/homebrew-core/issues --disable-multilib --prefix=/home/linuxbrew/.linuxbrew/Cellar/gcc#9/9.2.0 --libdir=/home/linuxbrew/.linuxbrew/Cellar/gcc#9/9.2.0/lib/gcc/9 --enable-languages=c,c++,objc,obj-c++,fortran --disable-nls --enable-checking=release --program-suffix=-9 --with-gmp=/home/linuxbrew/.linuxbrew/opt/gmp --with-mpfr=/home/linuxbrew/.linuxbrew/opt/mpfr --with-mpc=/home/linuxbrew/.linuxbrew/opt/libmpc --with-isl=/home/linuxbrew/.linuxbrew/opt/isl --with-pkgversion='Homebrew GCC 9.2.0'
Thread model: posix
gcc version 9.2.0 (Homebrew GCC 9.2.0)
COLLECT_GCC_OPTIONS='-O3' '-march=native' '-o' 'bug' '-v' '-shared-libgcc'
/gpfs1/well/gerton/dan/apps/octopus/build/brew/Cellar/gcc#9/9.2.0/bin/../libexec/gcc/x86_64-pc-linux-gnu/9.2.0/cc1plus -quiet -v -imultiarch x86_64-linux-gnu -iprefix /gpfs1/well/gerton/dan/apps/octopus/build/brew/Cellar/gcc#9/9.2.0/bin/../lib/gcc/9/gcc/x86_64-pc-linux-gnu/9.2.0/ -D_GNU_SOURCE bug.cpp -isysroot /gpfs1/well/gerton/dan/apps/octopus/build/brew/nonexistent -idirafter /gpfs1/well/gerton/dan/apps/octopus/build/brew/include -idirafter /gpfs1/well/gerton/dan/apps/octopus/build/brew/opt/glibc/include -march=skylake-avx512 -mmmx -mno-3dnow -msse -msse2 -msse3 -mssse3 -mno-sse4a -mcx16 -msahf -mmovbe -maes -mno-sha -mpclmul -mpopcnt -mabm -mno-lwp -mfma -mno-fma4 -mno-xop -mbmi -mno-sgx -mbmi2 -mno-pconfig -mno-wbnoinvd -mno-tbm -mavx -mavx2 -msse4.2 -msse4.1 -mlzcnt -mrtm -mhle -mrdrnd -mf16c -mfsgsbase -mrdseed -mprfchw -madx -mfxsr -mxsave -mxsaveopt -mavx512f -mno-avx512er -mavx512cd -mno-avx512pf -mno-prefetchwt1 -mclflushopt -mxsavec -mxsaves -mavx512dq -mavx512bw -mavx512vl -mno-avx512ifma -mno-avx512vbmi -mno-avx5124fmaps -mno-avx5124vnniw -mclwb -mno-mwaitx -mno-clzero -mno-pku -mno-rdpid -mno-gfni -mno-shstk -mno-avx512vbmi2 -mno-avx512vnni -mno-vaes -mno-vpclmulqdq -mno-avx512bitalg -mno-movdiri -mno-movdir64b -mno-waitpkg -mno-cldemote -mno-ptwrite --param l1-cache-size=32 --param l1-cache-line-size=64 --param l2-cache-size=28160 -mtune=skylake-avx512 -quiet -dumpbase bug.cpp -auxbase bug -O3 -version -o /tmp/cczPrvHP.s
GNU C++14 (Homebrew GCC 9.2.0) version 9.2.0 (x86_64-pc-linux-gnu)
compiled by GNU C version 9.2.0, GMP version 6.1.2, MPFR version 4.0.2, MPC version 1.1.0, isl version isl-0.21-GMP
$ ./bug
{0 0} {-8 0}
{0 -8} {0 0}
{0 -9} {0 0}
{0 -10} {0 0}
{0 -11} {0 0}
{0 -12} {0 0}
{0 -13} {0 0}
{0 -14} {0 0}
{0 -15} {0 0}
{0 -16} {0 0}
The correct output...
-ftree-loop-vectorize is the culprit:
$ g++-9 -march=native -O2 -o bug bug.cpp -ftree-loop-vectorize
$ ./bug
{0 0} {-8 0}
{-2048 255} {0 0}
{-2304 255} {0 0}
{-2560 255} {0 0}
{-2816 255} {0 0}
{-3072 255} {0 0}
{-3328 255} {0 0}
{-3584 255} {0 0}
{-3840 255} {0 0}
{0 -16} {0 0}
None of the other O3 flags result in this behaviour.

This turned out to be due to a bug in binutils gas. This solution was to upgrade my binutils to 2.32.

Why clang does not optimize global const like a #define?

I have this test program, using a #define constant:
#include <stdio.h>
#define FOO 1
int main()
{
printf("%d\n", FOO);
return 0;
}
When compiled with “Apple LLVM version 10.0.0 (clang-1000.11.45.5)”, I get an executable of 8432 bytes. Here is the assembly listing:
.section __TEXT,__text,regular,pure_instructions
.build_version macos, 10, 14
.globl _main ## -- Begin function main
.p2align 4, 0x90
_main: ## #main
.cfi_startproc
## %bb.0:
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset %rbp, -16
movq %rsp, %rbp
.cfi_def_cfa_register %rbp
subq $16, %rsp
leaq L_.str(%rip), %rdi
movl $1, %esi
movl $0, -4(%rbp)
movb $0, %al
callq _printf
xorl %esi, %esi
movl %eax, -8(%rbp) ## 4-byte Spill
movl %esi, %eax
addq $16, %rsp
popq %rbp
retq
.cfi_endproc
## -- End function
.section __TEXT,__cstring,cstring_literals
L_.str: ## #.str
.asciz "%d\n"
.subsections_via_symbols
Now I replace #define FOO 1 with const int FOO = 1;. The executable is now 8464 bytes and the assembly listing looks like this:
.section __TEXT,__text,regular,pure_instructions
.build_version macos, 10, 14
.globl _main ## -- Begin function main
.p2align 4, 0x90
_main: ## #main
.cfi_startproc
## %bb.0:
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset %rbp, -16
movq %rsp, %rbp
.cfi_def_cfa_register %rbp
subq $16, %rsp
leaq L_.str(%rip), %rdi
movl $1, %esi
movl $0, -4(%rbp)
movb $0, %al
callq _printf
xorl %esi, %esi
movl %eax, -8(%rbp) ## 4-byte Spill
movl %esi, %eax
addq $16, %rsp
popq %rbp
retq
.cfi_endproc
## -- End function
.section __TEXT,__const
.globl _FOO ## #FOO
.p2align 2
_FOO:
.long 1 ## 0x1
.section __TEXT,__cstring,cstring_literals
L_.str: ## #.str
.asciz "%d\n"
.subsections_via_symbols
So it actually declared a FOO variable, making the executable 32 bytes bigger.
I get the same result with -O3 optimization level.
Why is that? Normally, the compiler should be intelligent enough to optimize and add the constant to the symbol table instead of taking up storage for it.

This is another case where the difference between C and C++ matters.
In C, const int FOO has external linkage and must thus be included in the binary.
Compiling with g++ or clang++ instead gives you the desired optimization as FOO has internal linkage in C++.
You can achieve the optimization in C mode by explicitly requesting internal linkage for FOO via
static const int FOO = 1;
Both clang and gcc with link-time optimization enabled (-flto) also manage to strip away the unused symbol, even when linkage is external. (Live with and without LTO.)

The fact that you use the variable FOO in your second program means that it has to live somewhere, so the compiler needs to allocate it somewhere.
In the #define case, there is no variable - the pre-processor substituted the text "FOO" with the text "1" an so the call to printf() was passed a constant value, not a variable.

Slow std::string concatenation on windows

I have a program that needs to concatenate lots of strings together (to be more precise integers converted to strings). On my Ubuntu machine (running g++ 7.3.0) the code runs in 1.5 seconds. But the code needs to be run on Windows as well (running g++ 6.3.0 using MinGW), where it takes 15 seconds to complete. Furthermore, the Ubuntu setup runs on a much slower Laptop using an i7-4712MQ CPU # 2.30GHz, whereas the Windows machine runs on an i7-7700K CPU # 4.20GHz.
The code to reproduce the times is shown below. I compile the code with g++ tester.cpp -O2 -o tester (or tester.exe for windows)
#include <iostream>
#include <chrono>
int main(int argc, char const *argv[]) {
auto started = std::chrono::high_resolution_clock::now();
std::string str = "";
const int n = 10000000;
str.reserve(2 * n);
int a = 1;
for (int i = 0; i < n; ++i) {
str += std::to_string(a) + " ";
}
auto done = std::chrono::high_resolution_clock::now();
double secs = (double) std::chrono::duration_cast<std::chrono::milliseconds>(done-started).count() / 1000;
std::cout << "Done in " << secs << "\n";
return 0;
}
Any idea where the large performance gap might come from?
The disassemblies look like this:
Ubuntu:
.file "tester.cpp"
.text
.align 2
.p2align 4,,15
.type _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.19, #function
_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.19:
.LFB2389:
.cfi_startproc
pushq %r12
.cfi_def_cfa_offset 16
.cfi_offset 12, -16
pushq %rbp
.cfi_def_cfa_offset 24
.cfi_offset 6, -24
movq %rsi, %r12
pushq %rbx
.cfi_def_cfa_offset 32
.cfi_offset 3, -32
movq %rdx, %rbx
movq %rdi, %rbp
subq %rsi, %rbx
subq $16, %rsp
.cfi_def_cfa_offset 48
movq %fs:40, %rax
movq %rax, 8(%rsp)
xorl %eax, %eax
cmpq $15, %rbx
movq %rbx, (%rsp)
ja .L12
movq (%rdi), %rdx
cmpq $1, %rbx
movq %rdx, %rax
jne .L4
movzbl (%rsi), %eax
movb %al, (%rdx)
movq (%rdi), %rdx
.L5:
movq (%rsp), %rax
movq %rax, 8(%rbp)
movb $0, (%rdx,%rax)
movq 8(%rsp), %rax
xorq %fs:40, %rax
jne .L13
addq $16, %rsp
.cfi_remember_state
.cfi_def_cfa_offset 32
popq %rbx
.cfi_def_cfa_offset 24
popq %rbp
.cfi_def_cfa_offset 16
popq %r12
.cfi_def_cfa_offset 8
ret
.L12:
.cfi_restore_state
xorl %edx, %edx
movq %rsp, %rsi
call _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_createERmm#PLT
movq (%rsp), %rdx
movq %rax, 0(%rbp)
movq %rdx, 16(%rbp)
.L3:
movq %rbx, %rdx
movq %r12, %rsi
movq %rax, %rdi
call memcpy#PLT
movq 0(%rbp), %rdx
jmp .L5
.L4:
testq %rbx, %rbx
je .L5
jmp .L3
.L13:
call __stack_chk_fail#PLT
.cfi_endproc
.LFE2389:
.size _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.19, .-_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.19
.set _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPcEEvT_S7_St20forward_iterator_tag.isra.23,_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.19
.section .text._ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z,"axG",#progbits,_ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z,comdat
.p2align 4,,15
.weak _ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z
.type _ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z, #function
_ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z:
.LFB1953:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsi, %r10
movq %rdx, %rsi
movq %rcx, %rdx
movq %rsp, %rbp
.cfi_def_cfa_register 6
pushq %r12
pushq %rbx
.cfi_offset 12, -24
.cfi_offset 3, -32
movq %rdi, %r12
subq $208, %rsp
testb %al, %al
movq %r8, -160(%rbp)
movq %r9, -152(%rbp)
je .L15
movaps %xmm0, -144(%rbp)
movaps %xmm1, -128(%rbp)
movaps %xmm2, -112(%rbp)
movaps %xmm3, -96(%rbp)
movaps %xmm4, -80(%rbp)
movaps %xmm5, -64(%rbp)
movaps %xmm6, -48(%rbp)
movaps %xmm7, -32(%rbp)
.L15:
movq %fs:40, %rax
movq %rax, -200(%rbp)
xorl %eax, %eax
leaq 30(%rsi), %rax
leaq -224(%rbp), %rcx
andq $-16, %rax
movl $32, -224(%rbp)
movl $48, -220(%rbp)
subq %rax, %rsp
leaq 16(%rbp), %rax
leaq 15(%rsp), %rbx
movq %rax, -216(%rbp)
leaq -192(%rbp), %rax
andq $-16, %rbx
movq %rbx, %rdi
movq %rax, -208(%rbp)
call *%r10
leaq 16(%r12), %rdx
movq %r12, %rdi
movq %rbx, %rsi
movq %rdx, (%r12)
movslq %eax, %rdx
addq %rbx, %rdx
call _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPcEEvT_S7_St20forward_iterator_tag.isra.23
movq -200(%rbp), %rdi
xorq %fs:40, %rdi
movq %r12, %rax
jne .L18
leaq -16(%rbp), %rsp
popq %rbx
popq %r12
popq %rbp
.cfi_remember_state
.cfi_def_cfa 7, 8
ret
.L18:
.cfi_restore_state
call __stack_chk_fail#PLT
.cfi_endproc
.LFE1953:
.size _ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z, .-_ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z
.section .rodata.str1.1,"aMS",#progbits,1
.LC0:
.string ""
.LC1:
.string "%d"
.LC2:
.string "basic_string::append"
.LC3:
.string " "
.LC5:
.string "Done in "
.LC6:
.string "\n"
.section .text.startup,"ax",#progbits
.p2align 4,,15
.globl main
.type main, #function
main:
.LFB1871:
.cfi_startproc
.cfi_personality 0x9b,DW.ref.__gxx_personality_v0
.cfi_lsda 0x1b,.LLSDA1871
pushq %r15
.cfi_def_cfa_offset 16
.cfi_offset 15, -16
pushq %r14
.cfi_def_cfa_offset 24
.cfi_offset 14, -24
pushq %r13
.cfi_def_cfa_offset 32
.cfi_offset 13, -32
pushq %r12
.cfi_def_cfa_offset 40
.cfi_offset 12, -40
pushq %rbp
.cfi_def_cfa_offset 48
.cfi_offset 6, -48
pushq %rbx
.cfi_def_cfa_offset 56
.cfi_offset 3, -56
subq $136, %rsp
.cfi_def_cfa_offset 192
leaq 16(%rsp), %r13
movq %fs:40, %rax
movq %rax, 120(%rsp)
xorl %eax, %eax
call _ZNSt6chrono3_V212system_clock3nowEv#PLT
leaq .LC0(%rip), %rdx
movq %rax, (%rsp)
leaq 16(%r13), %rax
movq %r13, %rdi
movq %rdx, %rsi
movq %rax, 16(%rsp)
.LEHB0:
call _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.19
.LEHE0:
movl $20000000, %esi
movq %r13, %rdi
.LEHB1:
call _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE7reserveEm#PLT
.LEHE1:
leaq 48(%rsp), %rbp
leaq 80(%rsp), %rax
movl $10000000, %ebx
movabsq $9223372036854775807, %r14
leaq 96(%rsp), %r12
movq %rax, 8(%rsp)
leaq 16(%rbp), %r15
jmp .L25
.p2align 4,,10
.p2align 3
.L21:
movq %rcx, 80(%rsp)
movq 16(%rax), %rcx
movq %rcx, 96(%rsp)
.L22:
movq 8(%rax), %rcx
movb $0, 16(%rax)
movq %r13, %rdi
movq %rcx, 88(%rsp)
movq %rdx, (%rax)
movq $0, 8(%rax)
movq 80(%rsp), %rsi
movq 88(%rsp), %rdx
.LEHB2:
call _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_appendEPKcm#PLT
.LEHE2:
movq 80(%rsp), %rdi
cmpq %r12, %rdi
je .L23
call _ZdlPv#PLT
.L23:
movq 48(%rsp), %rdi
cmpq %r15, %rdi
je .L24
call _ZdlPv#PLT
.L24:
subl $1, %ebx
je .L40
.L25:
movq vsnprintf#GOTPCREL(%rip), %rsi
leaq .LC1(%rip), %rcx
movl $1, %r8d
movl $16, %edx
movq %rbp, %rdi
xorl %eax, %eax
.LEHB3:
call _ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z
.LEHE3:
cmpq %r14, 56(%rsp)
je .L41
leaq .LC3(%rip), %rsi
movl $1, %edx
movq %rbp, %rdi
.LEHB4:
call _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_appendEPKcm#PLT
.LEHE4:
movq %r12, 80(%rsp)
movq (%rax), %rcx
leaq 16(%rax), %rdx
cmpq %rdx, %rcx
jne .L21
movdqu 16(%rax), %xmm0
movaps %xmm0, 96(%rsp)
jmp .L22
.p2align 4,,10
.p2align 3
.L40:
call _ZNSt6chrono3_V212system_clock3nowEv#PLT
subq (%rsp), %rax
movabsq $4835703278458516699, %rdx
leaq .LC5(%rip), %rsi
pxor %xmm0, %xmm0
leaq _ZSt4cout(%rip), %rdi
movq %rax, %rcx
imulq %rdx
sarq $63, %rcx
sarq $18, %rdx
subq %rcx, %rdx
cvtsi2sdq %rdx, %xmm0
movl $8, %edx
divsd .LC4(%rip), %xmm0
movsd %xmm0, (%rsp)
.LEHB5:
call _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l#PLT
movsd (%rsp), %xmm0
leaq _ZSt4cout(%rip), %rdi
call _ZNSo9_M_insertIdEERSoT_#PLT
leaq .LC6(%rip), %rsi
movq %rax, %rdi
call _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc#PLT
.LEHE5:
movq 16(%rsp), %rdi
addq $16, %r13
cmpq %r13, %rdi
je .L26
call _ZdlPv#PLT
.L26:
xorl %eax, %eax
movq 120(%rsp), %rbx
xorq %fs:40, %rbx
jne .L42
addq $136, %rsp
.cfi_remember_state
.cfi_def_cfa_offset 56
popq %rbx
.cfi_def_cfa_offset 48
popq %rbp
.cfi_def_cfa_offset 40
popq %r12
.cfi_def_cfa_offset 32
popq %r13
.cfi_def_cfa_offset 24
popq %r14
.cfi_def_cfa_offset 16
popq %r15
.cfi_def_cfa_offset 8
ret
.L41:
.cfi_restore_state
leaq .LC2(%rip), %rdi
.LEHB6:
call _ZSt20__throw_length_errorPKc#PLT
.LEHE6:
.L35:
movq %rax, %rbx
.L29:
movq 48(%rsp), %rdi
addq $16, %rbp
cmpq %rbp, %rdi
je .L31
call _ZdlPv#PLT
.L31:
movq 16(%rsp), %rdi
addq $16, %r13
cmpq %r13, %rdi
je .L32
call _ZdlPv#PLT
.L32:
movq %rbx, %rdi
.LEHB7:
call _Unwind_Resume#PLT
.LEHE7:
.L34:
movq %rax, %rbx
jmp .L31
.L36:
movq 8(%rsp), %rdx
movq 80(%rsp), %rdi
movq %rax, %rbx
addq $16, %rdx
cmpq %rdx, %rdi
je .L29
call _ZdlPv#PLT
jmp .L29
.L42:
call __stack_chk_fail#PLT
.cfi_endproc
.LFE1871:
.globl __gxx_personality_v0
.section .gcc_except_table,"a",#progbits
.LLSDA1871:
.byte 0xff
.byte 0xff
.byte 0x1
.uleb128 .LLSDACSE1871-.LLSDACSB1871
.LLSDACSB1871:
.uleb128 .LEHB0-.LFB1871
.uleb128 .LEHE0-.LEHB0
.uleb128 0
.uleb128 0
.uleb128 .LEHB1-.LFB1871
.uleb128 .LEHE1-.LEHB1
.uleb128 .L34-.LFB1871
.uleb128 0
.uleb128 .LEHB2-.LFB1871
.uleb128 .LEHE2-.LEHB2
.uleb128 .L36-.LFB1871
.uleb128 0
.uleb128 .LEHB3-.LFB1871
.uleb128 .LEHE3-.LEHB3
.uleb128 .L34-.LFB1871
.uleb128 0
.uleb128 .LEHB4-.LFB1871
.uleb128 .LEHE4-.LEHB4
.uleb128 .L35-.LFB1871
.uleb128 0
.uleb128 .LEHB5-.LFB1871
.uleb128 .LEHE5-.LEHB5
.uleb128 .L34-.LFB1871
.uleb128 0
.uleb128 .LEHB6-.LFB1871
.uleb128 .LEHE6-.LEHB6
.uleb128 .L35-.LFB1871
.uleb128 0
.uleb128 .LEHB7-.LFB1871
.uleb128 .LEHE7-.LEHB7
.uleb128 0
.uleb128 0
.LLSDACSE1871:
.section .text.startup
.size main, .-main
.p2align 4,,15
.type _GLOBAL__sub_I_main, #function
_GLOBAL__sub_I_main:
.LFB2369:
.cfi_startproc
leaq _ZStL8__ioinit(%rip), %rdi
subq $8, %rsp
.cfi_def_cfa_offset 16
call _ZNSt8ios_base4InitC1Ev#PLT
movq _ZNSt8ios_base4InitD1Ev#GOTPCREL(%rip), %rdi
leaq __dso_handle(%rip), %rdx
leaq _ZStL8__ioinit(%rip), %rsi
addq $8, %rsp
.cfi_def_cfa_offset 8
jmp __cxa_atexit#PLT
.cfi_endproc
.LFE2369:
.size _GLOBAL__sub_I_main, .-_GLOBAL__sub_I_main
.section .init_array,"aw"
.align 8
.quad _GLOBAL__sub_I_main
.local _ZStL8__ioinit
.comm _ZStL8__ioinit,1,1
.section .rodata.cst8,"aM",#progbits,8
.align 8
.LC4:
.long 0
.long 1083129856
.hidden DW.ref.__gxx_personality_v0
.weak DW.ref.__gxx_personality_v0
.section .data.DW.ref.__gxx_personality_v0,"awG",#progbits,DW.ref.__gxx_personality_v0,comdat
.align 8
.type DW.ref.__gxx_personality_v0, #object
.size DW.ref.__gxx_personality_v0, 8
DW.ref.__gxx_personality_v0:
.quad __gxx_personality_v0
.hidden __dso_handle
.ident "GCC: (Ubuntu 7.3.0-16ubuntu3) 7.3.0"
.section .note.GNU-stack,"",#progbits
Windows:
.file "tester.cpp"
.text
.p2align 4,,15
.def ___tcf_0; .scl 3; .type 32; .endef
___tcf_0:
LFB2556:
.cfi_startproc
movl $__ZStL8__ioinit, %ecx
jmp __ZNSt8ios_base4InitD1Ev
.cfi_endproc
LFE2556:
.section .rdata,"dr"
.align 4
LC0:
.ascii "basic_string::_M_construct null not valid\0"
.text
.align 2
.p2align 4,,15
.def __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPcEEvT_S7_St20forward_iterator_tag.isra.29; .scl 3; .type 32; .endef
__ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPcEEvT_S7_St20forward_iterator_tag.isra.29:
LFB2587:
.cfi_startproc
pushl %edi
.cfi_def_cfa_offset 8
.cfi_offset 7, -8
pushl %esi
.cfi_def_cfa_offset 12
.cfi_offset 6, -12
movl %ecx, %esi
pushl %ebx
.cfi_def_cfa_offset 16
.cfi_offset 3, -16
subl $32, %esp
.cfi_def_cfa_offset 48
movl 48(%esp), %edi
movl 52(%esp), %ebx
testl %edi, %edi
jne L5
testl %ebx, %ebx
je L5
movl $LC0, (%esp)
call __ZSt19__throw_logic_errorPKc
.p2align 4,,10
L5:
subl %edi, %ebx
cmpl $15, %ebx
movl %ebx, 28(%esp)
ja L22
movl (%esi), %edx
cmpl $1, %ebx
movl %edx, %eax
je L23
testl %ebx, %ebx
jne L6
L8:
movl 28(%esp), %eax
movl %eax, 4(%esi)
movb $0, (%edx,%eax)
addl $32, %esp
.cfi_remember_state
.cfi_def_cfa_offset 16
popl %ebx
.cfi_restore 3
.cfi_def_cfa_offset 12
popl %esi
.cfi_restore 6
.cfi_def_cfa_offset 8
popl %edi
.cfi_restore 7
.cfi_def_cfa_offset 4
ret $8
.p2align 4,,10
L22:
.cfi_restore_state
leal 28(%esp), %eax
movl $0, 4(%esp)
movl %esi, %ecx
movl %eax, (%esp)
call __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_createERjj
.cfi_def_cfa_offset 40
subl $8, %esp
.cfi_def_cfa_offset 48
movl %eax, (%esi)
movl 28(%esp), %edx
movl %edx, 8(%esi)
L6:
movl %ebx, 8(%esp)
movl %edi, 4(%esp)
movl %eax, (%esp)
call _memcpy
movl (%esi), %edx
jmp L8
.p2align 4,,10
L23:
movzbl (%edi), %eax
movb %al, (%edx)
movl (%esi), %edx
jmp L8
.cfi_endproc
LFE2587:
.def __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.21; .scl 3; .type 32; .endef
.set __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.21,__ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPcEEvT_S7_St20forward_iterator_tag.isra.29
.section .text$_ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_jPKS8_PcEjSB_z,"x"
.linkonce discard
.p2align 4,,15
.globl __ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_jPKS8_PcEjSB_z
.def __ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_jPKS8_PcEjSB_z; .scl 2; .type 32; .endef
__ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_jPKS8_PcEjSB_z:
LFB2177:
.cfi_startproc
pushl %ebp
.cfi_def_cfa_offset 8
.cfi_offset 5, -8
movl %esp, %ebp
.cfi_def_cfa_register 5
pushl %esi
pushl %ebx
subl $16, %esp
.cfi_offset 6, -12
.cfi_offset 3, -16
movl 16(%ebp), %edx
movl 8(%ebp), %esi
leal 30(%edx), %eax
andl $-16, %eax
call ___chkstk_ms
subl %eax, %esp
leal 24(%ebp), %eax
leal 31(%esp), %ebx
movl %edx, 4(%esp)
movl %eax, 12(%esp)
movl 20(%ebp), %eax
andl $-16, %ebx
movl %ebx, (%esp)
movl %eax, 8(%esp)
call *12(%ebp)
leal 8(%esi), %edx
addl %ebx, %eax
movl %esi, %ecx
movl %edx, (%esi)
movl %eax, 4(%esp)
movl %ebx, (%esp)
call __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPcEEvT_S7_St20forward_iterator_tag.isra.29
subl $8, %esp
leal -8(%ebp), %esp
movl %esi, %eax
popl %ebx
.cfi_restore 3
popl %esi
.cfi_restore 6
popl %ebp
.cfi_restore 5
.cfi_def_cfa 4, 4
ret
.cfi_endproc
LFE2177:
.def ___main; .scl 2; .type 32; .endef
.section .rdata,"dr"
LC1:
.ascii "\0"
LC2:
.ascii "%d\0"
LC3:
.ascii "basic_string::append\0"
LC4:
.ascii " \0"
.def ___divdi3; .scl 2; .type 32; .endef
LC6:
.ascii "Done in \0"
LC7:
.ascii "\12\0"
.section .text.startup,"x"
.p2align 4,,15
.globl _main
.def _main; .scl 2; .type 32; .endef
_main:
LFB2111:
.cfi_startproc
.cfi_personality 0,___gxx_personality_v0
.cfi_lsda 0,LLSDA2111
leal 4(%esp), %ecx
.cfi_def_cfa 1, 0
andl $-16, %esp
pushl -4(%ecx)
pushl %ebp
.cfi_escape 0x10,0x5,0x2,0x75,0
movl %esp, %ebp
pushl %edi
pushl %esi
pushl %ebx
pushl %ecx
.cfi_escape 0xf,0x3,0x75,0x70,0x6
.cfi_escape 0x10,0x7,0x2,0x75,0x7c
.cfi_escape 0x10,0x6,0x2,0x75,0x78
.cfi_escape 0x10,0x3,0x2,0x75,0x74
subl $152, %esp
call ___main
call __ZNSt6chrono3_V212system_clock3nowEv
leal -96(%ebp), %ecx
movl %eax, -136(%ebp)
leal -88(%ebp), %eax
movl $LC1, 4(%esp)
movl $LC1, (%esp)
movl %edx, -132(%ebp)
movl %eax, -96(%ebp)
LEHB0:
call __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.21
LEHE0:
leal -96(%ebp), %ecx
subl $8, %esp
movl $20000000, (%esp)
LEHB1:
call __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE7reserveEj
LEHE1:
subl $4, %esp
movl $10000000, %edi
leal -72(%ebp), %esi
leal -40(%ebp), %ebx
jmp L32
.p2align 4,,10
L28:
movl %ecx, -48(%ebp)
movl 8(%eax), %ecx
movl %ecx, -40(%ebp)
L29:
movl 4(%eax), %ecx
movb $0, 8(%eax)
movl %ecx, -44(%ebp)
movl %edx, (%eax)
leal -96(%ebp), %ecx
movl $0, 4(%eax)
movl -44(%ebp), %eax
movl %eax, 4(%esp)
movl -48(%ebp), %eax
movl %eax, (%esp)
LEHB2:
call __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_appendEPKcj
LEHE2:
movl -48(%ebp), %eax
subl $8, %esp
cmpl %ebx, %eax
je L30
movl %eax, (%esp)
call __ZdlPv
L30:
movl -72(%ebp), %eax
leal -64(%ebp), %edx
cmpl %edx, %eax
je L31
movl %eax, (%esp)
call __ZdlPv
L31:
subl $1, %edi
je L46
L32:
movl $1, 16(%esp)
movl $LC2, 12(%esp)
movl $16, 8(%esp)
movl $_vsnprintf, 4(%esp)
movl %esi, (%esp)
LEHB3:
call __ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_jPKS8_PcEjSB_z
LEHE3:
cmpl $2147483647, -68(%ebp)
je L47
movl $1, 4(%esp)
movl $LC4, (%esp)
movl %esi, %ecx
LEHB4:
call __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_appendEPKcj
LEHE4:
movl %ebx, -48(%ebp)
movl (%eax), %ecx
leal 8(%eax), %edx
subl $8, %esp
cmpl %edx, %ecx
jne L28
movl 12(%eax), %ecx
movl %ecx, -120(%ebp)
movl 16(%eax), %ecx
movl %ecx, -124(%ebp)
movl 20(%eax), %ecx
movl %ecx, -128(%ebp)
movl 8(%eax), %ecx
movl %ecx, -40(%ebp)
movl -120(%ebp), %ecx
movl %ecx, -36(%ebp)
movl -124(%ebp), %ecx
movl %ecx, -32(%ebp)
movl -128(%ebp), %ecx
movl %ecx, -28(%ebp)
jmp L29
.p2align 4,,10
L46:
call __ZNSt6chrono3_V212system_clock3nowEv
subl -136(%ebp), %eax
movl $1000000, 8(%esp)
sbbl -132(%ebp), %edx
movl $0, 12(%esp)
movl %eax, (%esp)
movl %edx, 4(%esp)
call ___divdi3
movl %eax, -120(%ebp)
movl %edx, -116(%ebp)
fildq -120(%ebp)
movl $8, 8(%esp)
movl $LC6, 4(%esp)
movl $__ZSt4cout, (%esp)
fdivs LC5
fstpl -120(%ebp)
LEHB5:
call __ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_i
fldl -120(%ebp)
movl $__ZSt4cout, %ecx
fstpl (%esp)
call __ZNSo9_M_insertIdEERSoT_
subl $8, %esp
movl $LC7, 4(%esp)
movl %eax, (%esp)
call __ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
LEHE5:
movl -96(%ebp), %eax
leal -88(%ebp), %edi
cmpl %edi, %eax
je L43
movl %eax, (%esp)
call __ZdlPv
L43:
leal -16(%ebp), %esp
xorl %eax, %eax
popl %ecx
.cfi_remember_state
.cfi_restore 1
.cfi_def_cfa 1, 0
popl %ebx
.cfi_restore 3
popl %esi
.cfi_restore 6
popl %edi
.cfi_restore 7
popl %ebp
.cfi_restore 5
leal -4(%ecx), %esp
.cfi_def_cfa 4, 4
ret
L47:
.cfi_restore_state
movl $LC3, (%esp)
LEHB6:
call __ZSt20__throw_length_errorPKc
LEHE6:
L41:
movl %eax, %ebx
L36:
movl -72(%ebp), %eax
leal -64(%ebp), %edx
cmpl %edx, %eax
je L38
movl %eax, (%esp)
call __ZdlPv
L38:
movl -96(%ebp), %eax
leal -88(%ebp), %edi
cmpl %edi, %eax
je L39
movl %eax, (%esp)
call __ZdlPv
L39:
movl %ebx, (%esp)
LEHB7:
call __Unwind_Resume
LEHE7:
L42:
movl %eax, %esi
movl -48(%ebp), %eax
cmpl %ebx, %eax
je L35
movl %eax, (%esp)
call __ZdlPv
L35:
movl %esi, %ebx
jmp L36
L40:
movl %eax, %ebx
jmp L38
.cfi_endproc
LFE2111:
.def ___gxx_personality_v0; .scl 2; .type 32; .endef
.section .gcc_except_table,"w"
LLSDA2111:
.byte 0xff
.byte 0xff
.byte 0x1
.uleb128 LLSDACSE2111-LLSDACSB2111
LLSDACSB2111:
.uleb128 LEHB0-LFB2111
.uleb128 LEHE0-LEHB0
.uleb128 0
.uleb128 0
.uleb128 LEHB1-LFB2111
.uleb128 LEHE1-LEHB1
.uleb128 L40-LFB2111
.uleb128 0
.uleb128 LEHB2-LFB2111
.uleb128 LEHE2-LEHB2
.uleb128 L42-LFB2111
.uleb128 0
.uleb128 LEHB3-LFB2111
.uleb128 LEHE3-LEHB3
.uleb128 L40-LFB2111
.uleb128 0
.uleb128 LEHB4-LFB2111
.uleb128 LEHE4-LEHB4
.uleb128 L41-LFB2111
.uleb128 0
.uleb128 LEHB5-LFB2111
.uleb128 LEHE5-LEHB5
.uleb128 L40-LFB2111
.uleb128 0
.uleb128 LEHB6-LFB2111
.uleb128 LEHE6-LEHB6
.uleb128 L41-LFB2111
.uleb128 0
.uleb128 LEHB7-LFB2111
.uleb128 LEHE7-LEHB7
.uleb128 0
.uleb128 0
LLSDACSE2111:
.section .text.startup,"x"
.p2align 4,,15
.def __GLOBAL__sub_I_main; .scl 3; .type 32; .endef
__GLOBAL__sub_I_main:
LFB2557:
.cfi_startproc
subl $28, %esp
.cfi_def_cfa_offset 32
movl $__ZStL8__ioinit, %ecx
call __ZNSt8ios_base4InitC1Ev
movl $___tcf_0, (%esp)
call _atexit
addl $28, %esp
.cfi_def_cfa_offset 4
ret
.cfi_endproc
LFE2557:
.section .ctors,"w"
.align 4
.long __GLOBAL__sub_I_main
.lcomm __ZStL8__ioinit,1,1
.section .rdata,"dr"
.align 4
LC5:
.long 1148846080
.ident "GCC: (MinGW.org GCC-6.3.0-1) 6.3.0"
.def __ZNSt8ios_base4InitD1Ev; .scl 2; .type 32; .endef
.def __ZSt19__throw_logic_errorPKc; .scl 2; .type 32; .endef
.def __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_createERjj; .scl 2; .type 32; .endef
.def _memcpy; .scl 2; .type 32; .endef
.def __ZNSt6chrono3_V212system_clock3nowEv; .scl 2; .type 32; .endef
.def __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE7reserveEj; .scl 2; .type 32; .endef
.def __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_appendEPKcj; .scl 2; .type 32; .endef
.def __ZdlPv; .scl 2; .type 32; .endef
.def _vsnprintf; .scl 2; .type 32; .endef
.def __ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_i; .scl 2; .type 32; .endef
.def __ZNSo9_M_insertIdEERSoT_; .scl 2; .type 32; .endef
.def __ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc; .scl 2; .type 32; .endef
.def __ZSt20__throw_length_errorPKc; .scl 2; .type 32; .endef
.def __Unwind_Resume; .scl 2; .type 32; .endef
.def __ZNSt8ios_base4InitC1Ev; .scl 2; .type 32; .endef
.def _atexit; .scl 2; .type 32; .endef

Quick look at disassembly shows that Windows version uses movl (i. e. long word, 32 bit move) and Linux version uses movq (quad word, 64 bit) and SSE registers xmm.
My bet is that on Linux, you compile for x86-64, while on Windows you target 32 bit x86.
x86-64 includes SSE2 extension, while x86 does not, so MinGW defaults to no-SSE mode.
If that's the case, building with 64 bit toolchain on Windows should result in comparable performance. Alternatively, you might enable SSE for 32 bit builds (-msse2 compiler flag, if I remember correctly).

The mingw.org implementation just seems to be much more inefficient than linux, Visual Studio or mingw-w64.org.
>g++ --version
g++ (MinGW.org GCC-6.3.0-1) 6.3.0
Done in 24.808
>g++ --version
g++ (i686-posix-dwarf-rev2, Built by MinGW-W64 project) 6.3.0
Done in 0.679

Tested with MSYS2 MinGW64:
g++ --version
g++.exe (Rev2, Built by MSYS2 project) 7.3.0
g++.exe -Wall -O3 -mtune=native -fno-exceptions -fno-rtti -c main.cpp -o main.o
g++.exe -o test.exe main.o -s
Done in 0.547
Env: Windows 10 x64
CPU: Intel Core i5-6300U, 2.4GH
RAM: 16GB DDR4
In any case, MinGW uses mswcrt.dll instead of GNU libc (windows bundled one, not a universal CRT/visual studio CRT etc) so speed gap may comes from C standard library from my experience.
P.S. with some changes (same compiler flags)
#include <iostream>
#include <chrono>
#ifdef _WIN32
#include <windows.h>
static std::size_t page_size() noexcept {
::SYSTEM_INFO si;
::GetSystemInfo(&si);
return si.dwPageSize;
}
#else
#include <sys/types.h>
#include <unistd.h>
static std::size_t page_size() noexcept {
return static_cast<std::size_t>( ::sysconf(_SC_PAGESIZE) );
}
#endif // _WIN32
int main(int argc, char const *argv[]) {
auto started = std::chrono::high_resolution_clock::now();
const std::size_t n = 10000000;
// align size to page boundary
const std::size_t al = page_size() - 1;
const std::size_t buff_size = ( (n << 1) + al) & ~al;
std::string str;
str.reserve(buff_size);
const std::string to_append( std::to_string(1) );
for (std::size_t i = 0; i < n; ++i) {
str.append( to_append );
str.push_back(' ');
}
auto done = std::chrono::high_resolution_clock::now();
double secs = (double) std::chrono::duration_cast<std::chrono::milliseconds>(done-started).count() / 1000;
std::cout << "Done in " << secs << "\n";
return 0;
}
Done in 0.046
Asm ouput for main function:
main:
pushq %r14
.seh_pushreg %r14
pushq %r13
.seh_pushreg %r13
pushq %r12
.seh_pushreg %r12
pushq %rbp
.seh_pushreg %rbp
pushq %rdi
.seh_pushreg %rdi
pushq %rsi
.seh_pushreg %rsi
pushq %rbx
.seh_pushreg %rbx
subq $144, %rsp
.seh_stackalloc 144
.seh_endprologue
movl $10000000, %esi
call __main
leaq 96(%rsp), %r13
leaq 64(%rsp), %rbp
call _ZNSt6chrono3_V212system_clock3nowEv
movq %r13, %rcx
leaq 16(%rbp), %r12
movq %rax, %r14
call *__imp_GetSystemInfo(%rip)
movl 100(%rsp), %eax
movq %rbp, %rcx
movq %r12, 64(%rsp)
movq $0, 72(%rsp)
leaq 19999999(%rax), %rdx
negq %rax
movb $0, 80(%rsp)
andq %rax, %rdx
call _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE7reserveEy
movl $1, 32(%rsp)
movq %r13, %rcx
leaq .LC0(%rip), %r9
movl $16, %r8d
leaq _ZL9vsnprintfPcyPKcS_(%rip), %rdx
call _ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_yPKS8_PcEySB_z
jmp .L14
.p2align 4,,10
.L16:
movb $32, (%rdx,%rbx)
.L26:
movq 64(%rsp), %rax
movq %rdi, 72(%rsp)
movb $0, 1(%rax,%rbx)
subq $1, %rsi
je .L27
.L14:
movq 96(%rsp), %rdx
movq 104(%rsp), %r8
movq %rbp, %rcx
call _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_appendEPKcy
movq 72(%rsp), %rbx
movq 64(%rsp), %rdx
movl $15, %eax
leaq 1(%rbx), %rdi
cmpq %r12, %rdx
je .L15
movq 80(%rsp), %rax
.L15:
cmpq %rax, %rdi
jbe .L16
xorl %r9d, %r9d
xorl %r8d, %r8d
movq %rbx, %rdx
movq %rbp, %rcx
movq $1, 32(%rsp)
call _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_mutateEyyPKcy
movq 64(%rsp), %rax
movb $32, (%rax,%rbx)
jmp .L26
.p2align 4,,10
.L27:
call _ZNSt6chrono3_V212system_clock3nowEv
pxor %xmm1, %xmm1
movl $8, %r8d
movabsq $4835703278458516699, %rdx
subq %r14, %rax
addq $16, %r13
movq %rax, %rcx
imulq %rdx
sarq $63, %rcx
sarq $18, %rdx
subq %rcx, %rdx
movq .refptr._ZSt4cout(%rip), %rcx
cvtsi2sdq %rdx, %xmm1
leaq .LC2(%rip), %rdx
divsd .LC1(%rip), %xmm1
movsd %xmm1, 56(%rsp)
call _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_x
movsd 56(%rsp), %xmm1
movq .refptr._ZSt4cout(%rip), %rcx
call _ZNSo9_M_insertIdEERSoT_
leaq .LC3(%rip), %rdx
movq %rax, %rcx
call _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
movq 96(%rsp), %rcx
cmpq %r13, %rcx
je .L19
call _ZdlPv
.L19:
movq 64(%rsp), %rcx
addq $16, %rbp
cmpq %rbp, %rcx
je .L20
call _ZdlPv
.L20:
xorl %eax, %eax
addq $144, %rsp
popq %rbx
popq %rsi
popq %rdi
popq %rbp
popq %r12
popq %r13
popq %r14
ret
.seh_endproc
.p2align 4,,15
.def _GLOBAL__sub_I_main; .scl 3; .type 32; .endef
.seh_proc _GLOBAL__sub_I_main

(Just for the proportions) Windows Release target vs. Debug target on Visual Studio C++: By default, Debug target compile-line is without optimization, while Release target compile-line is with /O2 optimization, with /Oi ("Enable Intrinsic Functions"), & with /GL ("Whole Program Optimization"). Your code, on my workstation, Debug x64 vs Relesae x64:
Debug: 70 sec.
Release: 0.27 sec.
You build with MinGW (which I am not familiar with). But from a fast search, there is a talk about Debug/Release mode ...and MinGW has equivalent /O2 optimization, /Oi ("Enable Intrinsic Functions"), and /Og ("Enable Global Optimization") flags, it seems.
-
Compile with these 3 flags (x64 target), & compare with the VS Release x64 benchmark. Anyway, this is MS default compile optimization for a Release target.
-
Test Environment:
HP 8100, Windows 10 Pro 64 bit, CPU i7 870, 16 GB DDR3 RAM, Visual Studio 2017, Targets: Debug x64 / Release x64

I tried your code at my Windows with MinGW 4.8.0 and got ~20 seconds. When I changed string concatination to std::stringstream I got 0.5 seconds:
...
std::stringstream ss;
for (int i = 0; i < n; ++i) {
//str += std::to_string(a) + " ";
ss << a << " ";
}
str = ss.str();
...

Why does a simple use of ostringstream generates so much assembly code?

Consider the following simple example that formats a string and an integer using ostringstream and discards the output:
#include <sstream>
void ostringstream_test() {
std::ostringstream ss;
ss << "x = " << 42;
ss.str();
}
Compiling it with clang++ -S -O3 -DNDEBUG -std=c++14 test.cc generates a ton of assembly code (half a kilobyte in x86-64 instructions compared to less than a hundred bytes for a similar sprintf code) - see below the output. Why does it generates so much code, is it inherent to the ostringstream API or this particular compiler/library does something wrong?
.globl __Z18ostringstream_testv
.p2align 4, 0x90
__Z18ostringstream_testv: ## #_Z18ostringstream_testv
Lfunc_begin0:
.cfi_startproc
.cfi_personality 155, ___gxx_personality_v0
.cfi_lsda 16, Lexception0
## BB#0:
pushq %rbp
Lcfi0:
.cfi_def_cfa_offset 16
Lcfi1:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Lcfi2:
.cfi_def_cfa_register %rbp
pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %rbx
subq $328, %rsp ## imm = 0x148
Lcfi3:
.cfi_offset %rbx, -56
Lcfi4:
.cfi_offset %r12, -48
Lcfi5:
.cfi_offset %r13, -40
Lcfi6:
.cfi_offset %r14, -32
Lcfi7:
.cfi_offset %r15, -24
leaq -256(%rbp), %r14
leaq -360(%rbp), %r12
movq __ZTCNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE0_NS_13basic_ostreamIcS2_EE#GOTPCREL(%rip), %rax
leaq 24(%rax), %rcx
movq %rcx, -368(%rbp)
addq $64, %rax
movq %rax, -256(%rbp)
Ltmp0:
movq %r14, %rdi
movq %r12, %rsi
callq __ZNSt3__18ios_base4initEPv
Ltmp1:
## BB#1:
movq $0, -120(%rbp)
movl $-1, -112(%rbp)
movq __ZTVNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE#GOTPCREL(%rip), %rbx
leaq 24(%rbx), %r13
movq %r13, -368(%rbp)
addq $64, %rbx
movq %rbx, -256(%rbp)
Ltmp3:
movq %r12, %rdi
callq __ZNSt3__115basic_streambufIcNS_11char_traitsIcEEEC2Ev
Ltmp4:
## BB#2:
movq __ZTVNSt3__115basic_stringbufIcNS_11char_traitsIcEENS_9allocatorIcEEEE#GOTPCREL(%rip), %r15
addq $16, %r15
movq %r15, -360(%rbp)
movq $0, -272(%rbp)
movq $0, -280(%rbp)
movq $0, -288(%rbp)
movq $0, -296(%rbp)
movl $16, -264(%rbp)
xorps %xmm0, %xmm0
movaps %xmm0, -80(%rbp)
movq $0, -64(%rbp)
Ltmp6:
leaq -80(%rbp), %rsi
movq %r12, %rdi
callq __ZNSt3__115basic_stringbufIcNS_11char_traitsIcEENS_9allocatorIcEEE3strERKNS_12basic_stringIcS2_S4_EE
Ltmp7:
## BB#3:
testb $1, -80(%rbp)
je LBB0_5
## BB#4:
movq -64(%rbp), %rdi
callq __ZdlPv
LBB0_5:
Ltmp9:
leaq L_.str(%rip), %rsi
leaq -368(%rbp), %rdi
movl $4, %edx
callq __ZNSt3__124__put_character_sequenceIcNS_11char_traitsIcEEEERNS_13basic_ostreamIT_T0_EES7_PKS4_m
Ltmp10:
## BB#6:
Ltmp11:
movl $42, %esi
movq %rax, %rdi
callq __ZNSt3__113basic_ostreamIcNS_11char_traitsIcEEElsEi
Ltmp12:
## BB#7:
Ltmp13:
leaq -104(%rbp), %rdi
movq %r12, %rsi
callq __ZNKSt3__115basic_stringbufIcNS_11char_traitsIcEENS_9allocatorIcEEE3strEv
Ltmp14:
## BB#8:
testb $1, -104(%rbp)
je LBB0_10
## BB#9:
movq -88(%rbp), %rdi
callq __ZdlPv
LBB0_10:
movq %r13, -368(%rbp)
movq %rbx, -256(%rbp)
movq %r15, -360(%rbp)
testb $1, -296(%rbp)
je LBB0_12
## BB#11:
movq -280(%rbp), %rdi
callq __ZdlPv
LBB0_12:
movq %r12, %rdi
callq __ZNSt3__115basic_streambufIcNS_11char_traitsIcEEED2Ev
movq __ZTTNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE#GOTPCREL(%rip), %rsi
addq $8, %rsi
leaq -368(%rbp), %rdi
callq __ZNSt3__113basic_ostreamIcNS_11char_traitsIcEEED2Ev
movq %r14, %rdi
callq __ZNSt3__19basic_iosIcNS_11char_traitsIcEEED2Ev
addq $328, %rsp ## imm = 0x148
popq %rbx
popq %r12
popq %r13
popq %r14
popq %r15
popq %rbp
retq
LBB0_13:
Ltmp8:
movq %rax, -48(%rbp) ## 8-byte Spill
testb $1, -80(%rbp)
je LBB0_18
## BB#14:
movq -64(%rbp), %rdi
callq __ZdlPv
testb $1, -296(%rbp)
jne LBB0_19
jmp LBB0_20
LBB0_16:
Ltmp5:
movq %rax, -48(%rbp) ## 8-byte Spill
jmp LBB0_21
LBB0_15:
Ltmp2:
movq %rax, -48(%rbp) ## 8-byte Spill
jmp LBB0_22
LBB0_17:
Ltmp15:
movq %rax, -48(%rbp) ## 8-byte Spill
movq %r13, -368(%rbp)
movq %rbx, -256(%rbp)
movq %r15, -360(%rbp)
LBB0_18:
testb $1, -296(%rbp)
je LBB0_20
LBB0_19:
movq -280(%rbp), %rdi
callq __ZdlPv
LBB0_20:
movq %r12, %rdi
callq __ZNSt3__115basic_streambufIcNS_11char_traitsIcEEED2Ev
LBB0_21:
movq __ZTTNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE#GOTPCREL(%rip), %rsi
addq $8, %rsi
leaq -368(%rbp), %rdi
callq __ZNSt3__113basic_ostreamIcNS_11char_traitsIcEEED2Ev
LBB0_22:
movq %r14, %rdi
callq __ZNSt3__19basic_iosIcNS_11char_traitsIcEEED2Ev
movq -48(%rbp), %rdi ## 8-byte Reload
callq __Unwind_Resume
Lfunc_end0:
.cfi_endproc
.section __TEXT,__gcc_except_tab
.p2align 2
GCC_except_table0:
Lexception0:
.byte 255 ## #LPStart Encoding = omit
.byte 155 ## #TType Encoding = indirect pcrel sdata4
.asciz "\303\200" ## #TType base offset
.byte 3 ## Call site Encoding = udata4
.byte 65 ## Call site table length
Lset0 = Ltmp0-Lfunc_begin0 ## >> Call Site 1 <<
.long Lset0
Lset1 = Ltmp1-Ltmp0 ## Call between Ltmp0 and Ltmp1
.long Lset1
Lset2 = Ltmp2-Lfunc_begin0 ## jumps to Ltmp2
.long Lset2
.byte 0 ## On action: cleanup
Lset3 = Ltmp3-Lfunc_begin0 ## >> Call Site 2 <<
.long Lset3
Lset4 = Ltmp4-Ltmp3 ## Call between Ltmp3 and Ltmp4
.long Lset4
Lset5 = Ltmp5-Lfunc_begin0 ## jumps to Ltmp5
.long Lset5
.byte 0 ## On action: cleanup
Lset6 = Ltmp6-Lfunc_begin0 ## >> Call Site 3 <<
.long Lset6
Lset7 = Ltmp7-Ltmp6 ## Call between Ltmp6 and Ltmp7
.long Lset7
Lset8 = Ltmp8-Lfunc_begin0 ## jumps to Ltmp8
.long Lset8
.byte 0 ## On action: cleanup
Lset9 = Ltmp9-Lfunc_begin0 ## >> Call Site 4 <<
.long Lset9
Lset10 = Ltmp14-Ltmp9 ## Call between Ltmp9 and Ltmp14
.long Lset10
Lset11 = Ltmp15-Lfunc_begin0 ## jumps to Ltmp15
.long Lset11
.byte 0 ## On action: cleanup
Lset12 = Ltmp14-Lfunc_begin0 ## >> Call Site 5 <<
.long Lset12
Lset13 = Lfunc_end0-Ltmp14 ## Call between Ltmp14 and Lfunc_end0
.long Lset13
.long 0 ## has no landing pad
.byte 0 ## On action: cleanup
.p2align 2

The most likely reason for the difference is that the IOStream implementation is expanded inline while the sprintf() use is just a function call. Nothing inherently prevents IOStreams to be implemented by a library. It does take a tiny but of abstraction and planning, though: the definition in the standard uses templates. These are normally just implemented inline. Declaring the typically used instantiations (for character types char and wchar_t) as extern templates and explicitly instantiating them is extra work, though. I showed a long time ago that it does pay off in term of compile-time and, at least, libstdc++ preinstantiates the IOStreams functions in a library. Based on you experiment it seems libc++ doesn’t.

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

Halide: X86 assembly code generation - c++

It sounds like you may be building using a very old Halide tree—for quite a while there has not any file camera_pipe.cpp, the generated output is not called curved.*, etc. That said, the x86 backend in CodeGen_X86.cpp does generate x86 code. The curved.s you posted is x86_64 assembly.

Related

C++ return reference push instruction not showing up in assembly

Why does march=native corrupt my program?

Why clang does not optimize global const like a #define?

Slow std::string concatenation on windows

Why does a simple use of ostringstream generates so much assembly code?

Categories

Resources