Debugging with an assemble program with GDB

Debugging with an assemble program with GDB - gdb

Here is my question:
I wrote a piece of code of assemble. It could read a file, transform the content to the uppercase and print the outputs in a newfile.
I complie and link the assemble code with:
as -gstabs read-files.s -o read-files.o
ld read-files.o -o read-files
And a test like "./read-files input-file output-file" works well.
But what if I want to debug this piece of code with gdb? I tried, but:
when I set the breakpoint and args of target code in gdb with:
(gdb) b *_start+1
(gdb) run test-file TEST-FILE
It will end with a segmentfault immediately.
Can I really debug this code like what I just stated aboved? Thanks
And the assemble code is here:
.section .data
.equ SYS_OPEN, 5
.equ SYS_WRITE, 4
.equ SYS_READ, 3
.equ SYS_CLOSE, 6
.equ SYS_EXIT, 1
.equ O_RDONLY, 0
.equ O_CREAT_WRONLY_TRUNC, 03101
.equ STDIN, 0
.equ STDOUT, 1
.equ STDERR, 2
.equ LINUX_SYSCALL, 0x80
.equ END_OF_FILE, 0
.equ NUMBER_ARGUMENTS, 2
.section .bss
.equ BUFFER_SIZE, 500
.lcomm BUFFER_DATA, BUFFER_SIZE
.section .text
.equ ST_SIZE_RESERVE, 8
.equ ST_FD_IN, -4
.equ ST_FD_OUT, -8
.equ ST_ARGC, 0
.equ ST_ARGV_0, 4
.equ ST_ARGV_1, 8
.equ ST_ARGV_2, 12
.globl _start
_start:
movl %esp, %ebp
subl $ST_SIZE_RESERVE, %esp
open_files:
open_fd_in:
movl $SYS_OPEN, %eax
movl ST_ARGV_1(%ebp), %ebx
movl $O_RDONLY, %ecx
movl $0666, %edx
int $LINUX_SYSCALL
store_fd_in:
movl %eax, ST_FD_IN(%ebp)
open_fd_out:
movl $SYS_OPEN, %eax
movl ST_ARGV_2(%ebp), %ebx
movl $O_CREAT_WRONLY_TRUNC, %ecx
movl $0666, %edx
int $LINUX_SYSCALL
store_fd_out:
movl %eax, ST_FD_OUT(%ebp)
read_loop_begin:
movl $SYS_READ, %eax
movl ST_FD_IN(%ebp), %ebx
movl $BUFFER_DATA, %ecx
movl $BUFFER_SIZE, %edx
int $LINUX_SYSCALL
cmpl $END_OF_FILE, %eax
jle end_loop
continue_read_loop:
pushl $BUFFER_DATA
pushl %eax
call convert_to_upper
popl %eax
addl $4, %esp
movl %eax, %edx
movl $SYS_WRITE, %eax
movl ST_FD_OUT(%ebp), %ebx
movl $BUFFER_DATA, %ecx
int $LINUX_SYSCALL
jmp read_loop_begin
end_loop:
movl $SYS_CLOSE, %eax
movl ST_FD_OUT(%ebp), %ebx
int $LINUX_SYSCALL
movl $SYS_CLOSE, %eax
movl ST_FD_IN(%ebp), %ebx
int $LINUX_SYSCALL
movl $SYS_EXIT, %eax
movl $0, %ebx
int $LINUX_SYSCALL
.equ LOWERCASE_A, 'a'
.equ LOWERCASE_Z, 'z'
.equ UPPER_CONVERSION, 'A' - 'a'
.equ ST_BUFFER_LEN, 8
.equ ST_BUFFER, 12
convert_to_upper:
pushl %ebp
movl %esp, %ebp
movl ST_BUFFER(%ebp), %eax
movl ST_BUFFER_LEN(%ebp), %ebx
movl $0, %edi
cmpl $0, %ebx
je end_convert_loop
convert_loop:
movb (%eax, %edi, 1), %cl
cmpb $LOWERCASE_A, %cl
jl next_byte
cmpb $LOWERCASE_Z, %cl
jg next_byte
addb $UPPER_CONVERSION, %cl
movb %cl, (%eax, %edi, 1)
next_byte:
incl %edi
cmpl %edi, %ebx
jne convert_loop
end_convert_loop:
movl %ebp, %esp
popl %ebp
ret

Related

Slow std::string concatenation on windows

I have a program that needs to concatenate lots of strings together (to be more precise integers converted to strings). On my Ubuntu machine (running g++ 7.3.0) the code runs in 1.5 seconds. But the code needs to be run on Windows as well (running g++ 6.3.0 using MinGW), where it takes 15 seconds to complete. Furthermore, the Ubuntu setup runs on a much slower Laptop using an i7-4712MQ CPU # 2.30GHz, whereas the Windows machine runs on an i7-7700K CPU # 4.20GHz.
The code to reproduce the times is shown below. I compile the code with g++ tester.cpp -O2 -o tester (or tester.exe for windows)
#include <iostream>
#include <chrono>
int main(int argc, char const *argv[]) {
auto started = std::chrono::high_resolution_clock::now();
std::string str = "";
const int n = 10000000;
str.reserve(2 * n);
int a = 1;
for (int i = 0; i < n; ++i) {
str += std::to_string(a) + " ";
}
auto done = std::chrono::high_resolution_clock::now();
double secs = (double) std::chrono::duration_cast<std::chrono::milliseconds>(done-started).count() / 1000;
std::cout << "Done in " << secs << "\n";
return 0;
}
Any idea where the large performance gap might come from?
The disassemblies look like this:
Ubuntu:
.file "tester.cpp"
.text
.align 2
.p2align 4,,15
.type _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.19, #function
_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.19:
.LFB2389:
.cfi_startproc
pushq %r12
.cfi_def_cfa_offset 16
.cfi_offset 12, -16
pushq %rbp
.cfi_def_cfa_offset 24
.cfi_offset 6, -24
movq %rsi, %r12
pushq %rbx
.cfi_def_cfa_offset 32
.cfi_offset 3, -32
movq %rdx, %rbx
movq %rdi, %rbp
subq %rsi, %rbx
subq $16, %rsp
.cfi_def_cfa_offset 48
movq %fs:40, %rax
movq %rax, 8(%rsp)
xorl %eax, %eax
cmpq $15, %rbx
movq %rbx, (%rsp)
ja .L12
movq (%rdi), %rdx
cmpq $1, %rbx
movq %rdx, %rax
jne .L4
movzbl (%rsi), %eax
movb %al, (%rdx)
movq (%rdi), %rdx
.L5:
movq (%rsp), %rax
movq %rax, 8(%rbp)
movb $0, (%rdx,%rax)
movq 8(%rsp), %rax
xorq %fs:40, %rax
jne .L13
addq $16, %rsp
.cfi_remember_state
.cfi_def_cfa_offset 32
popq %rbx
.cfi_def_cfa_offset 24
popq %rbp
.cfi_def_cfa_offset 16
popq %r12
.cfi_def_cfa_offset 8
ret
.L12:
.cfi_restore_state
xorl %edx, %edx
movq %rsp, %rsi
call _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_createERmm#PLT
movq (%rsp), %rdx
movq %rax, 0(%rbp)
movq %rdx, 16(%rbp)
.L3:
movq %rbx, %rdx
movq %r12, %rsi
movq %rax, %rdi
call memcpy#PLT
movq 0(%rbp), %rdx
jmp .L5
.L4:
testq %rbx, %rbx
je .L5
jmp .L3
.L13:
call __stack_chk_fail#PLT
.cfi_endproc
.LFE2389:
.size _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.19, .-_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.19
.set _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPcEEvT_S7_St20forward_iterator_tag.isra.23,_ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.19
.section .text._ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z,"axG",#progbits,_ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z,comdat
.p2align 4,,15
.weak _ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z
.type _ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z, #function
_ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z:
.LFB1953:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsi, %r10
movq %rdx, %rsi
movq %rcx, %rdx
movq %rsp, %rbp
.cfi_def_cfa_register 6
pushq %r12
pushq %rbx
.cfi_offset 12, -24
.cfi_offset 3, -32
movq %rdi, %r12
subq $208, %rsp
testb %al, %al
movq %r8, -160(%rbp)
movq %r9, -152(%rbp)
je .L15
movaps %xmm0, -144(%rbp)
movaps %xmm1, -128(%rbp)
movaps %xmm2, -112(%rbp)
movaps %xmm3, -96(%rbp)
movaps %xmm4, -80(%rbp)
movaps %xmm5, -64(%rbp)
movaps %xmm6, -48(%rbp)
movaps %xmm7, -32(%rbp)
.L15:
movq %fs:40, %rax
movq %rax, -200(%rbp)
xorl %eax, %eax
leaq 30(%rsi), %rax
leaq -224(%rbp), %rcx
andq $-16, %rax
movl $32, -224(%rbp)
movl $48, -220(%rbp)
subq %rax, %rsp
leaq 16(%rbp), %rax
leaq 15(%rsp), %rbx
movq %rax, -216(%rbp)
leaq -192(%rbp), %rax
andq $-16, %rbx
movq %rbx, %rdi
movq %rax, -208(%rbp)
call *%r10
leaq 16(%r12), %rdx
movq %r12, %rdi
movq %rbx, %rsi
movq %rdx, (%r12)
movslq %eax, %rdx
addq %rbx, %rdx
call _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPcEEvT_S7_St20forward_iterator_tag.isra.23
movq -200(%rbp), %rdi
xorq %fs:40, %rdi
movq %r12, %rax
jne .L18
leaq -16(%rbp), %rsp
popq %rbx
popq %r12
popq %rbp
.cfi_remember_state
.cfi_def_cfa 7, 8
ret
.L18:
.cfi_restore_state
call __stack_chk_fail#PLT
.cfi_endproc
.LFE1953:
.size _ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z, .-_ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z
.section .rodata.str1.1,"aMS",#progbits,1
.LC0:
.string ""
.LC1:
.string "%d"
.LC2:
.string "basic_string::append"
.LC3:
.string " "
.LC5:
.string "Done in "
.LC6:
.string "\n"
.section .text.startup,"ax",#progbits
.p2align 4,,15
.globl main
.type main, #function
main:
.LFB1871:
.cfi_startproc
.cfi_personality 0x9b,DW.ref.__gxx_personality_v0
.cfi_lsda 0x1b,.LLSDA1871
pushq %r15
.cfi_def_cfa_offset 16
.cfi_offset 15, -16
pushq %r14
.cfi_def_cfa_offset 24
.cfi_offset 14, -24
pushq %r13
.cfi_def_cfa_offset 32
.cfi_offset 13, -32
pushq %r12
.cfi_def_cfa_offset 40
.cfi_offset 12, -40
pushq %rbp
.cfi_def_cfa_offset 48
.cfi_offset 6, -48
pushq %rbx
.cfi_def_cfa_offset 56
.cfi_offset 3, -56
subq $136, %rsp
.cfi_def_cfa_offset 192
leaq 16(%rsp), %r13
movq %fs:40, %rax
movq %rax, 120(%rsp)
xorl %eax, %eax
call _ZNSt6chrono3_V212system_clock3nowEv#PLT
leaq .LC0(%rip), %rdx
movq %rax, (%rsp)
leaq 16(%r13), %rax
movq %r13, %rdi
movq %rdx, %rsi
movq %rax, 16(%rsp)
.LEHB0:
call _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.19
.LEHE0:
movl $20000000, %esi
movq %r13, %rdi
.LEHB1:
call _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE7reserveEm#PLT
.LEHE1:
leaq 48(%rsp), %rbp
leaq 80(%rsp), %rax
movl $10000000, %ebx
movabsq $9223372036854775807, %r14
leaq 96(%rsp), %r12
movq %rax, 8(%rsp)
leaq 16(%rbp), %r15
jmp .L25
.p2align 4,,10
.p2align 3
.L21:
movq %rcx, 80(%rsp)
movq 16(%rax), %rcx
movq %rcx, 96(%rsp)
.L22:
movq 8(%rax), %rcx
movb $0, 16(%rax)
movq %r13, %rdi
movq %rcx, 88(%rsp)
movq %rdx, (%rax)
movq $0, 8(%rax)
movq 80(%rsp), %rsi
movq 88(%rsp), %rdx
.LEHB2:
call _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_appendEPKcm#PLT
.LEHE2:
movq 80(%rsp), %rdi
cmpq %r12, %rdi
je .L23
call _ZdlPv#PLT
.L23:
movq 48(%rsp), %rdi
cmpq %r15, %rdi
je .L24
call _ZdlPv#PLT
.L24:
subl $1, %ebx
je .L40
.L25:
movq vsnprintf#GOTPCREL(%rip), %rsi
leaq .LC1(%rip), %rcx
movl $1, %r8d
movl $16, %edx
movq %rbp, %rdi
xorl %eax, %eax
.LEHB3:
call _ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_mPKS8_P13__va_list_tagEmSB_z
.LEHE3:
cmpq %r14, 56(%rsp)
je .L41
leaq .LC3(%rip), %rsi
movl $1, %edx
movq %rbp, %rdi
.LEHB4:
call _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_appendEPKcm#PLT
.LEHE4:
movq %r12, 80(%rsp)
movq (%rax), %rcx
leaq 16(%rax), %rdx
cmpq %rdx, %rcx
jne .L21
movdqu 16(%rax), %xmm0
movaps %xmm0, 96(%rsp)
jmp .L22
.p2align 4,,10
.p2align 3
.L40:
call _ZNSt6chrono3_V212system_clock3nowEv#PLT
subq (%rsp), %rax
movabsq $4835703278458516699, %rdx
leaq .LC5(%rip), %rsi
pxor %xmm0, %xmm0
leaq _ZSt4cout(%rip), %rdi
movq %rax, %rcx
imulq %rdx
sarq $63, %rcx
sarq $18, %rdx
subq %rcx, %rdx
cvtsi2sdq %rdx, %xmm0
movl $8, %edx
divsd .LC4(%rip), %xmm0
movsd %xmm0, (%rsp)
.LEHB5:
call _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l#PLT
movsd (%rsp), %xmm0
leaq _ZSt4cout(%rip), %rdi
call _ZNSo9_M_insertIdEERSoT_#PLT
leaq .LC6(%rip), %rsi
movq %rax, %rdi
call _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc#PLT
.LEHE5:
movq 16(%rsp), %rdi
addq $16, %r13
cmpq %r13, %rdi
je .L26
call _ZdlPv#PLT
.L26:
xorl %eax, %eax
movq 120(%rsp), %rbx
xorq %fs:40, %rbx
jne .L42
addq $136, %rsp
.cfi_remember_state
.cfi_def_cfa_offset 56
popq %rbx
.cfi_def_cfa_offset 48
popq %rbp
.cfi_def_cfa_offset 40
popq %r12
.cfi_def_cfa_offset 32
popq %r13
.cfi_def_cfa_offset 24
popq %r14
.cfi_def_cfa_offset 16
popq %r15
.cfi_def_cfa_offset 8
ret
.L41:
.cfi_restore_state
leaq .LC2(%rip), %rdi
.LEHB6:
call _ZSt20__throw_length_errorPKc#PLT
.LEHE6:
.L35:
movq %rax, %rbx
.L29:
movq 48(%rsp), %rdi
addq $16, %rbp
cmpq %rbp, %rdi
je .L31
call _ZdlPv#PLT
.L31:
movq 16(%rsp), %rdi
addq $16, %r13
cmpq %r13, %rdi
je .L32
call _ZdlPv#PLT
.L32:
movq %rbx, %rdi
.LEHB7:
call _Unwind_Resume#PLT
.LEHE7:
.L34:
movq %rax, %rbx
jmp .L31
.L36:
movq 8(%rsp), %rdx
movq 80(%rsp), %rdi
movq %rax, %rbx
addq $16, %rdx
cmpq %rdx, %rdi
je .L29
call _ZdlPv#PLT
jmp .L29
.L42:
call __stack_chk_fail#PLT
.cfi_endproc
.LFE1871:
.globl __gxx_personality_v0
.section .gcc_except_table,"a",#progbits
.LLSDA1871:
.byte 0xff
.byte 0xff
.byte 0x1
.uleb128 .LLSDACSE1871-.LLSDACSB1871
.LLSDACSB1871:
.uleb128 .LEHB0-.LFB1871
.uleb128 .LEHE0-.LEHB0
.uleb128 0
.uleb128 0
.uleb128 .LEHB1-.LFB1871
.uleb128 .LEHE1-.LEHB1
.uleb128 .L34-.LFB1871
.uleb128 0
.uleb128 .LEHB2-.LFB1871
.uleb128 .LEHE2-.LEHB2
.uleb128 .L36-.LFB1871
.uleb128 0
.uleb128 .LEHB3-.LFB1871
.uleb128 .LEHE3-.LEHB3
.uleb128 .L34-.LFB1871
.uleb128 0
.uleb128 .LEHB4-.LFB1871
.uleb128 .LEHE4-.LEHB4
.uleb128 .L35-.LFB1871
.uleb128 0
.uleb128 .LEHB5-.LFB1871
.uleb128 .LEHE5-.LEHB5
.uleb128 .L34-.LFB1871
.uleb128 0
.uleb128 .LEHB6-.LFB1871
.uleb128 .LEHE6-.LEHB6
.uleb128 .L35-.LFB1871
.uleb128 0
.uleb128 .LEHB7-.LFB1871
.uleb128 .LEHE7-.LEHB7
.uleb128 0
.uleb128 0
.LLSDACSE1871:
.section .text.startup
.size main, .-main
.p2align 4,,15
.type _GLOBAL__sub_I_main, #function
_GLOBAL__sub_I_main:
.LFB2369:
.cfi_startproc
leaq _ZStL8__ioinit(%rip), %rdi
subq $8, %rsp
.cfi_def_cfa_offset 16
call _ZNSt8ios_base4InitC1Ev#PLT
movq _ZNSt8ios_base4InitD1Ev#GOTPCREL(%rip), %rdi
leaq __dso_handle(%rip), %rdx
leaq _ZStL8__ioinit(%rip), %rsi
addq $8, %rsp
.cfi_def_cfa_offset 8
jmp __cxa_atexit#PLT
.cfi_endproc
.LFE2369:
.size _GLOBAL__sub_I_main, .-_GLOBAL__sub_I_main
.section .init_array,"aw"
.align 8
.quad _GLOBAL__sub_I_main
.local _ZStL8__ioinit
.comm _ZStL8__ioinit,1,1
.section .rodata.cst8,"aM",#progbits,8
.align 8
.LC4:
.long 0
.long 1083129856
.hidden DW.ref.__gxx_personality_v0
.weak DW.ref.__gxx_personality_v0
.section .data.DW.ref.__gxx_personality_v0,"awG",#progbits,DW.ref.__gxx_personality_v0,comdat
.align 8
.type DW.ref.__gxx_personality_v0, #object
.size DW.ref.__gxx_personality_v0, 8
DW.ref.__gxx_personality_v0:
.quad __gxx_personality_v0
.hidden __dso_handle
.ident "GCC: (Ubuntu 7.3.0-16ubuntu3) 7.3.0"
.section .note.GNU-stack,"",#progbits
Windows:
.file "tester.cpp"
.text
.p2align 4,,15
.def ___tcf_0; .scl 3; .type 32; .endef
___tcf_0:
LFB2556:
.cfi_startproc
movl $__ZStL8__ioinit, %ecx
jmp __ZNSt8ios_base4InitD1Ev
.cfi_endproc
LFE2556:
.section .rdata,"dr"
.align 4
LC0:
.ascii "basic_string::_M_construct null not valid\0"
.text
.align 2
.p2align 4,,15
.def __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPcEEvT_S7_St20forward_iterator_tag.isra.29; .scl 3; .type 32; .endef
__ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPcEEvT_S7_St20forward_iterator_tag.isra.29:
LFB2587:
.cfi_startproc
pushl %edi
.cfi_def_cfa_offset 8
.cfi_offset 7, -8
pushl %esi
.cfi_def_cfa_offset 12
.cfi_offset 6, -12
movl %ecx, %esi
pushl %ebx
.cfi_def_cfa_offset 16
.cfi_offset 3, -16
subl $32, %esp
.cfi_def_cfa_offset 48
movl 48(%esp), %edi
movl 52(%esp), %ebx
testl %edi, %edi
jne L5
testl %ebx, %ebx
je L5
movl $LC0, (%esp)
call __ZSt19__throw_logic_errorPKc
.p2align 4,,10
L5:
subl %edi, %ebx
cmpl $15, %ebx
movl %ebx, 28(%esp)
ja L22
movl (%esi), %edx
cmpl $1, %ebx
movl %edx, %eax
je L23
testl %ebx, %ebx
jne L6
L8:
movl 28(%esp), %eax
movl %eax, 4(%esi)
movb $0, (%edx,%eax)
addl $32, %esp
.cfi_remember_state
.cfi_def_cfa_offset 16
popl %ebx
.cfi_restore 3
.cfi_def_cfa_offset 12
popl %esi
.cfi_restore 6
.cfi_def_cfa_offset 8
popl %edi
.cfi_restore 7
.cfi_def_cfa_offset 4
ret $8
.p2align 4,,10
L22:
.cfi_restore_state
leal 28(%esp), %eax
movl $0, 4(%esp)
movl %esi, %ecx
movl %eax, (%esp)
call __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_createERjj
.cfi_def_cfa_offset 40
subl $8, %esp
.cfi_def_cfa_offset 48
movl %eax, (%esi)
movl 28(%esp), %edx
movl %edx, 8(%esi)
L6:
movl %ebx, 8(%esp)
movl %edi, 4(%esp)
movl %eax, (%esp)
call _memcpy
movl (%esi), %edx
jmp L8
.p2align 4,,10
L23:
movzbl (%edi), %eax
movb %al, (%edx)
movl (%esi), %edx
jmp L8
.cfi_endproc
LFE2587:
.def __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.21; .scl 3; .type 32; .endef
.set __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.21,__ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPcEEvT_S7_St20forward_iterator_tag.isra.29
.section .text$_ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_jPKS8_PcEjSB_z,"x"
.linkonce discard
.p2align 4,,15
.globl __ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_jPKS8_PcEjSB_z
.def __ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_jPKS8_PcEjSB_z; .scl 2; .type 32; .endef
__ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_jPKS8_PcEjSB_z:
LFB2177:
.cfi_startproc
pushl %ebp
.cfi_def_cfa_offset 8
.cfi_offset 5, -8
movl %esp, %ebp
.cfi_def_cfa_register 5
pushl %esi
pushl %ebx
subl $16, %esp
.cfi_offset 6, -12
.cfi_offset 3, -16
movl 16(%ebp), %edx
movl 8(%ebp), %esi
leal 30(%edx), %eax
andl $-16, %eax
call ___chkstk_ms
subl %eax, %esp
leal 24(%ebp), %eax
leal 31(%esp), %ebx
movl %edx, 4(%esp)
movl %eax, 12(%esp)
movl 20(%ebp), %eax
andl $-16, %ebx
movl %ebx, (%esp)
movl %eax, 8(%esp)
call *12(%ebp)
leal 8(%esi), %edx
addl %ebx, %eax
movl %esi, %ecx
movl %edx, (%esi)
movl %eax, 4(%esp)
movl %ebx, (%esp)
call __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPcEEvT_S7_St20forward_iterator_tag.isra.29
subl $8, %esp
leal -8(%ebp), %esp
movl %esi, %eax
popl %ebx
.cfi_restore 3
popl %esi
.cfi_restore 6
popl %ebp
.cfi_restore 5
.cfi_def_cfa 4, 4
ret
.cfi_endproc
LFE2177:
.def ___main; .scl 2; .type 32; .endef
.section .rdata,"dr"
LC1:
.ascii "\0"
LC2:
.ascii "%d\0"
LC3:
.ascii "basic_string::append\0"
LC4:
.ascii " \0"
.def ___divdi3; .scl 2; .type 32; .endef
LC6:
.ascii "Done in \0"
LC7:
.ascii "\12\0"
.section .text.startup,"x"
.p2align 4,,15
.globl _main
.def _main; .scl 2; .type 32; .endef
_main:
LFB2111:
.cfi_startproc
.cfi_personality 0,___gxx_personality_v0
.cfi_lsda 0,LLSDA2111
leal 4(%esp), %ecx
.cfi_def_cfa 1, 0
andl $-16, %esp
pushl -4(%ecx)
pushl %ebp
.cfi_escape 0x10,0x5,0x2,0x75,0
movl %esp, %ebp
pushl %edi
pushl %esi
pushl %ebx
pushl %ecx
.cfi_escape 0xf,0x3,0x75,0x70,0x6
.cfi_escape 0x10,0x7,0x2,0x75,0x7c
.cfi_escape 0x10,0x6,0x2,0x75,0x78
.cfi_escape 0x10,0x3,0x2,0x75,0x74
subl $152, %esp
call ___main
call __ZNSt6chrono3_V212system_clock3nowEv
leal -96(%ebp), %ecx
movl %eax, -136(%ebp)
leal -88(%ebp), %eax
movl $LC1, 4(%esp)
movl $LC1, (%esp)
movl %edx, -132(%ebp)
movl %eax, -96(%ebp)
LEHB0:
call __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE12_M_constructIPKcEEvT_S8_St20forward_iterator_tag.isra.21
LEHE0:
leal -96(%ebp), %ecx
subl $8, %esp
movl $20000000, (%esp)
LEHB1:
call __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE7reserveEj
LEHE1:
subl $4, %esp
movl $10000000, %edi
leal -72(%ebp), %esi
leal -40(%ebp), %ebx
jmp L32
.p2align 4,,10
L28:
movl %ecx, -48(%ebp)
movl 8(%eax), %ecx
movl %ecx, -40(%ebp)
L29:
movl 4(%eax), %ecx
movb $0, 8(%eax)
movl %ecx, -44(%ebp)
movl %edx, (%eax)
leal -96(%ebp), %ecx
movl $0, 4(%eax)
movl -44(%ebp), %eax
movl %eax, 4(%esp)
movl -48(%ebp), %eax
movl %eax, (%esp)
LEHB2:
call __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_appendEPKcj
LEHE2:
movl -48(%ebp), %eax
subl $8, %esp
cmpl %ebx, %eax
je L30
movl %eax, (%esp)
call __ZdlPv
L30:
movl -72(%ebp), %eax
leal -64(%ebp), %edx
cmpl %edx, %eax
je L31
movl %eax, (%esp)
call __ZdlPv
L31:
subl $1, %edi
je L46
L32:
movl $1, 16(%esp)
movl $LC2, 12(%esp)
movl $16, 8(%esp)
movl $_vsnprintf, 4(%esp)
movl %esi, (%esp)
LEHB3:
call __ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_jPKS8_PcEjSB_z
LEHE3:
cmpl $2147483647, -68(%ebp)
je L47
movl $1, 4(%esp)
movl $LC4, (%esp)
movl %esi, %ecx
LEHB4:
call __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_appendEPKcj
LEHE4:
movl %ebx, -48(%ebp)
movl (%eax), %ecx
leal 8(%eax), %edx
subl $8, %esp
cmpl %edx, %ecx
jne L28
movl 12(%eax), %ecx
movl %ecx, -120(%ebp)
movl 16(%eax), %ecx
movl %ecx, -124(%ebp)
movl 20(%eax), %ecx
movl %ecx, -128(%ebp)
movl 8(%eax), %ecx
movl %ecx, -40(%ebp)
movl -120(%ebp), %ecx
movl %ecx, -36(%ebp)
movl -124(%ebp), %ecx
movl %ecx, -32(%ebp)
movl -128(%ebp), %ecx
movl %ecx, -28(%ebp)
jmp L29
.p2align 4,,10
L46:
call __ZNSt6chrono3_V212system_clock3nowEv
subl -136(%ebp), %eax
movl $1000000, 8(%esp)
sbbl -132(%ebp), %edx
movl $0, 12(%esp)
movl %eax, (%esp)
movl %edx, 4(%esp)
call ___divdi3
movl %eax, -120(%ebp)
movl %edx, -116(%ebp)
fildq -120(%ebp)
movl $8, 8(%esp)
movl $LC6, 4(%esp)
movl $__ZSt4cout, (%esp)
fdivs LC5
fstpl -120(%ebp)
LEHB5:
call __ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_i
fldl -120(%ebp)
movl $__ZSt4cout, %ecx
fstpl (%esp)
call __ZNSo9_M_insertIdEERSoT_
subl $8, %esp
movl $LC7, 4(%esp)
movl %eax, (%esp)
call __ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
LEHE5:
movl -96(%ebp), %eax
leal -88(%ebp), %edi
cmpl %edi, %eax
je L43
movl %eax, (%esp)
call __ZdlPv
L43:
leal -16(%ebp), %esp
xorl %eax, %eax
popl %ecx
.cfi_remember_state
.cfi_restore 1
.cfi_def_cfa 1, 0
popl %ebx
.cfi_restore 3
popl %esi
.cfi_restore 6
popl %edi
.cfi_restore 7
popl %ebp
.cfi_restore 5
leal -4(%ecx), %esp
.cfi_def_cfa 4, 4
ret
L47:
.cfi_restore_state
movl $LC3, (%esp)
LEHB6:
call __ZSt20__throw_length_errorPKc
LEHE6:
L41:
movl %eax, %ebx
L36:
movl -72(%ebp), %eax
leal -64(%ebp), %edx
cmpl %edx, %eax
je L38
movl %eax, (%esp)
call __ZdlPv
L38:
movl -96(%ebp), %eax
leal -88(%ebp), %edi
cmpl %edi, %eax
je L39
movl %eax, (%esp)
call __ZdlPv
L39:
movl %ebx, (%esp)
LEHB7:
call __Unwind_Resume
LEHE7:
L42:
movl %eax, %esi
movl -48(%ebp), %eax
cmpl %ebx, %eax
je L35
movl %eax, (%esp)
call __ZdlPv
L35:
movl %esi, %ebx
jmp L36
L40:
movl %eax, %ebx
jmp L38
.cfi_endproc
LFE2111:
.def ___gxx_personality_v0; .scl 2; .type 32; .endef
.section .gcc_except_table,"w"
LLSDA2111:
.byte 0xff
.byte 0xff
.byte 0x1
.uleb128 LLSDACSE2111-LLSDACSB2111
LLSDACSB2111:
.uleb128 LEHB0-LFB2111
.uleb128 LEHE0-LEHB0
.uleb128 0
.uleb128 0
.uleb128 LEHB1-LFB2111
.uleb128 LEHE1-LEHB1
.uleb128 L40-LFB2111
.uleb128 0
.uleb128 LEHB2-LFB2111
.uleb128 LEHE2-LEHB2
.uleb128 L42-LFB2111
.uleb128 0
.uleb128 LEHB3-LFB2111
.uleb128 LEHE3-LEHB3
.uleb128 L40-LFB2111
.uleb128 0
.uleb128 LEHB4-LFB2111
.uleb128 LEHE4-LEHB4
.uleb128 L41-LFB2111
.uleb128 0
.uleb128 LEHB5-LFB2111
.uleb128 LEHE5-LEHB5
.uleb128 L40-LFB2111
.uleb128 0
.uleb128 LEHB6-LFB2111
.uleb128 LEHE6-LEHB6
.uleb128 L41-LFB2111
.uleb128 0
.uleb128 LEHB7-LFB2111
.uleb128 LEHE7-LEHB7
.uleb128 0
.uleb128 0
LLSDACSE2111:
.section .text.startup,"x"
.p2align 4,,15
.def __GLOBAL__sub_I_main; .scl 3; .type 32; .endef
__GLOBAL__sub_I_main:
LFB2557:
.cfi_startproc
subl $28, %esp
.cfi_def_cfa_offset 32
movl $__ZStL8__ioinit, %ecx
call __ZNSt8ios_base4InitC1Ev
movl $___tcf_0, (%esp)
call _atexit
addl $28, %esp
.cfi_def_cfa_offset 4
ret
.cfi_endproc
LFE2557:
.section .ctors,"w"
.align 4
.long __GLOBAL__sub_I_main
.lcomm __ZStL8__ioinit,1,1
.section .rdata,"dr"
.align 4
LC5:
.long 1148846080
.ident "GCC: (MinGW.org GCC-6.3.0-1) 6.3.0"
.def __ZNSt8ios_base4InitD1Ev; .scl 2; .type 32; .endef
.def __ZSt19__throw_logic_errorPKc; .scl 2; .type 32; .endef
.def __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_createERjj; .scl 2; .type 32; .endef
.def _memcpy; .scl 2; .type 32; .endef
.def __ZNSt6chrono3_V212system_clock3nowEv; .scl 2; .type 32; .endef
.def __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE7reserveEj; .scl 2; .type 32; .endef
.def __ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_appendEPKcj; .scl 2; .type 32; .endef
.def __ZdlPv; .scl 2; .type 32; .endef
.def _vsnprintf; .scl 2; .type 32; .endef
.def __ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_i; .scl 2; .type 32; .endef
.def __ZNSo9_M_insertIdEERSoT_; .scl 2; .type 32; .endef
.def __ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc; .scl 2; .type 32; .endef
.def __ZSt20__throw_length_errorPKc; .scl 2; .type 32; .endef
.def __Unwind_Resume; .scl 2; .type 32; .endef
.def __ZNSt8ios_base4InitC1Ev; .scl 2; .type 32; .endef
.def _atexit; .scl 2; .type 32; .endef

Quick look at disassembly shows that Windows version uses movl (i. e. long word, 32 bit move) and Linux version uses movq (quad word, 64 bit) and SSE registers xmm.
My bet is that on Linux, you compile for x86-64, while on Windows you target 32 bit x86.
x86-64 includes SSE2 extension, while x86 does not, so MinGW defaults to no-SSE mode.
If that's the case, building with 64 bit toolchain on Windows should result in comparable performance. Alternatively, you might enable SSE for 32 bit builds (-msse2 compiler flag, if I remember correctly).

The mingw.org implementation just seems to be much more inefficient than linux, Visual Studio or mingw-w64.org.
>g++ --version
g++ (MinGW.org GCC-6.3.0-1) 6.3.0
Done in 24.808
>g++ --version
g++ (i686-posix-dwarf-rev2, Built by MinGW-W64 project) 6.3.0
Done in 0.679

Tested with MSYS2 MinGW64:
g++ --version
g++.exe (Rev2, Built by MSYS2 project) 7.3.0
g++.exe -Wall -O3 -mtune=native -fno-exceptions -fno-rtti -c main.cpp -o main.o
g++.exe -o test.exe main.o -s
Done in 0.547
Env: Windows 10 x64
CPU: Intel Core i5-6300U, 2.4GH
RAM: 16GB DDR4
In any case, MinGW uses mswcrt.dll instead of GNU libc (windows bundled one, not a universal CRT/visual studio CRT etc) so speed gap may comes from C standard library from my experience.
P.S. with some changes (same compiler flags)
#include <iostream>
#include <chrono>
#ifdef _WIN32
#include <windows.h>
static std::size_t page_size() noexcept {
::SYSTEM_INFO si;
::GetSystemInfo(&si);
return si.dwPageSize;
}
#else
#include <sys/types.h>
#include <unistd.h>
static std::size_t page_size() noexcept {
return static_cast<std::size_t>( ::sysconf(_SC_PAGESIZE) );
}
#endif // _WIN32
int main(int argc, char const *argv[]) {
auto started = std::chrono::high_resolution_clock::now();
const std::size_t n = 10000000;
// align size to page boundary
const std::size_t al = page_size() - 1;
const std::size_t buff_size = ( (n << 1) + al) & ~al;
std::string str;
str.reserve(buff_size);
const std::string to_append( std::to_string(1) );
for (std::size_t i = 0; i < n; ++i) {
str.append( to_append );
str.push_back(' ');
}
auto done = std::chrono::high_resolution_clock::now();
double secs = (double) std::chrono::duration_cast<std::chrono::milliseconds>(done-started).count() / 1000;
std::cout << "Done in " << secs << "\n";
return 0;
}
Done in 0.046
Asm ouput for main function:
main:
pushq %r14
.seh_pushreg %r14
pushq %r13
.seh_pushreg %r13
pushq %r12
.seh_pushreg %r12
pushq %rbp
.seh_pushreg %rbp
pushq %rdi
.seh_pushreg %rdi
pushq %rsi
.seh_pushreg %rsi
pushq %rbx
.seh_pushreg %rbx
subq $144, %rsp
.seh_stackalloc 144
.seh_endprologue
movl $10000000, %esi
call __main
leaq 96(%rsp), %r13
leaq 64(%rsp), %rbp
call _ZNSt6chrono3_V212system_clock3nowEv
movq %r13, %rcx
leaq 16(%rbp), %r12
movq %rax, %r14
call *__imp_GetSystemInfo(%rip)
movl 100(%rsp), %eax
movq %rbp, %rcx
movq %r12, 64(%rsp)
movq $0, 72(%rsp)
leaq 19999999(%rax), %rdx
negq %rax
movb $0, 80(%rsp)
andq %rax, %rdx
call _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE7reserveEy
movl $1, 32(%rsp)
movq %r13, %rcx
leaq .LC0(%rip), %r9
movl $16, %r8d
leaq _ZL9vsnprintfPcyPKcS_(%rip), %rdx
call _ZN9__gnu_cxx12__to_xstringINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEcEET_PFiPT0_yPKS8_PcEySB_z
jmp .L14
.p2align 4,,10
.L16:
movb $32, (%rdx,%rbx)
.L26:
movq 64(%rsp), %rax
movq %rdi, 72(%rsp)
movb $0, 1(%rax,%rbx)
subq $1, %rsi
je .L27
.L14:
movq 96(%rsp), %rdx
movq 104(%rsp), %r8
movq %rbp, %rcx
call _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_appendEPKcy
movq 72(%rsp), %rbx
movq 64(%rsp), %rdx
movl $15, %eax
leaq 1(%rbx), %rdi
cmpq %r12, %rdx
je .L15
movq 80(%rsp), %rax
.L15:
cmpq %rax, %rdi
jbe .L16
xorl %r9d, %r9d
xorl %r8d, %r8d
movq %rbx, %rdx
movq %rbp, %rcx
movq $1, 32(%rsp)
call _ZNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEE9_M_mutateEyyPKcy
movq 64(%rsp), %rax
movb $32, (%rax,%rbx)
jmp .L26
.p2align 4,,10
.L27:
call _ZNSt6chrono3_V212system_clock3nowEv
pxor %xmm1, %xmm1
movl $8, %r8d
movabsq $4835703278458516699, %rdx
subq %r14, %rax
addq $16, %r13
movq %rax, %rcx
imulq %rdx
sarq $63, %rcx
sarq $18, %rdx
subq %rcx, %rdx
movq .refptr._ZSt4cout(%rip), %rcx
cvtsi2sdq %rdx, %xmm1
leaq .LC2(%rip), %rdx
divsd .LC1(%rip), %xmm1
movsd %xmm1, 56(%rsp)
call _ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_x
movsd 56(%rsp), %xmm1
movq .refptr._ZSt4cout(%rip), %rcx
call _ZNSo9_M_insertIdEERSoT_
leaq .LC3(%rip), %rdx
movq %rax, %rcx
call _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
movq 96(%rsp), %rcx
cmpq %r13, %rcx
je .L19
call _ZdlPv
.L19:
movq 64(%rsp), %rcx
addq $16, %rbp
cmpq %rbp, %rcx
je .L20
call _ZdlPv
.L20:
xorl %eax, %eax
addq $144, %rsp
popq %rbx
popq %rsi
popq %rdi
popq %rbp
popq %r12
popq %r13
popq %r14
ret
.seh_endproc
.p2align 4,,15
.def _GLOBAL__sub_I_main; .scl 3; .type 32; .endef
.seh_proc _GLOBAL__sub_I_main

(Just for the proportions) Windows Release target vs. Debug target on Visual Studio C++: By default, Debug target compile-line is without optimization, while Release target compile-line is with /O2 optimization, with /Oi ("Enable Intrinsic Functions"), & with /GL ("Whole Program Optimization"). Your code, on my workstation, Debug x64 vs Relesae x64:
Debug: 70 sec.
Release: 0.27 sec.
You build with MinGW (which I am not familiar with). But from a fast search, there is a talk about Debug/Release mode ...and MinGW has equivalent /O2 optimization, /Oi ("Enable Intrinsic Functions"), and /Og ("Enable Global Optimization") flags, it seems.
-
Compile with these 3 flags (x64 target), & compare with the VS Release x64 benchmark. Anyway, this is MS default compile optimization for a Release target.
-
Test Environment:
HP 8100, Windows 10 Pro 64 bit, CPU i7 870, 16 GB DDR3 RAM, Visual Studio 2017, Targets: Debug x64 / Release x64

I tried your code at my Windows with MinGW 4.8.0 and got ~20 seconds. When I changed string concatination to std::stringstream I got 0.5 seconds:
...
std::stringstream ss;
for (int i = 0; i < n; ++i) {
//str += std::to_string(a) + " ";
ss << a << " ";
}
str = ss.str();
...

C++ Tail recursion using 64-bit variables

I have written a simple Fibonacci function as an exercise in C++ (using Visual Studio) to test Tail Recursion and to see how it works.
this is the code:
int fib_tail(int n, int res, int next) {
if (n == 0) {
return res;
}
return fib_tail(n - 1, next, res + next);
}
int main()
{
fib_tail(10,0,1); //Tail Recursion works
}
when I compiled using Release mode I saw the optimized assembly using the JMP instruction in spite of a call. So my conclusion was: tail recursion works. See image below:
I wanted to do some performance tests by increasing the input variable n in my Fibonacci function. I then opted to change the variable type, used in the function, from int to unsigned long long. Then I passed a big number like: 10e+08
This is now the new function:
typedef unsigned long long ULONG64;
ULONG64 fib_tail(ULONG64 n, ULONG64 res, ULONG64 next) {
if (n == 0) {
return res;
}
return fib_tail(n - 1, next, res + next);
}
int main()
{
fib_tail(10e+9,0,1); //Tail recursion does not work
}
When I ran the code above I got a stack overflow exception, which made me think that tail recursion was not working. I looked at the assembly and in fact I found this:
As you see now there is a call instruction whereas I was expecting only a simple JMP. I don't understand the reason why using a 8 bytes variable disables tail recursion. Why the compiler doesn't perform an optimization in such case?

This is one of those questions that you'd have to ask the guys that do compiler optimisation for MS - there is really no technical reason why ANY return type should prevent tail-recursion from being a jump as such - there may be OTHER reasons such as "the code is too complex to understand" or some such.
clang 3.7 as of a couple of weeks back clearly figures it out:
_Z8fib_tailyyy: # #_Z8fib_tailyyy
pushl %ebp
pushl %ebx
pushl %edi
pushl %esi
pushl %eax
movl 36(%esp), %ecx
movl 32(%esp), %esi
movl 28(%esp), %edi
movl 24(%esp), %ebx
movl %ebx, %eax
orl %edi, %eax
je .LBB0_1
movl 44(%esp), %ebp
movl 40(%esp), %eax
movl %eax, (%esp) # 4-byte Spill
.LBB0_3: # %if.end
movl %ebp, %edx
movl (%esp), %eax # 4-byte Reload
addl $-1, %ebx
adcl $-1, %edi
addl %eax, %esi
adcl %edx, %ecx
movl %ebx, %ebp
orl %edi, %ebp
movl %esi, (%esp) # 4-byte Spill
movl %ecx, %ebp
movl %eax, %esi
movl %edx, %ecx
jne .LBB0_3
jmp .LBB0_4
.LBB0_1:
movl %esi, %eax
movl %ecx, %edx
.LBB0_4: # %return
addl $4, %esp
popl %esi
popl %edi
popl %ebx
popl %ebp
retl
main: # #main
subl $28, %esp
movl $0, 20(%esp)
movl $1, 16(%esp)
movl $0, 12(%esp)
movl $0, 8(%esp)
movl $2, 4(%esp)
movl $1410065408, (%esp) # imm = 0x540BE400
calll _Z8fib_tailyyy
movl %edx, f+4
movl %eax, f
xorl %eax, %eax
addl $28, %esp
retl
Same applies to gcc 4.9.2 if you give it -O2 (but not in -O1 which was all clang needed)
(And of course also in 64-bit mode)

Nested if statements and "&&" operator

if(a() && b() && c() && d())
doSomething();
if(a())
if(b())
if(c())
if(d())
doSomething();
Is there "any" performance difference between these two?
For example, in a situation that a() turns 0, will it keep running b(), c() and d() in the first if statement? Or will it work same as the second nested if statement?

They're exactly identical.
To test this yourself, run gcc -S test.c (presuming that this is where you've put your source) and observe the contents of test.s.
Here's how the nested-if approach compiles in gcc 4.8.1 with default options (annotated with comments):
main:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movl $0, %eax
call A # try to call A
testl %eax, %eax # look at its return value
je .L3 # short-circuit if it returned 0
movl $0, %eax # ...repeat for B, et al.
call B
testl %eax, %eax
je .L3
movl $0, %eax
call C
testl %eax, %eax
je .L3
movl $0, %eax
call D
testl %eax, %eax
je .L3
movl $0, %eax
call doSomething
.L3:
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
Here's how the && approach compiles:
main:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movl $0, %eax
call A # try to call A
testl %eax, %eax # look at its return value
je .L3 # short-circuit if it returned 0
movl $0, %eax # ...repeat for B, et al.
call B
testl %eax, %eax
je .L3
movl $0, %eax
call C
testl %eax, %eax
je .L3
movl $0, %eax
call D
testl %eax, %eax
je .L3
movl $0, %eax
call doSomething
.L3:
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc

assembly x86 'decompiling' [closed]

Closed. This question does not meet Stack Overflow guidelines. It is not currently accepting answers.
This question appears to be off-topic because it lacks sufficient information to diagnose the problem. Describe your problem in more detail or include a minimal example in the question itself.
Closed 8 years ago.
Improve this question
I'm having trouble understanding this assembly x86 code (AT&T notation). I need to be able to understand it (write C++ function that is compiled to that code) and solve similar exercises on the exam.
Can you explain to me which part does what and what is the convention?
f:
pushl %ebp ; 1
movl %esp, %ebp; 2
pushl %ebx ; 3
subl $36, %esp; 4
movl 8(%ebp), %edx ; 5
movl 12(%ebp), %eax ; 6
movl (%eax), %eax ; 7
movl %edx, 8(%esp) ; 8
leal 16(%ebp), %edx ; 9
movl %edx, 4(%esp) ; 10
movl %eax, (%esp) ; 11
call f; 12
movl %eax, -12(%ebp) ; 13
movl 16(%ebp), %edx ; 14
movl 12(%ebp), %eax ; 15
movl %edx, (%eax) ; 16
movl 12(%ebp), %eax ; 17
movl (%eax), %edx ; 18
movl -12(%ebp), %eax ; 19
movl %edx, 8(%esp) ; 20
leal 8(%ebp), %edx ; 21
movl %edx, 4(%esp) ; 22
movl %eax, (%esp) ; 23
call f; 24
movl %eax, %ebx; 25
movl 16(%ebp), %edx ; 26
movl -12(%ebp), %eax ; 27
movl %edx, 8(%esp) ; 28
movl 12(%ebp), %edx ; 29
movl %edx, 4(%esp) ; 30
movl %eax, (%esp) ; 31
call f; 32
movl %eax, %edx; 33
movl 16(%ebp), %eax ; 34
movl %edx, 8(%esp) ; 35
leal 8(%ebp), %edx ; 36
movl %edx, 4(%esp) ; 37
movl %eax, (%esp) ; 38
call f; 39
movl %ebx, 8(%esp) ; 40
leal -12(%ebp), %edx ; 41
movl %edx, 4(%esp) ; 42
movl %eax, (%esp) ; 43
call f; 44
addl $36, %esp; 45
popl %ebx ; 46
popl %ebp ; 47
ret; 48
There are no jumps, but a few of 'call f', does it mean that there is an infinite loop?

Below is a little bit to help you get going.
Step 1. Divide the code up into logical chunks. Key things to look for to identify logical chunks are the stack prologue and epilogue code, function calls, branch statements and addresses identified by the branch statements.
Step 2. Make notes about what each chunk is doing.
For example ...
f:
pushl %ebp
movl %esp, %ebp ; Create the stack frame
pushl %ebx ; and save non-volatile register EBX
subl $36, %esp ; Carve space for 9 32-bit words on the stack
; Notes: 8(%ebp) is the address for the 1st parameter
; 12(%ebp) is the address for the 2nd parameter
; 16(%ebp) is the address for the 3rd parameter
;
; Anything addresses as -#(%ebp) will be a stack variable
; local to this function.
;
; Anything addressed as #(%esp) will be used to pass parameters
; to the sub-function. The advantage of doing it this way is that
; parameters passed to the sub-function do not have to be popped
; after every call to a sub-function.
movl 8(%ebp), %edx ; EDX = 1st parameter
movl 12(%ebp), %eax ; EAX = 2nd parameter
movl (%eax), %eax ; The 2nd parameter is a pointer!
movl %edx, 8(%esp) ; Pass EDX as 3rd parameter to sub-function
leal 16(%ebp), %edx ; EDX = address of 3rd parameter to this function
movl %edx, 4(%esp) ; Passing it as 2nd parameter to sub-function
movl %eax, (%esp) ; Pass EAX as 3rd parameter to sub-function
call f ; Call sub-function
movl %eax, -12(%ebp) ; Save return value to local stack variable
; More Notes:
; I am guessing that this bit of decompiled code was an object file.
; Experience has shown me that when the address sub-functions used by
; CALL are all the same (and match the address of the calling function)
; this is often due to decompiling an object file as opposed to an
; executable. If however, the sub-function address truly is '0xf', then
; this will be a recursive routine that will blow the stack as there is
; no exit condition.
movl 16(%ebp), %edx ; EDX: 3rd parameter passed to function
; likely modified by previous CALL
movl 12(%ebp), %eax ; EAX: 2nd parameter passed to function
movl %edx, (%eax) ; Save EDX to the location pointed to by the 2nd parameter
movl 12(%ebp), %eax ; EAX: 2nd parameter passed to function (recall it's a ptr)
movl (%eax), %edx ; ... and so on ...
movl -12(%ebp), %eax
movl %edx, 8(%esp)
leal 8(%ebp), %edx)
movl %edx, 4(%esp)
movl %eax, (%esp)
call f
movl %eax, %ebx
movl 16(%ebp), %edx
movl -12(%ebp), %eax
movl %edx, 8(%esp)
movl 12(%ebp), %edx
movl %edx, 4(%esp)
movl %eax, (%esp)
call f
movl %eax, %edx
movl 16(%ebp), %eax
movl %edx, 8(%esp)
leal 8(%ebp), %edx
movl %edx, 4(%esp)
movl %eax, (%esp)
call f
movl %ebx, 8(%esp)
leal -12(%ebp), %edx
movl %edx, 4(%esp)
movl %eax, (%esp)
call f
addl $36, %esp ; Reclaim that carved stack space
popl %ebx ; Restore the non-volatile register EBX
popl %ebp ; Restore to the caller's stack frame
ret ; Return
I am leaving the rest for you. I hope this helps you along.

This function f is a recursive function without termination of the recursion. Something like
void f(int a, int b, int c)
{
f(a,b,c);
//....
}
Stop evaluating the disassembly, since it isn't worth to get such bad code in any high level language.

I came to the solution:
int f (int i, int* j, int k) {
int n = f(*j, &k, i);
*j = k;
f( f(n, &i, *j), &n, f(k, &i, f(n, j, k)) );
return 0;
}
when compiling my code
g++ -m32 -S a.cpp
I get the following assembly code:
_Z1fiPii:
.LFB971:
.cfi_startproc
.cfi_personality 0,__gxx_personality_v0
.cfi_lsda 0,.LLSDA971
pushl %ebp
.cfi_def_cfa_offset 8
.cfi_offset 5, -8
movl %esp, %ebp
.cfi_def_cfa_register 5
pushl %ebx
subl $36, %esp
.cfi_offset 3, -12
movl 8(%ebp), %edx
movl 12(%ebp), %eax
movl (%eax), %eax
movl %edx, 8(%esp)
leal 16(%ebp), %edx
movl %edx, 4(%esp)
movl %eax, (%esp)
.LEHB0:
call _Z1fiPii
movl %eax, -12(%ebp)
movl 16(%ebp), %edx
movl 12(%ebp), %eax
movl %edx, (%eax)
movl 16(%ebp), %edx
movl -12(%ebp), %eax
movl %edx, 8(%esp)
movl 12(%ebp), %edx
movl %edx, 4(%esp)
movl %eax, (%esp)
call _Z1fiPii
movl 16(%ebp), %edx
movl %eax, 8(%esp)
leal 8(%ebp), %eax
movl %eax, 4(%esp)
movl %edx, (%esp)
call _Z1fiPii
movl %eax, %ebx
movl 12(%ebp), %eax
movl (%eax), %edx
movl -12(%ebp), %eax
movl %edx, 8(%esp)
leal 8(%ebp), %ecx
movl %ecx, 4(%esp)
movl %eax, (%esp)
call _Z1fiPii
movl %ebx, 8(%esp)
leal -12(%ebp), %edx
movl %edx, 4(%esp)
movl %eax, (%esp)
call _Z1fiPii
.LEHE0:
movl $0, %eax
jmp .L5
.L4:
movl %eax, (%esp)
.LEHB1:
call _Unwind_Resume
.LEHE1:
.L5:
addl $36, %esp
popl %ebx
.cfi_restore 3
popl %ebp
.cfi_restore 5
.cfi_def_cfa 4, 4
ret
.cfi_endproc
Is this one equivalent to the one pasted before?

while loop in assembly language

There is such code in C++:
#include <iostream>
int main(){
int a = 4;
while(a--){
std::cout << (a + 1) << '\n';
}
return 0;
}
and corresponding code of main function in assembly code produced by g++:
.globl main
.type main, #function
main:
.LFB957:
.cfi_startproc
.cfi_personality 0x0,__gxx_personality_v0
pushl %ebp
.cfi_def_cfa_offset 8
movl %esp, %ebp
.cfi_offset 5, -8
.cfi_def_cfa_register 5
andl $-16, %esp
subl $32, %esp
movl $4, 28(%esp) # int a = 4;
jmp .L2
.L3:
movl 28(%esp), %eax # std::cout << (a + 1) << '\n';
addl $1, %eax
movl %eax, 4(%esp)
movl $_ZSt4cout, (%esp)
call _ZNSolsEi
movl $10, 4(%esp)
movl %eax, (%esp)
call _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_c
.L2:
cmpl $0, 28(%esp)
setne %al
subl $1, 28(%esp) # a = a - 1
testb %al, %al
jne .L3
movl $0, %eax
leave
ret
.cfi_endproc
.LFE957:
.size main, .-main
What are used instructions setne and testb in following fragment for?
.L2:
cmpl $0, 28(%esp)
setne %al
subl $1, 28(%esp) # a = a - 1
testb %al, %al
jne .L3
Couldn't it be just so to check in while loop whether a is not zero and jump?

The while condition is formally the equivalent of:
while ( a -- != 0 )
(Omitting the comparison is a legal obfuscation.)
The compiler is generating code to compare a with 0, save the
results in register al, then decrement a, and then test the saved
results.

Because a-- means
tmpval=a;
a=a-1;
return tmpval;
so compiler needs to save the previous value of a.
In this program, the body part of while will be executed when a = 0 (after a--, so it will print 1).

It's been a long while since I did assembler, but I would assume it's some optimisation to keep the pipelines busy / optimise register use.

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

Debugging with an assemble program with GDB - gdb

Related

Slow std::string concatenation on windows

C++ Tail recursion using 64-bit variables

Nested if statements and "&&" operator

assembly x86 'decompiling' [closed]

while loop in assembly language

Categories

Resources