Difference at assembly level of printf from cstdio and iostream - c++

So this question is just out of curiosity.
I have some tiny program:
#include <some_header>
void print(){ printf("abc"); } // don't care about main, I'm not gonna run it
Then I compiled it to assembly, with once some_header=>iostream and another time some_header=>cstdio with gcc.godbolt.org (6.1 for x86_64) with -O3 -pedantic -std=c++14. Look at this:
.LC0:
.string "abc"
print(): (iostream) or (both included)
movl $.LC0, %edi
xorl %eax, %eax
jmp printf
subq $8, %rsp
movl std::__ioinit, %edi
call std::ios_base::Init::Init()
movl $__dso_handle, %edx
movl std::__ioinit, %esi
movl std::ios_base::Init::~Init(), %edi
addq $8, %rsp
jmp __cxa_atexit
print(): (cstdio)
movl $.LC0, %edi
xorl %eax, %eax
jmp printf
There's a significant difference between them, and they're identical for the first three lines, so why do iostream need such amount of code to clean up or what are those lines just doing? OR just to say that godbolt is unreliable upon this task?
Also it seems that standard doesn't guarantee that printf is accessible from iostream, should this be relied upon?

Your print function compiles to a pretty much the same assembly code in both cases.
The additional lines you see are to initialise and de-initialise iostream library. You may see that clearly if you remove the optimisation flag -O3.
Here is a complete listing with iostream included and optimisation switched off.
std::piecewise_construct:
.zero 1
.LC0:
.string "abc"
print():
pushq %rbp
movq %rsp, %rbp
movl $.LC0, %edi
movl $0, %eax
call printf
nop
popq %rbp
ret
__static_initialization_and_destruction_0(int, int):
pushq %rbp
movq %rsp, %rbp
subq $16, %rsp
movl %edi, -4(%rbp)
movl %esi, -8(%rbp)
cmpl $1, -4(%rbp)
jne .L4
cmpl $65535, -8(%rbp)
jne .L4
movl std::__ioinit, %edi
call std::ios_base::Init::Init()
movl $__dso_handle, %edx
movl std::__ioinit, %esi
movl std::ios_base::Init::~Init(), %edi
call __cxa_atexit
.L4:
nop
leave
ret
pushq %rbp
movq %rsp, %rbp
movl $65535, %esi
movl $1, %edi
call __static_initialization_and_destruction_0(int, int)
popq %rbp
ret

Related

What do these instructions in the diassembly phase indicate?

Hello as I run c++ code in clion IDE debugger, after main() returns, the debugger steps into a file called disassembly, and it contains what looks like assmebly code. What are those instructions? What does it do? Should I care? as I'm new to c++ I'm familiarizing myself with the language, IDE and anything else of relevance.
start:
nop
movl %eax, %edi
callq 0x2e82e ; symbol stub for: exit
hlt
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
exit:
jmpq *0x268c241c(%rip)
exit:
pushq %rbp
movq %rsp, %rbp
pushq %rbx
pushq %rax
movl %edi, %ebx
cmpl $0xad, %edi
jne 0x5a404 ; <+41>
leaq 0x2683a31e(%rip), %rcx
movq (%rcx), %rax
testq %rax, %rax
je 0x5a404 ; <+41>
xorl %eax, %eax
xchgq %rax, (%rcx)
testq %rax, %rax
jne 0x5a427 ; <+76>
xorl %eax, %eax
callq 0x8017c ; symbol stub for: _tlv_exit
xorl %edi, %edi
callq 0x5a196 ; __cxa_finalize
movq 0x268354f7(%rip), %rax
testq %rax, %rax
je 0x5a420 ; <+69>
callq *%rax
movl %ebx, %edi
callq 0x8000e ; symbol stub for: __exit
callq *%rax
ud2
There is also this
_tlv_exit:
pushq %rbp
movq %rsp, %rbp
pushq %rbx
pushq %rax
movq 0x268db5e9(%rip), %rdi
callq 0x2e92a ; symbol stub for: pthread_getspecific
testq %rax, %rax
je 0x18e20 ; <+54>
movq %rax, %rbx
movq 0x268db5d5(%rip), %rdi
xorl %esi, %esi
callq 0x2e942 ; symbol stub for: pthread_setspecific
movq %rbx, %rdi
addq $0x8, %rsp
popq %rbx
popq %rbp
jmp 0x1983e ; tlv_finalize_list
addq $0x8, %rsp
popq %rbx
popq %rbp
retq
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
start:
nop
movl %eax, %edi
callq 0x2e82e ; symbol stub for: exit
hlt
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
exit:
jmpq *0x268c241c(%rip)
pthread_getspecific:
jmpq *0x268c2470(%rip)
__cxa_finalize_ranges:
pushq %rbp
movq %rsp, %rbp
pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %rbx
subq $0x18, %rsp
movl %esi, -0x2c(%rbp)
movq %rdi, -0x38(%rbp)
leaq 0x26834d24(%rip), %rdi
callq 0x804d6 ; symbol stub for: pthread_mutex_lock
movq 0x26834ca0(%rip), %r13
testq %r13, %r13
je 0x5a17c ; <+383>
movl -0x2c(%rbp), %ebx
addq $0x8, -0x38(%rbp)
movslq 0x8(%r13), %r15
testq %r15, %r15
jle 0x5a16f ; <+370>
decq %r15
movq %r15, %r14
shlq $0x5, %r14
movl 0x10(%r13,%r14), %r12d
testl %r12d, %r12d
je 0x5a03d ; <+64>
cmpl $0x0, -0x2c(%rbp)
je 0x5a102 ; <+261>
cmpl $0x1, %r12d
je 0x5a0a4 ; <+167>
cmpl $0x3, %r12d
je 0x5a0d1 ; <+212>
cmpl $0x2, %r12d
jne 0x5a102 ; <+261>
movq 0x28(%r13,%r14), %rax
movq -0x38(%rbp), %rcx
xorl %edx, %edx
movq -0x8(%rcx), %rsi
cmpq %rax, %rsi
ja 0x5a096 ; <+153>
addq (%rcx), %rsi
cmpq %rax, %rsi
ja 0x5a102 ; <+261>
incq %rdx
addq $0x10, %rcx
cmpq %rbx, %rdx
jb 0x5a085 ; <+136>
jmp 0x5a03d ; <+64>
movq 0x18(%r13,%r14), %rax
movq -0x38(%rbp), %rcx
xorl %edx, %edx
movq -0x8(%rcx), %rsi
cmpq %rax, %rsi
ja 0x5a0c0 ; <+195>
addq (%rcx), %rsi
cmpq %rax, %rsi
ja 0x5a102 ; <+261>
incq %rdx
addq $0x10, %rcx
cmpq %rbx, %rdx
jb 0x5a0af ; <+178>
jmp 0x5a03d ; <+64>
movq 0x18(%r13,%r14), %rax
movq 0x10(%rax), %rax
movq -0x38(%rbp), %rcx
xorl %edx, %edx
movq -0x8(%rcx), %rsi
cmpq %rax, %rsi
ja 0x5a0f1 ; <+244>
addq (%rcx), %rsi
cmpq %rax, %rsi
ja 0x5a102 ; <+261>
incq %rdx
addq $0x10, %rcx
cmpq %rbx, %rdx
jb 0x5a0e0 ; <+227>
jmp 0x5a03d ; <+64>
leaq 0x10(%r13,%r14), %rax
movl $0x0, (%rax)
movb $0x0, 0x26834b94(%rip)
leaq 0x26834c25(%rip), %rdi
callq 0x804e2 ; symbol stub for: pthread_mutex_unlock
cmpl $0x1, %r12d
je 0x5a13e ; <+321>
cmpl $0x3, %r12d
je 0x5a145 ; <+328>
cmpl $0x2, %r12d
jne 0x5a14d ; <+336>
movq 0x20(%r13,%r14), %rdi
callq *0x18(%r13,%r14)
jmp 0x5a14d ; <+336>
callq *0x18(%r13,%r14)
jmp 0x5a14d ; <+336>
movq 0x18(%r13,%r14), %rdi
callq *0x10(%rdi)
leaq 0x26834bec(%rip), %rdi
callq 0x804d6 ; symbol stub for: pthread_mutex_lock
cmpb $0x0, 0x26834b48(%rip)
je 0x5a03d ; <+64>
movq 0x26834b5b(%rip), %r13
jmp 0x5a173 ; <+374>
movq (%r13), %r13
testq %r13, %r13
jne 0x5a039 ; <+60>
leaq 0x26834bbd(%rip), %rdi
addq $0x18, %rsp
popq %rbx
popq %r12
popq %r13
popq %r14
popq %r15
popq %rbp
jmp 0x804e2 ; symbol stub for: pthread_mutex_unlock
__cxa_finalize:
testq %rdi, %rdi
je 0x5a1c5 ; <+47>
pushq %rbp
movq %rsp, %rbp
subq $0x10, %rsp
leaq -0x10(%rbp), %rax
movq %rdi, (%rax)
movq $0x1, 0x8(%rax)
movq %rax, %rdi
movl $0x1, %esi
callq 0x59ffd ; __cxa_finalize_ranges
addq $0x10, %rsp
popq %rbp
retq
xorl %edi, %edi
xorl %esi, %esi
jmp 0x59ffd ; __cxa_finalize_ranges
exit:
pushq %rbp
movq %rsp, %rbp
pushq %rbx
pushq %rax
movl %edi, %ebx
cmpl $0xad, %edi
jne 0x5a404 ; <+41>
leaq 0x2683a31e(%rip), %rcx
movq (%rcx), %rax
testq %rax, %rax
je 0x5a404 ; <+41>
xorl %eax, %eax
xchgq %rax, (%rcx)
testq %rax, %rax
jne 0x5a427 ; <+76>
xorl %eax, %eax
callq 0x8017c ; symbol stub for: _tlv_exit
xorl %edi, %edi
callq 0x5a196 ; __cxa_finalize
movq 0x268354f7(%rip), %rax
testq %rax, %rax
je 0x5a420 ; <+69>
callq *%rax
movl %ebx, %edi
callq 0x8000e ; symbol stub for: __exit
callq *%rax
ud2
_tlv_exit:
jmpq *0x2680cbd6(%rip)
pthread_getspecific:
movq %gs:(,%rdi,8), %rax
retq
Assembly output is just a dump of the executable code the compiler generated, but in a human-readable form1. This is not actually used by the compiler, it's just an artifact of the compilation process to be used for reference.
Remember, the compiled executable can be converted into assembly code at any time, tools like IDA Pro and Ghidra excel at doing this on any executable, but the compiler can add in contextual information that's lost in the final compilation phase in the form of comments or useful labels for things.
The compiler often emits debug hints for your compiled executable so it can turn a stack-trace into something that maps back to your original source code. These artifacts are much more useful as they allow you to step through C++ code instead of assembly code. If you ever have to debug in a library you don't have the source for you'll be stuck stepping through an assembly view of the executable code.
1 Presuming you can read assembly code.
The code you posted is support code from your libc runtime. The runtime is responsible for, among others:
implementing atexit hooks;
setting up your IO streams (cin, cout);
running constructors of any global static variables.
This answer has a more complete overview. You can search for articles about libc_start_main and related functions to learn more.

Why use leal instead of incq?

I was fooling around and found that the following
#include <stdio.h>
void f(int& x){
x+=1;
}
int main(){
int a = 12;
f(a);
printf("%d\n",a);
}
when translated by g++ (Ubuntu 4.8.4-2ubuntu1~14.04.3) 4.8.4 with g++ main.cpp -S produces this assembly (showing only the relevant parts)
_Z1fRi:
pushq %rbp
movq %rsp, %rbp
movq %rdi, -8(%rbp)
movq -8(%rbp), %rax
movl (%rax), %eax
leal 1(%rax), %edx
movq -8(%rbp), %rax
movl %edx, (%rax)
popq %rbp
ret
main:
pushq %rbp
movq %rsp, %rbp
subq $16, %rsp
movl $12, -4(%rbp)
leaq -4(%rbp), %rax
movq %rax, %rdi
call _Z1fRi
movl -4(%rbp), %eax
movl %eax, %esi
movl $.LC0, %edi
movl $0, %eax
call printf
movl $0, %eax
leave
ret
Question: Why would the compiler choose to use leal instead of incq? Or am I missing something?
You compiled without optimization. GCC does not make any effort to select particularly well-fitting instructions when building in "debug" mode; it just focuses on generating the code as quickly as possible (and with an eye to making debugging easier—e.g., the ability to set breakpoints on source code lines).
When I enable optimizations by passing the -O2 switch, I get:
_Z1fRi:
addl $1, (%rdi)
ret
With generic tuning, the addl is preferred because some Intel processors (specifically Pentium 4, but also possibly Knight's Landing) have a false flags dependency.
With -march=k8, incl is used instead.
There is sometimes a use-case for leal in optimized code, though, and that is when you want to increment a register's value and store the result in a different register. Using leal in this way would allow you to preserve the register's original value, without needing an additional movl instruction. Another advantage of leal over incl/addl is that leal doesn't affect the flags, which can be useful in instruction scheduling.

g++ dumped assembly output doesn't work

I have following C++ code in main.cpp file.
int add(int a,int b)
{
int c = a + b;
return c;
}
int main()
{
int a = 2;
int b = 4;
int d = add(2,4);
}
when I ran g++ -S main.cpp I got the following assembly code.(after removing all the debug symbols). Also I have changed the code to print the sum of the 2 numbers using sys_write system call.
.text
.globl _Z3addii
_Z3addii:
pushq %rbp
movq %rsp, %rbp
movl %edi, -20(%rbp)
movl %esi, -24(%rbp)
movl -24(%rbp), %eax
movl -20(%rbp), %edx
addl %edx, %eax
movl %eax, -4(%rbp)
movl -4(%rbp), %eax
popq %rbp
ret
.globl main
main:
pushq %rbp
movq %rsp, %rbp
subq $16, %rsp
movl $2, -12(%rbp)
movl $4, -8(%rbp)
movl $4, %esi
movl $2, %edi
call _Z3addii
movl %eax, -4(%rbp)
movl $4, %edx #message length
movl -4(%rbp), %esi #message to write
movl $1, %edi #file descriptor (stdout)
movl $1, %eax #system call number (sys_write)
syscall #call kernel
movl $60, %eax # Invoke the Linux 'exit' syscall
movl $0, %edi # With a return value of 0
syscall # call kernel
ret
My problem is when I run the above assembly it gives nothing as output. I can't understand what I am missing here? Can someone please tell me what I am missing? Thanks.
commands used:
g++ -o main main.s and ./main -->no output
OS: Ubuntu 12.04 64bit and g++ version: 4.8.2
There are two things you're doing wrong:
Firstly, you're the 64-bit syscall instruction, but initialize only the %e part of the registers. Secondly, this:
movl -4(%rbp), %esi
loads the value that is at -4(%rbp) (the 6 you just calculated) into %esi, when sys_write expects the memory address of that value there (by which I mean in %rsi). It works with this:
movq $1, %rax #system call number (sys_write)
movq $1, %rdi #file descriptor (stdout)
leaq -4(%rbp), %rsi #message to write
movq $4, %rdx #message length
syscall #call kernel
Of course, you're not going to get formatted output this way. To see that the 6 is printed, you will have to pipe the output through hexdump or something similar.
Addendum: That you only initialize the %e part of the registers is actually only really critical here in the case of %rsi. %rbp holds, at the time of reading, a value with set high bits, and these are lost if only -4(%ebp) is written to %esi. Technically this also works:
movl $1, %eax #system call number (sys_write)
movl $1, %edi #file descriptor (stdout)
leaq -4(%rbp), %rsi #message to write
movl $4, %edx #message length
syscall #call kernel
...but I feel that it is rather poor style.

Nested if statements and "&&" operator

if(a() && b() && c() && d())
doSomething();
if(a())
if(b())
if(c())
if(d())
doSomething();
Is there "any" performance difference between these two?
For example, in a situation that a() turns 0, will it keep running b(), c() and d() in the first if statement? Or will it work same as the second nested if statement?
They're exactly identical.
To test this yourself, run gcc -S test.c (presuming that this is where you've put your source) and observe the contents of test.s.
Here's how the nested-if approach compiles in gcc 4.8.1 with default options (annotated with comments):
main:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movl $0, %eax
call A # try to call A
testl %eax, %eax # look at its return value
je .L3 # short-circuit if it returned 0
movl $0, %eax # ...repeat for B, et al.
call B
testl %eax, %eax
je .L3
movl $0, %eax
call C
testl %eax, %eax
je .L3
movl $0, %eax
call D
testl %eax, %eax
je .L3
movl $0, %eax
call doSomething
.L3:
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
Here's how the && approach compiles:
main:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movl $0, %eax
call A # try to call A
testl %eax, %eax # look at its return value
je .L3 # short-circuit if it returned 0
movl $0, %eax # ...repeat for B, et al.
call B
testl %eax, %eax
je .L3
movl $0, %eax
call C
testl %eax, %eax
je .L3
movl $0, %eax
call D
testl %eax, %eax
je .L3
movl $0, %eax
call doSomething
.L3:
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc

Force GCC to pass arguments in registers

I'm starting to try to mess around with inlining ASM in C++, so I wrote up this little snippet:
#include <iostream>
int foo(int, int, int);
int main(void)
{
return foo(1,2,3);
}
int foo(int a, int b, int c)
{
asm volatile("add %1, %0\n\t"
"add %2, %0\n\t"
"add $0x01, %0":"+r"(a):"r"(b), "r"(c):"cc");
}
Which outputs the following assembly code:
main:
.LFB969:
subq $40, %rsp
.seh_stackalloc 40
.seh_endprologue
call __main
movl $3, %r8d
movl $2, %edx
movl $1, %ecx
call _Z3fooiii
... stuff not shown...
_Z3fooiii:
.LFB970:
.seh_endprologue
movl %ecx, 8(%rsp)
movl %edx, 16(%rsp)
movl %r8d, 24(%rsp)
movl 16(%rsp), %edx
movl 24(%rsp), %ecx
movl 8(%rsp), %eax
/APP
# 15 "K:\inline_asm_practice_1.cpp" 1
add %edx, %eax
add %ecx, %eax
add $0x01, %eax
# 0 "" 2
/NO_APP
movl %eax, 8(%rsp)
ret
So I can see where it inputs my code, but what's with the stack manipulations above it? Is there any way I can get rid of them; they seem unnecessary. I should just be able to have
(in main)
movl $3, %r8d
movl $2, %edx
movl $1, %ecx
call _Z3fooiii
(in foo)
add %edx, %ecx
add %r8d, %eax
add $0x01, %eax
ret
How do I make gcc understand that it doesn't need to shove things on the stack and bring them back in a different order? I've fried fastcall and regparam already, and I can't find anything aboout this.
You probably need to enable optimizations via something like -O2 in order to get the compiler to try and write better/faster code, instead simpler/easier to debug/understand code.