I was fooling around and found that the following
#include <stdio.h>
void f(int& x){
x+=1;
}
int main(){
int a = 12;
f(a);
printf("%d\n",a);
}
when translated by g++ (Ubuntu 4.8.4-2ubuntu1~14.04.3) 4.8.4 with g++ main.cpp -S produces this assembly (showing only the relevant parts)
_Z1fRi:
pushq %rbp
movq %rsp, %rbp
movq %rdi, -8(%rbp)
movq -8(%rbp), %rax
movl (%rax), %eax
leal 1(%rax), %edx
movq -8(%rbp), %rax
movl %edx, (%rax)
popq %rbp
ret
main:
pushq %rbp
movq %rsp, %rbp
subq $16, %rsp
movl $12, -4(%rbp)
leaq -4(%rbp), %rax
movq %rax, %rdi
call _Z1fRi
movl -4(%rbp), %eax
movl %eax, %esi
movl $.LC0, %edi
movl $0, %eax
call printf
movl $0, %eax
leave
ret
Question: Why would the compiler choose to use leal instead of incq? Or am I missing something?
You compiled without optimization. GCC does not make any effort to select particularly well-fitting instructions when building in "debug" mode; it just focuses on generating the code as quickly as possible (and with an eye to making debugging easier—e.g., the ability to set breakpoints on source code lines).
When I enable optimizations by passing the -O2 switch, I get:
_Z1fRi:
addl $1, (%rdi)
ret
With generic tuning, the addl is preferred because some Intel processors (specifically Pentium 4, but also possibly Knight's Landing) have a false flags dependency.
With -march=k8, incl is used instead.
There is sometimes a use-case for leal in optimized code, though, and that is when you want to increment a register's value and store the result in a different register. Using leal in this way would allow you to preserve the register's original value, without needing an additional movl instruction. Another advantage of leal over incl/addl is that leal doesn't affect the flags, which can be useful in instruction scheduling.
Related
I read the cppreference guide over the sperimental feature of transactional memory and i try it.
I write some simple code with sincronized that as say cpp reference is not a transaction but only guarantees that the operation in the block are executed in a total order, the i write the same code with atomic_noexcept and atomic_commit, not with atomic_cancel that seems to be not yet implemented.
The doubt that i have is about the difference between atomic_noexcept, atomic_commit and synchronized, apparently they work in the same way, except for the compilation error if a no transaction safe function is called in an atomic block.
So I analyze the assembly code for the 3 variants, and result the same, as reported below:
cpp atomic_noexcept:
int a;
void thread_func() {
atomic_noexcept
{
++a;
}
}
assembly atomic_noexcept:
thread_func():
subq $8, %rsp
movl $43, %edi
xorl %eax, %eax
call _ITM_beginTransaction
testb $2, %al
jne .L2
movl $a, %edi
call _ITM_RfWU4
movl $a, %edi
leal 1(%rax), %esi
call _ITM_WaWU4
call _ITM_commitTransaction
addq $8, %rsp
ret
.L2:
addl $1, a(%rip)
addq $8, %rsp
jmp _ITM_commitTransaction
a:
.zero 4
cpp atomic_commit:
int a;
void thread_func() {
atomic_commit
{
++a;
}
}
assembly atomic_commit:
thread_func():
subq $8, %rsp
movl $43, %edi
xorl %eax, %eax
call _ITM_beginTransaction
testb $2, %al
jne .L2
movl $a, %edi
call _ITM_RfWU4
movl $a, %edi
leal 1(%rax), %esi
call _ITM_WaWU4
call _ITM_commitTransaction
addq $8, %rsp
ret
.L2:
addl $1, a(%rip)
addq $8, %rsp
jmp _ITM_commitTransaction
a:
.zero 4
cpp synchronized:
int a;
void thread_func() {
synchronized
{
++a;
}
}
assembly synchronized:
thread_func():
subq $8, %rsp
movl $43, %edi
xorl %eax, %eax
call _ITM_beginTransaction
testb $2, %al
jne .L2
movl $a, %edi
call _ITM_RfWU4
movl $a, %edi
leal 1(%rax), %esi
call _ITM_WaWU4
call _ITM_commitTransaction
addq $8, %rsp
ret
.L2:
addl $1, a(%rip)
addq $8, %rsp
jmp _ITM_commitTransaction
a:
.zero 4
How can they work differently? For example i report the explanation of different atomic block of cppreference:
atomic_noexcept : If an exception is thrown, std::abort is called
atomic_cancel : If an exception is thrown, std::abort is called,
unless the exception is one of the exceptions uses for transaction
cancellation (see below) in which case the transaction is cancelled:
the values of all memory locations in the program that were modified
by side effects of the operations of the atomic block are restored to
the values they had at the time the start of the atomic block was
executed, and the exception continues stack unwinding as usual.
atomic_commit : If an exception is thrown, the transaction is
committed normally.
How can atomic_noexcept work differently from atomic_commit if has the same assembly code?
How can syncronized block work differently from atomic block if has the same assembly code?
EDIT:
All these test and assembly code are extracted from last version of GCC (V. 10.2)
EDIT2:
After some test and research i haven't found yet a logical explanation for the said different behaviour.
I'm trying to reproduce the example code in order to understand the as-if rule of C++ better. According to cppreference.
int& preinc(int& n) { return ++n; }
int add(int n, int m) { return n+m; }
// volatile input to prevent constant folding
volatile int input = 7;
// volatile output to make the result a visible side-effect
volatile int result;
int main()
{
int n = input;
// using built-in operators would invoke undefined behavior
// int m = ++n + ++n;
// but using functions makes sure the code executes as-if
// the functions were not overlapped
int m = add(preinc(n), preinc(n));
result = m;
}
I use g++ -s main.cpp to get the assembler output from the source, the main() function of output file main.s is showed as below:
main:
.LFB2:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
pushq %rbx
subq $24, %rsp
.cfi_offset 3, -24
movq %fs:40, %rax
movq %rax, -24(%rbp)
xorl %eax, %eax
movl input(%rip), %eax
movl %eax, -32(%rbp)
leaq -32(%rbp), %rax
movq %rax, %rdi
call _Z6preincRi
movl (%rax), %ebx
leaq -32(%rbp), %rax
movq %rax, %rdi
call _Z6preincRi
movl (%rax), %eax
movl %ebx, %esi
movl %eax, %edi
call _Z3addii
movl %eax, -28(%rbp)
movl -28(%rbp), %eax
movl %eax, result(%rip)
movl $0, %eax
movq -24(%rbp), %rdx
xorq %fs:40, %rdx
je .L7
call __stack_chk_fail
According to the output file I think the g++ compiler only compile the source code sentence by sentence without optimization, even if I add the -O3 compile option.
Since the output suppose to be like this:
# full code of the main() function as produced by the GCC compiler
# x86 (Intel) platform:
movl input(%rip), %eax # eax = input
leal 3(%rax,%rax), %eax # eax = 3 + eax + eax
movl %eax, result(%rip) # result = eax
xorl %eax, %eax # eax = 0 (the return value of main())
ret
I want to know how to get the assembler output code showed as below.
Something went wrong when I test the example code. Here is the answer I made concluding some of my thoughts and the comments from others above.
Compilers will optimize the code unless the "-O3" or "-O2" compilation option was added. Just like #Balázs Kovacsics and #molbdnilo said in comments. Using the command g++ -S main.cpp will get the assembler output sentence by sentence like what is showed in the question.
Once the "-O3" or "-O2" compilation option was added, it means programmer allows the compiler do any code transformations that do not change the observable behavior of the program. So the main() function of output file main.s is showed as below, using g++ -S -O3 main.cpp
main:
.LFB2:
.cfi_startproc
movl input(%rip), %eax
leal 3(%rax,%rax), %eax
movl %eax, result(%rip)
xorl %eax, %eax
ret
.cfi_endproc
Should be careful that the compiler option should written in upper case.
Here is a compiler explorer website #JulianH gave which is really convenient to see assembler output among different platforms and different compilers.
I think get the assembler output helps me understand as-if rule better. I hope what I wrote would help someone who is also confusing about the abstract description of cppreference.
So this question is just out of curiosity.
I have some tiny program:
#include <some_header>
void print(){ printf("abc"); } // don't care about main, I'm not gonna run it
Then I compiled it to assembly, with once some_header=>iostream and another time some_header=>cstdio with gcc.godbolt.org (6.1 for x86_64) with -O3 -pedantic -std=c++14. Look at this:
.LC0:
.string "abc"
print(): (iostream) or (both included)
movl $.LC0, %edi
xorl %eax, %eax
jmp printf
subq $8, %rsp
movl std::__ioinit, %edi
call std::ios_base::Init::Init()
movl $__dso_handle, %edx
movl std::__ioinit, %esi
movl std::ios_base::Init::~Init(), %edi
addq $8, %rsp
jmp __cxa_atexit
print(): (cstdio)
movl $.LC0, %edi
xorl %eax, %eax
jmp printf
There's a significant difference between them, and they're identical for the first three lines, so why do iostream need such amount of code to clean up or what are those lines just doing? OR just to say that godbolt is unreliable upon this task?
Also it seems that standard doesn't guarantee that printf is accessible from iostream, should this be relied upon?
Your print function compiles to a pretty much the same assembly code in both cases.
The additional lines you see are to initialise and de-initialise iostream library. You may see that clearly if you remove the optimisation flag -O3.
Here is a complete listing with iostream included and optimisation switched off.
std::piecewise_construct:
.zero 1
.LC0:
.string "abc"
print():
pushq %rbp
movq %rsp, %rbp
movl $.LC0, %edi
movl $0, %eax
call printf
nop
popq %rbp
ret
__static_initialization_and_destruction_0(int, int):
pushq %rbp
movq %rsp, %rbp
subq $16, %rsp
movl %edi, -4(%rbp)
movl %esi, -8(%rbp)
cmpl $1, -4(%rbp)
jne .L4
cmpl $65535, -8(%rbp)
jne .L4
movl std::__ioinit, %edi
call std::ios_base::Init::Init()
movl $__dso_handle, %edx
movl std::__ioinit, %esi
movl std::ios_base::Init::~Init(), %edi
call __cxa_atexit
.L4:
nop
leave
ret
pushq %rbp
movq %rsp, %rbp
movl $65535, %esi
movl $1, %edi
call __static_initialization_and_destruction_0(int, int)
popq %rbp
ret
I have following C++ code in main.cpp file.
int add(int a,int b)
{
int c = a + b;
return c;
}
int main()
{
int a = 2;
int b = 4;
int d = add(2,4);
}
when I ran g++ -S main.cpp I got the following assembly code.(after removing all the debug symbols). Also I have changed the code to print the sum of the 2 numbers using sys_write system call.
.text
.globl _Z3addii
_Z3addii:
pushq %rbp
movq %rsp, %rbp
movl %edi, -20(%rbp)
movl %esi, -24(%rbp)
movl -24(%rbp), %eax
movl -20(%rbp), %edx
addl %edx, %eax
movl %eax, -4(%rbp)
movl -4(%rbp), %eax
popq %rbp
ret
.globl main
main:
pushq %rbp
movq %rsp, %rbp
subq $16, %rsp
movl $2, -12(%rbp)
movl $4, -8(%rbp)
movl $4, %esi
movl $2, %edi
call _Z3addii
movl %eax, -4(%rbp)
movl $4, %edx #message length
movl -4(%rbp), %esi #message to write
movl $1, %edi #file descriptor (stdout)
movl $1, %eax #system call number (sys_write)
syscall #call kernel
movl $60, %eax # Invoke the Linux 'exit' syscall
movl $0, %edi # With a return value of 0
syscall # call kernel
ret
My problem is when I run the above assembly it gives nothing as output. I can't understand what I am missing here? Can someone please tell me what I am missing? Thanks.
commands used:
g++ -o main main.s and ./main -->no output
OS: Ubuntu 12.04 64bit and g++ version: 4.8.2
There are two things you're doing wrong:
Firstly, you're the 64-bit syscall instruction, but initialize only the %e part of the registers. Secondly, this:
movl -4(%rbp), %esi
loads the value that is at -4(%rbp) (the 6 you just calculated) into %esi, when sys_write expects the memory address of that value there (by which I mean in %rsi). It works with this:
movq $1, %rax #system call number (sys_write)
movq $1, %rdi #file descriptor (stdout)
leaq -4(%rbp), %rsi #message to write
movq $4, %rdx #message length
syscall #call kernel
Of course, you're not going to get formatted output this way. To see that the 6 is printed, you will have to pipe the output through hexdump or something similar.
Addendum: That you only initialize the %e part of the registers is actually only really critical here in the case of %rsi. %rbp holds, at the time of reading, a value with set high bits, and these are lost if only -4(%ebp) is written to %esi. Technically this also works:
movl $1, %eax #system call number (sys_write)
movl $1, %edi #file descriptor (stdout)
leaq -4(%rbp), %rsi #message to write
movl $4, %edx #message length
syscall #call kernel
...but I feel that it is rather poor style.
I'm starting to try to mess around with inlining ASM in C++, so I wrote up this little snippet:
#include <iostream>
int foo(int, int, int);
int main(void)
{
return foo(1,2,3);
}
int foo(int a, int b, int c)
{
asm volatile("add %1, %0\n\t"
"add %2, %0\n\t"
"add $0x01, %0":"+r"(a):"r"(b), "r"(c):"cc");
}
Which outputs the following assembly code:
main:
.LFB969:
subq $40, %rsp
.seh_stackalloc 40
.seh_endprologue
call __main
movl $3, %r8d
movl $2, %edx
movl $1, %ecx
call _Z3fooiii
... stuff not shown...
_Z3fooiii:
.LFB970:
.seh_endprologue
movl %ecx, 8(%rsp)
movl %edx, 16(%rsp)
movl %r8d, 24(%rsp)
movl 16(%rsp), %edx
movl 24(%rsp), %ecx
movl 8(%rsp), %eax
/APP
# 15 "K:\inline_asm_practice_1.cpp" 1
add %edx, %eax
add %ecx, %eax
add $0x01, %eax
# 0 "" 2
/NO_APP
movl %eax, 8(%rsp)
ret
So I can see where it inputs my code, but what's with the stack manipulations above it? Is there any way I can get rid of them; they seem unnecessary. I should just be able to have
(in main)
movl $3, %r8d
movl $2, %edx
movl $1, %ecx
call _Z3fooiii
(in foo)
add %edx, %ecx
add %r8d, %eax
add $0x01, %eax
ret
How do I make gcc understand that it doesn't need to shove things on the stack and bring them back in a different order? I've fried fastcall and regparam already, and I can't find anything aboout this.
You probably need to enable optimizations via something like -O2 in order to get the compiler to try and write better/faster code, instead simpler/easier to debug/understand code.