Behaviour of assert - c++

I'm trying to understand how assert behaves in the case of false statement. It calls __assert_fail and it calls std::abort(), according to documentation. So, I want to know what is going in assembly code of __assert_fail:
endbr64
pushq %r13
movl %edx, %r13d
movl $0x5, %edx
pushq %r12
movq %rsi, %r12
leaq 0x18467e(%rip), %rsi
pushq %rbp
movq %rdi, %rbp
leaq 0x1804f0(%rip), %rdi ; _libc_intl_domainname
pushq %rbx
movq %rcx, %rbx
subq $0x8, %rsp
callq 0x37980 ; __dcgettext
movq %rbx, %r8
movl %r13d, %ecx
movq %r12, %rdx
movq %rax, %rdi
movq %rbp, %rsi
callq 0x36d70
What %r13 and %r12 stand for in this code? Where is the call of std::abort() and what is going on before and after this call?

Related

Move semantics appear to add additional overhead in the simplest possible* example

* I believe this is the simplest possible example, but if I'm incorrect please let me know.
https://godbolt.org/z/neaTse
I'm attempting to learn and understand move semantics and some of their intricacies, but I've hit a bit of a snag. When attempting to compare the following 2 code snippets, the code using move semantics ends up with 8 additional lines of assembly and 2 additional moves (15 for move, 13 for no-move).
Move:
#include <utility>
template<class T>
void swap(T& a, T& b)
{
T tmp(std::move(a));
a = std::move(b);
b = std::move(tmp);
}
int main(){
int a, b;
swap(a, b);
}
No-Move:
template<class T>
void swap(T& a, T& b)
{
T tmp(a);
a = b;
b = tmp;
}
int main(){
int a, b;
swap(a, b);
}
Move generated assembly:
main:
pushq %rbp
movq %rsp, %rbp
subq $16, %rsp
leaq -8(%rbp), %rdx
leaq -4(%rbp), %rax
movq %rdx, %rsi
movq %rax, %rdi
call void swap<int>(int&, int&)
movl $0, %eax
leave
ret
void swap<int>(int&, int&):
pushq %rbp
movq %rsp, %rbp
subq $32, %rsp
movq %rdi, -24(%rbp)
movq %rsi, -32(%rbp)
movq -24(%rbp), %rax
movq %rax, %rdi
call std::remove_reference<int&>::type&& std::move<int&>(int&)
movl (%rax), %eax
movl %eax, -4(%rbp)
movq -32(%rbp), %rax
movq %rax, %rdi
call std::remove_reference<int&>::type&& std::move<int&>(int&)
movl (%rax), %edx
movq -24(%rbp), %rax
movl %edx, (%rax)
leaq -4(%rbp), %rax
movq %rax, %rdi
call std::remove_reference<int&>::type&& std::move<int&>(int&)
movl (%rax), %edx
movq -32(%rbp), %rax
movl %edx, (%rax)
nop
leave
ret
No-move generated assembly:
main:
pushq %rbp
movq %rsp, %rbp
subq $16, %rsp
leaq -8(%rbp), %rdx
leaq -4(%rbp), %rax
movq %rdx, %rsi
movq %rax, %rdi
call void swap<int>(int&, int&)
movl $0, %eax
leave
ret
void swap<int>(int&, int&):
pushq %rbp
movq %rsp, %rbp
movq %rdi, -24(%rbp)
movq %rsi, -32(%rbp)
movq -24(%rbp), %rax
movl (%rax), %eax
movl %eax, -4(%rbp)
movq -32(%rbp), %rax
movl (%rax), %edx
movq -24(%rbp), %rax
movl %edx, (%rax)
movq -32(%rbp), %rax
movl -4(%rbp), %edx
movl %edx, (%rax)
nop
popq %rbp
ret
I think the the way I've internalized or abstracted move semantics for myself, is that they "enable 'newly available' optimization through the removal of costly temporary copies".
Have I just internalized this incorrectly?
-Or-
Is this just failing because I'm using a primitive type?
-Or-
Have I just missed the mark entirely?
OK, the problem is inlining and effective optimization, to get around it I've annotated it with __attribute__((noinline)).
I've made a class to handle the move
class mover {
public :
int *ptr { nullptr };
__attribute__((noinline)) mover() : ptr(new int(42)) { }
__attribute__((noinline)) mover(mover & other) {
delete ptr;
ptr = new int(*other.ptr);
}
__attribute__((noinline)) mover& operator= ( mover && other) {
delete ptr;
ptr = other.ptr;
other.ptr = nullptr;
return *this;
}
__attribute__((noinline)) mover& operator= ( mover & other) {
delete ptr;
ptr = new int(*other.ptr);
return *this;
}
__attribute__((noinline)) mover(mover && other) {
ptr = other.ptr;
other.ptr = nullptr;
}
__attribute__((noinline)) ~mover() {
delete ptr;
}
};
Specific doesn't use smart pointers to be able to see what goes on.
The move swap now looks like this calling the correct constructors and operators
void swap<mover>(mover&, mover&):
pushq %r12
movq %rdi, %r12
pushq %rbp
movq %rsi, %rbp
movq %rdi, %rsi
subq $24, %rsp
leaq 8(%rsp), %rdi
call mover::mover(mover&&)
movq %rbp, %rsi
movq %r12, %rdi
call mover::operator=(mover&&) [clone .isra.0]
leaq 8(%rsp), %rsi
movq %rbp, %rdi
call mover::operator=(mover&&) [clone .isra.0]
leaq 8(%rsp), %rdi
call mover::~mover() [complete object destructor]
addq $24, %rsp
popq %rbp
popq %r12
ret
And the copy swap looks like this calling copy and copy assign.
void swap<mover>(mover&, mover&):
pushq %r12
movq %rdi, %r12
pushq %rbp
movq %rsi, %rbp
movq %rdi, %rsi
subq $24, %rsp
leaq 8(%rsp), %rdi
call mover::mover(mover&)
movq %rbp, %rsi
movq %r12, %rdi
call mover::operator=(mover&) [clone .isra.0]
leaq 8(%rsp), %rsi
movq %rbp, %rdi
call mover::operator=(mover&) [clone .isra.0]
leaq 8(%rsp), %rdi
call mover::~mover() [complete object destructor]
addq $24, %rsp
popq %rbp
popq %r12
ret
This biggest effect are the different move constructor
mover::mover(mover&&):
movq (%rsi), %rax
movq $0, (%rsi)
movq %rax, (%rdi)
ret
and copy constructor
mover::mover(mover&):
pushq %rbp
movq %rsi, %rbp
pushq %rbx
movq %rdi, %rbx
subq $8, %rsp
movq $0, (%rdi)
movl $4, %edi
call operator new(unsigned long) // <---- new
movq 0(%rbp), %rdx
movq %rax, (%rbx)
movl (%rdx), %edx
movl %edx, (%rax)
addq $8, %rsp
popq %rbx
popq %rbp
ret
With the new call in the latter.

What do these instructions in the diassembly phase indicate?

Hello as I run c++ code in clion IDE debugger, after main() returns, the debugger steps into a file called disassembly, and it contains what looks like assmebly code. What are those instructions? What does it do? Should I care? as I'm new to c++ I'm familiarizing myself with the language, IDE and anything else of relevance.
start:
nop
movl %eax, %edi
callq 0x2e82e ; symbol stub for: exit
hlt
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
exit:
jmpq *0x268c241c(%rip)
exit:
pushq %rbp
movq %rsp, %rbp
pushq %rbx
pushq %rax
movl %edi, %ebx
cmpl $0xad, %edi
jne 0x5a404 ; <+41>
leaq 0x2683a31e(%rip), %rcx
movq (%rcx), %rax
testq %rax, %rax
je 0x5a404 ; <+41>
xorl %eax, %eax
xchgq %rax, (%rcx)
testq %rax, %rax
jne 0x5a427 ; <+76>
xorl %eax, %eax
callq 0x8017c ; symbol stub for: _tlv_exit
xorl %edi, %edi
callq 0x5a196 ; __cxa_finalize
movq 0x268354f7(%rip), %rax
testq %rax, %rax
je 0x5a420 ; <+69>
callq *%rax
movl %ebx, %edi
callq 0x8000e ; symbol stub for: __exit
callq *%rax
ud2
There is also this
_tlv_exit:
pushq %rbp
movq %rsp, %rbp
pushq %rbx
pushq %rax
movq 0x268db5e9(%rip), %rdi
callq 0x2e92a ; symbol stub for: pthread_getspecific
testq %rax, %rax
je 0x18e20 ; <+54>
movq %rax, %rbx
movq 0x268db5d5(%rip), %rdi
xorl %esi, %esi
callq 0x2e942 ; symbol stub for: pthread_setspecific
movq %rbx, %rdi
addq $0x8, %rsp
popq %rbx
popq %rbp
jmp 0x1983e ; tlv_finalize_list
addq $0x8, %rsp
popq %rbx
popq %rbp
retq
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
start:
nop
movl %eax, %edi
callq 0x2e82e ; symbol stub for: exit
hlt
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
exit:
jmpq *0x268c241c(%rip)
pthread_getspecific:
jmpq *0x268c2470(%rip)
__cxa_finalize_ranges:
pushq %rbp
movq %rsp, %rbp
pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %rbx
subq $0x18, %rsp
movl %esi, -0x2c(%rbp)
movq %rdi, -0x38(%rbp)
leaq 0x26834d24(%rip), %rdi
callq 0x804d6 ; symbol stub for: pthread_mutex_lock
movq 0x26834ca0(%rip), %r13
testq %r13, %r13
je 0x5a17c ; <+383>
movl -0x2c(%rbp), %ebx
addq $0x8, -0x38(%rbp)
movslq 0x8(%r13), %r15
testq %r15, %r15
jle 0x5a16f ; <+370>
decq %r15
movq %r15, %r14
shlq $0x5, %r14
movl 0x10(%r13,%r14), %r12d
testl %r12d, %r12d
je 0x5a03d ; <+64>
cmpl $0x0, -0x2c(%rbp)
je 0x5a102 ; <+261>
cmpl $0x1, %r12d
je 0x5a0a4 ; <+167>
cmpl $0x3, %r12d
je 0x5a0d1 ; <+212>
cmpl $0x2, %r12d
jne 0x5a102 ; <+261>
movq 0x28(%r13,%r14), %rax
movq -0x38(%rbp), %rcx
xorl %edx, %edx
movq -0x8(%rcx), %rsi
cmpq %rax, %rsi
ja 0x5a096 ; <+153>
addq (%rcx), %rsi
cmpq %rax, %rsi
ja 0x5a102 ; <+261>
incq %rdx
addq $0x10, %rcx
cmpq %rbx, %rdx
jb 0x5a085 ; <+136>
jmp 0x5a03d ; <+64>
movq 0x18(%r13,%r14), %rax
movq -0x38(%rbp), %rcx
xorl %edx, %edx
movq -0x8(%rcx), %rsi
cmpq %rax, %rsi
ja 0x5a0c0 ; <+195>
addq (%rcx), %rsi
cmpq %rax, %rsi
ja 0x5a102 ; <+261>
incq %rdx
addq $0x10, %rcx
cmpq %rbx, %rdx
jb 0x5a0af ; <+178>
jmp 0x5a03d ; <+64>
movq 0x18(%r13,%r14), %rax
movq 0x10(%rax), %rax
movq -0x38(%rbp), %rcx
xorl %edx, %edx
movq -0x8(%rcx), %rsi
cmpq %rax, %rsi
ja 0x5a0f1 ; <+244>
addq (%rcx), %rsi
cmpq %rax, %rsi
ja 0x5a102 ; <+261>
incq %rdx
addq $0x10, %rcx
cmpq %rbx, %rdx
jb 0x5a0e0 ; <+227>
jmp 0x5a03d ; <+64>
leaq 0x10(%r13,%r14), %rax
movl $0x0, (%rax)
movb $0x0, 0x26834b94(%rip)
leaq 0x26834c25(%rip), %rdi
callq 0x804e2 ; symbol stub for: pthread_mutex_unlock
cmpl $0x1, %r12d
je 0x5a13e ; <+321>
cmpl $0x3, %r12d
je 0x5a145 ; <+328>
cmpl $0x2, %r12d
jne 0x5a14d ; <+336>
movq 0x20(%r13,%r14), %rdi
callq *0x18(%r13,%r14)
jmp 0x5a14d ; <+336>
callq *0x18(%r13,%r14)
jmp 0x5a14d ; <+336>
movq 0x18(%r13,%r14), %rdi
callq *0x10(%rdi)
leaq 0x26834bec(%rip), %rdi
callq 0x804d6 ; symbol stub for: pthread_mutex_lock
cmpb $0x0, 0x26834b48(%rip)
je 0x5a03d ; <+64>
movq 0x26834b5b(%rip), %r13
jmp 0x5a173 ; <+374>
movq (%r13), %r13
testq %r13, %r13
jne 0x5a039 ; <+60>
leaq 0x26834bbd(%rip), %rdi
addq $0x18, %rsp
popq %rbx
popq %r12
popq %r13
popq %r14
popq %r15
popq %rbp
jmp 0x804e2 ; symbol stub for: pthread_mutex_unlock
__cxa_finalize:
testq %rdi, %rdi
je 0x5a1c5 ; <+47>
pushq %rbp
movq %rsp, %rbp
subq $0x10, %rsp
leaq -0x10(%rbp), %rax
movq %rdi, (%rax)
movq $0x1, 0x8(%rax)
movq %rax, %rdi
movl $0x1, %esi
callq 0x59ffd ; __cxa_finalize_ranges
addq $0x10, %rsp
popq %rbp
retq
xorl %edi, %edi
xorl %esi, %esi
jmp 0x59ffd ; __cxa_finalize_ranges
exit:
pushq %rbp
movq %rsp, %rbp
pushq %rbx
pushq %rax
movl %edi, %ebx
cmpl $0xad, %edi
jne 0x5a404 ; <+41>
leaq 0x2683a31e(%rip), %rcx
movq (%rcx), %rax
testq %rax, %rax
je 0x5a404 ; <+41>
xorl %eax, %eax
xchgq %rax, (%rcx)
testq %rax, %rax
jne 0x5a427 ; <+76>
xorl %eax, %eax
callq 0x8017c ; symbol stub for: _tlv_exit
xorl %edi, %edi
callq 0x5a196 ; __cxa_finalize
movq 0x268354f7(%rip), %rax
testq %rax, %rax
je 0x5a420 ; <+69>
callq *%rax
movl %ebx, %edi
callq 0x8000e ; symbol stub for: __exit
callq *%rax
ud2
_tlv_exit:
jmpq *0x2680cbd6(%rip)
pthread_getspecific:
movq %gs:(,%rdi,8), %rax
retq
Assembly output is just a dump of the executable code the compiler generated, but in a human-readable form1. This is not actually used by the compiler, it's just an artifact of the compilation process to be used for reference.
Remember, the compiled executable can be converted into assembly code at any time, tools like IDA Pro and Ghidra excel at doing this on any executable, but the compiler can add in contextual information that's lost in the final compilation phase in the form of comments or useful labels for things.
The compiler often emits debug hints for your compiled executable so it can turn a stack-trace into something that maps back to your original source code. These artifacts are much more useful as they allow you to step through C++ code instead of assembly code. If you ever have to debug in a library you don't have the source for you'll be stuck stepping through an assembly view of the executable code.
1 Presuming you can read assembly code.
The code you posted is support code from your libc runtime. The runtime is responsible for, among others:
implementing atexit hooks;
setting up your IO streams (cin, cout);
running constructors of any global static variables.
This answer has a more complete overview. You can search for articles about libc_start_main and related functions to learn more.

C++ return reference push instruction not showing up in assembly

I'm trying to learn more about how return references are compiled, and I'm currently stuck on how they show up in assembly. The code I'm running is this:
struct Obj {
char buf[100];
int i;
long b;
} B, B2;
Obj foo(Obj b) {
b.i = 100; // Do something to the argument
return b;
}
int main() {
B2 = foo(B);
}
and the assembly code looks like this:
.file "return_function_assembly.cpp"
.text
.globl B
.bss
.align 32
.type B, #object
.size B, 112
B:
.zero 112
.globl B2
.align 32
.type B2, #object
.size B2, 112
B2:
.zero 112
.text
.globl _Z3foo3Obj
.type _Z3foo3Obj, #function
_Z3foo3Obj:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movq %rdi, -8(%rbp)
movl $100, 116(%rbp)
movq -8(%rbp), %rax
movq 16(%rbp), %rdx
movq 24(%rbp), %rcx
movq %rdx, (%rax)
movq %rcx, 8(%rax)
movq 32(%rbp), %rdx
movq 40(%rbp), %rcx
movq %rdx, 16(%rax)
movq %rcx, 24(%rax)
movq 48(%rbp), %rdx
movq 56(%rbp), %rcx
movq %rdx, 32(%rax)
movq %rcx, 40(%rax)
movq 64(%rbp), %rdx
movq 72(%rbp), %rcx
movq %rdx, 48(%rax)
movq %rcx, 56(%rax)
movq 80(%rbp), %rdx
movq 88(%rbp), %rcx
movq %rdx, 64(%rax)
movq %rcx, 72(%rax)
movq 96(%rbp), %rdx
movq 104(%rbp), %rcx
movq %rdx, 80(%rax)
movq %rcx, 88(%rax)
movq 112(%rbp), %rdx
movq 120(%rbp), %rcx
movq %rdx, 96(%rax)
movq %rcx, 104(%rax)
movq -8(%rbp), %rax
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE0:
.size _Z3foo3Obj, .-_Z3foo3Obj
.globl main
.type main, #function
main:
.LFB1:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
addq $-128, %rsp
movq %fs:40, %rax
movq %rax, -8(%rbp)
xorl %eax, %eax
leaq -128(%rbp), %rax
pushq 104+B(%rip)
pushq 96+B(%rip)
pushq 88+B(%rip)
pushq 80+B(%rip)
pushq 72+B(%rip)
pushq 64+B(%rip)
pushq 56+B(%rip)
pushq 48+B(%rip)
pushq 40+B(%rip)
pushq 32+B(%rip)
pushq 24+B(%rip)
pushq 16+B(%rip)
pushq 8+B(%rip)
pushq B(%rip)
movq %rax, %rdi
call _Z3foo3Obj
addq $112, %rsp
movq -128(%rbp), %rax
movq -120(%rbp), %rdx
movq %rax, B2(%rip)
movq %rdx, 8+B2(%rip)
movq -112(%rbp), %rax
movq -104(%rbp), %rdx
movq %rax, 16+B2(%rip)
movq %rdx, 24+B2(%rip)
movq -96(%rbp), %rax
movq -88(%rbp), %rdx
movq %rax, 32+B2(%rip)
movq %rdx, 40+B2(%rip)
movq -80(%rbp), %rax
movq -72(%rbp), %rdx
movq %rax, 48+B2(%rip)
movq %rdx, 56+B2(%rip)
movq -64(%rbp), %rax
movq -56(%rbp), %rdx
movq %rax, 64+B2(%rip)
movq %rdx, 72+B2(%rip)
movq -48(%rbp), %rax
movq -40(%rbp), %rdx
movq %rax, 80+B2(%rip)
movq %rdx, 88+B2(%rip)
movq -32(%rbp), %rax
movq -24(%rbp), %rdx
movq %rax, 96+B2(%rip)
movq %rdx, 104+B2(%rip)
movl $0, %eax
movq -8(%rbp), %rdx
xorq %fs:40, %rdx
je .L5
call __stack_chk_fail#PLT
.L5:
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE1:
.size main, .-main
.ident "GCC: (Ubuntu 7.4.0-1ubuntu1~18.04.1) 7.4.0"
.section .note.GNU-stack,"",#progbits
System is Linux, compiled with g++, and from my understanding of function frames, I should be seeing an additional "push" instruction that pushes the address of B2 onto the stack prior the call instruction in the function frame.
However, none of these push instructions seem to correspond to what I'm looking for. I see a leaq instruction, and a pushq (%rbp) at the beginning of main, but nothing that seems to be what I'm supposed to be seeing. Can anyone please advise?

Why does a simple use of ostringstream generates so much assembly code?

Consider the following simple example that formats a string and an integer using ostringstream and discards the output:
#include <sstream>
void ostringstream_test() {
std::ostringstream ss;
ss << "x = " << 42;
ss.str();
}
Compiling it with clang++ -S -O3 -DNDEBUG -std=c++14 test.cc generates a ton of assembly code (half a kilobyte in x86-64 instructions compared to less than a hundred bytes for a similar sprintf code) - see below the output. Why does it generates so much code, is it inherent to the ostringstream API or this particular compiler/library does something wrong?
.globl __Z18ostringstream_testv
.p2align 4, 0x90
__Z18ostringstream_testv: ## #_Z18ostringstream_testv
Lfunc_begin0:
.cfi_startproc
.cfi_personality 155, ___gxx_personality_v0
.cfi_lsda 16, Lexception0
## BB#0:
pushq %rbp
Lcfi0:
.cfi_def_cfa_offset 16
Lcfi1:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Lcfi2:
.cfi_def_cfa_register %rbp
pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %rbx
subq $328, %rsp ## imm = 0x148
Lcfi3:
.cfi_offset %rbx, -56
Lcfi4:
.cfi_offset %r12, -48
Lcfi5:
.cfi_offset %r13, -40
Lcfi6:
.cfi_offset %r14, -32
Lcfi7:
.cfi_offset %r15, -24
leaq -256(%rbp), %r14
leaq -360(%rbp), %r12
movq __ZTCNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE0_NS_13basic_ostreamIcS2_EE#GOTPCREL(%rip), %rax
leaq 24(%rax), %rcx
movq %rcx, -368(%rbp)
addq $64, %rax
movq %rax, -256(%rbp)
Ltmp0:
movq %r14, %rdi
movq %r12, %rsi
callq __ZNSt3__18ios_base4initEPv
Ltmp1:
## BB#1:
movq $0, -120(%rbp)
movl $-1, -112(%rbp)
movq __ZTVNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE#GOTPCREL(%rip), %rbx
leaq 24(%rbx), %r13
movq %r13, -368(%rbp)
addq $64, %rbx
movq %rbx, -256(%rbp)
Ltmp3:
movq %r12, %rdi
callq __ZNSt3__115basic_streambufIcNS_11char_traitsIcEEEC2Ev
Ltmp4:
## BB#2:
movq __ZTVNSt3__115basic_stringbufIcNS_11char_traitsIcEENS_9allocatorIcEEEE#GOTPCREL(%rip), %r15
addq $16, %r15
movq %r15, -360(%rbp)
movq $0, -272(%rbp)
movq $0, -280(%rbp)
movq $0, -288(%rbp)
movq $0, -296(%rbp)
movl $16, -264(%rbp)
xorps %xmm0, %xmm0
movaps %xmm0, -80(%rbp)
movq $0, -64(%rbp)
Ltmp6:
leaq -80(%rbp), %rsi
movq %r12, %rdi
callq __ZNSt3__115basic_stringbufIcNS_11char_traitsIcEENS_9allocatorIcEEE3strERKNS_12basic_stringIcS2_S4_EE
Ltmp7:
## BB#3:
testb $1, -80(%rbp)
je LBB0_5
## BB#4:
movq -64(%rbp), %rdi
callq __ZdlPv
LBB0_5:
Ltmp9:
leaq L_.str(%rip), %rsi
leaq -368(%rbp), %rdi
movl $4, %edx
callq __ZNSt3__124__put_character_sequenceIcNS_11char_traitsIcEEEERNS_13basic_ostreamIT_T0_EES7_PKS4_m
Ltmp10:
## BB#6:
Ltmp11:
movl $42, %esi
movq %rax, %rdi
callq __ZNSt3__113basic_ostreamIcNS_11char_traitsIcEEElsEi
Ltmp12:
## BB#7:
Ltmp13:
leaq -104(%rbp), %rdi
movq %r12, %rsi
callq __ZNKSt3__115basic_stringbufIcNS_11char_traitsIcEENS_9allocatorIcEEE3strEv
Ltmp14:
## BB#8:
testb $1, -104(%rbp)
je LBB0_10
## BB#9:
movq -88(%rbp), %rdi
callq __ZdlPv
LBB0_10:
movq %r13, -368(%rbp)
movq %rbx, -256(%rbp)
movq %r15, -360(%rbp)
testb $1, -296(%rbp)
je LBB0_12
## BB#11:
movq -280(%rbp), %rdi
callq __ZdlPv
LBB0_12:
movq %r12, %rdi
callq __ZNSt3__115basic_streambufIcNS_11char_traitsIcEEED2Ev
movq __ZTTNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE#GOTPCREL(%rip), %rsi
addq $8, %rsi
leaq -368(%rbp), %rdi
callq __ZNSt3__113basic_ostreamIcNS_11char_traitsIcEEED2Ev
movq %r14, %rdi
callq __ZNSt3__19basic_iosIcNS_11char_traitsIcEEED2Ev
addq $328, %rsp ## imm = 0x148
popq %rbx
popq %r12
popq %r13
popq %r14
popq %r15
popq %rbp
retq
LBB0_13:
Ltmp8:
movq %rax, -48(%rbp) ## 8-byte Spill
testb $1, -80(%rbp)
je LBB0_18
## BB#14:
movq -64(%rbp), %rdi
callq __ZdlPv
testb $1, -296(%rbp)
jne LBB0_19
jmp LBB0_20
LBB0_16:
Ltmp5:
movq %rax, -48(%rbp) ## 8-byte Spill
jmp LBB0_21
LBB0_15:
Ltmp2:
movq %rax, -48(%rbp) ## 8-byte Spill
jmp LBB0_22
LBB0_17:
Ltmp15:
movq %rax, -48(%rbp) ## 8-byte Spill
movq %r13, -368(%rbp)
movq %rbx, -256(%rbp)
movq %r15, -360(%rbp)
LBB0_18:
testb $1, -296(%rbp)
je LBB0_20
LBB0_19:
movq -280(%rbp), %rdi
callq __ZdlPv
LBB0_20:
movq %r12, %rdi
callq __ZNSt3__115basic_streambufIcNS_11char_traitsIcEEED2Ev
LBB0_21:
movq __ZTTNSt3__119basic_ostringstreamIcNS_11char_traitsIcEENS_9allocatorIcEEEE#GOTPCREL(%rip), %rsi
addq $8, %rsi
leaq -368(%rbp), %rdi
callq __ZNSt3__113basic_ostreamIcNS_11char_traitsIcEEED2Ev
LBB0_22:
movq %r14, %rdi
callq __ZNSt3__19basic_iosIcNS_11char_traitsIcEEED2Ev
movq -48(%rbp), %rdi ## 8-byte Reload
callq __Unwind_Resume
Lfunc_end0:
.cfi_endproc
.section __TEXT,__gcc_except_tab
.p2align 2
GCC_except_table0:
Lexception0:
.byte 255 ## #LPStart Encoding = omit
.byte 155 ## #TType Encoding = indirect pcrel sdata4
.asciz "\303\200" ## #TType base offset
.byte 3 ## Call site Encoding = udata4
.byte 65 ## Call site table length
Lset0 = Ltmp0-Lfunc_begin0 ## >> Call Site 1 <<
.long Lset0
Lset1 = Ltmp1-Ltmp0 ## Call between Ltmp0 and Ltmp1
.long Lset1
Lset2 = Ltmp2-Lfunc_begin0 ## jumps to Ltmp2
.long Lset2
.byte 0 ## On action: cleanup
Lset3 = Ltmp3-Lfunc_begin0 ## >> Call Site 2 <<
.long Lset3
Lset4 = Ltmp4-Ltmp3 ## Call between Ltmp3 and Ltmp4
.long Lset4
Lset5 = Ltmp5-Lfunc_begin0 ## jumps to Ltmp5
.long Lset5
.byte 0 ## On action: cleanup
Lset6 = Ltmp6-Lfunc_begin0 ## >> Call Site 3 <<
.long Lset6
Lset7 = Ltmp7-Ltmp6 ## Call between Ltmp6 and Ltmp7
.long Lset7
Lset8 = Ltmp8-Lfunc_begin0 ## jumps to Ltmp8
.long Lset8
.byte 0 ## On action: cleanup
Lset9 = Ltmp9-Lfunc_begin0 ## >> Call Site 4 <<
.long Lset9
Lset10 = Ltmp14-Ltmp9 ## Call between Ltmp9 and Ltmp14
.long Lset10
Lset11 = Ltmp15-Lfunc_begin0 ## jumps to Ltmp15
.long Lset11
.byte 0 ## On action: cleanup
Lset12 = Ltmp14-Lfunc_begin0 ## >> Call Site 5 <<
.long Lset12
Lset13 = Lfunc_end0-Ltmp14 ## Call between Ltmp14 and Lfunc_end0
.long Lset13
.long 0 ## has no landing pad
.byte 0 ## On action: cleanup
.p2align 2
The most likely reason for the difference is that the IOStream implementation is expanded inline while the sprintf() use is just a function call. Nothing inherently prevents IOStreams to be implemented by a library. It does take a tiny but of abstraction and planning, though: the definition in the standard uses templates. These are normally just implemented inline. Declaring the typically used instantiations (for character types char and wchar_t) as extern templates and explicitly instantiating them is extra work, though. I showed a long time ago that it does pay off in term of compile-time and, at least, libstdc++ preinstantiates the IOStreams functions in a library. Based on you experiment it seems libc++ doesn’t.

Vectorization of sin and cos

I was playing around with Compiler Explorer and ran into an anomaly (I think). If I want to make the compiler vectorize a sin calculation using libmvec, I would write:
#include <cmath>
#define NN 512
typedef float T;
typedef T __attribute__((aligned(NN))) AT;
inline T s(const T x)
{
return sinf(x);
}
void func(AT* __restrict x, AT* __restrict y, int length)
{
if (length & NN-1) __builtin_unreachable();
for (int i = 0; i < length; i++)
{
y[i] = s(x[i]);
}
}
compile with gcc 6.2 and -O3 -march=native -ffast-math and get
func(float*, float*, int):
testl %edx, %edx
jle .L10
leaq 8(%rsp), %r10
andq $-32, %rsp
pushq -8(%r10)
pushq %rbp
movq %rsp, %rbp
pushq %r14
xorl %r14d, %r14d
pushq %r13
leal -8(%rdx), %r13d
pushq %r12
shrl $3, %r13d
movq %rsi, %r12
pushq %r10
addl $1, %r13d
pushq %rbx
movq %rdi, %rbx
subq $8, %rsp
.L4:
vmovaps (%rbx), %ymm0
addl $1, %r14d
addq $32, %r12
addq $32, %rbx
call _ZGVcN8v_sinf // YAY! Vectorized trig!
vmovaps %ymm0, -32(%r12)
cmpl %r13d, %r14d
jb .L4
vzeroupper
addq $8, %rsp
popq %rbx
popq %r10
popq %r12
popq %r13
popq %r14
popq %rbp
leaq -8(%r10), %rsp
.L10:
ret
But when I add a cosine to the function, there is no vectorization:
#include <cmath>
#define NN 512
typedef float T;
typedef T __attribute__((aligned(NN))) AT;
inline T f(const T x)
{
return cosf(x)+sinf(x);
}
void func(AT* __restrict x, AT* __restrict y, int length)
{
if (length & NN-1) __builtin_unreachable();
for (int i = 0; i < length; i++)
{
y[i] = f(x[i]);
}
}
which gives:
func(float*, float*, int):
testl %edx, %edx
jle .L10
pushq %r12
leal -1(%rdx), %eax
pushq %rbp
leaq 4(%rdi,%rax,4), %r12
movq %rsi, %rbp
pushq %rbx
movq %rdi, %rbx
subq $16, %rsp
.L4:
vmovss (%rbx), %xmm0
leaq 8(%rsp), %rsi
addq $4, %rbx
addq $4, %rbp
leaq 12(%rsp), %rdi
call sincosf // No vectorization
vmovss 12(%rsp), %xmm0
vaddss 8(%rsp), %xmm0, %xmm0
vmovss %xmm0, -4(%rbp)
cmpq %rbx, %r12
jne .L4
addq $16, %rsp
popq %rbx
popq %rbp
popq %r12
.L10:
ret
I see two good alternatives. Either call a vectorized version of sincosf or call the vectorized sin and cos sequentially. I tried adding -fno-builtin-sincos to no avail. -fopt-info-vec-missed complains about complex float, which there is none.
Is this a known issue with gcc? Either way, is there a way I can convince gcc to vectorize the latter example?
(As an aside, is there any way to get gcc < 6 to vectorize trigonometric functions automatically?)