"call" instruction that seemingly jumps into itself - c++

I have some C++ code
#include <cstdio>
#include <boost/bind.hpp>
#include <boost/function.hpp>
class A {
public:
void do_it() { std::printf("aaa"); }
};
void
call_it(const boost::function<void()> &f)
{
f();
}
void
func()
{
A *a = new A;
call_it(boost::bind(&A::do_it, a));
}
which gcc 4 compiles into the following assembly (from objdump):
00000030 <func()>:
30: 55 push %ebp
31: 89 e5 mov %esp,%ebp
33: 56 push %esi
34: 31 f6 xor %esi,%esi
36: 53 push %ebx
37: bb 00 00 00 00 mov $0x0,%ebx
3c: 83 ec 40 sub $0x40,%esp
3f: c7 04 24 01 00 00 00 movl $0x1,(%esp)
46: e8 fc ff ff ff call 47 <func()+0x17>
4b: 8d 55 ec lea 0xffffffec(%ebp),%edx
4e: 89 14 24 mov %edx,(%esp)
51: 89 5c 24 04 mov %ebx,0x4(%esp)
55: 89 74 24 08 mov %esi,0x8(%esp)
59: 89 44 24 0c mov %eax,0xc(%esp)
; the rest of the function is omitted
I can't understand the operand of call instruction here, why does it call into itself, but with one byte off?

The call is probably to an external function, and the address you see (FFFFFFFC) is just a placeholder for the real address, which the linker and/or loader will take care of later.

Related

How are non-static, non-virtual methods implemented in C++?

I wanted to know how methods are implemented in C++. I wanted to know how methods are implemented "under the hood".
So, I have made a simple C++ program which has a class with 1 non static field and 1 non static, non virtual method.
Then I instantiated the class in the main function and called the method. I have used objdump -d option in order to see the CPU instructions of this program. I have a x86-64 processor.
Here's the code:
#include<stdio.h>
class TestClass {
public:
int x;
int xPlus2(){
return x + 2;
}
};
int main(){
TestClass tc1 = {5};
int variable = tc1.xPlus2();
printf("%d \n", variable);
return 0;
}
Here are instructions for the method xPlus2:
0000000000402c30 <_ZN9TestClass6xPlus2Ev>:
402c30: 55 push %rbp
402c31: 48 89 e5 mov %rsp,%rbp
402c34: 48 89 4d 10 mov %rcx,0x10(%rbp)
402c38: 48 8b 45 10 mov 0x10(%rbp),%rax
402c3c: 8b 00 mov (%rax),%eax
402c3e: 83 c0 02 add $0x2,%eax
402c41: 5d pop %rbp
402c42: c3 retq
402c43: 90 nop
402c44: 90 nop
402c45: 90 nop
402c46: 90 nop
402c47: 90 nop
402c48: 90 nop
402c49: 90 nop
402c4a: 90 nop
402c4b: 90 nop
402c4c: 90 nop
402c4d: 90 nop
402c4e: 90 nop
402c4f: 90 nop
If I understand it correctly, these instructions can be replaced by just 3 instructions, because I believe that I don't need to use the stack, I think the compiler used it redundantly:
mov (%rcx), eax
add $2, eax
retq
and then maybe I still need lots of nop instructions for synchronization purposes or whatnot. If you look at the CPU instructions, it looks like the value that x field has is stored at the location in memory which rcx register holds. You will see the rest of the CPU instructions in a moment. It is a little bit hard for me to track what has happened here (especially what is going on with the call of _main function), I don't even know what parts of assembly are important to look at. Compiler produces main function (as I expected), but then it also produced _main function which is called from the main, there are some weird functions in between those two as well.
Here are other parts of the assembly that I think may be interesting:
0000000000401550 <main>:
401550: 55 push %rbp
401551: 48 89 e5 mov %rsp,%rbp
401554: 48 83 ec 30 sub $0x30,%rsp
401558: e8 e3 00 00 00 callq 401640 <__main>
40155d: c7 45 f8 05 00 00 00 movl $0x5,-0x8(%rbp)
401564: 48 8d 45 f8 lea -0x8(%rbp),%rax
401568: 48 89 c1 mov %rax,%rcx
40156b: e8 c0 16 00 00 callq 402c30 <_ZN9TestClass6xPlus2Ev>
401570: 89 45 fc mov %eax,-0x4(%rbp)
401573: 8b 45 fc mov -0x4(%rbp),%eax
401576: 89 c2 mov %eax,%edx
401578: 48 8d 0d 81 2a 00 00 lea 0x2a81(%rip),%rcx # 404000 <.rdata>
40157f: e8 ec 14 00 00 callq 402a70 <printf>
401584: b8 00 00 00 00 mov $0x0,%eax
401589: 48 83 c4 30 add $0x30,%rsp
40158d: 5d pop %rbp
40158e: c3 retq
40158f: 90 nop
0000000000401590 <__do_global_dtors>:
401590: 48 83 ec 28 sub $0x28,%rsp
401594: 48 8b 05 75 1a 00 00 mov 0x1a75(%rip),%rax # 403010 <p.93846>
40159b: 48 8b 00 mov (%rax),%rax
40159e: 48 85 c0 test %rax,%rax
4015a1: 74 1d je 4015c0 <__do_global_dtors+0x30>
4015a3: ff d0 callq *%rax
4015a5: 48 8b 05 64 1a 00 00 mov 0x1a64(%rip),%rax # 403010 <p.93846>
4015ac: 48 8d 50 08 lea 0x8(%rax),%rdx
4015b0: 48 8b 40 08 mov 0x8(%rax),%rax
4015b4: 48 89 15 55 1a 00 00 mov %rdx,0x1a55(%rip) # 403010 <p.93846>
4015bb: 48 85 c0 test %rax,%rax
4015be: 75 e3 jne 4015a3 <__do_global_dtors+0x13>
4015c0: 48 83 c4 28 add $0x28,%rsp
4015c4: c3 retq
4015c5: 90 nop
4015c6: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1)
4015cd: 00 00 00
00000000004015d0 <__do_global_ctors>:
4015d0: 56 push %rsi
4015d1: 53 push %rbx
4015d2: 48 83 ec 28 sub $0x28,%rsp
4015d6: 48 8b 0d 23 2d 00 00 mov 0x2d23(%rip),%rcx # 404300 <.refptr.__CTOR_LIST__>
4015dd: 48 8b 11 mov (%rcx),%rdx
4015e0: 83 fa ff cmp $0xffffffff,%edx
4015e3: 89 d0 mov %edx,%eax
4015e5: 74 39 je 401620 <__do_global_ctors+0x50>
4015e7: 85 c0 test %eax,%eax
4015e9: 74 20 je 40160b <__do_global_ctors+0x3b>
4015eb: 89 c2 mov %eax,%edx
4015ed: 83 e8 01 sub $0x1,%eax
4015f0: 48 8d 1c d1 lea (%rcx,%rdx,8),%rbx
4015f4: 48 29 c2 sub %rax,%rdx
4015f7: 48 8d 74 d1 f8 lea -0x8(%rcx,%rdx,8),%rsi
4015fc: 0f 1f 40 00 nopl 0x0(%rax)
401600: ff 13 callq *(%rbx)
401602: 48 83 eb 08 sub $0x8,%rbx
401606: 48 39 f3 cmp %rsi,%rbx
401609: 75 f5 jne 401600 <__do_global_ctors+0x30>
40160b: 48 8d 0d 7e ff ff ff lea -0x82(%rip),%rcx # 401590 <__do_global_dtors>
401612: 48 83 c4 28 add $0x28,%rsp
401616: 5b pop %rbx
401617: 5e pop %rsi
401618: e9 f3 fe ff ff jmpq 401510 <atexit>
40161d: 0f 1f 00 nopl (%rax)
401620: 31 c0 xor %eax,%eax
401622: eb 02 jmp 401626 <__do_global_ctors+0x56>
401624: 89 d0 mov %edx,%eax
401626: 44 8d 40 01 lea 0x1(%rax),%r8d
40162a: 4a 83 3c c1 00 cmpq $0x0,(%rcx,%r8,8)
40162f: 4c 89 c2 mov %r8,%rdx
401632: 75 f0 jne 401624 <__do_global_ctors+0x54>
401634: eb b1 jmp 4015e7 <__do_global_ctors+0x17>
401636: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1)
40163d: 00 00 00
0000000000401640 <__main>:
401640: 8b 05 ea 59 00 00 mov 0x59ea(%rip),%eax # 407030 <initialized>
401646: 85 c0 test %eax,%eax
401648: 74 06 je 401650 <__main+0x10>
40164a: c3 retq
40164b: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
401650: c7 05 d6 59 00 00 01 movl $0x1,0x59d6(%rip) # 407030 <initialized>
401657: 00 00 00
40165a: e9 71 ff ff ff jmpq 4015d0 <__do_global_ctors>
40165f: 90 nop
I think what you are looking for are these instructions:
40155d: c7 45 f8 05 00 00 00 movl $0x5,-0x8(%rbp)
401564: 48 8d 45 f8 lea -0x8(%rbp),%rax
401568: 48 89 c1 mov %rax,%rcx
40156b: e8 c0 16 00 00 callq 402c30 <_ZN9TestClass6xPlus2Ev>
401570: 89 45 fc mov %eax,-0x4(%rbp)
These match with the code from main:
TestClass tc1 = {5};
int variable = tc1.xPlus2();
At address 40155d the field tc1.x is initialized with the value 5.
At address 401564 the pointer to tc1 is loaded into the register %rax
At address 401568 the pointer to tc1 is copied into the register %rcx
At address 40156b is the call of the method tc1.xPlus2()
At address 401570 the result is store in variable
Your observations are mostly correct. rcx holds the this pointer to the object on which the method was called. x is stored in the first area of memory that the this pointer points to, so that is why rcx was dereferenced and the result added to. It is the responsibility of the caller to make sure that rcx is the address of the object before invoking the function. We can see main prepare rcx by setting it to an address in its stack frame. You are correct that the compiler produced inefficient code here and did not need to use the stack. Compiling with higher optimization levels -O1, -O2, or -O3 will likely fix that. These higher optimizations will probably get rid of the nops too, since they are used for function alignment. You can mostly ignore __main. It's used for libc initialization.

finstrument-functions-exclude-function-list appears to not handle commas properly

Attempting to compile with finstrument-functions and exclude a template function with multiple template parameters, using the \ method to escape commas (as described for exclude-file-list here) fails to properly disable instrumenting the function passed.
GCC command used:
gcc -finstrument-functions -finstrument-functions-exclude-function-list='test<float\, int>' main.cpp -o a.out -O0
Above creates a binary file with the "test" function instrumented. Assembly snippet and main.cpp file included below
gcc -dumpversion returns "6.2.0", above command run on red hat enterprise linux, version 7.4
Contents of main.cpp:
template<class T, class U>
T test(int a, T b){
int res = 0;
for(int i = 0; i < 1000; i++){
res += i;
}
return(res);
}
int main(int argc, char** argv){
float a = test<float, int>(argc, 1.0);
return(0);
}
objdumped output for "test" function:
000000000040059f <float test<float, int>(int, float)>:
40059f: 55 push %rbp
4005a0: 48 89 e5 mov %rsp,%rbp
4005a3: 48 83 ec 20 sub $0x20,%rsp
4005a7: 89 7d ec mov %edi,-0x14(%rbp)
4005aa: f3 0f 11 45 e8 movss %xmm0,-0x18(%rbp)
4005af: 48 8b 45 08 mov 0x8(%rbp),%rax
4005b3: 48 89 c6 mov %rax,%rsi
4005b6: bf 9f 05 40 00 mov $0x40059f,%edi
4005bb: e8 70 fe ff ff callq 400430 <__cyg_profile_func_enter#plt>
4005c0: c7 45 f8 00 00 00 00 movl $0x0,-0x8(%rbp)
4005c7: c7 45 fc 00 00 00 00 movl $0x0,-0x4(%rbp)
4005ce: 81 7d fc e7 03 00 00 cmpl $0x3e7,-0x4(%rbp)
4005d5: 7f 11 jg 4005e8 <float test<float, int>(int, float)+0x49>
4005d7: 8b 55 f8 mov -0x8(%rbp),%edx
4005da: 8b 45 fc mov -0x4(%rbp),%eax
4005dd: 01 d0 add %edx,%eax
4005df: 89 45 f8 mov %eax,-0x8(%rbp)
4005e2: 83 45 fc 01 addl $0x1,-0x4(%rbp)
4005e6: eb e6 jmp 4005ce <float test<float, int>(int, float)+0x2f>
4005e8: 8b 45 f8 mov -0x8(%rbp),%eax
4005eb: 66 0f ef c9 pxor %xmm1,%xmm1
4005ef: f3 0f 2a c8 cvtsi2ss %eax,%xmm1
4005f3: f3 0f 11 4d e4 movss %xmm1,-0x1c(%rbp)
4005f8: 48 8b 45 08 mov 0x8(%rbp),%rax
4005fc: 48 89 c6 mov %rax,%rsi
4005ff: bf 9f 05 40 00 mov $0x40059f,%edi
400604: e8 17 fe ff ff callq 400420 <__cyg_profile_func_exit#plt>
400609: f3 0f 10 45 e4 movss -0x1c(%rbp),%xmm0
40060e: c9 leaveq
40060f: c3 retq
I expected the test function to not be instrumented, but it is. Does anyone know why this is?
Compiler explorer example
Just in case anyone comes by this way, both this and finstrument-functions-exclude-function-list not respecting namespace parts of a function are bugs, and I have filed against both. Hopefully a fix will be implemented soon (working on one currently).
Namespace / class mishandling
Comma mishandling

Member initializer list, pointer initialization without argument

In a large framework which used to use many smart pointers and now uses raw pointers, I come across situations like this quite often:
class A {
public:
int* m;
A() : m() {}
};
The reason is because int* m used to be a smart pointer and so the initializer list called a default constructor. Now that int* m is a raw pointer I am not certain if this is equivalent to:
class A {
public:
int* m;
A() : m(nullptr) {}
};
Without the explicit nullptr is A::m still initialized to zero? A look at no optimization objdump -d makes it appear to be yes but I am not certain. The reason I feel that the answer is yes is due to this line in the objdump -d (I posted more of the objdump -d below):
400644: 48 c7 00 00 00 00 00 movq $0x0,(%rax)
Little program that tries to find undefined behavior:
class A {
public:
int* m;
A() : m(nullptr) {}
};
int main() {
A buf[1000000];
unsigned int count = 0;
for (unsigned int i = 0; i < 1000000; ++i) {
count += buf[i].m ? 1 : 0;
}
return count;
}
Compilation, execution, and return value:
g++ -std=c++14 -O0 foo.cpp
./a.out; echo $?
0
Relevant assembly sections from objdump -d:
00000000004005b8 <main>:
4005b8: 55 push %rbp
4005b9: 48 89 e5 mov %rsp,%rbp
4005bc: 41 54 push %r12
4005be: 53 push %rbx
4005bf: 48 81 ec 10 12 7a 00 sub $0x7a1210,%rsp
4005c6: 48 8d 85 e0 ed 85 ff lea -0x7a1220(%rbp),%rax
4005cd: bb 3f 42 0f 00 mov $0xf423f,%ebx
4005d2: 49 89 c4 mov %rax,%r12
4005d5: eb 10 jmp 4005e7 <main+0x2f>
4005d7: 4c 89 e7 mov %r12,%rdi
4005da: e8 59 00 00 00 callq 400638 <_ZN1AC1Ev>
4005df: 49 83 c4 08 add $0x8,%r12
4005e3: 48 83 eb 01 sub $0x1,%rbx
4005e7: 48 83 fb ff cmp $0xffffffffffffffff,%rbx
4005eb: 75 ea jne 4005d7 <main+0x1f>
4005ed: c7 45 ec 00 00 00 00 movl $0x0,-0x14(%rbp)
4005f4: c7 45 e8 00 00 00 00 movl $0x0,-0x18(%rbp)
4005fb: eb 23 jmp 400620 <main+0x68>
4005fd: 8b 45 e8 mov -0x18(%rbp),%eax
400600: 48 8b 84 c5 e0 ed 85 mov -0x7a1220(%rbp,%rax,8),%rax
400607: ff
400608: 48 85 c0 test %rax,%rax
40060b: 74 07 je 400614 <main+0x5c>
40060d: b8 01 00 00 00 mov $0x1,%eax
400612: eb 05 jmp 400619 <main+0x61>
400614: b8 00 00 00 00 mov $0x0,%eax
400619: 01 45 ec add %eax,-0x14(%rbp)
40061c: 83 45 e8 01 addl $0x1,-0x18(%rbp)
400620: 81 7d e8 3f 42 0f 00 cmpl $0xf423f,-0x18(%rbp)
400627: 76 d4 jbe 4005fd <main+0x45>
400629: 8b 45 ec mov -0x14(%rbp),%eax
40062c: 48 81 c4 10 12 7a 00 add $0x7a1210,%rsp
400633: 5b pop %rbx
400634: 41 5c pop %r12
400636: 5d pop %rbp
400637: c3 retq
0000000000400638 <_ZN1AC1Ev>:
400638: 55 push %rbp
400639: 48 89 e5 mov %rsp,%rbp
40063c: 48 89 7d f8 mov %rdi,-0x8(%rbp)
400640: 48 8b 45 f8 mov -0x8(%rbp),%rax
400644: 48 c7 00 00 00 00 00 movq $0x0,(%rax)
40064b: 5d pop %rbp
40064c: c3 retq
40064d: 0f 1f 00 nopl (%rax)
Empty () initializer stands for default-initialization in C++98 and for value-initialization in C++03 and later. For scalar types (including pointers) value-initialization/default-initialization leads to zero-initialization.
Which means that in your case m() and m(nullptr) will have exactly the same effect: in both cases m is initialized as a null pointer. In C++ it was like that since the beginning of standardized times.

SSE load/store memory transactions

There are two ways for memory-register interactions in use SSE intrinsics:
Intermediate pointers:
void f_sse(float *input, float *output, unsigned int n)
{
_m128 *input_sse = reinterpret_cast<__m128*>(input);//Input intermediate pointer
_m128 *output_sse = reinterpret_cast<__m128*>(output);//Output intermediate pointer
_m128 s = _mm_set1_ps(0.1f);
auto loop_size = n/4;
for(auto i=0; i<loop_size; ++i)
output_sse[i] = _mm_add_ps(input_sse[i], s);
}
Explicit fetch/store:
void f_sse(float *input, float *output, unsigned int n)
{
_m128 input_sse, output_sse, result;
_m128 s = _mm_set1_ps(0.1f);
for(auto i=0; i<n; i+=4)
{
input_sse = _mm_load_ps(input+i);
result = _mm_add_ps(input_sse, s);
_mm_store_ps(output+i, result);
}
}
What's the difference between mentioned approaches and which method is better in terms of perfomance? input and output pointers are aligned by _mm_malloc().
Compiled with g++ at optimization level O3 the assembly code of the inner loop (using objdump -d) are
20: 0f 28 04 07 movaps (%rdi,%rax,1),%xmm0
24: 0f 58 c1 addps %xmm1,%xmm0
27: 0f 29 04 06 movaps %xmm0,(%rsi,%rax,1)
2b: 48 83 c0 10 add $0x10,%rax
2f: 48 39 d0 cmp %rdx,%rax
32: 75 ec jne 20 <_Z5f_ssePfS_j+0x20>
and
10: 0f 28 04 07 movaps (%rdi,%rax,1),%xmm0
14: 83 c1 04 add $0x4,%ecx
17: 0f 58 c1 addps %xmm1,%xmm0
1a: 0f 29 04 06 movaps %xmm0,(%rsi,%rax,1)
1e: 48 83 c0 10 add $0x10,%rax
22: 39 ca cmp %ecx,%edx
24: 77 ea ja 10 <_Z5f_ssePfS_j+0x10>
They are pretty similar. In the first g++ manage to use only one counter (only one add instruction). So I guess its better.
I compiled both of your samples with g++ -O2, and the main difference I found was that the value in edx (n) is used differently, which leads to slightly different code.
First function:
0000000000000000 <_Z6f_sse2PfS_j>:
0: c1 ea 02 shr $0x2,%edx # loop_size = n / 4.
3: 85 d2 test %edx,%edx
5: 74 2d je 34 <_Z6f_sse2PfS_j+0x34>
7: 83 ea 01 sub $0x1,%edx
a: 0f 28 0d 00 00 00 00 movaps 0x0(%rip),%xmm1 # 11 <_Z6f_sse2PfS_j+0x11>
11: 48 83 c2 01 add $0x1,%rdx
15: 31 c0 xor %eax,%eax
17: 48 c1 e2 04 shl $0x4,%rdx // Adjust for loop size vs. index.
1b: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
20: 0f 28 04 07 movaps (%rdi,%rax,1),%xmm0
24: 0f 58 c1 addps %xmm1,%xmm0
27: 0f 29 04 06 movaps %xmm0,(%rsi,%rax,1)
2b: 48 83 c0 10 add $0x10,%rax
2f: 48 39 d0 cmp %rdx,%rax
32: 75 ec jne 20 <_Z6f_sse2PfS_j+0x20>
34: f3 c3 repz retq
Second function:
0000000000000000 <_Z5f_ssePfS_j>:
0: 85 d2 test %edx,%edx
2: 74 22 je 26 <_Z5f_ssePfS_j+0x26>
4: 0f 28 0d 00 00 00 00 movaps 0x0(%rip),%xmm1 # b <_Z5f_ssePfS_j+0xb>
b: 31 c0 xor %eax,%eax
d: 31 c9 xor %ecx,%ecx
f: 90 nop
10: 0f 28 04 07 movaps (%rdi,%rax,1),%xmm0
14: 83 c1 04 add $0x4,%ecx
17: 0f 58 c1 addps %xmm1,%xmm0
1a: 0f 29 04 06 movaps %xmm0,(%rsi,%rax,1)
1e: 48 83 c0 10 add $0x10,%rax
22: 39 ca cmp %ecx,%edx
24: 77 ea ja 10 <_Z5f_ssePfS_j+0x10>
26: f3 c3 repz retq
I also looked at the code generated, and came up with this:
void f_sse2(float *input, float *output, unsigned int n)
{
__m128 *end = reinterpret_cast<__m128*>(&input[n]);
__m128 *input_sse = reinterpret_cast<__m128*>(input);//Input intermediate pointer
__m128 *output_sse = reinterpret_cast<__m128*>(output);//Output intermediate pointer
__m128 s = _mm_set1_ps(0.1f);
while(input_sse < end)
*output_sse++ = _mm_add_ps(*input_sse++, s);
}
which generates this code:
0000000000000000 <_Z6f_sse2PfS_j>:
0: 89 d2 mov %edx,%edx
2: 48 8d 04 97 lea (%rdi,%rdx,4),%rax
6: 48 39 c7 cmp %rax,%rdi
9: 73 23 jae 2e <_Z6f_sse2PfS_j+0x2e>
b: 0f 28 0d 00 00 00 00 movaps 0x0(%rip),%xmm1 # 12 <_Z6f_sse2PfS_j+0x12>
12: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
18: 0f 28 07 movaps (%rdi),%xmm0
1b: 48 83 c7 10 add $0x10,%rdi
1f: 0f 58 c1 addps %xmm1,%xmm0
22: 0f 29 06 movaps %xmm0,(%rsi)
25: 48 83 c6 10 add $0x10,%rsi
29: 48 39 f8 cmp %rdi,%rax
2c: 77 ea ja 18 <_Z6f_sse2PfS_j+0x18>
2e: f3 c3 repz retq
Which I think may be a tiny bit more efficient, but probably not worth changing it for. But it gave me something to do for 15 minutes.

G++ 4.6 -std=gnu++0x: Static Local Variable Constructor Call Timing and Thread Safety

void a() { ... }
void b() { ... }
struct X
{
X() { b(); }
};
void f()
{
a();
static X x;
...
}
Assume f is called multiple times from various threads (potentially contended) after the entry of main. (and of course that the only calls to a and b are those seen above)
When the above code is compiled with gcc g++ 4.6 in -std=gnu++0x mode:
Q1. Is it guaranteed that a() will be called at least once and return before b() is called? That is to ask, on the first call to f(), is the constructor of x called at the same time an automatic duration local variable (non-static) would be (and not at global static initialization time for example)?
Q2. Is it guaranteed that b() will be called exactly once? Even if two threads execute f for the first time at the same time on different cores? If yes, by which specific mechanism does the GCC generated code provide synchronization? Edit: Additionally could one of the threads calling f() obtain access to x before the constructor of X returns?
Update: I am trying to compile an example and decompile to investigate mechanism...
test.cpp:
struct X;
void ext1(int x);
void ext2(X& x);
void a() { ext1(1); }
void b() { ext1(2); }
struct X
{
X() { b(); }
};
void f()
{
a();
static X x;
ext2(x);
}
Then:
$ g++ -std=gnu++0x -c -o test.o ./test.cpp
$ objdump -d test.o -M intel > test.dump
test.dump:
test.o: file format elf64-x86-64
Disassembly of section .text:
0000000000000000 <_Z1av>:
0: 55 push rbp
1: 48 89 e5 mov rbp,rsp
4: bf 01 00 00 00 mov edi,0x1
9: e8 00 00 00 00 call e <_Z1av+0xe>
e: 5d pop rbp
f: c3 ret
0000000000000010 <_Z1bv>:
10: 55 push rbp
11: 48 89 e5 mov rbp,rsp
14: bf 02 00 00 00 mov edi,0x2
19: e8 00 00 00 00 call 1e <_Z1bv+0xe>
1e: 5d pop rbp
1f: c3 ret
0000000000000020 <_Z1fv>:
20: 55 push rbp
21: 48 89 e5 mov rbp,rsp
24: 41 54 push r12
26: 53 push rbx
27: e8 00 00 00 00 call 2c <_Z1fv+0xc>
2c: b8 00 00 00 00 mov eax,0x0
31: 0f b6 00 movzx eax,BYTE PTR [rax]
34: 84 c0 test al,al
36: 75 2d jne 65 <_Z1fv+0x45>
38: bf 00 00 00 00 mov edi,0x0
3d: e8 00 00 00 00 call 42 <_Z1fv+0x22>
42: 85 c0 test eax,eax
44: 0f 95 c0 setne al
47: 84 c0 test al,al
49: 74 1a je 65 <_Z1fv+0x45>
4b: 41 bc 00 00 00 00 mov r12d,0x0
51: bf 00 00 00 00 mov edi,0x0
56: e8 00 00 00 00 call 5b <_Z1fv+0x3b>
5b: bf 00 00 00 00 mov edi,0x0
60: e8 00 00 00 00 call 65 <_Z1fv+0x45>
65: bf 00 00 00 00 mov edi,0x0
6a: e8 00 00 00 00 call 6f <_Z1fv+0x4f>
6f: 5b pop rbx
70: 41 5c pop r12
72: 5d pop rbp
73: c3 ret
74: 48 89 c3 mov rbx,rax
77: 45 84 e4 test r12b,r12b
7a: 75 0a jne 86 <_Z1fv+0x66>
7c: bf 00 00 00 00 mov edi,0x0
81: e8 00 00 00 00 call 86 <_Z1fv+0x66>
86: 48 89 d8 mov rax,rbx
89: 48 89 c7 mov rdi,rax
8c: e8 00 00 00 00 call 91 <_Z1fv+0x71>
Disassembly of section .text._ZN1XC2Ev:
0000000000000000 <_ZN1XC1Ev>:
0: 55 push rbp
1: 48 89 e5 mov rbp,rsp
4: 48 83 ec 10 sub rsp,0x10
8: 48 89 7d f8 mov QWORD PTR [rbp-0x8],rdi
c: e8 00 00 00 00 call 11 <_ZN1XC1Ev+0x11>
11: c9 leave
12: c3 ret
I don't see the synchronization mechanism? Or is it added at linktime?
Update2: Ok when I link it I can see it...
400973: 84 c0 test %al,%al
400975: 75 2d jne 4009a4 <_Z1fv+0x45>
400977: bf 98 20 40 00 mov $0x402098,%edi
40097c: e8 1f fe ff ff callq 4007a0 <__cxa_guard_acquire#plt>
400981: 85 c0 test %eax,%eax
400983: 0f 95 c0 setne %al
400986: 84 c0 test %al,%al
400988: 74 1a je 4009a4 <_Z1fv+0x45>
40098a: 41 bc 00 00 00 00 mov $0x0,%r12d
400990: bf a0 20 40 00 mov $0x4020a0,%edi
400995: e8 a6 00 00 00 callq 400a40 <_ZN1XC1Ev>
40099a: bf 98 20 40 00 mov $0x402098,%edi
40099f: e8 0c fe ff ff callq 4007b0 <__cxa_guard_release#plt>
4009a4: bf a0 20 40 00 mov $0x4020a0,%edi
4009a9: e8 72 ff ff ff callq 400920 <_Z4ext2R1X>
4009ae: 5b pop %rbx
4009af: 41 5c pop %r12
4009b1: 5d pop %rbp
It surrounds it with __cxa_guard_acquire and __cxa_guard_release, whatever they do.
Q1. Yes. According to C++11, 6.7/4:
such a variable is initialized the first time control passes through its declaration
so it will be initialised after the first call to a().
Q2. Under GCC, and any compiler that supports the C++11 thread model: yes, initialisation of local static variables is thread safe. Other compilers might not give that guarantee. The exact mechanism is an implementation detail. I believe GCC uses an atomic flag to indicate whether it's initialised, and a mutex to protect initialisation when the flag is not set, but I could be wrong. Certainly, this thread implies that it was originally implemented like that.
UPDATE: your code does indeed contain the initialisation code. You can see it more clearly if you link it, and then disassemble the program, so that you can see which functions are being called. I also used objdump -SC to interleave the source and demangle C++ names. It uses internal locking functions __cxa_guard_acquire and __cxa_guard_release, to make sure only one thread executes the initialisation code.
#void f()
#{
400724: push rbp
400725: mov rbp,rsp
400728: push r13
40072a: push r12
40072c: push rbx
40072d: sub rsp,0x8
# a();
400731: call 400704 <a()>
# static X x;
# if (!guard) {
400736: mov eax,0x601050
40073b: movzx eax,BYTE PTR [rax]
40073e: test al,al
400740: jne 400792 <f()+0x6e>
# if (__cxa_guard_acquire(&guard)) {
400742: mov edi,0x601050
400747: call 4005c0 <__cxa_guard_acquire#plt>
40074c: test eax,eax
40074e: setne al
400751: test al,al
400753: je 400792 <f()+0x6e>
# // initialise x
400755: mov ebx,0x0
40075a: mov edi,0x601058
40075f: call 4007b2 <X::X()>
# __cxa_guard_release(&guard);
400764: mov edi,0x601050
400769: call 4005e0 <__cxa_guard_release#plt>
# } else {
40076e: jmp 400792 <f()+0x6e>
# // already initialised
400770: mov r12d,edx
400773: mov r13,rax
400776: test bl,bl
400778: jne 400784 <f()+0x60>
40077a: mov edi,0x601050
40077f: call 4005f0 <__cxa_guard_abort#plt>
400784: mov rax,r13
400787: movsxd rdx,r12d
40078a: mov rdi,rax
40078d: 400610 <_Unwind_Resume#plt>
# }
# }
# ext2(x);
400792: mov edi,0x601058
400797: call 4007d1 <_Z4ext2R1X>
#}
As far as I know it is guaranteed that b is only called once. However, it is not guaranteed that the initialisation is performed thread safe, which means another thread could potentially work with a half/not initialized x. (That's kind of funny because static mutexes are basicly useless this way.)