At my server module, sometimes log4cxx library made it crash.
It's because ...
LevelPtr Level::getTrace() {
static LevelPtr level(new Level(Level::TRACE_INT, LOG4CXX_STR("TRACE"), 7));
return level;
}
static LevelPtr returns null ptr.
I tested following code.
int start_flag = 0;
class test_dummy {
public:
int mi;
test_dummy() : mi(1)
{
std::cout << "hey!\n";
}
static test_dummy* get_p()
{
static test_dummy* _p = new test_dummy();
return _p;
}
};
void thread_proc()
{
int i = 0;
while (start_flag == 0)
{
i++;
}
if (test_dummy::get_p() == 0)
{
std::cout << "error!!!\n";
}
else
{
std::cout << "mi:" << test_dummy::get_p()->mi << "\n";
}
}
void main()
{
boost::thread *pth_array[5] = {0,};
for (int i = 0; i < 5; i++)
{
pth_array[i] = new boost::thread(thread_proc);
}
start_flag = 1;
for (int i = 0; i < 5; i++)
{
pth_array[i]->join();
}
std::cin.ignore();
}
It's really thread-unsafe, but I'm curious about why get_p() return null pointer not another allocated address.
It's because the value was set to 0 while one is doing new() operation?
You have a race condition in this code that the compiler provides:
if (!level_initialized)
{
level_initialized = 1;
level = new Level(...);
}
return level;
(It doesn't look EXACTLY like that - it's more complex, but I think you get the general idea)
In clang++ 3.5, it seems like there are locks to prevent this sort of race, but without actually looking at the code generated by your compiler, it's impossible to say exactly what is going on. But I suspect that this is what happens.
Here's what clang++ 3.5 generates (minus some clutter)
_Z8getTracev: # #_Z8getTracev
pushq %rbp
movq %rsp, %rbp
subq $32, %rsp
cmpb $0, _ZGVZ8getTracevE5level # Guard variable
jne .LBB0_4
leaq _ZGVZ8getTracevE5level, %rdi
callq __cxa_guard_acquire
cmpl $0, %eax
je .LBB0_4
.Ltmp0:
movl $4, %eax
movl %eax, %edi
callq _Znwm # new
.Ltmp1:
movq %rax, -24(%rbp) # 8-byte Spill
jmp .LBB0_3
.LBB0_3: # %invoke.cont
leaq _ZGVZ8getTracevE5level, %rdi
movq -24(%rbp), %rax # 8-byte Reload
movq -24(%rbp), %rcx # 8-byte Reload
movl $0, (%rcx)
movq %rax, _ZZ8getTracevE5level
callq __cxa_guard_release
.LBB0_4: # %init.end
movq _ZZ8getTracevE5level, %rax
addq $32, %rsp
popq %rbp
retq
I modified the code to use Level as an int, etc, so it's simpler than the code you'd get from exactly the code you posted.
It's hard to say much, since the code clearly has undefined
behavior, but the standard does require level to be
initialized to a null pointer before get_p is called. And in
order to ensure that the local static is initialized exactly
once, the compiler more or less has to add an extra flag;
something like:
static test_dummy* _p = nullptr;
static bool isInitialized = false;
if ( !isInitialized ) {
_p = new test_dummy();
isInitialized = true;
}
(In fact, of course, the initializations shown above are the
zero initialization , which occurs before anything else. And
a clever compiler could realize that the explicit initialization
which occurs the first time through cannot result in _p being
a null pointer, and use _p as the control variable.)
The above isn't thread safe; in order to make it thread safe,
the entire sequence must be protected. (There are also more or
less complicated tricks to avoid the need for a full mutex, but
in all cases, all accesses to isInitialized must be atomic.)
If the sequence isn't protected, the order another thread sees
the writes isn't defined. So some thread is seeing
isInitialized as true, but still seeing the null pointer in
_p.
Related
I need to inject a dynamic library into a process exclusively targeting the x86_64 instruction set. My host architecture is aarch64.
I attempted injection using the following C++ code...
#define CHKERR(x) if (kr != KERN_SUCCESS) {std::cout << kr << std::endl; return x;};
#define STACK_SIZE 0x1000
#define asm_pthread_offset 6
#define asm_dylib_offset 19
#define asm_dlopen_offset 39
#define asm_mach_thread_self_offset 51
#define asm_thread_suspend_offset 66
inject_result inject_dylib(int pid, const char *dylib_path) {
task_t remoteTask;
struct stat buf;
// check if the dynamic library exists...
int check = stat(dylib_path, &buf);
if (check != 0)
return INJECT_ERROR_NOT_FOUND;
mach_error_t kr = 0;
// request the task port of the target process...
kr = task_for_pid(mach_task_self(), pid, &remoteTask);
CHKERR(INJECT_ERROR_MACH_TASK);
// allocate space for library path in the task
mach_vm_address_t dylib_address;
kr = mach_vm_allocate(remoteTask, &dylib_address, strlen(dylib_path) + 1, 1);
CHKERR(INJECT_ERROR_GENERIC)
// write library path into the task
kr = mach_vm_write(remoteTask, dylib_address, (vm_offset_t)dylib_path, strlen(dylib_path)+1);
CHKERR(INJECT_ERROR_GENERIC)
mach_vm_address_t stack_address;
kr = mach_vm_allocate(remoteTask, &stack_address, STACK_SIZE, 1);
CHKERR(INJECT_ERROR_STACK_ALLOC)
unsigned char asm_instructions[ 100 ] =
"\x55" // push %rbp
"\x48\x89\xe5" // mov %rbp, %rsp
"\x48\xb8\x00\x00\x00\x00\x00\x00\x00\x00" // mov %rax, _pthread_set_self
"\xff\xd0" // call %rax
"\x5d" // pop %rbp
"\x48\xbf\x00\x00\x00\x00\x00\x00\x00\x00" // mov %rdi, dylib_address
"\x48\xbe\x02\x00\x00\x00\x00\x00\x00\x00" // mov %rsi, 2
"\x48\xb8\x00\x00\x00\x00\x00\x00\x00\x00" // mov %rax, dlopen
"\xff\xd0" // call %rax
"\x48\xb8\x00\x00\x00\x00\x00\x00\x00\x00" // mov %rax, mach_thread_self
"\xff\xd0" // call %rax
"\x48\x89\xc7" // mov %rdi, %rax
"\x48\xb8\x00\x00\x00\x00\x00\x00\x00\x00" // mov %rax, thread_suspend
"\xff\xd0" // call %rax
;
// allocate space for assembly instructions...
mach_vm_address_t code_address;
kr = mach_vm_allocate(remoteTask, &code_address, sizeof(asm_instructions), 1);
CHKERR(INJECT_ERROR_CODE_ALLOC)
// set some values in our assembly instructions...
mach_vm_address_t pthread_set_self_address = (mach_vm_address_t) dlsym(RTLD_DEFAULT, "_pthread_set_self");
mach_vm_address_t mach_thread_self_address = (mach_vm_address_t) mach_thread_self;
mach_vm_address_t thread_suspend_address = (mach_vm_address_t) thread_suspend;
mach_vm_address_t dlopen_address = (mach_vm_address_t) dlopen;
memcpy(&asm_instructions[asm_pthread_offset], &pthread_set_self_address, sizeof(mach_vm_address_t));
memcpy(&asm_instructions[asm_dylib_offset], &dylib_address, sizeof(mach_vm_address_t));
memcpy(&asm_instructions[asm_dlopen_offset], &dlopen_address, sizeof(mach_vm_address_t));
memcpy(&asm_instructions[asm_mach_thread_self_offset], &mach_thread_self_address, sizeof(mach_vm_address_t));
memcpy(&asm_instructions[asm_thread_suspend_offset], &thread_suspend_address, sizeof(mach_vm_address_t));
kr = mach_vm_write(remoteTask, code_address, (vm_offset_t)asm_instructions, sizeof(asm_instructions));
CHKERR(INJECT_ERROR_GENERIC)
kr = mach_vm_protect(remoteTask, code_address, sizeof(asm_instructions), 0, VM_PROT_EXECUTE | VM_PROT_READ);
CHKERR(INJECT_ERROR_GENERIC)
// create thread, set registers, and start
thread_t thread = {0};
x86_thread_state64_t thread_state = {0};
thread_state.__rip = code_address;
thread_state.__rdi = stack_address;
thread_state.__rsp = stack_address;
thread_state.__rbp = stack_address;
kr = thread_create_running(remoteTask, x86_THREAD_STATE64, (thread_state_t) &thread_state, x86_THREAD_STATE64_COUNT, &thread);
CHKERR(INJECT_ERROR_CREATE_THREAD)
mach_port_deallocate(mach_task_self(), remoteTask);
return INJECT_SUCCESS;
}
A problem occurs at the function create_thread_running where it consistently returns error 4 (KERN_INVALID_ARGUMENT). This is because the arm64 version of the XNU kernel does not support x86_THREAD_STATE64 as a thread state flavor.
I confirmed this as the issue by digging through the kernel source. Where you can see that x86_THREAD_STATE64 is not included in any switch case, and it defaults to KERN_INVALID_ARGUMENT.
Are there any compatible alternatives to this function or method of injection?
I have a C++ function which has many return statements at various places. How to set a breakpoint at the return statement where the function actually returns ?
And what does "break" command without argument means?
Contrary to answers so far, most compilers will create a single return assembly instruction, regardless of how many return statements are in the function (it is convenient for the compiler to do that, so there is only a single place to perform all the stack frame cleanup).
If you wanted to stop on that instruction, all you have to do is disas and look for retq (or whatever the return instruction for your processor is), and set a breakpoint on it. For example:
int foo(int x)
{
switch(x) {
case 1: return 2;
case 2: return 3;
default: return 42;
}
}
int main()
{
return foo(0);
}
(gdb) disas foo
Dump of assembler code for function foo:
0x0000000000400448 <+0>: push %rbp
0x0000000000400449 <+1>: mov %rsp,%rbp
0x000000000040044c <+4>: mov %edi,-0x4(%rbp)
0x000000000040044f <+7>: mov -0x4(%rbp),%eax
0x0000000000400452 <+10>: mov %eax,-0xc(%rbp)
0x0000000000400455 <+13>: cmpl $0x1,-0xc(%rbp)
0x0000000000400459 <+17>: je 0x400463 <foo+27>
0x000000000040045b <+19>: cmpl $0x2,-0xc(%rbp)
0x000000000040045f <+23>: je 0x40046c <foo+36>
0x0000000000400461 <+25>: jmp 0x400475 <foo+45>
0x0000000000400463 <+27>: movl $0x2,-0x8(%rbp)
0x000000000040046a <+34>: jmp 0x40047c <foo+52>
0x000000000040046c <+36>: movl $0x3,-0x8(%rbp)
0x0000000000400473 <+43>: jmp 0x40047c <foo+52>
0x0000000000400475 <+45>: movl $0x2a,-0x8(%rbp)
0x000000000040047c <+52>: mov -0x8(%rbp),%eax
0x000000000040047f <+55>: leaveq
0x0000000000400480 <+56>: retq
End of assembler dump.
(gdb) b *0x0000000000400480
Breakpoint 1 at 0x400480
(gdb) r
Breakpoint 1, 0x0000000000400480 in foo ()
(gdb) p $rax
$1 = 42
You can use reverse debugging to find out where function actually returns. Finish executing current frame, do reverse-step and then you should stop at just returned statement.
(gdb) record
(gdb) fin
(gdb) reverse-step
Break on all retq of current function
This Python command puts a breakpoint on every retq instruction of the current function:
class BreakReturn(gdb.Command):
def __init__(self):
super().__init__(
'break-return',
gdb.COMMAND_RUNNING,
gdb.COMPLETE_NONE,
False
)
def invoke(self, arg, from_tty):
frame = gdb.selected_frame()
# TODO make this work if there is no debugging information, where .block() fails.
block = frame.block()
# Find the function block in case we are in an inner block.
while block:
if block.function:
break
block = block.superblock
start = block.start
end = block.end
arch = frame.architecture()
pc = gdb.selected_frame().pc()
instructions = arch.disassemble(start, end - 1)
for instruction in instructions:
if instruction['asm'].startswith('retq '):
gdb.Breakpoint('*{}'.format(instruction['addr']))
BreakReturn()
Source it with:
source gdb.py
and use the command as:
break-return
continue
You should now be at retq.
Step until retq
Just for fun, another implementation that stops when a retq is found (less efficient of because no hardware support):
class ContinueReturn(gdb.Command):
def __init__(self):
super().__init__(
'continue-return',
gdb.COMMAND_RUNNING,
gdb.COMPLETE_NONE,
False
)
def invoke(self, arg, from_tty):
thread = gdb.inferiors()[0].threads()[0]
while thread.is_valid():
gdb.execute('ni', to_string=True)
frame = gdb.selected_frame()
arch = frame.architecture()
pc = gdb.selected_frame().pc()
instruction = arch.disassemble(pc)[0]['asm']
if instruction.startswith('retq '):
break
ContinueReturn()
This will ignore your other breakpoints. TODO: can be avoided?
Not sure if it is faster or slower than reverse-step.
A version that stops at a given opcode can be found at: https://stackoverflow.com/a/31249378/895245
break without arguments stops execution at the next instruction in the currently selected stack frame. You select strack frames via the frame or up and down commands. If you want to debug the point where you are actually leaving the current function, select the next outer frame and break there.
rr reverse debugging
Similar to GDB record mentioned at https://stackoverflow.com/a/3649698/895245 , but much more functional as of GDB 7.11 vs rr 4.1.0 in Ubuntu 16.04.
Notably, it deals with AVX correctly:
gdb reverse debugging fails with "Process record does not support instruction 0xf0d at address"
"target record-full" in gdb makes "n" command fail on printf with "Process record does not support instruction 0xc5 at address 0x7ffff7dee6e7"?
which prevents it from working with the default standard library calls.
Install Ubuntu 16.04:
sudo apt-get install rr linux-tools-common linux-tools-generic linux-cloud-tools-generic
sudo cpupower frequency-set -g performance
But also consider compiling from source to get the latest updates, it was not hard.
Test program:
int where_return(int i) {
if (i)
return 1;
else
return 0;
}
int main(void) {
where_return(0);
where_return(1);
}
compile and run:
gcc -O0 -ggdb3 -o reverse.out -std=c89 -Wextra reverse.c
rr record ./reverse.out
rr replay
Now you are left inside a GDB session, and you can properly reverse debug:
(rr) break main
Breakpoint 1 at 0x56057c458619: file a.c, line 9.
(rr) continue
Continuing.
Breakpoint 1, main () at a.c:9
9 where_return(0);
(rr) step
where_return (i=0) at a.c:2
2 if (i)
(rr) finish
Run till exit from #0 where_return (i=0) at a.c:2
main () at a.c:10
10 where_return(1);
Value returned is $1 = 0
(rr) reverse-step
where_return (i=0) at a.c:6
6 }
(rr) reverse-step
5 return 0;
We are now on the correct return line.
If you can change the source code, you might use some dirty trick with the preprocessor:
void on_return() {
}
#define return return on_return(), /* If the function has a return value != void */
#define return return on_return() /* If the function has a return value == void */
/* <<<-- Insert your function here -->>> */
#undef return
Then set a breakpoint to on_return and go one frame up.
Attention: This will not work, if a function does not return via a return statement. So ensure, that it's last line is a return.
Example (shamelessly copied from C code, but will work also in C++):
#include <stdio.h>
/* Dummy function to place the breakpoint */
void on_return(void) {
}
#define return return on_return()
void myfun1(int a) {
if (a > 10) return;
printf("<10\n");
return;
}
#undef return
#define return return on_return(),
int myfun2(int a) {
if (a < 0) return -1;
if (a > 0) return 1;
return 0;
}
#undef return
int main(void)
{
myfun1(1);
myfun2(2);
}
The first macro will change
return;
to
return on_return();
Which is valid, since on_return also returns void.
The second macro will change
return -1;
to
return on_return(), -1;
Which will call on_return() and then return -1 (thanks to the ,-operator).
This is a very dirty trick, but despite using backwards-stepping, it will work in multi-threaded environments and inlined functions, too.
Break without argument sets a breakpoint at the current line.
There is no way for a single breakpoint to catch all return paths. Either set a breakpoint at the caller immediately after it returns, or break at all return statements.
Since this is C++, I suppose you could create a local sentry object, and break on its destructor, though.
We know that local static variable initialization is thread-safe in C++11, and modern compilers fully support this. (Is local static variable initialization thread-safe in C++11?)
What is the cost of making it thread-safe? I understand that this could very well be compiler implementation dependent.
Context: I have a multi-threaded application (10 threads) accessing a singleton object pool instance via the following function at very high rates, and I'm concerned about its performance implications.
template <class T>
ObjectPool<T>* ObjectPool<T>::GetInst()
{
static ObjectPool<T> instance;
return &instance;
}
A look at the generated assembler code helps.
Source
#include <vector>
std::vector<int> &get(){
static std::vector<int> v;
return v;
}
int main(){
return get().size();
}
Assembler
std::vector<int, std::allocator<int> >::~vector():
movq (%rdi), %rdi
testq %rdi, %rdi
je .L1
jmp operator delete(void*)
.L1:
rep ret
get():
movzbl guard variable for get()::v(%rip), %eax
testb %al, %al
je .L15
movl get()::v, %eax
ret
.L15:
subq $8, %rsp
movl guard variable for get()::v, %edi
call __cxa_guard_acquire
testl %eax, %eax
je .L6
movl guard variable for get()::v, %edi
movq $0, get()::v(%rip)
movq $0, get()::v+8(%rip)
movq $0, get()::v+16(%rip)
call __cxa_guard_release
movl $__dso_handle, %edx
movl get()::v, %esi
movl std::vector<int, std::allocator<int> >::~vector(), %edi
call __cxa_atexit
.L6:
movl get()::v, %eax
addq $8, %rsp
ret
main:
subq $8, %rsp
call get()
movq 8(%rax), %rdx
subq (%rax), %rdx
addq $8, %rsp
movq %rdx, %rax
sarq $2, %rax
ret
Compared to
Source
#include <vector>
static std::vector<int> v;
std::vector<int> &get(){
return v;
}
int main(){
return get().size();
}
Assembler
std::vector<int, std::allocator<int> >::~vector():
movq (%rdi), %rdi
testq %rdi, %rdi
je .L1
jmp operator delete(void*)
.L1:
rep ret
get():
movl v, %eax
ret
main:
movq v+8(%rip), %rax
subq v(%rip), %rax
sarq $2, %rax
ret
movl $__dso_handle, %edx
movl v, %esi
movl std::vector<int, std::allocator<int> >::~vector(), %edi
movq $0, v(%rip)
movq $0, v+8(%rip)
movq $0, v+16(%rip)
jmp __cxa_atexit
I'm not that great with assembler, but I can see that in the first version v has a lock around it and get is not inlined whereas in the second version get is essentially gone.
You can play around with various compilers and optimization flags, but it seems no compiler is able to inline or optimize out the locks, even though the program is obviously single threaded.
You can add static to get which makes gcc inline get while preserving the lock.
To know how much these locks and additional instructions cost for your compiler, flags, platform and surrounding code you would need to make a proper benchmark.
I would expect the locks to have some overhead and be significantly slower than the inlined code, which becomes insignificant when you actually do work with the vector, but you can never be sure without measuring.
From my experience, this is exactly as costly as a regular mutex (critical section). If the code is called very frequently, consider using a normal global variable instead.
Explained extensively here https://www.youtube.com/watch?v=B3WWsKFePiM by Jason Turner.
I put a sample code to illustrate the video. Since thread-safety is the main issue, I tried to call the method from multiple threads to see its effects.
You can think that compiler is implementing double-checking lock for you even though they can do whatever they want to ensure thread-safety. But they will at least add a branch to distinguish first time initialization unless optimizer does initialization at the global scope eagerly.
https://en.wikipedia.org/wiki/Double-checked_locking#Usage_in_C++11
#include <iostream>
#include <string>
#include <vector>
#include <thread>
struct Temp
{
// Everytime this method is called, compiler has to check whether `name` is
// constructed or not due to init-at-first-use idiom. This at least would
// involve an atomic load operation and maybe a lock acquisition.
static const std::string& name() {
static const std::string name = "name";
return name;
}
// Following does not create contention. Profiler showed little bit of
// performance improvement.
const std::string& ref_name = name();
const std::string& get_name_ref() const {
return ref_name;
}
};
int main(int, char**)
{
Temp tmp;
constexpr int num_worker = 8;
std::vector<std::thread> threads;
for (int i = 0; i < num_worker; ++i) {
threads.emplace_back([&](){
for (int i = 0; i < 10000000; ++i) {
// name() is almost 5s slower
printf("%zu\n", tmp.get_name_ref().size());
}
});
}
for (int i = 0; i < num_worker; ++i) {
threads[i].join();
}
return 0;
}
The name() version is 5s slower than get_name_ref() on my machine.
$ time ./test > /dev/null
Also I used compiler explorer to see what gcc generates. Following proves double checking lock pattern: Pay attention to atomic loads and guards acquired.
name ()
{
bool retval.0;
bool retval.1;
bool D.25443;
struct allocator D.25437;
const struct string & D.29013;
static const struct string name;
_1 = __atomic_load_1 (&_ZGVZL4namevE4name, 2);
retval.0 = _1 == 0;
if (retval.0 != 0) goto <D.29003>; else goto <D.29004>;
<D.29003>:
_2 = __cxa_guard_acquire (&_ZGVZL4namevE4name);
retval.1 = _2 != 0;
if (retval.1 != 0) goto <D.29006>; else goto <D.29007>;
<D.29006>:
D.25443 = 0;
try
{
std::allocator<char>::allocator (&D.25437);
try
{
try
{
std::__cxx11::basic_string<char>::basic_string (&name, "name", &D.25437);
D.25443 = 1;
__cxa_guard_release (&_ZGVZL4namevE4name);
__cxa_atexit (__dt_comp , &name, &__dso_handle);
}
finally
{
std::allocator<char>::~allocator (&D.25437);
}
}
finally
{
D.25437 = {CLOBBER};
}
}
catch
{
if (D.25443 != 0) goto <D.29008>; else goto <D.29009>;
<D.29008>:
goto <D.29010>;
<D.29009>:
__cxa_guard_abort (&_ZGVZL4namevE4name);
<D.29010>:
}
goto <D.29011>;
<D.29007>:
<D.29011>:
goto <D.29012>;
<D.29004>:
<D.29012>:
D.29013 = &name;
return D.29013;
}
When I compile following code with gcc 6 -O3 -std=c++14, I get nice and empty main:
Dump of assembler code for function main():
0x00000000004003e0 <+0>: xor %eax,%eax
0x00000000004003e2 <+2>: retq
But uncommenting last line in main "breaks" optimization:
Dump of assembler code for function main():
0x00000000004005f0 <+0>: sub $0x78,%rsp
0x00000000004005f4 <+4>: lea 0x40(%rsp),%rdi
0x00000000004005f9 <+9>: movq $0x400838,0x10(%rsp)
0x0000000000400602 <+18>: movb $0x0,0x18(%rsp)
0x0000000000400607 <+23>: mov %fs:0x28,%rax
0x0000000000400610 <+32>: mov %rax,0x68(%rsp)
0x0000000000400615 <+37>: xor %eax,%eax
0x0000000000400617 <+39>: movl $0x0,(%rsp)
0x000000000040061e <+46>: movq $0x400838,0x30(%rsp)
0x0000000000400627 <+55>: movb $0x0,0x38(%rsp)
0x000000000040062c <+60>: movl $0x0,0x20(%rsp)
0x0000000000400634 <+68>: movq $0x400838,0x50(%rsp)
0x000000000040063d <+77>: movb $0x0,0x58(%rsp)
0x0000000000400642 <+82>: movl $0x0,0x40(%rsp)
0x000000000040064a <+90>: callq 0x400790 <ErasedObject::~ErasedObject()>
0x000000000040064f <+95>: lea 0x20(%rsp),%rdi
0x0000000000400654 <+100>: callq 0x400790 <ErasedObject::~ErasedObject()>
0x0000000000400659 <+105>: mov %rsp,%rdi
0x000000000040065c <+108>: callq 0x400790 <ErasedObject::~ErasedObject()>
0x0000000000400661 <+113>: mov 0x68(%rsp),%rdx
0x0000000000400666 <+118>: xor %fs:0x28,%rdx
0x000000000040066f <+127>: jne 0x400678 <main()+136>
0x0000000000400671 <+129>: xor %eax,%eax
0x0000000000400673 <+131>: add $0x78,%rsp
0x0000000000400677 <+135>: retq
0x0000000000400678 <+136>: callq 0x4005c0 <__stack_chk_fail#plt>
Code
#include <type_traits>
#include <new>
namespace
{
struct ErasedTypeVTable
{
using destructor_t = void (*)(void *obj);
destructor_t dtor;
};
template <typename T>
void dtor(void *obj)
{
return static_cast<T *>(obj)->~T();
}
template <typename T>
static const ErasedTypeVTable erasedTypeVTable = {
&dtor<T>
};
}
struct ErasedObject
{
std::aligned_storage<sizeof(void *)>::type storage;
const ErasedTypeVTable& vtbl;
bool flag = false;
template <typename T, typename S = typename std::decay<T>::type>
ErasedObject(T&& obj)
: vtbl(erasedTypeVTable<S>)
{
static_assert(sizeof(T) <= sizeof(storage) && alignof(T) <= alignof(decltype(storage)), "");
new (object()) S(std::forward<T>(obj));
}
ErasedObject(ErasedObject&& other) = default;
~ErasedObject()
{
if (flag)
{
::operator delete(object());
}
else
{
vtbl.dtor(object());
}
}
void *object()
{
return reinterpret_cast<char *>(&storage);
}
};
struct myType
{
int a;
};
int main()
{
ErasedObject c1(myType{});
ErasedObject c2(myType{});
//ErasedObject c3(myType{});
}
clang can optimize-out both versions.
Any ideas what's going on? Am I hitting some optimization limit? If so, is it configurable?
I ran g++ with -fdump-ipa-inline to get more information about why functions are or are not inlined.
For the testcase with main() function and three objects created I got:
(...)
150 Deciding on inlining of small functions. Starting with size 35.
151 Enqueueing calls in void {anonymous}::dtor(void*) [with T = myType]/40.
152 Enqueueing calls in int main()/35.
153 not inlinable: int main()/35 -> ErasedObject::~ErasedObject()/33, call is unlikely and code size would grow
154 not inlinable: int main()/35 -> ErasedObject::~ErasedObject()/33, call is unlikely and code size would grow
155 not inlinable: int main()/35 -> ErasedObject::~ErasedObject()/33, call is unlikely and code size would grow
(...)
This error code is set in gcc/gcc/ipa-inline.c:
else if (!e->maybe_hot_p ()
&& (growth >= MAX_INLINE_INSNS_SINGLE
|| growth_likely_positive (callee, growth)))
{
e->inline_failed = CIF_UNLIKELY_CALL;
want_inline = false;
}
Then I discovered, that the smallest change to make g++ inline these functions is to add a declaration:
int main() __attribute__((hot));
I wasn't able to find in code why int main() isn't considered hot, but probably this should be left for another question.
More interesting is the the second part of the conditional I pasted above. The intent was to not inline when the code will grow and you produced an example when the code shrinks after complete inlining.
I think this deserves to be reported on GCC's bugzilla, but I'm not sure if you can call it a bug - estimation of inline impact is a heuristic and as such it is expected to work correctly in most cases, not all of them.
When using C++ exceptions to transport errno state, the compiled code that gets generated by g++ (4.5.3) for code such as the following
#include <cerrno>
#include <stdexcept>
#include <string>
class oserror : public std::runtime_error {
private:
static std::string errnotostr(int errno_);
public:
explicit oserror(int errno_) :
std::runtime_error(errnotostr(errno_)) {
}
};
void test() {
throw oserror(errno);
}
is rather unexpectedly (on Linux, x86_64)
.type _Z4testv, #function
...
movl $16, %edi
call __cxa_allocate_exception
movq %rax, %rbx
movq %rbx, %r12
call __errno_location
movl (%rax), %eax
movl %eax, %esi
movq %r12, %rdi
call _ZN7oserrorC1Ei
What this basically means is that errno as an argument to a C++ exception is pretty much useless due to the call to __cxa_allocate_exception preceding the call to __errno_location (which is the macro content of errno), where the former calls std::malloc and does not save errno state (at least as far as I understood the sources of __cxa_allocate_exception in eh_alloc.cc of libstdc++).
This means that in the case that memory allocation fails, the error number that was actually to be passed into the exception object gets overwritten with the error number that std::malloc set up. std::malloc gives no guarantee to save an existing errno state, anyway, even in the case of successful exit - so the above code is definitely broken in the general case.
On Cygwin, x86, the code that gets compiled (also using g++ 4.5.3) for test() is okay, though:
.def __Z4testv; .scl 2; .type 32; .endef
...
call ___errno
movl (%eax), %esi
movl $8, (%esp)
call ___cxa_allocate_exception
movl %eax, %ebx
movl %ebx, %eax
movl %esi, 4(%esp)
movl %eax, (%esp)
call __ZN7oserrorC1Ei
Does this mean that for library code to properly wrap errno state in an exception, I'll always have to use a macro which expands to something like
int curerrno_ = errno;
throw oserror(curerrno_);
I actually can't seem to find the corresponding section of the C++ standard which says anything about evaluation order in the case of exceptions, but to me it seems that the g++ generated code on x86_64 (on Linux) is broken due to allocating memory for the exception object before collecting the parameters for its constructor, and that this is a compiler bug in some way. Am I right, or is this some fundamentally wrong thinking on my part?
What this basically means is that errno as an argument to a C++ exception is pretty much useless due to the call to __cxa_allocate_exception preceding the call to __errno_location (which is the macro content of errno), where the former calls std::malloc and does not save errno state (at least as far as I understood the sources of __cxa_allocate_exception in eh_alloc.cc of libstdc++).
This is not true. As far as I have checked the source code, the only "thing" inside __cxa_allocate_exception that can change errno is malloc(). Two cases may occur:
malloc() succeeds, then errno is unchanged;
malloc() fails, then std::terminate() is called and your oserror() is never constructed.
Therefore, since calling _cxa_allocate_exception before calling your constructor does not functionally change your program, I believe g++ has the right to do so.
Please note that __cxa_allocate_exception is done before your constructor is actually called.
32:std_errno.cpp **** throw oserror( errno );
352 0007 BF100000 movl $16, %edi
;;;; Exception space allocation:
355 000c E8000000 call __cxa_allocate_exception
356 0011 4889C3 movq %rax, %rbx
;;;; "errno" evaluation:
357 0014 E8000000 call __errno_location
358 0019 8B00 movl (%rax), %eax
359 001b 89C6 movl %eax, %esi
360 001d 4889DF movq %rbx, %rdi
;;;; Constructor called here:
362 0020 E8000000 call _ZN7oserrorC1Ei
So it makes sense. __cxa_allocate_exception just allocates space to an exception, but does not construct it (libc++abi Specification).
When your exception object is built, errno is arleady evaluated.
I took your example and implemented errnotostr:
// Unexpected flow of control (compiler-bug?) using errno as argument for exception in C++ (g++)
#include <cerrno>
#include <stdexcept>
#include <string>
#include <iostream>
#include <cstring>
#include <sstream>
class oserror : public std::runtime_error
{
private:
static std::string errnotostr(int errno_)
{
std::stringstream ss;
ss << "[" << errno_ << "] " << std::strerror( errno_ );
return ss.str( );
}
public:
explicit oserror( int errno_ )
: std::runtime_error( errnotostr( errno_ ) )
{
}
};
void test( )
{
throw oserror( errno );
}
int main( )
{
try
{
std::cout << "Enter a value to errno: ";
std::cin >> errno;
std::cout << "Test with errno = " << errno << std::endl;
test( );
}
catch ( oserror &o )
{
std::cout << "Exception caught: " << o.what( ) << std::endl;
return 1;
}
return 0;
}
Then I compiled with -O0 and -O2, run and got the same results, all according to expectations:
> ./std_errno
Enter a value to errno: 1
Test with errno = 1Exception caught: [1] Operation not permitted
> ./std_errno
Enter a value to errno: 11
Test with errno = 11
Exception caught: [11] Resource temporarily unavailable
> ./std_errno
Enter a value to errno: 111
Test with errno = 111
Exception caught: [111] Connection refused
(Running on 64-bits Opensuse 12.1, G++ 4.6.2)