Injecting a Dylib into Processes Running Under Rosetta - c++

I need to inject a dynamic library into a process exclusively targeting the x86_64 instruction set. My host architecture is aarch64.
I attempted injection using the following C++ code...
#define CHKERR(x) if (kr != KERN_SUCCESS) {std::cout << kr << std::endl; return x;};
#define STACK_SIZE 0x1000
#define asm_pthread_offset 6
#define asm_dylib_offset 19
#define asm_dlopen_offset 39
#define asm_mach_thread_self_offset 51
#define asm_thread_suspend_offset 66
inject_result inject_dylib(int pid, const char *dylib_path) {
task_t remoteTask;
struct stat buf;
// check if the dynamic library exists...
int check = stat(dylib_path, &buf);
if (check != 0)
return INJECT_ERROR_NOT_FOUND;
mach_error_t kr = 0;
// request the task port of the target process...
kr = task_for_pid(mach_task_self(), pid, &remoteTask);
CHKERR(INJECT_ERROR_MACH_TASK);
// allocate space for library path in the task
mach_vm_address_t dylib_address;
kr = mach_vm_allocate(remoteTask, &dylib_address, strlen(dylib_path) + 1, 1);
CHKERR(INJECT_ERROR_GENERIC)
// write library path into the task
kr = mach_vm_write(remoteTask, dylib_address, (vm_offset_t)dylib_path, strlen(dylib_path)+1);
CHKERR(INJECT_ERROR_GENERIC)
mach_vm_address_t stack_address;
kr = mach_vm_allocate(remoteTask, &stack_address, STACK_SIZE, 1);
CHKERR(INJECT_ERROR_STACK_ALLOC)
unsigned char asm_instructions[ 100 ] =
"\x55" // push %rbp
"\x48\x89\xe5" // mov %rbp, %rsp
"\x48\xb8\x00\x00\x00\x00\x00\x00\x00\x00" // mov %rax, _pthread_set_self
"\xff\xd0" // call %rax
"\x5d" // pop %rbp
"\x48\xbf\x00\x00\x00\x00\x00\x00\x00\x00" // mov %rdi, dylib_address
"\x48\xbe\x02\x00\x00\x00\x00\x00\x00\x00" // mov %rsi, 2
"\x48\xb8\x00\x00\x00\x00\x00\x00\x00\x00" // mov %rax, dlopen
"\xff\xd0" // call %rax
"\x48\xb8\x00\x00\x00\x00\x00\x00\x00\x00" // mov %rax, mach_thread_self
"\xff\xd0" // call %rax
"\x48\x89\xc7" // mov %rdi, %rax
"\x48\xb8\x00\x00\x00\x00\x00\x00\x00\x00" // mov %rax, thread_suspend
"\xff\xd0" // call %rax
;
// allocate space for assembly instructions...
mach_vm_address_t code_address;
kr = mach_vm_allocate(remoteTask, &code_address, sizeof(asm_instructions), 1);
CHKERR(INJECT_ERROR_CODE_ALLOC)
// set some values in our assembly instructions...
mach_vm_address_t pthread_set_self_address = (mach_vm_address_t) dlsym(RTLD_DEFAULT, "_pthread_set_self");
mach_vm_address_t mach_thread_self_address = (mach_vm_address_t) mach_thread_self;
mach_vm_address_t thread_suspend_address = (mach_vm_address_t) thread_suspend;
mach_vm_address_t dlopen_address = (mach_vm_address_t) dlopen;
memcpy(&asm_instructions[asm_pthread_offset], &pthread_set_self_address, sizeof(mach_vm_address_t));
memcpy(&asm_instructions[asm_dylib_offset], &dylib_address, sizeof(mach_vm_address_t));
memcpy(&asm_instructions[asm_dlopen_offset], &dlopen_address, sizeof(mach_vm_address_t));
memcpy(&asm_instructions[asm_mach_thread_self_offset], &mach_thread_self_address, sizeof(mach_vm_address_t));
memcpy(&asm_instructions[asm_thread_suspend_offset], &thread_suspend_address, sizeof(mach_vm_address_t));
kr = mach_vm_write(remoteTask, code_address, (vm_offset_t)asm_instructions, sizeof(asm_instructions));
CHKERR(INJECT_ERROR_GENERIC)
kr = mach_vm_protect(remoteTask, code_address, sizeof(asm_instructions), 0, VM_PROT_EXECUTE | VM_PROT_READ);
CHKERR(INJECT_ERROR_GENERIC)
// create thread, set registers, and start
thread_t thread = {0};
x86_thread_state64_t thread_state = {0};
thread_state.__rip = code_address;
thread_state.__rdi = stack_address;
thread_state.__rsp = stack_address;
thread_state.__rbp = stack_address;
kr = thread_create_running(remoteTask, x86_THREAD_STATE64, (thread_state_t) &thread_state, x86_THREAD_STATE64_COUNT, &thread);
CHKERR(INJECT_ERROR_CREATE_THREAD)
mach_port_deallocate(mach_task_self(), remoteTask);
return INJECT_SUCCESS;
}
A problem occurs at the function create_thread_running where it consistently returns error 4 (KERN_INVALID_ARGUMENT). This is because the arm64 version of the XNU kernel does not support x86_THREAD_STATE64 as a thread state flavor.
I confirmed this as the issue by digging through the kernel source. Where you can see that x86_THREAD_STATE64 is not included in any switch case, and it defaults to KERN_INVALID_ARGUMENT.
Are there any compatible alternatives to this function or method of injection?

Related

Different Stacktraces for NewHandler and UnhandledExceptionHandler

I have the following code:
#include <windows.h>
#include <minidumpapiset.h>
#include <strsafe.h>
#include <fileapi.h>
#include <iostream>
#include <signal.h>
#include <minwinbase.h>
#include <new.h>
#include "StackWalker.h"
int minidumpId = 0;
#ifndef _AddressOfReturnAddress
// Taken from: http://msdn.microsoft.com/en-us/library/s975zw7k(VS.71).aspx
#ifdef __cplusplus
#define EXTERNC extern "C"
#else
#define EXTERNC
#endif
// _ReturnAddress and _AddressOfReturnAddress should be prototyped before use
EXTERNC void* _AddressOfReturnAddress(void);
EXTERNC void* _ReturnAddress(void);
EXTERNC int __cdecl _purecall();
#endif
EXCEPTION_POINTERS ExceptionPointers;
EXCEPTION_RECORD ExceptionRecord;
CONTEXT ContextRecord;
void GetExceptionPointers(DWORD exceptionCode, EXCEPTION_POINTERS** exceptionPointers)
{
// The following code was taken from VC++ 8.0 CRT (invarg.c: line 104)
ZeroMemory(&ExceptionPointers, sizeof(EXCEPTION_POINTERS));
ZeroMemory(&ExceptionRecord, sizeof(EXCEPTION_RECORD));
ZeroMemory(&ContextRecord, sizeof(CONTEXT));
// Looks like a workaround for some bug in RtlCaptureContext. But no description.
#ifdef _X86_
__asm {
mov dword ptr[ContextRecord.Eax], eax
mov dword ptr[ContextRecord.Ecx], ecx
mov dword ptr[ContextRecord.Edx], edx
mov dword ptr[ContextRecord.Ebx], ebx
mov dword ptr[ContextRecord.Esi], esi
mov dword ptr[ContextRecord.Edi], edi
mov word ptr[ContextRecord.SegSs], ss
mov word ptr[ContextRecord.SegCs], cs
mov word ptr[ContextRecord.SegDs], ds
mov word ptr[ContextRecord.SegEs], es
mov word ptr[ContextRecord.SegFs], fs
mov word ptr[ContextRecord.SegGs], gs
pushfd
pop[ContextRecord.EFlags]
}
ContextRecord.ContextFlags = CONTEXT_CONTROL;
#pragma warning(push)
#pragma warning(disable : 4311)
ContextRecord.Eip = (ULONG)_ReturnAddress();
ContextRecord.Esp = (ULONG)_AddressOfReturnAddress();
#pragma warning(pop)
ContextRecord.Ebp = *(static_cast<ULONG*>(_AddressOfReturnAddress()) - 1);
#elif defined(_IA64_) || defined(_AMD64_) || defined(_ARM_) || defined(_ARM64_)
CaptureContext(&ContextRecord);
#else /* defined (_IA64_) || defined (_AMD64_) || defined(_ARM_) || defined(_ARM64_) */
ZeroMemory(&ContextRecord, sizeof(ContextRecord));
#endif /* defined (_IA64_) || defined (_AMD64_) || defined(_ARM_) || defined(_ARM64_) */
ExceptionRecord.ExceptionCode = exceptionCode;
ExceptionRecord.ExceptionAddress = _ReturnAddress();
ExceptionRecord.ExceptionFlags = EXCEPTION_NONCONTINUABLE;
*exceptionPointers = &ExceptionPointers;
(*exceptionPointers)->ExceptionRecord = &ExceptionRecord;
(*exceptionPointers)->ContextRecord = &ContextRecord;
}
class DbgLibrary final
{
public:
DbgLibrary()
{
dbgLibrary = LoadLibraryW(L"dbghelp.dll");
}
~DbgLibrary()
{
FreeLibrary(dbgLibrary);
}
explicit operator bool() const
{
return dbgLibrary != NULL;
}
bool WriteMinidump(HANDLE file, EXCEPTION_POINTERS* exceptionPointers) const
{
MINIDUMP_EXCEPTION_INFORMATION exceptionInformation;
exceptionInformation.ThreadId = GetCurrentThreadId();
exceptionInformation.ExceptionPointers = exceptionPointers;
exceptionInformation.ClientPointers = FALSE;
MINIDUMP_CALLBACK_INFORMATION callbackInformation;
callbackInformation.CallbackRoutine = NULL;
callbackInformation.CallbackParam = NULL;
typedef BOOL(WINAPI* LPMINIDUMPWRITEDUMP)(HANDLE processHandle, DWORD ProcessId, HANDLE fileHandle,
MINIDUMP_TYPE DumpType, CONST PMINIDUMP_EXCEPTION_INFORMATION ExceptionParam,
CONST PMINIDUMP_USER_STREAM_INFORMATION UserEncoderParam,
CONST PMINIDUMP_CALLBACK_INFORMATION CallbackParam);
LPMINIDUMPWRITEDUMP pfnMiniDumpWriteDump =
(LPMINIDUMPWRITEDUMP)GetProcAddress(dbgLibrary, "MiniDumpWriteDump");
if (NULL == pfnMiniDumpWriteDump)
{
return false;
}
BOOL isWriteSucceed = pfnMiniDumpWriteDump(GetCurrentProcess(), GetCurrentProcessId(), file, MiniDumpNormal,
&exceptionInformation, NULL, &callbackInformation);
return isWriteSucceed;
}
private:
HMODULE dbgLibrary;
};
inline HANDLE CreateNativeFile(const wchar_t* filePath)
{
HANDLE file = NULL;
file = CreateFileW(filePath, GENERIC_WRITE, 0, NULL, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL);
return file;
}
void CreateMiniDump(PEXCEPTION_POINTERS exceptionPointers)
{
const DbgLibrary dbgLibrary;
if (dbgLibrary)
{
wchar_t FILE_PATH[4096];
// Write `exceptionPointers` to the minidump file
StringCbPrintfW(FILE_PATH, sizeof(FILE_PATH), L"%ls\\%ls_%ld.dmp", ".",
L"minidump", minidumpId++);
HANDLE hMinidump = CreateNativeFile(FILE_PATH);
if (hMinidump != INVALID_HANDLE_VALUE)
{
dbgLibrary.WriteMinidump(hMinidump, exceptionPointers);
CloseHandle(hMinidump);
}
}
}
LONG WINAPI SehHandler(PEXCEPTION_POINTERS exceptionPointers)
{
std::cerr << "SehHandler\n";
CreateMiniDump(exceptionPointers);
return EXCEPTION_EXECUTE_HANDLER;
}
void SigsegvHandler(int)
{
std::cerr << "SigsegvHandler\n";
PEXCEPTION_POINTERS exceptionPointers = static_cast<PEXCEPTION_POINTERS>(_pxcptinfoptrs);
// Write minidump file
CreateMiniDump(exceptionPointers);
}
int __cdecl NewHandler(size_t size)
{
std::cerr << "NewHandler\n";
// 'new' operator memory allocation exception
PEXCEPTION_POINTERS exceptionPointers;
GetExceptionPointers(STATUS_NO_MEMORY, &exceptionPointers);
CreateMiniDump(exceptionPointers);
return 0;
}
struct A5 {
void F()
{
while (true)
{
int* a = new int[50000000];
}
}
};
struct A4 {
A5 a;
void F()
{
a.F();
}
};
struct A3 {
A4 a;
void F()
{
a.F();
}
};
struct A2 {
A3 a;
void F()
{
a.F();
}
};
struct A1 {
A2 a;
void F()
{
a.F();
}
};
int main()
{
SetUnhandledExceptionFilter(SehHandler);
signal(SIGSEGV, SigsegvHandler);
_set_new_handler(NewHandler);
A1().F();
return 0;
}
Here two handlers would be invoked: NewHandler and SehHandler. The first one because of bad_alloc in operator new[], the second one because of unhandled exception. In both handlers I create minidump with information about crash.
NewHandler:
Thread 0 (crashed)
0 StackWalker_VC2017.exe!_callnewh [new_handler.cpp : 79 + 0x2]
eip = 0x0040a636 esp = 0x0019fefc ebp = 0x0019ff08 ebx = 0x00311000
esi = 0x00401d10 edi = 0x00655368 eax = 0x0042eed0 ecx = 0x00000000
edx = 0x00655368 efl = 0x00000202
Found by: given as instruction pointer in context
1 StackWalker_VC2017.exe!operator new(unsigned int) [new_scalar.cpp : 40 + 0x8]
eip = 0x00404a05 esp = 0x0019ff10 ebp = 0x0019ff14
Found by: call frame info
2 StackWalker_VC2017.exe!A5::F() [main.cpp : 197 + 0xa]
eip = 0x00401d0a esp = 0x0019ff1c ebp = 0x0019ff28
Found by: call frame info
3 StackWalker_VC2017.exe!main [main.cpp : 239 + 0x8]
eip = 0x00402500 esp = 0x0019ff24 ebp = 0x0019ff28
Found by: call frame info
4 StackWalker_VC2017.exe!static int __scrt_common_main_seh() [exe_common.inl : 288 + 0x1c]
eip = 0x00404c5d esp = 0x0019ff30 ebp = 0x0019ff70
Found by: call frame info
5 kernel32.dll + 0x1fa29
eip = 0x7712fa29 esp = 0x0019ff78 ebp = 0x0019ff80
Found by: call frame info
6 ntdll.dll + 0x67a9e
eip = 0x77c97a9e esp = 0x0019ff88 ebp = 0x0019ffdc
Found by: previous frame's frame pointer
7 ntdll.dll + 0x67a6e
eip = 0x77c97a6e esp = 0x0019ffe4 ebp = 0x0019ffec
Found by: previous frame's frame pointer
SehHandler:
Thread 0 (crashed)
0 KERNELBASE.dll + 0x12b812
eip = 0x76ddb812 esp = 0x0019fe68 ebp = 0x0019fec4 ebx = 0x19930520
esi = 0x00645a90 edi = 0x0042c754 eax = 0x0019fe68 ecx = 0x00000003
edx = 0x00000000 efl = 0x00000212
Found by: given as instruction pointer in context
1 StackWalker_VC2017.exe!_CxxThrowException [throw.cpp : 74 + 0x19]
eip = 0x00405a98 esp = 0x0019fecc ebp = 0x0019fef4
Found by: previous frame's frame pointer
2 StackWalker_VC2017.exe!__scrt_throw_std_bad_alloc() [throw_bad_alloc.cpp : 35 + 0x16]
eip = 0x0040509c esp = 0x0019fefc ebp = 0x0019ff10
Found by: call frame info
3 StackWalker_VC2017.exe!main [main.cpp : 239 + 0x8]
eip = 0x00402500 esp = 0x0019ff24 ebp = 0x0019ff14
Found by: call frame info with scanning
Extracted stacks using breakpad minidump_stackwalk:
The question is why SehHandler stacktrace does not have all function calls?
The main problem is that in project I use crash handlers for logging information in dumps. But creating minidump on each NewHandler call is not inappropriate solution, because sometimes bad_alloc could be fixed and exception thrown in try/catch block, that means that it is expected behaviour. So I want to handle bad_alloc in unhandled exception handler, so that it would definitely be crash. Also problem occurs only in release builds.
As mentioned in https://developercommunity.visualstudio.com/t/stdbad-alloc-failures-are-undebuggable/542559?viewtype=solutions it is bug in msvc. Unfortunately there is no good solution for release builds.

How to let MSVC compiler optimize multiple step POD initialization?

I've made this sample code:
#include <vector>
struct POD {
int a;
int b;
int c;
inline static POD make_pod_with_default()
{
POD p{ 41, 51, 61 };
return p;
}
inline void change_pod_a(POD &p, int a) {
p.a = a;
}
inline void change_pod_b(POD &p, int b) {
p.b = b;
}
static POD make_pod_with_a(int a) {
POD p = make_pod_with_default();
p.change_pod_a(p, a);
return p;
}
static POD make_pod_with_b(int a) {
POD p = make_pod_with_default();
p.change_pod_b(p, a);
return p;
}
};
int main()
{
std::vector<POD> vec{};
vec.reserve(2);
vec.push_back(POD::make_pod_with_a(71));
vec.push_back(POD::make_pod_with_b(81));
return vec[0].a + vec[0].b + vec[0].c + vec[1].a + vec[1].b + vec[1].c;
}
In the compiled assembly code we can see the following instructions are being generated for the first vec.push_back(...) call:
...
mov DWORD PTR $T2[esp+32], 41 ; 00000029H
...
mov DWORD PTR $T2[esp+36], 51 ; 00000033H
...
mov DWORD PTR $T5[esp+32], 71 ; 00000047H
...
mov DWORD PTR $T6[esp+44], 61 ; 0000003dH
...
There's a mov to [esp+32] for the 71, but the mov to [esp+32] for the 41 is still there, being useless! How can I write code for MSVC that will enable this kind of optimization, is MSVC even capable of it?
Both GCC and CLANG give more optimized versions, but CLANG defeats by a large margin with literally no overhead, in a very clean and logical fashion:
CLANG generated code:
main: # #main
push rax
mov edi, 24
call operator new(unsigned long)
mov rdi, rax
call operator delete(void*)
mov eax, 366
pop rcx
ret
Everything is done at compile time as 71 + 51 + 61 + 41 + 81 + 61 = 366!
I must admit its painful to see my program being computed at compile time and still throw in that call to vec.reserve() in the assembly... but CLANG still takes the cake, by far! Come on MSVC, this is not a vector of volatile.
If you turn your methods constexpr, you might do:
constexpr POD step_one()
{
POD p{2, 5, 11};
p.b = 3;
return p;
}
constexpr void step_two(POD &p)
{
p.c = 5;
}
constexpr POD make_pod(){
POD p = step_one();
step_two(p);
return p;
}
POD make_pod_final()
{
constexpr POD res = make_pod();
return res;
}
resulting to:
make_pod_final PROC
mov eax, DWORD PTR $T1[esp-4]
mov DWORD PTR [eax], 2
mov DWORD PTR [eax+4], 3
mov DWORD PTR [eax+8], 5
ret 0
Demo

Cost of thread-safe local static variable initialization in C++11?

We know that local static variable initialization is thread-safe in C++11, and modern compilers fully support this. (Is local static variable initialization thread-safe in C++11?)
What is the cost of making it thread-safe? I understand that this could very well be compiler implementation dependent.
Context: I have a multi-threaded application (10 threads) accessing a singleton object pool instance via the following function at very high rates, and I'm concerned about its performance implications.
template <class T>
ObjectPool<T>* ObjectPool<T>::GetInst()
{
static ObjectPool<T> instance;
return &instance;
}
A look at the generated assembler code helps.
Source
#include <vector>
std::vector<int> &get(){
static std::vector<int> v;
return v;
}
int main(){
return get().size();
}
Assembler
std::vector<int, std::allocator<int> >::~vector():
movq (%rdi), %rdi
testq %rdi, %rdi
je .L1
jmp operator delete(void*)
.L1:
rep ret
get():
movzbl guard variable for get()::v(%rip), %eax
testb %al, %al
je .L15
movl get()::v, %eax
ret
.L15:
subq $8, %rsp
movl guard variable for get()::v, %edi
call __cxa_guard_acquire
testl %eax, %eax
je .L6
movl guard variable for get()::v, %edi
movq $0, get()::v(%rip)
movq $0, get()::v+8(%rip)
movq $0, get()::v+16(%rip)
call __cxa_guard_release
movl $__dso_handle, %edx
movl get()::v, %esi
movl std::vector<int, std::allocator<int> >::~vector(), %edi
call __cxa_atexit
.L6:
movl get()::v, %eax
addq $8, %rsp
ret
main:
subq $8, %rsp
call get()
movq 8(%rax), %rdx
subq (%rax), %rdx
addq $8, %rsp
movq %rdx, %rax
sarq $2, %rax
ret
Compared to
Source
#include <vector>
static std::vector<int> v;
std::vector<int> &get(){
return v;
}
int main(){
return get().size();
}
Assembler
std::vector<int, std::allocator<int> >::~vector():
movq (%rdi), %rdi
testq %rdi, %rdi
je .L1
jmp operator delete(void*)
.L1:
rep ret
get():
movl v, %eax
ret
main:
movq v+8(%rip), %rax
subq v(%rip), %rax
sarq $2, %rax
ret
movl $__dso_handle, %edx
movl v, %esi
movl std::vector<int, std::allocator<int> >::~vector(), %edi
movq $0, v(%rip)
movq $0, v+8(%rip)
movq $0, v+16(%rip)
jmp __cxa_atexit
I'm not that great with assembler, but I can see that in the first version v has a lock around it and get is not inlined whereas in the second version get is essentially gone.
You can play around with various compilers and optimization flags, but it seems no compiler is able to inline or optimize out the locks, even though the program is obviously single threaded.
You can add static to get which makes gcc inline get while preserving the lock.
To know how much these locks and additional instructions cost for your compiler, flags, platform and surrounding code you would need to make a proper benchmark.
I would expect the locks to have some overhead and be significantly slower than the inlined code, which becomes insignificant when you actually do work with the vector, but you can never be sure without measuring.
From my experience, this is exactly as costly as a regular mutex (critical section). If the code is called very frequently, consider using a normal global variable instead.
Explained extensively here https://www.youtube.com/watch?v=B3WWsKFePiM by Jason Turner.
I put a sample code to illustrate the video. Since thread-safety is the main issue, I tried to call the method from multiple threads to see its effects.
You can think that compiler is implementing double-checking lock for you even though they can do whatever they want to ensure thread-safety. But they will at least add a branch to distinguish first time initialization unless optimizer does initialization at the global scope eagerly.
https://en.wikipedia.org/wiki/Double-checked_locking#Usage_in_C++11
#include <iostream>
#include <string>
#include <vector>
#include <thread>
struct Temp
{
// Everytime this method is called, compiler has to check whether `name` is
// constructed or not due to init-at-first-use idiom. This at least would
// involve an atomic load operation and maybe a lock acquisition.
static const std::string& name() {
static const std::string name = "name";
return name;
}
// Following does not create contention. Profiler showed little bit of
// performance improvement.
const std::string& ref_name = name();
const std::string& get_name_ref() const {
return ref_name;
}
};
int main(int, char**)
{
Temp tmp;
constexpr int num_worker = 8;
std::vector<std::thread> threads;
for (int i = 0; i < num_worker; ++i) {
threads.emplace_back([&](){
for (int i = 0; i < 10000000; ++i) {
// name() is almost 5s slower
printf("%zu\n", tmp.get_name_ref().size());
}
});
}
for (int i = 0; i < num_worker; ++i) {
threads[i].join();
}
return 0;
}
The name() version is 5s slower than get_name_ref() on my machine.
$ time ./test > /dev/null
Also I used compiler explorer to see what gcc generates. Following proves double checking lock pattern: Pay attention to atomic loads and guards acquired.
name ()
{
bool retval.0;
bool retval.1;
bool D.25443;
struct allocator D.25437;
const struct string & D.29013;
static const struct string name;
_1 = __atomic_load_1 (&_ZGVZL4namevE4name, 2);
retval.0 = _1 == 0;
if (retval.0 != 0) goto <D.29003>; else goto <D.29004>;
<D.29003>:
_2 = __cxa_guard_acquire (&_ZGVZL4namevE4name);
retval.1 = _2 != 0;
if (retval.1 != 0) goto <D.29006>; else goto <D.29007>;
<D.29006>:
D.25443 = 0;
try
{
std::allocator<char>::allocator (&D.25437);
try
{
try
{
std::__cxx11::basic_string<char>::basic_string (&name, "name", &D.25437);
D.25443 = 1;
__cxa_guard_release (&_ZGVZL4namevE4name);
__cxa_atexit (__dt_comp , &name, &__dso_handle);
}
finally
{
std::allocator<char>::~allocator (&D.25437);
}
}
finally
{
D.25437 = {CLOBBER};
}
}
catch
{
if (D.25443 != 0) goto <D.29008>; else goto <D.29009>;
<D.29008>:
goto <D.29010>;
<D.29009>:
__cxa_guard_abort (&_ZGVZL4namevE4name);
<D.29010>:
}
goto <D.29011>;
<D.29007>:
<D.29011>:
goto <D.29012>;
<D.29004>:
<D.29012>:
D.29013 = &name;
return D.29013;
}

Is this a bug in gcc optimizer?

When I compile following code with gcc 6 -O3 -std=c++14, I get nice and empty main:
Dump of assembler code for function main():
0x00000000004003e0 <+0>: xor %eax,%eax
0x00000000004003e2 <+2>: retq
But uncommenting last line in main "breaks" optimization:
Dump of assembler code for function main():
0x00000000004005f0 <+0>: sub $0x78,%rsp
0x00000000004005f4 <+4>: lea 0x40(%rsp),%rdi
0x00000000004005f9 <+9>: movq $0x400838,0x10(%rsp)
0x0000000000400602 <+18>: movb $0x0,0x18(%rsp)
0x0000000000400607 <+23>: mov %fs:0x28,%rax
0x0000000000400610 <+32>: mov %rax,0x68(%rsp)
0x0000000000400615 <+37>: xor %eax,%eax
0x0000000000400617 <+39>: movl $0x0,(%rsp)
0x000000000040061e <+46>: movq $0x400838,0x30(%rsp)
0x0000000000400627 <+55>: movb $0x0,0x38(%rsp)
0x000000000040062c <+60>: movl $0x0,0x20(%rsp)
0x0000000000400634 <+68>: movq $0x400838,0x50(%rsp)
0x000000000040063d <+77>: movb $0x0,0x58(%rsp)
0x0000000000400642 <+82>: movl $0x0,0x40(%rsp)
0x000000000040064a <+90>: callq 0x400790 <ErasedObject::~ErasedObject()>
0x000000000040064f <+95>: lea 0x20(%rsp),%rdi
0x0000000000400654 <+100>: callq 0x400790 <ErasedObject::~ErasedObject()>
0x0000000000400659 <+105>: mov %rsp,%rdi
0x000000000040065c <+108>: callq 0x400790 <ErasedObject::~ErasedObject()>
0x0000000000400661 <+113>: mov 0x68(%rsp),%rdx
0x0000000000400666 <+118>: xor %fs:0x28,%rdx
0x000000000040066f <+127>: jne 0x400678 <main()+136>
0x0000000000400671 <+129>: xor %eax,%eax
0x0000000000400673 <+131>: add $0x78,%rsp
0x0000000000400677 <+135>: retq
0x0000000000400678 <+136>: callq 0x4005c0 <__stack_chk_fail#plt>
Code
#include <type_traits>
#include <new>
namespace
{
struct ErasedTypeVTable
{
using destructor_t = void (*)(void *obj);
destructor_t dtor;
};
template <typename T>
void dtor(void *obj)
{
return static_cast<T *>(obj)->~T();
}
template <typename T>
static const ErasedTypeVTable erasedTypeVTable = {
&dtor<T>
};
}
struct ErasedObject
{
std::aligned_storage<sizeof(void *)>::type storage;
const ErasedTypeVTable& vtbl;
bool flag = false;
template <typename T, typename S = typename std::decay<T>::type>
ErasedObject(T&& obj)
: vtbl(erasedTypeVTable<S>)
{
static_assert(sizeof(T) <= sizeof(storage) && alignof(T) <= alignof(decltype(storage)), "");
new (object()) S(std::forward<T>(obj));
}
ErasedObject(ErasedObject&& other) = default;
~ErasedObject()
{
if (flag)
{
::operator delete(object());
}
else
{
vtbl.dtor(object());
}
}
void *object()
{
return reinterpret_cast<char *>(&storage);
}
};
struct myType
{
int a;
};
int main()
{
ErasedObject c1(myType{});
ErasedObject c2(myType{});
//ErasedObject c3(myType{});
}
clang can optimize-out both versions.
Any ideas what's going on? Am I hitting some optimization limit? If so, is it configurable?
I ran g++ with -fdump-ipa-inline to get more information about why functions are or are not inlined.
For the testcase with main() function and three objects created I got:
(...)
150 Deciding on inlining of small functions. Starting with size 35.
151 Enqueueing calls in void {anonymous}::dtor(void*) [with T = myType]/40.
152 Enqueueing calls in int main()/35.
153 not inlinable: int main()/35 -> ErasedObject::~ErasedObject()/33, call is unlikely and code size would grow
154 not inlinable: int main()/35 -> ErasedObject::~ErasedObject()/33, call is unlikely and code size would grow
155 not inlinable: int main()/35 -> ErasedObject::~ErasedObject()/33, call is unlikely and code size would grow
(...)
This error code is set in gcc/gcc/ipa-inline.c:
else if (!e->maybe_hot_p ()
&& (growth >= MAX_INLINE_INSNS_SINGLE
|| growth_likely_positive (callee, growth)))
{
e->inline_failed = CIF_UNLIKELY_CALL;
want_inline = false;
}
Then I discovered, that the smallest change to make g++ inline these functions is to add a declaration:
int main() __attribute__((hot));
I wasn't able to find in code why int main() isn't considered hot, but probably this should be left for another question.
More interesting is the the second part of the conditional I pasted above. The intent was to not inline when the code will grow and you produced an example when the code shrinks after complete inlining.
I think this deserves to be reported on GCC's bugzilla, but I'm not sure if you can call it a bug - estimation of inline impact is a heuristic and as such it is expected to work correctly in most cases, not all of them.

Local static pointer variable is not thread safe?

At my server module, sometimes log4cxx library made it crash.
It's because ...
LevelPtr Level::getTrace() {
static LevelPtr level(new Level(Level::TRACE_INT, LOG4CXX_STR("TRACE"), 7));
return level;
}
static LevelPtr returns null ptr.
I tested following code.
int start_flag = 0;
class test_dummy {
public:
int mi;
test_dummy() : mi(1)
{
std::cout << "hey!\n";
}
static test_dummy* get_p()
{
static test_dummy* _p = new test_dummy();
return _p;
}
};
void thread_proc()
{
int i = 0;
while (start_flag == 0)
{
i++;
}
if (test_dummy::get_p() == 0)
{
std::cout << "error!!!\n";
}
else
{
std::cout << "mi:" << test_dummy::get_p()->mi << "\n";
}
}
void main()
{
boost::thread *pth_array[5] = {0,};
for (int i = 0; i < 5; i++)
{
pth_array[i] = new boost::thread(thread_proc);
}
start_flag = 1;
for (int i = 0; i < 5; i++)
{
pth_array[i]->join();
}
std::cin.ignore();
}
It's really thread-unsafe, but I'm curious about why get_p() return null pointer not another allocated address.
It's because the value was set to 0 while one is doing new() operation?
You have a race condition in this code that the compiler provides:
if (!level_initialized)
{
level_initialized = 1;
level = new Level(...);
}
return level;
(It doesn't look EXACTLY like that - it's more complex, but I think you get the general idea)
In clang++ 3.5, it seems like there are locks to prevent this sort of race, but without actually looking at the code generated by your compiler, it's impossible to say exactly what is going on. But I suspect that this is what happens.
Here's what clang++ 3.5 generates (minus some clutter)
_Z8getTracev: # #_Z8getTracev
pushq %rbp
movq %rsp, %rbp
subq $32, %rsp
cmpb $0, _ZGVZ8getTracevE5level # Guard variable
jne .LBB0_4
leaq _ZGVZ8getTracevE5level, %rdi
callq __cxa_guard_acquire
cmpl $0, %eax
je .LBB0_4
.Ltmp0:
movl $4, %eax
movl %eax, %edi
callq _Znwm # new
.Ltmp1:
movq %rax, -24(%rbp) # 8-byte Spill
jmp .LBB0_3
.LBB0_3: # %invoke.cont
leaq _ZGVZ8getTracevE5level, %rdi
movq -24(%rbp), %rax # 8-byte Reload
movq -24(%rbp), %rcx # 8-byte Reload
movl $0, (%rcx)
movq %rax, _ZZ8getTracevE5level
callq __cxa_guard_release
.LBB0_4: # %init.end
movq _ZZ8getTracevE5level, %rax
addq $32, %rsp
popq %rbp
retq
I modified the code to use Level as an int, etc, so it's simpler than the code you'd get from exactly the code you posted.
It's hard to say much, since the code clearly has undefined
behavior, but the standard does require level to be
initialized to a null pointer before get_p is called. And in
order to ensure that the local static is initialized exactly
once, the compiler more or less has to add an extra flag;
something like:
static test_dummy* _p = nullptr;
static bool isInitialized = false;
if ( !isInitialized ) {
_p = new test_dummy();
isInitialized = true;
}
(In fact, of course, the initializations shown above are the
zero initialization , which occurs before anything else. And
a clever compiler could realize that the explicit initialization
which occurs the first time through cannot result in _p being
a null pointer, and use _p as the control variable.)
The above isn't thread safe; in order to make it thread safe,
the entire sequence must be protected. (There are also more or
less complicated tricks to avoid the need for a full mutex, but
in all cases, all accesses to isInitialized must be atomic.)
If the sequence isn't protected, the order another thread sees
the writes isn't defined. So some thread is seeing
isInitialized as true, but still seeing the null pointer in
_p.