LLVM fails to optimize simple function

LLVM fails to optimize simple function - llvm

I'm using LLVM C++ API to generate a very simple function. Here's the IR output:
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
define i64 #"foo"(i64) {
%2 = alloca i64
store i64 %0, i64* %2
%3 = load i64, i64* %2
ret i64 %3
}
I expect LLVM to get rid of the superfluous stores/loads when I compile it to object file however that's not what I observe. If I use llvm::CodeGenOpt::None I get
$ objdump -M intel -d out.o | grep -A 10 foo
0000000000000000 <foo>:
0: 48 89 7c 24 f8 mov QWORD PTR [rsp-0x8],rdi
5: 48 8b 44 24 f8 mov rax,QWORD PTR [rsp-0x8]
a: c3 ret
If I use llvm::CodeGenOpt::Aggressive I get
$ objdump -M intel -d out.o | grep -A 10 foo
0000000000000000 <foo>:
0: 48 89 7c 24 f8 mov QWORD PTR [rsp-0x8],rdi
5: 48 89 f8 mov rax,rdi
8: c3 ret
I'd expect LLVM to optimize away mov QWORD PTR [rsp-0x8],rdi. Am I missing some options? Here's the object file generation code:
llvm::InitializeAllTargetInfos();
llvm::InitializeAllTargets();
llvm::InitializeAllTargetMCs();
llvm::InitializeAllAsmPrinters();
llvm::InitializeAllAsmParsers();
llvm::InitializeAllDisassemblers();
auto target_triple = llvm::sys::getDefaultTargetTriple();
string error;
auto* target = llvm::TargetRegistry::lookupTarget(target_triple, error);
if (target == nullptr) {
return error;
}
auto cpu = llvm::sys::getHostCPUName();
llvm::SubtargetFeatures subtarget_features;
llvm::StringMap<bool> feature_map;
if (llvm::sys::getHostCPUFeatures(feature_map)) {
for (auto &feature : feature_map) {
subtarget_features.AddFeature(feature.first(), feature.second);
}
}
auto features = subtarget_features.getString();
llvm::TargetOptions target_options;
auto reloc_model = llvm::Optional<llvm::Reloc::Model>();
auto* target_machine = target->createTargetMachine(
target_triple,
cpu,
features,
target_options,
reloc_model,
llvm::CodeModel::Default,
// llvm::CodeGenOpt::Default,
llvm::CodeGenOpt::Aggressive);
if (target_machine == nullptr) {
error = "Failed to create target machine";
return error;
}
llvm::legacy::PassManager pass_manager;
module.setDataLayout(target_machine->createDataLayout());
module.print(llvm::errs(), nullptr);
auto file = "out.o";
error_code ec;
llvm::raw_fd_ostream out(llvm::StringRef(file), ec, llvm::sys::fs::F_None);
if (ec) {
error = ec.message();
return error;
}
llvm::MachineModuleInfo* mmi = new llvm::MachineModuleInfo(
reinterpret_cast<const llvm::LLVMTargetMachine*>(target_machine));
if (mmi == nullptr) {
error = "Failed to create machine module info";
return error;
}
if (target_machine->addPassesToEmitFile(pass_manager, out, llvm::TargetMachine::CGFT_ObjectFile)) {
error = "Failed to emit file";
return error;
}
pass_manager.run(module);
out.close();
return error;

Related

Injecting a Dylib into Processes Running Under Rosetta

I need to inject a dynamic library into a process exclusively targeting the x86_64 instruction set. My host architecture is aarch64.
I attempted injection using the following C++ code...
#define CHKERR(x) if (kr != KERN_SUCCESS) {std::cout << kr << std::endl; return x;};
#define STACK_SIZE 0x1000
#define asm_pthread_offset 6
#define asm_dylib_offset 19
#define asm_dlopen_offset 39
#define asm_mach_thread_self_offset 51
#define asm_thread_suspend_offset 66
inject_result inject_dylib(int pid, const char *dylib_path) {
task_t remoteTask;
struct stat buf;
// check if the dynamic library exists...
int check = stat(dylib_path, &buf);
if (check != 0)
return INJECT_ERROR_NOT_FOUND;
mach_error_t kr = 0;
// request the task port of the target process...
kr = task_for_pid(mach_task_self(), pid, &remoteTask);
CHKERR(INJECT_ERROR_MACH_TASK);
// allocate space for library path in the task
mach_vm_address_t dylib_address;
kr = mach_vm_allocate(remoteTask, &dylib_address, strlen(dylib_path) + 1, 1);
CHKERR(INJECT_ERROR_GENERIC)
// write library path into the task
kr = mach_vm_write(remoteTask, dylib_address, (vm_offset_t)dylib_path, strlen(dylib_path)+1);
CHKERR(INJECT_ERROR_GENERIC)
mach_vm_address_t stack_address;
kr = mach_vm_allocate(remoteTask, &stack_address, STACK_SIZE, 1);
CHKERR(INJECT_ERROR_STACK_ALLOC)
unsigned char asm_instructions[ 100 ] =
"\x55" // push %rbp
"\x48\x89\xe5" // mov %rbp, %rsp
"\x48\xb8\x00\x00\x00\x00\x00\x00\x00\x00" // mov %rax, _pthread_set_self
"\xff\xd0" // call %rax
"\x5d" // pop %rbp
"\x48\xbf\x00\x00\x00\x00\x00\x00\x00\x00" // mov %rdi, dylib_address
"\x48\xbe\x02\x00\x00\x00\x00\x00\x00\x00" // mov %rsi, 2
"\x48\xb8\x00\x00\x00\x00\x00\x00\x00\x00" // mov %rax, dlopen
"\xff\xd0" // call %rax
"\x48\xb8\x00\x00\x00\x00\x00\x00\x00\x00" // mov %rax, mach_thread_self
"\xff\xd0" // call %rax
"\x48\x89\xc7" // mov %rdi, %rax
"\x48\xb8\x00\x00\x00\x00\x00\x00\x00\x00" // mov %rax, thread_suspend
"\xff\xd0" // call %rax
;
// allocate space for assembly instructions...
mach_vm_address_t code_address;
kr = mach_vm_allocate(remoteTask, &code_address, sizeof(asm_instructions), 1);
CHKERR(INJECT_ERROR_CODE_ALLOC)
// set some values in our assembly instructions...
mach_vm_address_t pthread_set_self_address = (mach_vm_address_t) dlsym(RTLD_DEFAULT, "_pthread_set_self");
mach_vm_address_t mach_thread_self_address = (mach_vm_address_t) mach_thread_self;
mach_vm_address_t thread_suspend_address = (mach_vm_address_t) thread_suspend;
mach_vm_address_t dlopen_address = (mach_vm_address_t) dlopen;
memcpy(&asm_instructions[asm_pthread_offset], &pthread_set_self_address, sizeof(mach_vm_address_t));
memcpy(&asm_instructions[asm_dylib_offset], &dylib_address, sizeof(mach_vm_address_t));
memcpy(&asm_instructions[asm_dlopen_offset], &dlopen_address, sizeof(mach_vm_address_t));
memcpy(&asm_instructions[asm_mach_thread_self_offset], &mach_thread_self_address, sizeof(mach_vm_address_t));
memcpy(&asm_instructions[asm_thread_suspend_offset], &thread_suspend_address, sizeof(mach_vm_address_t));
kr = mach_vm_write(remoteTask, code_address, (vm_offset_t)asm_instructions, sizeof(asm_instructions));
CHKERR(INJECT_ERROR_GENERIC)
kr = mach_vm_protect(remoteTask, code_address, sizeof(asm_instructions), 0, VM_PROT_EXECUTE | VM_PROT_READ);
CHKERR(INJECT_ERROR_GENERIC)
// create thread, set registers, and start
thread_t thread = {0};
x86_thread_state64_t thread_state = {0};
thread_state.__rip = code_address;
thread_state.__rdi = stack_address;
thread_state.__rsp = stack_address;
thread_state.__rbp = stack_address;
kr = thread_create_running(remoteTask, x86_THREAD_STATE64, (thread_state_t) &thread_state, x86_THREAD_STATE64_COUNT, &thread);
CHKERR(INJECT_ERROR_CREATE_THREAD)
mach_port_deallocate(mach_task_self(), remoteTask);
return INJECT_SUCCESS;
}
A problem occurs at the function create_thread_running where it consistently returns error 4 (KERN_INVALID_ARGUMENT). This is because the arm64 version of the XNU kernel does not support x86_THREAD_STATE64 as a thread state flavor.
I confirmed this as the issue by digging through the kernel source. Where you can see that x86_THREAD_STATE64 is not included in any switch case, and it defaults to KERN_INVALID_ARGUMENT.
Are there any compatible alternatives to this function or method of injection?

Initializing plog::RollingFileAppender on Windows XP Triggers Access Violation (Null Pointer)

When using [plog][1] on Windows XP. In this case, the code is:
void LogInit(void)
{
static plog::RollingFileAppender<plog::TxtFormatter> fileAppender("log.log");
Using Visual Studio 2019 but the project uses the platform toolset Visual Studio 2017 - Windows XP (v141_XP)
The output assembly is:
; COMDAT _LogInit
_TEXT SEGMENT
_status$1$ = -516 ; size = 4
_appender$66 = -516 ; size = 4
$T65 = -512 ; size = 256
$T64 = -512 ; size = 256
$T62 = -512 ; size = 256
$T60 = -512 ; size = 256
$T58 = -256 ; size = 256
$T57 = -256 ; size = 256
$T41 = -256 ; size = 256
_LogInit PROC ; COMDAT
; 108 : {
00000 55 push ebp
00001 8b ec mov ebp, esp
00003 83 e4 f8 and esp, -8 ; fffffff8H
; 109 : static plog::RollingFileAppender<plog::TxtFormatter> fileAppender("log.log");
00006 64 a1 00 00 00
00 mov eax, DWORD PTR fs:__tls_array
0000c 81 ec 04 02 00
00 sub esp, 516 ; 00000204H
00012 8b 0d 00 00 00
00 mov ecx, DWORD PTR __tls_index
00018 53 push ebx
00019 56 push esi
0001a 8b 34 88 mov esi, DWORD PTR [eax+ecx*4]
The null pointer is because EAX (__tls_array) and ECX (__tls_index) area both null. Output from WinDbg:
TGLOBALFLAG: 70
APPLICATION_VERIFIER_FLAGS: 0
CONTEXT: (.ecxr)
eax=00000000 ebx=00000000 ecx=00000000 edx=7c90e4f4 esi=0012f624 edi=00000000
eip=1000366a esp=001afda4 ebp=001affb4 iopl=0 nv up ei pl nz ac pe nc
cs=001b ss=0023 ds=0023 es=0023 fs=003b gs=0000 efl=00010216
LogTest!LogInit+0x1a:
1000366a 8b3488 mov esi,dword ptr [eax+ecx*4] ds:0023:00000000=????????
Resetting default scope
EXCEPTION_RECORD: (.exr -1)
ExceptionAddress: 1000366a (LogTest!LogInit+0x0000001a)
ExceptionCode: c0000005 (Access violation)
ExceptionFlags: 00000000
NumberParameters: 2
Parameter[0]: 00000000
Parameter[1]: 00000000
Attempt to read from address 00000000
PROCESS_NAME: notepad.exe
READ_ADDRESS: 00000000
ERROR_CODE: (NTSTATUS) 0xc0000005 - The instruction at 0x%p referenced memory at 0x%p. The memory could not be %s.
EXCEPTION_CODE_STR: c0000005
EXCEPTION_PARAMETER1: 00000000
EXCEPTION_PARAMETER2: 00000000
FAULTING_LOCAL_VARIABLE_NAME: fileAppender
STACK_TEXT:
001affb4 7c80b713 00000000 00000000 0012f624 LogTest!LogInit+0x1a
001affec 00000000 10003650 00000000 00000000 kernel32!BaseThreadStart+0x37
STACK_COMMAND: ~1s; .ecxr ; kb
FAULTING_SOURCE_LINE: d:\test\logtest.cpp
FAULTING_SOURCE_FILE: d:\test\logtest.cpp
FAULTING_SOURCE_LINE_NUMBER: 109
FAULTING_SOURCE_CODE:
105:
106: // This is an example of an exported function.
107: LogInit_API void LogInit(void)
108: {
> 109: static plog::RollingFileAppender<plog::TxtFormatter> fileAppender(";pg.log");
110: plog::init(plog::info, &fileAppender);
111:
112:
113:
114:
SYMBOL_NAME: LogTest!LogInit+1a
MODULE_NAME: LogTest
IMAGE_NAME: LogTest.dll
FAILURE_BUCKET_ID: NULL_POINTER_READ_c0000005_LogTest.dll!LogInit
OS_VERSION: 5.1.2600.5512
BUILDLAB_STR: xpsp
OSPLATFORM_TYPE: x86
OSNAME: Windows XP
FAILURE_ID_HASH: {0218fa42-bce4-328f-5683-a7e3657927fc}
Followup: MachineOwner
---------
Code for affected class is:
namespace plog
{
template<class Formatter, class Converter = NativeEOLConverter<UTF8Converter> >
class PLOG_LINKAGE_HIDDEN RollingFileAppender : public IAppender
{
public:
RollingFileAppender(const util::nchar* fileName, size_t maxFileSize = 0, int maxFiles = 0)
: m_fileSize()
, m_maxFileSize()
, m_maxFiles(maxFiles)
, m_firstWrite(true)
{
setFileName(fileName);
setMaxFileSize(maxFileSize);
}
#ifdef _WIN32
RollingFileAppender(const char* fileName, size_t maxFileSize = 0, int maxFiles = 0)
: m_fileSize()
, m_maxFileSize()
, m_maxFiles(maxFiles)
, m_firstWrite(true)
{
setFileName(fileName);
setMaxFileSize(maxFileSize);
}
#endif
virtual void write(const Record& record)
{
util::MutexLock lock(m_mutex);
if (m_firstWrite)
{
openLogFile();
m_firstWrite = false;
}
else if (m_maxFiles > 0 && m_fileSize > m_maxFileSize && static_cast<size_t>(-1) != m_fileSize)
{
rollLogFiles();
}
size_t bytesWritten = m_file.write(Converter::convert(Formatter::format(record)));
if (static_cast<size_t>(-1) != bytesWritten)
{
m_fileSize += bytesWritten;
}
}
void setFileName(const util::nchar* fileName)
{
util::MutexLock lock(m_mutex);
util::splitFileName(fileName, m_fileNameNoExt, m_fileExt);
m_file.close();
m_firstWrite = true;
}
#ifdef _WIN32
void setFileName(const char* fileName)
{
setFileName(util::toWide(fileName).c_str());
}
#endif
void setMaxFiles(int maxFiles)
{
m_maxFiles = maxFiles;
}
void setMaxFileSize(size_t maxFileSize)
{
m_maxFileSize = (std::max)(maxFileSize, static_cast<size_t>(1000)); // set a lower limit for the maxFileSize
}
void rollLogFiles()
{
m_file.close();
util::nstring lastFileName = buildFileName(m_maxFiles - 1);
util::File::unlink(lastFileName.c_str());
for (int fileNumber = m_maxFiles - 2; fileNumber >= 0; --fileNumber)
{
util::nstring currentFileName = buildFileName(fileNumber);
util::nstring nextFileName = buildFileName(fileNumber + 1);
util::File::rename(currentFileName.c_str(), nextFileName.c_str());
}
openLogFile();
m_firstWrite = false;
}
private:
void openLogFile()
{
util::nstring fileName = buildFileName();
m_fileSize = m_file.open(fileName.c_str());
if (0 == m_fileSize)
{
size_t bytesWritten = m_file.write(Converter::header(Formatter::header()));
if (static_cast<size_t>(-1) != bytesWritten)
{
m_fileSize += bytesWritten;
}
}
}
util::nstring buildFileName(int fileNumber = 0)
{
util::nostringstream ss;
ss << m_fileNameNoExt;
if (fileNumber > 0)
{
ss << '.' << fileNumber;
}
if (!m_fileExt.empty())
{
ss << '.' << m_fileExt;
}
return ss.str();
}
private:
util::Mutex m_mutex;
util::File m_file;
size_t m_fileSize;
size_t m_maxFileSize;
int m_maxFiles;
util::nstring m_fileExt;
util::nstring m_fileNameNoExt;
bool m_firstWrite;
};
}
Is there code or compiler settings that can be modified to fix/remove the references to __tls_array / __tls_index.
This occurs in both debug & release builds.
[1]: https://github.com/SergiusTheBest/plog

Setting compiler option /Zc:threadSafeInit- removes the references to __tls_array and __tls_index and stops the access violation crash.
Microsoft documentation here mentions:
In the C++11 standard, block scope variables with static or thread
storage duration must be zero-initialized before any other
initialization takes place. Initialization occurs when control first
passes through the declaration of the variable. If an exception is
thrown during initialization, the variable is considered
uninitialized, and initialization is re-attempted the next time
control passes through the declaration. If control enters the
declaration concurrently with initialization, the concurrent execution
blocks while initialization is completed. The behavior is undefined if
control re-enters the declaration recursively during initialization.
By default, Visual Studio starting in Visual Studio 2015 implements
this standard behavior. This behavior may be explicitly specified by
setting the /Zc:threadSafeInit compiler option.
The /Zc:threadSafeInit compiler option is on by default. The
/permissive- option does not affect /Zc:threadSafeInit.
Thread-safe initialization of static local variables relies on code
implemented in the Universal C run-time library (UCRT). To avoid
taking a dependency on the UCRT, or to preserve the non-thread-safe
initialization behavior of versions of Visual Studio prior to Visual
Studio 2015, use the /Zc:threadSafeInit- option. If you know that
thread-safety is not required, use this option to generate slightly
smaller, faster code around static local declarations.
Thread-safe static local variables use thread-local storage (TLS)
internally to provide efficient execution when the static has already
been initialized. The implementation of this feature relies on Windows
operating system support functions in Windows Vista and later
operating systems. Windows XP, Windows Server 2003, and older
operating systems do not have this support, so they do not get the
efficiency advantage. These operating systems also have a lower limit
on the number of TLS sections that can be loaded. Exceeding the TLS
section limit can cause a crash. If this is a problem in your code,
especially in code that must run on older operating systems, use
/Zc:threadSafeInit- to disable the thread-safe initialization code.

Different Stacktraces for NewHandler and UnhandledExceptionHandler

I have the following code:
#include <windows.h>
#include <minidumpapiset.h>
#include <strsafe.h>
#include <fileapi.h>
#include <iostream>
#include <signal.h>
#include <minwinbase.h>
#include <new.h>
#include "StackWalker.h"
int minidumpId = 0;
#ifndef _AddressOfReturnAddress
// Taken from: http://msdn.microsoft.com/en-us/library/s975zw7k(VS.71).aspx
#ifdef __cplusplus
#define EXTERNC extern "C"
#else
#define EXTERNC
#endif
// _ReturnAddress and _AddressOfReturnAddress should be prototyped before use
EXTERNC void* _AddressOfReturnAddress(void);
EXTERNC void* _ReturnAddress(void);
EXTERNC int __cdecl _purecall();
#endif
EXCEPTION_POINTERS ExceptionPointers;
EXCEPTION_RECORD ExceptionRecord;
CONTEXT ContextRecord;
void GetExceptionPointers(DWORD exceptionCode, EXCEPTION_POINTERS** exceptionPointers)
{
// The following code was taken from VC++ 8.0 CRT (invarg.c: line 104)
ZeroMemory(&ExceptionPointers, sizeof(EXCEPTION_POINTERS));
ZeroMemory(&ExceptionRecord, sizeof(EXCEPTION_RECORD));
ZeroMemory(&ContextRecord, sizeof(CONTEXT));
// Looks like a workaround for some bug in RtlCaptureContext. But no description.
#ifdef _X86_
__asm {
mov dword ptr[ContextRecord.Eax], eax
mov dword ptr[ContextRecord.Ecx], ecx
mov dword ptr[ContextRecord.Edx], edx
mov dword ptr[ContextRecord.Ebx], ebx
mov dword ptr[ContextRecord.Esi], esi
mov dword ptr[ContextRecord.Edi], edi
mov word ptr[ContextRecord.SegSs], ss
mov word ptr[ContextRecord.SegCs], cs
mov word ptr[ContextRecord.SegDs], ds
mov word ptr[ContextRecord.SegEs], es
mov word ptr[ContextRecord.SegFs], fs
mov word ptr[ContextRecord.SegGs], gs
pushfd
pop[ContextRecord.EFlags]
}
ContextRecord.ContextFlags = CONTEXT_CONTROL;
#pragma warning(push)
#pragma warning(disable : 4311)
ContextRecord.Eip = (ULONG)_ReturnAddress();
ContextRecord.Esp = (ULONG)_AddressOfReturnAddress();
#pragma warning(pop)
ContextRecord.Ebp = *(static_cast<ULONG*>(_AddressOfReturnAddress()) - 1);
#elif defined(_IA64_) || defined(_AMD64_) || defined(_ARM_) || defined(_ARM64_)
CaptureContext(&ContextRecord);
#else /* defined (_IA64_) || defined (_AMD64_) || defined(_ARM_) || defined(_ARM64_) */
ZeroMemory(&ContextRecord, sizeof(ContextRecord));
#endif /* defined (_IA64_) || defined (_AMD64_) || defined(_ARM_) || defined(_ARM64_) */
ExceptionRecord.ExceptionCode = exceptionCode;
ExceptionRecord.ExceptionAddress = _ReturnAddress();
ExceptionRecord.ExceptionFlags = EXCEPTION_NONCONTINUABLE;
*exceptionPointers = &ExceptionPointers;
(*exceptionPointers)->ExceptionRecord = &ExceptionRecord;
(*exceptionPointers)->ContextRecord = &ContextRecord;
}
class DbgLibrary final
{
public:
DbgLibrary()
{
dbgLibrary = LoadLibraryW(L"dbghelp.dll");
}
~DbgLibrary()
{
FreeLibrary(dbgLibrary);
}
explicit operator bool() const
{
return dbgLibrary != NULL;
}
bool WriteMinidump(HANDLE file, EXCEPTION_POINTERS* exceptionPointers) const
{
MINIDUMP_EXCEPTION_INFORMATION exceptionInformation;
exceptionInformation.ThreadId = GetCurrentThreadId();
exceptionInformation.ExceptionPointers = exceptionPointers;
exceptionInformation.ClientPointers = FALSE;
MINIDUMP_CALLBACK_INFORMATION callbackInformation;
callbackInformation.CallbackRoutine = NULL;
callbackInformation.CallbackParam = NULL;
typedef BOOL(WINAPI* LPMINIDUMPWRITEDUMP)(HANDLE processHandle, DWORD ProcessId, HANDLE fileHandle,
MINIDUMP_TYPE DumpType, CONST PMINIDUMP_EXCEPTION_INFORMATION ExceptionParam,
CONST PMINIDUMP_USER_STREAM_INFORMATION UserEncoderParam,
CONST PMINIDUMP_CALLBACK_INFORMATION CallbackParam);
LPMINIDUMPWRITEDUMP pfnMiniDumpWriteDump =
(LPMINIDUMPWRITEDUMP)GetProcAddress(dbgLibrary, "MiniDumpWriteDump");
if (NULL == pfnMiniDumpWriteDump)
{
return false;
}
BOOL isWriteSucceed = pfnMiniDumpWriteDump(GetCurrentProcess(), GetCurrentProcessId(), file, MiniDumpNormal,
&exceptionInformation, NULL, &callbackInformation);
return isWriteSucceed;
}
private:
HMODULE dbgLibrary;
};
inline HANDLE CreateNativeFile(const wchar_t* filePath)
{
HANDLE file = NULL;
file = CreateFileW(filePath, GENERIC_WRITE, 0, NULL, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL);
return file;
}
void CreateMiniDump(PEXCEPTION_POINTERS exceptionPointers)
{
const DbgLibrary dbgLibrary;
if (dbgLibrary)
{
wchar_t FILE_PATH[4096];
// Write `exceptionPointers` to the minidump file
StringCbPrintfW(FILE_PATH, sizeof(FILE_PATH), L"%ls\\%ls_%ld.dmp", ".",
L"minidump", minidumpId++);
HANDLE hMinidump = CreateNativeFile(FILE_PATH);
if (hMinidump != INVALID_HANDLE_VALUE)
{
dbgLibrary.WriteMinidump(hMinidump, exceptionPointers);
CloseHandle(hMinidump);
}
}
}
LONG WINAPI SehHandler(PEXCEPTION_POINTERS exceptionPointers)
{
std::cerr << "SehHandler\n";
CreateMiniDump(exceptionPointers);
return EXCEPTION_EXECUTE_HANDLER;
}
void SigsegvHandler(int)
{
std::cerr << "SigsegvHandler\n";
PEXCEPTION_POINTERS exceptionPointers = static_cast<PEXCEPTION_POINTERS>(_pxcptinfoptrs);
// Write minidump file
CreateMiniDump(exceptionPointers);
}
int __cdecl NewHandler(size_t size)
{
std::cerr << "NewHandler\n";
// 'new' operator memory allocation exception
PEXCEPTION_POINTERS exceptionPointers;
GetExceptionPointers(STATUS_NO_MEMORY, &exceptionPointers);
CreateMiniDump(exceptionPointers);
return 0;
}
struct A5 {
void F()
{
while (true)
{
int* a = new int[50000000];
}
}
};
struct A4 {
A5 a;
void F()
{
a.F();
}
};
struct A3 {
A4 a;
void F()
{
a.F();
}
};
struct A2 {
A3 a;
void F()
{
a.F();
}
};
struct A1 {
A2 a;
void F()
{
a.F();
}
};
int main()
{
SetUnhandledExceptionFilter(SehHandler);
signal(SIGSEGV, SigsegvHandler);
_set_new_handler(NewHandler);
A1().F();
return 0;
}
Here two handlers would be invoked: NewHandler and SehHandler. The first one because of bad_alloc in operator new[], the second one because of unhandled exception. In both handlers I create minidump with information about crash.
NewHandler:
Thread 0 (crashed)
0 StackWalker_VC2017.exe!_callnewh [new_handler.cpp : 79 + 0x2]
eip = 0x0040a636 esp = 0x0019fefc ebp = 0x0019ff08 ebx = 0x00311000
esi = 0x00401d10 edi = 0x00655368 eax = 0x0042eed0 ecx = 0x00000000
edx = 0x00655368 efl = 0x00000202
Found by: given as instruction pointer in context
1 StackWalker_VC2017.exe!operator new(unsigned int) [new_scalar.cpp : 40 + 0x8]
eip = 0x00404a05 esp = 0x0019ff10 ebp = 0x0019ff14
Found by: call frame info
2 StackWalker_VC2017.exe!A5::F() [main.cpp : 197 + 0xa]
eip = 0x00401d0a esp = 0x0019ff1c ebp = 0x0019ff28
Found by: call frame info
3 StackWalker_VC2017.exe!main [main.cpp : 239 + 0x8]
eip = 0x00402500 esp = 0x0019ff24 ebp = 0x0019ff28
Found by: call frame info
4 StackWalker_VC2017.exe!static int __scrt_common_main_seh() [exe_common.inl : 288 + 0x1c]
eip = 0x00404c5d esp = 0x0019ff30 ebp = 0x0019ff70
Found by: call frame info
5 kernel32.dll + 0x1fa29
eip = 0x7712fa29 esp = 0x0019ff78 ebp = 0x0019ff80
Found by: call frame info
6 ntdll.dll + 0x67a9e
eip = 0x77c97a9e esp = 0x0019ff88 ebp = 0x0019ffdc
Found by: previous frame's frame pointer
7 ntdll.dll + 0x67a6e
eip = 0x77c97a6e esp = 0x0019ffe4 ebp = 0x0019ffec
Found by: previous frame's frame pointer
SehHandler:
Thread 0 (crashed)
0 KERNELBASE.dll + 0x12b812
eip = 0x76ddb812 esp = 0x0019fe68 ebp = 0x0019fec4 ebx = 0x19930520
esi = 0x00645a90 edi = 0x0042c754 eax = 0x0019fe68 ecx = 0x00000003
edx = 0x00000000 efl = 0x00000212
Found by: given as instruction pointer in context
1 StackWalker_VC2017.exe!_CxxThrowException [throw.cpp : 74 + 0x19]
eip = 0x00405a98 esp = 0x0019fecc ebp = 0x0019fef4
Found by: previous frame's frame pointer
2 StackWalker_VC2017.exe!__scrt_throw_std_bad_alloc() [throw_bad_alloc.cpp : 35 + 0x16]
eip = 0x0040509c esp = 0x0019fefc ebp = 0x0019ff10
Found by: call frame info
3 StackWalker_VC2017.exe!main [main.cpp : 239 + 0x8]
eip = 0x00402500 esp = 0x0019ff24 ebp = 0x0019ff14
Found by: call frame info with scanning
Extracted stacks using breakpad minidump_stackwalk:
The question is why SehHandler stacktrace does not have all function calls?
The main problem is that in project I use crash handlers for logging information in dumps. But creating minidump on each NewHandler call is not inappropriate solution, because sometimes bad_alloc could be fixed and exception thrown in try/catch block, that means that it is expected behaviour. So I want to handle bad_alloc in unhandled exception handler, so that it would definitely be crash. Also problem occurs only in release builds.

As mentioned in https://developercommunity.visualstudio.com/t/stdbad-alloc-failures-are-undebuggable/542559?viewtype=solutions it is bug in msvc. Unfortunately there is no good solution for release builds.

Intel-pin: INS_MemoryDisplacement(ins) is not working in every case [duplicate]

I asked this question few days ago.
I wanted to get the stack allocation size (after the function creation). The answer suggests to do:
if((INS_Opcode(ins) == XED_ICLASS_ADD || INS_Opcode(ins) == XED_ICLASS_SUB) &&
REG(INS_OperandReg(ins, 0)) == REG_STACK_PTR && INS_OperandIsImmediate(ins, 1)
Which in theory is correct and does make sense. But, it doesn't work in practice (correct me if I'm wrong here). It works perfectly fine if I remove REG(INS_OperandReg(ins, 0)) == REG_STACK_PTR check. Why? Because pin doesn't detect the REG_STACK_PTR register when REG(INS_OperandReg(ins, 0)) is used to detect it. rather, it detects ah (which I believe is RAX), when I do check against add rsp, 0xffffffffffffff80 instruction (so, every time it gives: register: ah), as can be seen in my output below:
in
register: rbp
40051e push rbp
register: *invalid*
value: -128
40051f mov rbp, rsp
register: ah
400522 add rsp, 0xffffffffffffff80
register: *invalid*
400526 mov dword ptr [rbp-0x28], 0x7
register: *invalid*
40052d mov dword ptr [rbp-0x64], 0x9
register: eax
400534 mov eax, 0x0
register: *invalid*
400539 call 0x4004e6
register: rbp
4004e6 push rbp
register: *invalid*
value: 64
4004e7 mov rbp, rsp
register: ah
4004ea sub rsp, 0x40
register: *invalid*
4004ee mov dword ptr [rbp-0xc], 0x4
register: rax
4004f5 lea rax, ptr [rbp-0xc]
register: *invalid*
4004f9 mov qword ptr [rbp-0x8], rax
register: rax
4004fd mov rax, qword ptr [rbp-0x8]
register: eax
400501 mov eax, dword ptr [rax]
register: *invalid*
400503 mov esi, eax
register: edi
400505 mov edi, 0x4005d0
register: eax
40050a mov eax, 0x0
register: rdi
40050f call 0x4003f0
register: rdi
4003f0 jmp qword ptr [rip+0x200c22]
register: *invalid*
4003f6 push 0x0
register: *invalid*
4003fb jmp 0x4003e0
register: *invalid*
4003e0 push qword ptr [rip+0x200c22]
register: rdi
4003e6 jmp qword ptr [rip+0x200c24]
4
register: *invalid*
400514 mov dword ptr [rbp-0x3c], 0x3
40051b nop
register: *invalid*
40051c leave
register: *invalid*
40051d ret
register: eax
40053e mov eax, 0x0
register: *invalid*
400543 leave
out
Well, interestingly it does this for every occurrences of rsp (i.e. it detects ah instead of rsp). Also, it always prints the instruction 400522 add rsp, 0xffffffffffffff80, including rsp (So, why it doesn't print ah here?)
If ah represents rsp in some way, then I can always detect ah using: REG(INS_OperandReg(ins, 0)) == REG_AH. But, I want to understand what is going on here.
My code:
#include <iostream>
#include <fstream>
#include "pin.H"
#include <unordered_map>
// key to open the main Routine
static uint32_t key = 0;
// Ins object mapping
class Insr
{
private:
// Disassembled instruction
string insDis;
INS ins;
public:
Insr(string insDis, INS ins) { this->insDis = insDis; this->ins = ins;}
string get_insDis() { return insDis;}
INS get_ins() { return ins;}
};
// Stack for the Insr structure
static std::unordered_map<ADDRINT, Insr*> insstack;
// This function is called before every instruction is executed
VOID protect(uint64_t addr)
{
if (addr > 0x700000000000)
return;
if (!key)
return;
// Initialize the diassembled instruction
string insdis = insstack[addr]->get_insDis();
INS ins = insstack[addr]->get_ins();
if (INS_OperandCount(ins) > 0)
{
if (REG(INS_OperandReg(ins, 0)) == REG_AH)
std::cout << "register: " << REG_StringShort(REG(INS_OperandReg(ins, 0))) << '\n';
}
if((INS_Opcode(ins) == XED_ICLASS_ADD || INS_Opcode(ins) == XED_ICLASS_SUB) &&
INS_OperandIsImmediate(ins, 1))
{
int value = INS_OperandImmediate(ins, 1);
std::cout << "value: " << dec<<value << '\n';
}
std::cout << hex <<addr << "\t" << insdis << std::endl;
}
// Pin calls this function every time a new instruction is encountered
VOID Instruction(INS ins, VOID *v)
{
if (INS_Address(ins) > 0x700000000000)
return;
insstack.insert(std::make_pair(INS_Address(ins), new Insr(string(INS_Disassemble(ins)),
ins)));
// if (REG_valid_for_iarg_reg_value(INS_MemoryIndexReg(ins)))
// std::cout << "true" << '\n';
// Insert a call to docount before every instruction, no arguments are passed
INS_InsertCall(ins, IPOINT_BEFORE, (AFUNPTR)protect, IARG_ADDRINT, INS_Address(ins),
IARG_END);
}
// Lock Routine
void mutex_lock()
{
key = 0;
std::cout<<"out\n";
}
void mutex_unlock()
{
key = 1;
std::cout<<"in\n";
}
void Routine(RTN rtn, VOID *V)
{
if (RTN_Name(rtn) == "main")
{
RTN_Open(rtn);
RTN_InsertCall(rtn, IPOINT_BEFORE, (AFUNPTR)mutex_unlock, IARG_END);
RTN_InsertCall(rtn, IPOINT_AFTER, (AFUNPTR)mutex_lock, IARG_END);
RTN_Close(rtn);
}
}
INT32 Usage()
{
cerr << "This tool counts the number of dynamic instructions executed" << endl;
cerr << endl << KNOB_BASE::StringKnobSummary() << endl;
return -1;
}
int main(int argc, char * argv[])
{
// Initialize the symbol table
PIN_InitSymbols();
// Initialize pin
if (PIN_Init(argc, argv)) return Usage();
PIN_SetSyntaxIntel();
// Routine instrumentation
RTN_AddInstrumentFunction(Routine, 0);
// Register Instruction to be called to instrument instructions
INS_AddInstrumentFunction(Instruction, 0);
// Start the program, never returns
PIN_StartProgram();
return 0;
}
I have few questions regarding that.
How can I understand such a behavior? And how can I detect rsp if I want to? Lastly, how does the instruction prints rsp, but REG(INS_OperandReg(ins, 0)) == REG_STACK_PTR can not detect it?

The INS objects are only valid inside instrumentation routines, such as your Instruction routine. The INS type is nothing but a 32-bit integer that identifies an instruction. The Pin runtime internally maintains a table that maps these 32-bit integers to specific static instructions. It creates such a table whenever it's about to call an instrumentation routine. When the instrumentation routine returns, there is no guarantee that any of these identifiers map to the same static instructions and they may not even be valid. So when you save a copy of an INS object in the following line of code:
insstack.insert(std::make_pair(INS_Address(ins), new Insr(string(INS_Disassemble(ins)),
ins)));
that copy is only useful in the same instance of the Instruction routine. The next time the Instruction routine is called (or any other instrumentation routine), an instruction identifier might be reused for other instructions.
If you really want to pass an instruction to an analysis routine, you have two options:
Copy the actual bytes of the instruction to a buffer and pass the address of the buffer and later decode it using the XED API.
Pass the address of the instruction and later decode it using the XED API. This works if the instruction is guaranteed to be available at the same location later.

How to let MSVC compiler optimize multiple step POD initialization?

I've made this sample code:
#include <vector>
struct POD {
int a;
int b;
int c;
inline static POD make_pod_with_default()
{
POD p{ 41, 51, 61 };
return p;
}
inline void change_pod_a(POD &p, int a) {
p.a = a;
}
inline void change_pod_b(POD &p, int b) {
p.b = b;
}
static POD make_pod_with_a(int a) {
POD p = make_pod_with_default();
p.change_pod_a(p, a);
return p;
}
static POD make_pod_with_b(int a) {
POD p = make_pod_with_default();
p.change_pod_b(p, a);
return p;
}
};
int main()
{
std::vector<POD> vec{};
vec.reserve(2);
vec.push_back(POD::make_pod_with_a(71));
vec.push_back(POD::make_pod_with_b(81));
return vec[0].a + vec[0].b + vec[0].c + vec[1].a + vec[1].b + vec[1].c;
}
In the compiled assembly code we can see the following instructions are being generated for the first vec.push_back(...) call:
...
mov DWORD PTR $T2[esp+32], 41 ; 00000029H
...
mov DWORD PTR $T2[esp+36], 51 ; 00000033H
...
mov DWORD PTR $T5[esp+32], 71 ; 00000047H
...
mov DWORD PTR $T6[esp+44], 61 ; 0000003dH
...
There's a mov to [esp+32] for the 71, but the mov to [esp+32] for the 41 is still there, being useless! How can I write code for MSVC that will enable this kind of optimization, is MSVC even capable of it?
Both GCC and CLANG give more optimized versions, but CLANG defeats by a large margin with literally no overhead, in a very clean and logical fashion:
CLANG generated code:
main: # #main
push rax
mov edi, 24
call operator new(unsigned long)
mov rdi, rax
call operator delete(void*)
mov eax, 366
pop rcx
ret
Everything is done at compile time as 71 + 51 + 61 + 41 + 81 + 61 = 366!
I must admit its painful to see my program being computed at compile time and still throw in that call to vec.reserve() in the assembly... but CLANG still takes the cake, by far! Come on MSVC, this is not a vector of volatile.

If you turn your methods constexpr, you might do:
constexpr POD step_one()
{
POD p{2, 5, 11};
p.b = 3;
return p;
}
constexpr void step_two(POD &p)
{
p.c = 5;
}
constexpr POD make_pod(){
POD p = step_one();
step_two(p);
return p;
}
POD make_pod_final()
{
constexpr POD res = make_pod();
return res;
}
resulting to:
make_pod_final PROC
mov eax, DWORD PTR $T1[esp-4]
mov DWORD PTR [eax], 2
mov DWORD PTR [eax+4], 3
mov DWORD PTR [eax+8], 5
ret 0
Demo

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

LLVM fails to optimize simple function - llvm

Related

Injecting a Dylib into Processes Running Under Rosetta

Initializing plog::RollingFileAppender on Windows XP Triggers Access Violation (Null Pointer)

Different Stacktraces for NewHandler and UnhandledExceptionHandler

Intel-pin: INS_MemoryDisplacement(ins) is not working in every case [duplicate]

How to let MSVC compiler optimize multiple step POD initialization?

Categories

Resources