So i am currently learning about hooking and injection and build a little test application for myself. What i am doing in there is:
Allocating some memory inside the process (using VirtualAllocEx)
Writing code to this code-cave (using WriteProcessMemory)
Hooking a function inside the app to jump to this code-cave
Now i have some problems which i am unsure of why they occur.
My code for creating the cave is this:
DWORD procID = GetCurrentProcessId();
HANDLE procHandle = OpenProcess((PROCESS_VM_WRITE | PROCESS_VM_OPERATION), false, procID);
if (procHandle == NULL) {
std::cout << "OpenProcess failed..." << std::endl;
return -1;
}
DWORD leng = (unsigned int)dynASM_end - (unsigned int)dynASM;
leng += 270;
LPVOID allocAddr = VirtualAllocEx(procHandle, NULL, leng, MEM_COMMIT, PAGE_EXECUTE_READWRITE);
std::cout << "BaseAddr: " << allocAddr << std::endl;
int check = WriteProcessMemory(procHandle, (void*)((char*)allocAddr + 0x10E), dynASM, leng - 270, NULL);
unsigned int bExec = 1;
check = WriteProcessMemory(procHandle, allocAddr, &bExec, sizeof(unsigned int), NULL);
if (check) {
std::cout << "Successfully written asm to process!" << std::endl;
}
Now this is very basic. VirtualAllocEx always returns 0x00030000, is this possible?
This is my simple assembly code:
void testCall()
{
std::cout << "Hook worked!" << std::endl;
}
__declspec(naked) void dynASM()
{
__asm {
push ebp
mov ebp, esp
call testCall
mov esp, ebp
pop ebp
ret
}
}
__declspec(naked) void dynASM_end()
{
}
I believe i can call the function testCall this way, can't i? This is the code which i put into the codecave as you can see above.
Now i have a offset to a function inside another class in which i want to hook. This function only consists of:
__declspec(naked) void Core::HookFunc()
{
__asm {
mov edi, edi
push ebp
mov ebp, esp
mov esp, ebp
pop ebp
ret
}
}
And i try to hook like this:
DWORD hookFuncOffset = 0x14510;
DWORD jumpTo = (DWORD)((char*)allocAddr + 0x10E);
DWORD baseAddr = (DWORD)GetModuleHandle(NULL);
BYTE* finalAddr = (BYTE*)((DWORD)baseAddr + hookFuncOffset);
DWORD oldProtect, bkpProtect, relAdr;
VirtualProtect(finalAddr, 5, PAGE_EXECUTE_READWRITE, &oldProtect);
relAdr = (DWORD)(jumpTo - (DWORD)finalAddr) - 5;
*finalAddr = 0xE9;
*((DWORD*)finalAddr + 0x1) = relAdr;
VirtualProtect(finalAddr, 5, oldProtect, &bkpProtect);
allocAddr is the address for the code-cave i have allocated above.
Now while debugging it i can see that relAddr is 0xFFFABBF9 which seems really high to me. But if i look into the disassembly view of Visual Studio the jmp created leads to 0xF9939B14 which leads to an Access Violation Exception. I would believe that even if the jmp would lead to the "correct" address it would also throw an Exception. What am i doing wrong?
Related
I made a WOW64 syscall hook for the `NtCreateSection` function using the following code:
#include "Funcs.h"
#include <cstdio>
#include <Windows.h>
const int PAGE_SIZE = 0x1000;
const int SYSCALL_INTERCEPT = 0x4A;
const int NUM_WOW64_BYTES = 0x9;
using pNtCreateSection =
NTSTATUS (NTAPI*)(PHANDLE SectionHandle, ULONG DesiredAccess, POBJECT_ATTRIBUTES ObjectAttributes,
PLARGE_INTEGER MaximumSize, ULONG PageAttributess, ULONG SectionAttributes, HANDLE FileHandle);
pNtCreateSection NtCreateSection = nullptr;
DWORD_PTR dwWow64Address = 0;
LPVOID lpJmpRealloc = nullptr;
ULONG SectionAttributes;
void __declspec(naked) NtCreateSectionHook()
{
__asm
{
pushad
}
fprintf(stderr, "NtCreateSectionHook called !\n");
__asm
{
popad
jmp lpJmpRealloc
}
}
DWORD_PTR __declspec(naked) GetWow64Address()
{
__asm
{
mov eax, dword ptr fs:[0xC0]
ret
}
}
void __declspec(naked) Wow64Trampoline()
{
__asm
{
cmp eax, SYSCALL_INTERCEPT
jz NtCreateSectionHook
jmp lpJmpRealloc
}
}
LPVOID CreateNewJump(const DWORD_PTR dwWow64Address)
{
lpJmpRealloc = VirtualAlloc(nullptr, PAGE_SIZE, MEM_RESERVE | MEM_COMMIT,
PAGE_EXECUTE_READWRITE);
(void)memcpy(lpJmpRealloc, (const void *)dwWow64Address, NUM_WOW64_BYTES);
return lpJmpRealloc;
}
void EnableWow64Redirect(const DWORD_PTR dwWow64Address, const LPVOID lpNewJumpLocation)
{
unsigned char trampolineBytes[] =
{
0x68, 0xDD, 0xCC, 0xBB, 0xAA, /*push 0xAABBCCDD*/
0xC3, /*ret*/
0xCC, 0xCC, 0xCC /*padding*/
};
memcpy(&trampolineBytes[1], &lpNewJumpLocation, sizeof(DWORD_PTR));
WriteJump(dwWow64Address, trampolineBytes, sizeof trampolineBytes);
}
void WriteJump(const DWORD_PTR dwWow64Address, const void *pBuffer, size_t ulSize)
{
DWORD dwOldProtect = 0;
(void)VirtualProtect(reinterpret_cast<LPVOID>(dwWow64Address), PAGE_SIZE, PAGE_EXECUTE_READWRITE, &dwOldProtect);
(void)memcpy(reinterpret_cast<void *>(dwWow64Address), pBuffer, ulSize);
(void)VirtualProtect(reinterpret_cast<LPVOID>(dwWow64Address), PAGE_SIZE, dwOldProtect, &dwOldProtect);
}
int main(int argc, char *argv[])
{
const auto hModule = GetModuleHandle(L"ntdll.dll");
NtCreateSection = reinterpret_cast<pNtCreateSection>(GetProcAddress(hModule, "NtCreateSection"));
dwWow64Address = GetWow64Address();
const auto lpNewJumpLocation = CreateNewJump(dwWow64Address);
EnableWow64Redirect(dwWow64Address, static_cast<LPVOID>(Wow64Trampoline));
//Test syscall
HANDLE hSection;
NtCreateSection(&hSection, SECTION_ALL_ACCESS, nullptr, nullptr, PAGE_EXECUTE_READWRITE, SEC_COMMIT | SEC_NOCHANGE, nullptr);
getchar();
return 0;
}
The code works fine until I change the hooked function to this
void __declspec(naked) NtCreateSectionHook()
{
__asm
{
pushad
mov eax, [esp + 28]
mov SectionAttributes, eax
}
fprintf(stderr, "NtCreateSectionHook called !\n");
if ((SectionAttributes & SEC_NOCHANGE) != 0)
{
fprintf(stderr, "SEC_NOCHANGE found !\n");
}
__asm
{
popad
jmp lpJmpRealloc
}
}
The problem in my code is that the pushad instruction messes with the esp therefore I can't access the stack anymore and if I don't use pushad/popap the app crashes since I'm messing up with the stack then jumping to the real function address.
The argument I wanna access and change is the 6th argument of NtCreateSection
function.
pushad does not prevent you from accessing the stack. pushad pushes 32 bytes (8 registers, 4 bytes each) into the stack, hence, any offset after pushad should be corrected by adding 32.
Is it possible in WinAPI to set stack size for the current thread at runtime like setrlimit does on Linux?
I mean to increase the reserved stack size for the current thread if it is too small for the current requirements.
This is in a library that may be called by threads from other programming languages, so it's not an option to set stack size at compile time.
If not, any ideas about a solution like an assembly trampoline that changes the stack pointer to a dynamically allocated memory block?
FAQ: Proxy thread is a surefire solution (unless the caller thread has extremely small stack). However, thread switching seems a performance killer. I need substantial amount of stack for recursion or for _alloca. This is also for performance, because heap allocation is slow, especially if multiple threads allocate from heap in parallel (they get blocked by the same libc/CRT mutex, so the code becomes serial).
you can not full swap stack in current thread (allocate self, delete old) in library code because in old stack - return addresses, may be pointers to variables in stack, etc.
and you can not expand stack (virtual memory for it already allocated (reserved/commit) and not expandable.
however possible allocate temporary stack and switch to this stack during call. you must in this case save old StackBase and StackLimit from NT_TIB (look this structure in winnt.h), set new values (you need allocate memory for new stack), do call (for switch stack you need some assembly code - you can not do this only on c/c++) and return original StackBase and StackLimit. in kernelmode exist support for this - KeExpandKernelStackAndCallout
however in user mode exist Fibers - this is very rare used, but look like perfectly match to task. with Fiber we can create additional stack/execution context inside current thread.
so in general solution is next (for library):
on DLL_THREAD_ATTACH :
convert thread to fiber
(ConvertThreadToFiber) (if it return false check also
GetLastError for ERROR_ALREADY_FIBER - this is also ok code)
and create own Fiber by call CreateFiberEx
we do this only once. than, every time when your procedure is called, which require large stack space:
remember the current fiber by call GetCurrentFiber
setup task for your fiber
switch to your fiber by call SwitchToFiber
call procedure inside fiber
return to original fiber (saved from call GetCurrentFiber)
again by SwitchToFiber
and finally on DLL_THREAD_DETACH you need:
delete your fiber by DeleteFiber
convert fiber to thread by call ConvertFiberToThread but only
in case initial ConvertThreadToFiber return true (if was
ERROR_ALREADY_FIBER- let who first convert thread to fiber convert
it back - this is not your task in this case)
you need some (usual small) data associated with your fiber / thread. this must be of course per thread variable. so you need use __declspec(thread) for declare this data. or direct use TLS (or which modern c++ features exist for this)
demo implementation is next:
typedef ULONG (WINAPI * MY_EXPAND_STACK_CALLOUT) (PVOID Parameter);
class FIBER_DATA
{
public:
PVOID _PrevFiber, _MyFiber;
MY_EXPAND_STACK_CALLOUT _pfn;
PVOID _Parameter;
ULONG _dwError;
BOOL _bConvertToThread;
static VOID CALLBACK _FiberProc( PVOID lpParameter)
{
reinterpret_cast<FIBER_DATA*>(lpParameter)->FiberProc();
}
VOID FiberProc()
{
for (;;)
{
_dwError = _pfn(_Parameter);
SwitchToFiber(_PrevFiber);
}
}
public:
~FIBER_DATA()
{
if (_MyFiber)
{
DeleteFiber(_MyFiber);
}
if (_bConvertToThread)
{
ConvertFiberToThread();
}
}
FIBER_DATA()
{
_bConvertToThread = FALSE, _MyFiber = 0;
}
ULONG Create(SIZE_T dwStackCommitSize, SIZE_T dwStackReserveSize);
ULONG DoCallout(MY_EXPAND_STACK_CALLOUT pfn, PVOID Parameter)
{
_PrevFiber = GetCurrentFiber();
_pfn = pfn;
_Parameter = Parameter;
SwitchToFiber(_MyFiber);
return _dwError;
}
};
__declspec(thread) FIBER_DATA* g_pData;
ULONG FIBER_DATA::Create(SIZE_T dwStackCommitSize, SIZE_T dwStackReserveSize)
{
if (ConvertThreadToFiber(this))
{
_bConvertToThread = TRUE;
}
else
{
ULONG dwError = GetLastError();
if (dwError != ERROR_ALREADY_FIBER)
{
return dwError;
}
}
return (_MyFiber = CreateFiberEx(dwStackCommitSize, dwStackReserveSize, 0, _FiberProc, this)) ? NOERROR : GetLastError();
}
void OnDetach()
{
if (FIBER_DATA* pData = g_pData)
{
delete pData;
}
}
ULONG OnAttach()
{
if (FIBER_DATA* pData = new FIBER_DATA)
{
if (ULONG dwError = pData->Create(2*PAGE_SIZE, 512 * PAGE_SIZE))
{
delete pData;
return dwError;
}
g_pData = pData;
return NOERROR;
}
return ERROR_NO_SYSTEM_RESOURCES;
}
ULONG WINAPI TestCallout(PVOID param)
{
DbgPrint("TestCallout(%s)\n", param);
return NOERROR;
}
ULONG DoCallout(MY_EXPAND_STACK_CALLOUT pfn, PVOID Parameter)
{
if (FIBER_DATA* pData = g_pData)
{
return pData->DoCallout(pfn, Parameter);
}
return ERROR_GEN_FAILURE;
}
if (!OnAttach())//DLL_THREAD_ATTACH
{
DoCallout(TestCallout, "Demo Task #1");
DoCallout(TestCallout, "Demo Task #2");
OnDetach();//DLL_THREAD_DETACH
}
also note that all fibers executed in single thread context - multiple fibers associated with thread can not execute in concurrent - only sequential, and you yourself control switch time. so not need any additional synchronization. and SwitchToFiber - this is complete user mode proc. which executed very fast, never fail (because never allocate any resources)
update
despite use __declspec(thread) FIBER_DATA* g_pData; more simply (less code), better for implementation direct use TlsGetValue / TlsSetValue and allocate FIBER_DATA on first call inside thread, but not for all threads. also __declspec(thread) not correct worked (not worked at all) in XP for dll. so some modification can be
at DLL_PROCESS_ATTACH allocate your TLS slot gTlsIndex = TlsAlloc();
and free it on DLL_PROCESS_DETACH
if (gTlsIndex != TLS_OUT_OF_INDEXES) TlsFree(gTlsIndex);
on every DLL_THREAD_DETACH notification call
void OnThreadDetach()
{
if (FIBER_DATA* pData = (FIBER_DATA*)TlsGetValue(gTlsIndex))
{
delete pData;
}
}
and DoCallout need be modified in next way
ULONG DoCallout(MY_EXPAND_STACK_CALLOUT pfn, PVOID Parameter)
{
FIBER_DATA* pData = (FIBER_DATA*)TlsGetValue(gTlsIndex);
if (!pData)
{
// this code executed only once on first call
if (!(pData = new FIBER_DATA))
{
return ERROR_NO_SYSTEM_RESOURCES;
}
if (ULONG dwError = pData->Create(512*PAGE_SIZE, 4*PAGE_SIZE))// or what stack size you need
{
delete pData;
return dwError;
}
TlsSetValue(gTlsIndex, pData);
}
return pData->DoCallout(pfn, Parameter);
}
so instead allocate stack for every new thread on DLL_THREAD_ATTACH via OnAttach() much better alocate it only for threads when really need (at first call)
and this code can potential have problems with fibers, if someone else also try use fibers. say in msdn example code not check for ERROR_ALREADY_FIBER in case ConvertThreadToFiber return 0. so we can wait that this case will be incorrect handled by main application if we before it decide create fiber and it also try use fiber after us. also ERROR_ALREADY_FIBER not worked in xp (begin from vista).
so possible and another solution - yourself create thread stack, and temporary switch to it doring call which require large stack space. main need not only allocate space for stack and swap esp (or rsp) but not forget correct establish StackBase and StackLimit in NT_TIB - it is necessary and sufficient condition (otherwise exceptions and guard page extension will be not worked).
despite this alternate solution require more code (manually create thread stack and stack switch) it will be work on xp too and nothing affect in situation when somebody else also try using fibers in thread
typedef ULONG (WINAPI * MY_EXPAND_STACK_CALLOUT) (PVOID Parameter);
extern "C" PVOID __fastcall SwitchToStack(PVOID param, PVOID stack);
struct FIBER_DATA
{
PVOID _Stack, _StackLimit, _StackPtr, _StackBase;
MY_EXPAND_STACK_CALLOUT _pfn;
PVOID _Parameter;
ULONG _dwError;
static void __fastcall FiberProc(FIBER_DATA* pData, PVOID stack)
{
for (;;)
{
pData->_dwError = pData->_pfn(pData->_Parameter);
// StackLimit can changed during _pfn call
pData->_StackLimit = ((PNT_TIB)NtCurrentTeb())->StackLimit;
stack = SwitchToStack(0, stack);
}
}
ULONG Create(SIZE_T Reserve, SIZE_T Commit);
ULONG DoCallout(MY_EXPAND_STACK_CALLOUT pfn, PVOID Parameter)
{
_pfn = pfn;
_Parameter = Parameter;
PNT_TIB tib = (PNT_TIB)NtCurrentTeb();
PVOID StackBase = tib->StackBase, StackLimit = tib->StackLimit;
tib->StackBase = _StackBase, tib->StackLimit = _StackLimit;
_StackPtr = SwitchToStack(this, _StackPtr);
tib->StackBase = StackBase, tib->StackLimit = StackLimit;
return _dwError;
}
~FIBER_DATA()
{
if (_Stack)
{
VirtualFree(_Stack, 0, MEM_RELEASE);
}
}
FIBER_DATA()
{
_Stack = 0;
}
};
ULONG FIBER_DATA::Create(SIZE_T Reserve, SIZE_T Commit)
{
Reserve = (Reserve + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1);
Commit = (Commit + (PAGE_SIZE - 1)) & ~(PAGE_SIZE - 1);
if (Reserve < Commit || !Reserve)
{
return ERROR_INVALID_PARAMETER;
}
if (PBYTE newStack = (PBYTE)VirtualAlloc(0, Reserve, MEM_RESERVE, PAGE_NOACCESS))
{
union {
PBYTE newStackBase;
void** ppvStack;
};
newStackBase = newStack + Reserve;
PBYTE newStackLimit = newStackBase - Commit;
if (newStackLimit = (PBYTE)VirtualAlloc(newStackLimit, Commit, MEM_COMMIT, PAGE_READWRITE))
{
if (Reserve == Commit || VirtualAlloc(newStackLimit - PAGE_SIZE, PAGE_SIZE, MEM_COMMIT, PAGE_READWRITE|PAGE_GUARD))
{
_StackBase = newStackBase, _StackLimit = newStackLimit, _Stack = newStack;
#if defined(_M_IX86)
*--ppvStack = FiberProc;
ppvStack -= 4;// ebp,esi,edi,ebx
#elif defined(_M_AMD64)
ppvStack -= 5;// x64 space
*--ppvStack = FiberProc;
ppvStack -= 8;// r15,r14,r13,r12,rbp,rsi,rdi,rbx
#else
#error "not supported"
#endif
_StackPtr = ppvStack;
return NOERROR;
}
}
VirtualFree(newStack, 0, MEM_RELEASE);
}
return GetLastError();
}
ULONG gTlsIndex;
ULONG DoCallout(MY_EXPAND_STACK_CALLOUT pfn, PVOID Parameter)
{
FIBER_DATA* pData = (FIBER_DATA*)TlsGetValue(gTlsIndex);
if (!pData)
{
// this code executed only once on first call
if (!(pData = new FIBER_DATA))
{
return ERROR_NO_SYSTEM_RESOURCES;
}
if (ULONG dwError = pData->Create(512*PAGE_SIZE, 4*PAGE_SIZE))
{
delete pData;
return dwError;
}
TlsSetValue(gTlsIndex, pData);
}
return pData->DoCallout(pfn, Parameter);
}
void OnThreadDetach()
{
if (FIBER_DATA* pData = (FIBER_DATA*)TlsGetValue(gTlsIndex))
{
delete pData;
}
}
and assembly code for SwitchToStack : on x86
#SwitchToStack#8 proc
push ebx
push edi
push esi
push ebp
xchg esp,edx
mov eax,edx
pop ebp
pop esi
pop edi
pop ebx
ret
#SwitchToStack#8 endp
and for x64:
SwitchToStack proc
push rbx
push rdi
push rsi
push rbp
push r12
push r13
push r14
push r15
xchg rsp,rdx
mov rax,rdx
pop r15
pop r14
pop r13
pop r12
pop rbp
pop rsi
pop rdi
pop rbx
ret
SwitchToStack endp
usage/test can be next:
gTlsIndex = TlsAlloc();//DLL_PROCESS_ATTACH
if (gTlsIndex != TLS_OUT_OF_INDEXES)
{
TestStackMemory();
DoCallout(TestCallout, "test #1");
//play with stack, excepions, guard pages
PSTR str = (PSTR)alloca(256);
DoCallout(zTestCallout, str);
DbgPrint("str=%s\n", str);
DoCallout(TestCallout, "test #2");
OnThreadDetach();//DLL_THREAD_DETACH
TlsFree(gTlsIndex);//DLL_PROCESS_DETACH
}
void TestMemory(PVOID AllocationBase)
{
MEMORY_BASIC_INFORMATION mbi;
PVOID BaseAddress = AllocationBase;
while (VirtualQuery(BaseAddress, &mbi, sizeof(mbi)) >= sizeof(mbi) && mbi.AllocationBase == AllocationBase)
{
BaseAddress = (PBYTE)mbi.BaseAddress + mbi.RegionSize;
DbgPrint("[%p, %p) %p %08x %08x\n", mbi.BaseAddress, BaseAddress, (PVOID)(mbi.RegionSize >> PAGE_SHIFT), mbi.State, mbi.Protect);
}
}
void TestStackMemory()
{
MEMORY_BASIC_INFORMATION mbi;
if (VirtualQuery(_AddressOfReturnAddress(), &mbi, sizeof(mbi)) >= sizeof(mbi))
{
TestMemory(mbi.AllocationBase);
}
}
ULONG WINAPI zTestCallout(PVOID Parameter)
{
TestStackMemory();
alloca(5*PAGE_SIZE);
TestStackMemory();
__try
{
*(int*)0=0;
}
__except(EXCEPTION_EXECUTE_HANDLER)
{
DbgPrint("exception %x handled\n", GetExceptionCode());
}
strcpy((PSTR)Parameter, "zTestCallout demo");
return NOERROR;
}
ULONG WINAPI TestCallout(PVOID param)
{
TestStackMemory();
DbgPrint("TestCallout(%s)\n", param);
return NOERROR;
}
The maximum stack size is determined when the thread is created. It cannot be modified after that time.
I tried to test the RegCreateKeyEx call using asm inline.
This is the code :
long regkey(HKEY lnKey, LPCTSTR lpsub, DWORD rise, LPTSTR lpc, DWORD dwopt, REGSAM sd, LPSECURITY_ATTRIBUTES lpas, HKEY * const &llkey, DWORD * const &dwDisposition)
{
__asm
{
lea eax, dwDisposition
push eax // put eax at the top of the stack
lea eax, llkey
push eax
lea eax, lpas
push eax
lea eax, sd
push eax
lea eax, dwopt
push eax
lea eax, lpc
push eax
lea eax, rise
push eax
lea eax, lpsub
push eax
lea eax, lnKey
push eax
call DWORD ptr RegCreateKeyEx
}
return;
}
int main()
{
HKEY lnKey;
LPCTSTR lpsub;
DWORD rise;
LPTSTR lpc;
DWORD dwopt;
REGSAM sd;
LPSECURITY_ATTRIBUTES lpas;
HKEY llkey;
DWORD dwDisposition;
long ret0 = regkey(HKEY_CURRENT_USER, TEXT(linkey.c_str()), 0, NULL, 0, KEY_WRITE, NULL, &llkey, &dwDisposition);
printf("CREATE %d\n", ret0);
return 0;
}
However, it returns error 87 (INVALID ARGUMENT).
The linkey.c_str () variable contains the string "Software\\Microsoft\\Windows\\CurrentVersion\\Run\\" path and passes it correctly because I tried to use it directly by calling directly the RegCreateKeyEx (HKEY_CURRENT_USER,
TEXT (linkey.c_str ()),
0, NULL, 0,
KEY_WRITE, NULL,
& Llkey, & dwDisposition) ad it's all OK
Where am I wrong in use asm inline?
If you wrote the code in C++, like so:
LONG regkey(HKEY lnKey, LPCTSTR lpsub, DWORD rise, LPTSTR lpc, DWORD dwopt, REGSAM sd, LPSECURITY_ATTRIBUTES lpas, HKEY * const &llkey, DWORD * const &dwDisposition)
{
return RegCreateKeyEx(lnKey, lpsub, rise, lpc, dwopt, sd, lpas, llkey, dwDisposition);
}
then you could compile it using the /FA switch, as advised in the comments, to get the compiler to generate an assembly listing of the code it would generate. Alternatively, you could compile it and then break in, using the debugger to show the disassembly of the actual binary. Either way, you'd see the following assembly code being generated by the compiler:
mov eax, DWORD PTR [esp+36] ; dwDisposition
push DWORD PTR [eax]
mov eax, DWORD PTR [esp+36] ; llkey
push DWORD PTR [eax]
push DWORD PTR [esp+36] ; lpas
push DWORD PTR [esp+36] ; sd
push DWORD PTR [esp+36] ; dwopt
push DWORD PTR [esp+36] ; lpc
push DWORD PTR [esp+36] ; rise
push DWORD PTR [esp+36] ; lpsub
push DWORD PTR [esp+36] ; lnKey
call DWORD PTR RegCreateKeyEx
ret 0
So, the inline assembly should be:
LONG regkey(HKEY lnKey, LPCTSTR lpsub, DWORD rise, LPTSTR lpc, DWORD dwopt, REGSAM sd, LPSECURITY_ATTRIBUTES lpas, HKEY * const &llkey, DWORD * const &dwDisposition)
{
__asm
{
mov eax, DWORD PTR [dwDisposition]
push DWORD PTR [eax]
mov eax, DWORD PTR [llkey]
push DWORD PTR [eax]
push DWORD PTR [lpas];
push DWORD PTR [sd];
push DWORD PTR [dwopt];
push DWORD PTR [lpc];
push DWORD PTR [rise];
push DWORD PTR [lpsub]
push DWORD PTR [lnKey]
call DWORD PTR RegCreateKeyEx
} // return value is left in EAX
}
which is pretty darn easy. You don't need to worry about calculating offsets from the stack pointer, because the inline assembler supports the use of C++ variables. There is never a need for a LEA instruction. In fact, the LEA instruction is wrong, because it causes you to pass pointers as parameters to the RegCreateKeyEx function, instead of the values themselves, which is why you were getting error code 87, "Invalid parameter".
The only complicated thing is the way the dwDisposition and llkey parameters are handled. First, the address has to be loaded into a register (EAX), and then that address is dereferenced as it is pushed onto the stack. This additional level of indirection is necessary because you passed these parameters as references to pointers. I have no idea why you've chosen to do that, but because you did, the reference has to be dereferenced. (Under the hood, a C++ compiler implements references like pointers.)
However, I have no idea why you would actually write this code in inline assembly. There is absolutely no reason to do so; it isn't buying you anything, it's just making things more complicated to write and maintain. It's also costing you something in performance. Above, I showed what a C++ compiler would generate for the function call. Here is what the compiler generates when you use inline assembly:
push ebp
mov ebp, esp
mov eax, DWORD PTR _dwDisposition$[ebp]
push DWORD PTR [eax]
mov eax, DWORD PTR _llkey$[ebp]
push DWORD PTR [eax]
push DWORD PTR _lpas$[ebp]
push DWORD PTR _sd$[ebp]
push DWORD PTR _dwopt$[ebp]
push DWORD PTR _lpc$[ebp]
push DWORD PTR _rise$[ebp]
push DWORD PTR _lpsub$[ebp]
push DWORD PTR _lnKey$[ebp]
call DWORD PTR RegCreateKey
pop ebp
ret 0
Notice the extra prologue and epilogue instructions that are made necessary by the use of inline assembly, since the compiler has no idea what you are actually doing inside of the inline assembly and therefore has to compensate for it by setting up and tearing down a stack frame. It isn't a massive performance cost or anything, but again, it's completely pointless.
Your follow-up question/problem (from a comment and an answer you posted) makes no sense to me. You shouldn't need any sort of exception handler, and since you didn't quote the exact error message you're getting, I have no idea what it might be trying to say. It is likely that the problem relates to strtemp, whose declaration you don't show us. Since you said you are calling the ANSI version of RegCreateKeyEx, strtemp should be a pointer to a char buffer, terminated with a NUL character. Also, the next parameter you are passing to RegSetValueEx is wrong: cbData must include the terminating NUL character, so this should properly be strlen(strtemp) + 1.
Other problems with your code include the fact that there is absolutely no error checking. If the attempt to create the registry key fails, then you should not attempt to write into it, nor should you attempt to close it.
Also, there is never a reason in the year 2017 to call the ANSI version of a Windows API function. Everything has been Unicode internally for almost two decades, and your code needs to get with the program. This means using strings consisting of wchar_t characters.
The question is fairly straight forward, what I'm trying to do is restore my process' detoured functions.
When I say detoured I mean the usual jmp instruction to an unknown location.
For example, when the ntdll.dll export NtOpenProcess() is not detoured, the first 5 bytes of the instruction of the function are along the lines of mov eax, *.
(The * offset depending on the OS version.)
When it gets detoured, that mov eax, * turns into a jmp.
What I'm trying to do is restore their bytes to what they were originally before any memory modifications.
My idea was to try and read the information I need from the disk, not from memory, however I do not know how to do that as I'm just a beginner.
Any help or explanation is greatly welcomed, if I did not explain my problem correctly please tell me!
I ended up figuring it out.
Example on NtOpenProcess.
Instead of restoring the bytes I decided to jump over them instead.
First we have to define the base of ntdll.
/* locate ntdll */
#define NTDLL _GetModuleHandleA("ntdll.dll")
Once we've done that, we're good to go. GetOffsetFromRva will calculate the offset of the file based on the address and module header passed to it.
DWORD GetOffsetFromRva(IMAGE_NT_HEADERS * nth, DWORD RVA)
{
PIMAGE_SECTION_HEADER sectionHeader = IMAGE_FIRST_SECTION(nth);
for (unsigned i = 0, sections = nth->FileHeader.NumberOfSections; i < sections; i++, sectionHeader++)
{
if (sectionHeader->VirtualAddress <= RVA)
{
if ((sectionHeader->VirtualAddress + sectionHeader->Misc.VirtualSize) > RVA)
{
RVA -= sectionHeader->VirtualAddress;
RVA += sectionHeader->PointerToRawData;
return RVA;
}
}
}
return 0;
}
We call this to get us the file offset that we need in order to find the original bytes of the function.
DWORD GetExportPhysicalAddress(HMODULE hmModule, char* szExportName)
{
if (!hmModule)
{
return 0;
}
DWORD dwModuleBaseAddress = (DWORD)hmModule;
IMAGE_DOS_HEADER* pHeaderDOS = (IMAGE_DOS_HEADER *)hmModule;
if (pHeaderDOS->e_magic != IMAGE_DOS_SIGNATURE)
{
return 0;
}
IMAGE_NT_HEADERS * pHeaderNT = (IMAGE_NT_HEADERS *)(dwModuleBaseAddress + pHeaderDOS->e_lfanew);
if (pHeaderNT->Signature != IMAGE_NT_SIGNATURE)
{
return 0;
}
/* get the export virtual address through a custom GetProcAddress function. */
void* pExportRVA = GetProcedureAddress(hmModule, szExportName);
if (pExportRVA)
{
/* convert the VA to RVA... */
DWORD dwExportRVA = (DWORD)pExportRVA - dwModuleBaseAddress;
/* get the file offset and return */
return GetOffsetFromRva(pHeaderNT, dwExportRVA);
}
return 0;
}
Using the function that gets us the file offset, we can now read the original export bytes.
size_t ReadExportFunctionBytes(HMODULE hmModule, char* szExportName, BYTE* lpBuffer, size_t t_Count)
{
/* get the offset */
DWORD dwFileOffset = GetExportPhysicalAddress(hmModule, szExportName);
if (!dwFileOffset)
{
return 0;
}
/* get the path of the targetted module */
char szModuleFilePath[MAX_PATH];
GetModuleFileNameA(hmModule, szModuleFilePath, MAX_PATH);
if (strnull(szModuleFilePath))
{
return 0;
}
/* try to open the file off the disk */
FILE *fModule = fopen(szModuleFilePath, "rb");
if (!fModule)
{
/* we couldn't open the file */
return 0;
}
/* go to the offset and read it */
fseek(fModule, dwFileOffset, SEEK_SET);
size_t t_Read = 0;
if ((t_Read = fread(lpBuffer, t_Count, 1, fModule)) == 0)
{
/* we didn't read anything */
return 0;
}
/* close file and return */
fclose(fModule);
return t_Read;
}
And we can retrieve the syscall index from the mov instruction originally placed in the first 5 bytes of the export on x86.
DWORD GetSyscallIndex(char* szFunctionName)
{
BYTE buffer[5];
ReadExportFunctionBytes(NTDLL, szFunctionName, buffer, 5);
if (!buffer)
{
return 0;
}
return BytesToDword(buffer + 1);
}
Get the NtOpenProcess address and add 5 to trampoline over it.
DWORD _ptrNtOpenProcess = (DWORD) GetProcAddress(NTDLL, "NtOpenProcess") + 5;
DWORD _oNtOpenProcess = GetSyscallIndex("NtOpenProcess");
The recovered/reconstructed NtOpenProcess.
__declspec(naked) NTSTATUS NTAPI _NtOpenProcess
(
_Out_ PHANDLE ProcessHandle,
_In_ ACCESS_MASK DesiredAccess,
_In_ POBJECT_ATTRIBUTES ObjectAttributes,
_In_opt_ PCLIENT_ID ClientId
) {
__asm
{
mov eax, [_oNtOpenProcess]
jmp dword ptr ds : [_ptrNtOpenProcess]
}
}
Let's call it.
int main()
{
printf("NtOpenProcess %x index: %x\n", _ptrNtOpenProcess, _oNtOpenProcess);
uint32_t pId = 0;
do
{
pId = GetProcessByName("notepad.exe");
Sleep(200);
} while (pId == 0);
OBJECT_ATTRIBUTES oa;
CLIENT_ID cid;
cid.UniqueProcess = (HANDLE)pId;
cid.UniqueThread = 0;
InitializeObjectAttributes(&oa, NULL, 0, NULL, NULL);
HANDLE hProcess;
NTSTATUS ntStat;
ntStat = _NtOpenProcess(&hProcess, PROCESS_ALL_ACCESS, &oa, &cid);
if (!NT_SUCCESS(ntStat))
{
printf("Couldn't open the process. NTSTATUS: %d", ntStat);
return 0;
}
printf("Successfully opened the process.");
/* clean up. */
NtClose(hProcess);
getchar();
return 0;
}
Debugging with visual studio 2005 The following Error Displayed :
Unhandled exception at 0x00000000 in procexp.exe: 0xC0000005: Access
violation reading location 0x00000000.
And Thread Information:
2704 Win32 Thread 00000000 Normal 0
extern "C" VDLL2_API BOOL WINAPI MyTerminateProcess(HANDLE hProcess,UINT uExitCode)
{
SetLastError(5);
return FALSE;
}
FARPROC HookFunction(char *UserDll,FARPROC pfn,FARPROC HookFunc)
{
DWORD dwSizeofExportTable=0;
DWORD dwRelativeVirtualAddress=0;
HMODULE hm=GetModuleHandle(NULL);
FARPROC pfnOriginalAddressToReturn;
PIMAGE_DOS_HEADER pim=(PIMAGE_DOS_HEADER)hm;
PIMAGE_NT_HEADERS pimnt=(PIMAGE_NT_HEADERS)((DWORD)pim +
(DWORD)pim->e_lfanew);
PIMAGE_DATA_DIRECTORY
pimdata=(PIMAGE_DATA_DIRECTORY)&(pimnt->OptionalHeader.DataDirectory);
PIMAGE_OPTIONAL_HEADER pot=&(pimnt->OptionalHeader);
PIMAGE_DATA_DIRECTORY
pim2=(PIMAGE_DATA_DIRECTORY)((DWORD)pot+(DWORD)104);
dwSizeofExportTable=pim2->Size;
dwRelativeVirtualAddress=pim2->VirtualAddress;
char *ascstr;
PIMAGE_IMPORT_DESCRIPTOR
pimexp=(PIMAGE_IMPORT_DESCRIPTOR)(pim2->VirtualAddress + (DWORD)pim);
while(pimexp->Name)
{
ascstr=(char *)((DWORD)pim + (DWORD)pimexp->Name);
if(strcmpi(ascstr,UserDll) == 0)
{
break;
}
pimexp++;
}
PIMAGE_THUNK_DATA
pname=(PIMAGE_THUNK_DATA)((DWORD)pim+(DWORD)pimexp->FirstThunk);
LPDWORD lpdw=&(pname->u1.Function);
DWORD dwError=0;
DWORD OldProtect=0;
while(pname->u1.Function)
{
if((DWORD)pname->u1.Function == (DWORD)pfn)
{
lpdw=&(pname->u1.Function);
VirtualProtect((LPVOID)lpdw,sizeof(DWORD),PAGE_READWRITE,&OldProtect);
pname->u1.Function=(DWORD)HookFunc;
VirtualProtect((LPVOID)lpdw,sizeof(DWORD),PAGE_READONLY,&OldProtect);
return pfn;
}
pname++;
}
return (FARPROC)0;
}
FARPROC CallHook(void)
{
HMODULE hm=GetModuleHandle(TEXT("Kernel32.dll"));
FARPROC fp=GetProcAddress(hm,"TerminateProcess");
HMODULE hm2=GetModuleHandle(TEXT("vdll2.dll"));
FARPROC fpHook=GetProcAddress(hm2,"MyTerminateProcess");
dwAddOfTerminateProcess=HookFunction("Kernel32.dll",fp,fpHook);
if(dwAddOfTerminateProcess == 0)
{
MessageBox(NULL,TEXT("Unable TO Hook Function."),TEXT("Parth"),MB_OK);
}
else
{
MessageBox(NULL,TEXT("Success Hooked."),TEXT("Parth"),MB_OK);
}
return 0;
}
Thanks in advance for any help.
004118AC mov esi,esp
004118AE push 0
004118B0 mov eax,dword ptr [hProc]
004118B3 push eax
004118B4 call dword ptr[__imp__TerminateProcess#8(4181E4h)]
004118BA cmp esi,esp
esi returned zero. why ?
What is VDLL2_API defined as? It may be interfering with the calling convention (which is meant to be WINAPI for this function, as you write it later on the same line).
Stack problems on exit (ESI, ESP) usually indicate that you have your calling conventions mixed up. You appear to have used FARPROC consistently everywhere else, but since you know the exact prototype of the function, try typedef-ing that as the type to use instead:
typedef BOOL (WINAPI *TERMINATEPROCESS_PROC)(HANDLE, UINT);
Now use TERMINATEPROCESS_PROC everywhere instead of FARPROC.
Don't write this kind of code yourself. Use the Detours library from Microsoft Research.