Call original function after hooked function - c++

I want to learn how to make hooks so I made a simple program to test me. The hook is working perfectly but I also wanted to call the original function after calling the hooked. Tried to do in many ways, moving the stack, restoring the original bytes and then calling the original function at the end of the hooked function but it did not work.
My program simply waits any key and print the text.
My hook (DLL):
#include <windows.h>
#include <stdio.h>
void WriteMem(DWORD dwAddr, BYTE *dwNew, int Size);
void MyPrintf(char *text)
{
printf("\n Original message: %s\n", buff);
}
void WriteMem(DWORD dwAddr, BYTE *dwNew, int Size)
{
DWORD OldProt;
VirtualProtect((void*)dwAddr, Size, PAGE_EXECUTE_READWRITE, &OldProt);
memset((void*)(dwAddr), 0x90, Size);
memcpy((void*)(dwAddr), (void*)(dwNew), Size);
VirtualProtect((void*)(dwAddr), Size, OldProt, &OldProt);
}
void SetJMP(INT32 dwOld, LPVOID dwNew, INT32 Size)
{
BYTE dwNewBytes[5] = {0xE9, 0x00, 0x00, 0x00, 0x00};
DWORD calc = ((DWORD)dwNew - dwOld - 5);
memcpy(&dwNewBytes[1], &calc, 4);
WriteMem(dwOld, dwNewBytes, Size);
}
int SetIntercepet()
{ // 0x40102A printf address
SetJMP(0x40102A, MyPrintf, 7);
return 0;
}
BOOL APIENTRY DllMain(HANDLE hModule, DWORD fdwReason, LPVOID lpReserved)
{
switch(fdwReason)
{
case DLL_PROCESS_ATTACH:
SetIntercepet();
break;
}
return TRUE;
}
My test program (C):
#include <stdio.h>
#include <windows.h>
int main()
{
while (1)
{
system("pause");
printf("ORIGINAL\n");
}
}
part of the test program decompiled:
00401000 /$ 55 PUSH EBP
00401001 |. 8BEC MOV EBP,ESP
00401003 |> B8 01000000 /MOV EAX,1
00401008 |. 85C0 |TEST EAX,EAX
0040100A |. 74 1C |JE SHORT test.00401028
0040100C |. 68 00E04000 |PUSH test.0040E000 ; ASCII "pause"
00401011 |. E8 D9000000 |CALL test.004010EF
00401016 |. 83C4 04 |ADD ESP,4
00401019 |. 68 08E04000 |PUSH test.0040E008 ; ASCII "ORIGINAL"
0040101E |. E8 07000000 |CALL test.0040102A
00401023 |. 83C4 04 |ADD ESP,4
00401026 |.^EB DB \JMP SHORT test.00401003
00401028 |> 5D POP EBP
00401029 \. C3 RETN
0040102A /$ 6A 0C PUSH 0C
0040102C |. 68 50D44000 PUSH test.0040D450
00401031 |. E8 52140000 CALL test.00402488
00401036 |. 33C0 XOR EAX,EAX
00401038 |. 33F6 XOR ESI,ESI
0040103A |. 3975 08 CMP DWORD PTR SS:[EBP+8],ESI

Since you are overwriteing the actual printf function, you'll have to copy the instructions there and then perform the relevant "fixup" to make it work in the new location, as well as jumping back to "after your patch". This would either involve knowing exactly what the original code is (in other words, push 0c, push test.0040d450), or understanding enough of the machine code to split instructions up on their boundaries.
Another, much easier method would be to replace the original call-spot with new code. So, instead of patching your code into 0x40102a, you patch your code into 40101E, saving the 40102a from the original call point, and once you have done what you need to do, you call back into 40102a.
Something like this would do that:
void* origPrintf;
void MyPrintf(char *text)
{
void (*orig)(char *text) = reinterpret_cast<void (*)(char *text)>(origPrintf);
printf("\n Original message: %s\n", buff);
orig(text);
}
void WriteMem(DWORD dwAddr, BYTE *dwNew, int Size, void &*oldCall)
{
DWORD OldProt;
int offset;
VirtualProtect((void*)dwAddr, Size, PAGE_EXECUTE_READWRITE, &OldProt);
memcpy(offset, (void*)(dwAddr + 1), sizeof(offset));
oldCall = (void*)dwAddr + 5 + offset; // 5 byte call instruction assumed.
memset((void*)(dwAddr), 0x90, Size);
memcpy((void*)(dwAddr), (void*)(dwNew), Size);
VirtualProtect((void*)(dwAddr), Size, OldProt, &OldProt);
}
void SetJMP(INT32 dwOld, LPVOID dwNew, INT32 Size, void&*oldCall)
{
BYTE dwNewBytes[5] = {0xE9, 0x00, 0x00, 0x00, 0x00};
DWORD calc = ((DWORD)dwNew - dwOld - 5);
memcpy(&dwNewBytes[1], &calc, 4);
WriteMem(dwOld, dwNewBytes, Size, oldCall);
}
int SetIntercepet()
{ // 0x40102A printf address
SetJMP(0x40102A, MyPrintf, 7);
return 0;
}
[I can't check the code, as I'm pretty sure the addresses are quite different on my 64-bit Linux machine, but it should give a reasonable principle]

Related

Detecting mov dword ptr [rbp - ...] instructions with a specific register in the addressing mode, using Intel PIN

I want to detect the instructions like mov dword ptr [rbp-0x28], 0x7 (so, all the instructions in mov dword ptr [rbp-0xxx], xxx format) using Intel PIN (mainly to get array writes information). In un-optimized code, this should get most stores to local variables.
I can do:
if (INS_Opcode(ins) == XED_ICLASS_MOV)
instruction detection;
to detect the mov instruction. But, along with that it also detects other instruction such as mov eax, 0x0. I want to detect the instructions with dword ptr size directive.
I checked the pin instruction inspection API and pin xed-iclass-enum. Using that documentation I tried something like:
if ((INS_Opcode(ins) == XED_ICLASS_MOV) && INS_OperandIsMemory(ins, 0))
instruction detection;
which gives me the desired result. But also gives me the instructions like mov esi, eax (which I don't desire).
My code:
#include <fstream>
#include <iostream>
#include "pin.H"
#include <stack>
#include <unordered_map>
// Additional library calls go here
// Stack allocation
struct Node
{
int value;
};
std::stack<Node> mainStack;
// Ins object mapping
class Insr
{
private:
INS insobject;
public:
Insr(INS insob)
{
insobject = insob;
}
INS get_insobject()
{
return insobject;
}
};
static std::unordered_map<ADDRINT, Insr*> insstack;
// Output file object
ofstream OutFile;
//static uint64_t counter = 0;
std::string rtin = "";
// Make this lock if you want to print from _start
uint32_t key = 0;
void printmaindisas(uint64_t addr, std::string disassins)
{
std::stringstream tempstream;
tempstream << std::hex << addr;
std::string address = tempstream.str();
// if (addr > 0x700000000000)
// return;
if (addr > 0x700000000000)
return;
if (!key)
return;
// if (insstack[addr]->get_opcode() == XED_ICLASS_ADD || insstack[addr]->get_opcode()
// == XED_ICLASS_SUB)
INS ins = insstack[addr]->get_insobject();
if((INS_Opcode(ins) == XED_ICLASS_ADD || INS_Opcode(ins) == XED_ICLASS_SUB)
&&(INS_OperandIsImmediate(ins, 1)))
{
int value = INS_OperandImmediate(ins, 1);
std::cout << "value: " << value << '\n';
Node node{value};
mainStack.push(node);
std::cout << "stack top: " << mainStack.top().value << '\n';
}
if ((INS_Opcode(ins) == XED_ICLASS_MOV) && INS_OperandIsMemory(ins, 0))
{
std::cout << "yes!" << '\n';
}
std::cout<<address<<"\t"<<disassins<<std::endl;
}
void mutex_lock()
{
key = 0;
std::cout<<"out\n";
}
void mutex_unlock()
{
key = 1;
std::cout<<"in\n";
}
void Instruction(INS ins, VOID *v)
{
insstack.insert(std::make_pair(INS_Address(ins), new Insr(ins)));
INS_InsertCall(ins, IPOINT_BEFORE, (AFUNPTR)printmaindisas, IARG_ADDRINT, INS_Address(ins),
IARG_PTR, new string(INS_Disassemble(ins)), IARG_END);
}
void Routine(RTN rtn, VOID *V)
{
if (RTN_Name(rtn) == "main")
{
//std::cout<<"Loading: "<<RTN_Name(rtn) << endl;
RTN_Open(rtn);
RTN_InsertCall(rtn, IPOINT_BEFORE, (AFUNPTR)mutex_unlock, IARG_END);
RTN_InsertCall(rtn, IPOINT_AFTER, (AFUNPTR)mutex_lock, IARG_END);
RTN_Close(rtn);
}
}
KNOB<string> KnobOutputFile(KNOB_MODE_WRITEONCE, "pintool", "o", "mytool.out", "specify output file name");
/*
VOID Fini(INT32 code, VOID *v)
{
// Write to a file since cout and cerr maybe closed by the application
OutFile.setf(ios::showbase);
OutFile << "Count " << count << endl;
OutFile.close();
}
*/
int32_t Usage()
{
cerr << "This is my custom tool" << endl;
cerr << endl << KNOB_BASE::StringKnobSummary() << endl;
return -1;
}
int main(int argc, char * argv[])
{
// It must be called for image instrumentation
// Initialize the symbol table
PIN_InitSymbols();
// Initialize pin
// PIN_Init must be called before PIN_StartProgram
// as mentioned in the documentation
if (PIN_Init(argc, argv)) return Usage();
// Open the output file to write
OutFile.open(KnobOutputFile.Value().c_str());
// Set instruction format as intel
// Not needed because my machine is intel
PIN_SetSyntaxIntel();
RTN_AddInstrumentFunction(Routine, 0);
// Add an isntruction instrumentation
INS_AddInstrumentFunction(Instruction, 0);
//PIN_AddFiniFunction(Fini, 0);
// Start the program here
PIN_StartProgram();
return 0;
}
And the output I'm getting:
in
40051e push rbp
value: -128
stack top: -128
40051f mov rbp, rsp
400522 add rsp, 0xffffffffffffff80
yes!
400526 mov dword ptr [rbp-0x28], 0x7
yes!
40052d mov dword ptr [rbp-0x64], 0x9
400534 mov eax, 0x0
400539 call 0x4004e6
4004e6 push rbp
value: 64
stack top: 64
4004e7 mov rbp, rsp
4004ea sub rsp, 0x40
yes!
4004ee mov dword ptr [rbp-0xc], 0x4
4004f5 lea rax, ptr [rbp-0xc]
yes!
4004f9 mov qword ptr [rbp-0x8], rax
4004fd mov rax, qword ptr [rbp-0x8]
400501 mov eax, dword ptr [rax]
yes!
400503 mov esi, eax
400505 mov edi, 0x4005d0
40050a mov eax, 0x0
40050f call 0x4003f0
4003f0 jmp qword ptr [rip+0x200c22]
4003f6 push 0x0
4003fb jmp 0x4003e0
4003e0 push qword ptr [rip+0x200c22]
4003e6 jmp qword ptr [rip+0x200c24]
4
yes!
400514 mov dword ptr [rbp-0x3c], 0x3
40051b nop
40051c leave
40051d ret
40053e mov eax, 0x0
400543 leave
out
Is this the correct way to do that (without any false positives)?
If you want to accept all of the following instructions:
mov [rbp + disp], reg/imm
mov [rbp*scale + disp], reg/imm
mov [reg + rbp*scale], reg/imm
mov [rbp + reg*scale + disp], reg/imm
then you need to perform the following checks:
if (INS_Opcode(ins) == XED_ICLASS_MOV && // Check that the instruction is MOV.
INS_OperandIsMemory(ins, 0) && // Check that the destination operand is a memory operand.
INS_OperandWidth(ins, 0) == 32 && // Check that the size of the operand is 32 bits.
(INS_OperandMemoryBaseReg(ins, 0) == REG_EBP ||
INS_OperandMemoryIndexReg(ins, 0) == REG_EBP)) // Check that the base or index register is RBP.
{
...
}
Note that these checks accept both MOV instructions with displacement (including a displacement of zero) and MOV instructions without displacement (which is semantically equivalent to a displacement of zero but the encoding is different).
I assumed that you want to accept RBP both as a base register or as an index register (potentially with a scale larger than 1). Note that in case RBP is used as a base register, the encoding of the instruction will always include a displacement. See: Why are rbp and rsp called general purpose registers?.
If you want to accept all of the following instructions where RBP is used as the base register:
mov [rbp + disp], reg/imm
mov [rbp + reg*scale + disp], reg/imm
then you need to perform the following checks:
if (INS_Opcode(ins) == XED_ICLASS_MOV && // Check that the instruction is MOV.
INS_OperandIsMemory(ins, 0) && // Check that the destination operand is a memory operand.
INS_OperandWidth(ins, 0) == 32 && // Check that the size of the operand is 32 bits.
INS_OperandMemoryBaseReg(ins, 0) == REG_EBP) // Check that the base is RBP.
{
...
}
If you want to accept only the following instruction:
mov [rbp + disp], reg/imm
then you need to perform the following checks:
if (INS_Opcode(ins) == XED_ICLASS_MOV && // Check that the instruction is MOV.
INS_OperandIsMemory(ins, 0) && // Check that the destination operand is a memory operand.
INS_OperandWidth(ins, 0) == 32 && // Check that the size of the operand is 32 bits.
INS_OperandMemoryBaseReg(ins, 0) == REG_EBP && // Check that the base is RBP.
INS_OperandMemoryIndexReg(ins, 0) == REG_INVALID()) // Check that there is no index register.
{
...
}
If you want to check whether the displacement is a negative number, use the following check:
INS_OperandMemoryDisplacement(ins, 0) < 0
Note that INS_OperandMemoryDisplacement does not distinguish between a memory operand that has no displacement and one that has a displacement of zero. If there is no displacement, it just returns zero. If you want to determine whether the instruction encoding actually includes a displacement field, then you should use the XED API instead.

Stack Overflow C00000FD in MFC application

I am using a Custom tree control derived class. Below is the code.
CCustomTreeCtrl.h
class CCustomTreeCtrl : public CTreeCtrl
{
public:
BOOL Check;
CCustomTreeCtrl();
virtual ~CCustomTreeCtrl();
protected:
//{{AFX_MSG(CCustomTreeCtrl)
afx_msg void OnLButtonDblClk(UINT nFlags, CPoint point);
//}}AFX_MSG
DECLARE_MESSAGE_MAP();
};
CCustomTreeCtrl.cpp
CCustomTreeCtrl::CCustomTreeCtrl()
{
}
CCustomTreeCtrl::~CCustomTreeCtrl()
{
}
BEGIN_MESSAGE_MAP(CCustomTreeCtrl , CTreeCtrl)
//{{AFX_MSG(CCustomTreeCtrl)
ON_WM_LBUTTONDBLCLK()
//}}AFX_MSG
END_MESSAGE_MAP()
void CCustomTreeCtrl::OnLButtonDblClk(UINT nFlags, CPoint point)
{
char buf[255];
FILE *stream;
CString FilePath;
TV_HITTESTINFO pHitTestInfo;
pHittestInfo.pt = point;
pHitTestInfo.flags = TVHT_ONITEMLABEL;
HitTest(&pHitTestInfo);
if(pHitTestInfo.hItem != NULL && !ItemHasChildren(pHitTestInfo.hItem))
{
RECT sRect;
GetItemRect(pHitTestInfo.hItem,&sRect,1);
CRect cRect(&sRect);
if(cRect.PtInRect(point))
{
CTestApp* the = (CTestApp*)AfxGetApp();
CMainFrame* pFrame = (CMainFrame*)AfxGetMainWnd();
CString FileName = GetItemtext(pHitTestInfo.hItem);
FilePath = pFrame->Filepath;
strcpy(buf,Filepath);
strcat(buf,Filename);
strcat(buf,".ext");
stream = fopen(buf, "r");
if(stream != NULL)
{
fclose(stream);
if(pFrame->isUpToDate(buf))
{
pFrame->DisplayFile(buf);
}
else
{
pFrame->CreateFile(buf);//-> Error in this line Exception: C00000FD Stack OverFlow
pFrame->DisplayFile(buf);
}
}
}
}
}
The application exits on double click on a tree item.
When debugging, on stepping into the CreateFile function raises ans stack overflow exception.
If the file is up to date the DisplayFile function is executed properly.
The error is only when I call the CreateFile function. The function just writes some data to text file. On debugging, the exception is raised the minute I step into the function.
Call Trace (IDE VC6)
CCustomTreeCtrl::OnLButtonDblClick(unsigned int 1, Cpoint {x=150 y=104}) line 117
MFC42D! xxxxxxxx()
MFC42D! xxxxxxxx()
MFC42D! xxxxxxxx()
MFC42D! xxxxxxxx()
MFC42D! xxxxxxxx()
USER32! xxxxxxxx()
USER32! xxxxxxxx()
USER32! xxxxxxxx()
USER32! xxxxxxxx()
MFC42D! xxxxxxxx()
MFC42D! xxxxxxxx()
MFC42D! xxxxxxxx()
MFC42D! xxxxxxxx()
MFC42D! xxxxxxxx()
CTestApp::PreTranslateMessage(tagMSG * 0x00484cd0 {msg = 0x00000203 wp = 0x00000001 lp= 0x00680096}) line 563
MFC42D! xxxxxxxx()
MFC42D! xxxxxxxx()
MFC42D! xxxxxxxx()
MFC42D! xxxxxxxx()
WinMain(HINSTANCE__ 0x00400000, HINSTANCE__ 0x00000000, char * 0x00151f2e, int 1) Line 30
WinMainCRTStartup() Line 330 + 54 bytes
KERNEL32! xxxxxxxx()
CreateFunction
void CMainFrame::CreateFile(CString FileName)
{
BeginWaitCursor();
if(FileName.getAt(1) == 'L')
{
CMyDllA myDlla;
myDlla.ConvertDataToFile(MyDataBasePath,FileName);
}
else if(FileName.getAt(1) == 'B')
{
CMyDllB myDllb;
myDllb.ConvertDataToFile(MyDataBasePath,FileName);
}
else if(FileName.getAt(1) == 'D')
{
CMyDllD myDlld;
myDlld.ConvertDataToFile(MyDataBasePath,FileName);
}
else if(FileName.getAt(1) == 'S')
{
CMyDllS myDlls;
myDlls.ConvertDataToFile(MyDataBasePath,FileName);
}
EndWaitCursor();
}
//ConvertDataToFile Function reads data for a database performs calculations and writes report to a text file.
#Michael Walz Yes, the application crashes before hitting the breakpoint.
In Disassembly:
147 pFrame->CreateFile(buf);
00456B74 push ecx
00456B75 mov ecx,esp
00456B77 mov dword ptr [ebp-112A8h],esp
00456B7D lea edx, [buf]
00456B83 push edx
00456B84 call CString::CString (0045da26)
00456B89 mov dword ptr [ebp-11328h],eap
00456B77 mov ecx, dword ptr [pFrame]
00456B95 call #ILT+1300(CMainFrame::CreateFile) (00401519)
As soon as i step into Create File function, in the variable window its showing "this CXX0069: Error variable needs stack frame"
CreateFile Disassembly:
10040: {
0043F813 push ebp
0043F814 mov ebp, esp
0043F816 push 0FFh
0043F818 push offset $L111205 (00465099)
0043F81D mov eax, fs:[00000000]
0043F823 push eax
0043F824 mov dword ptr fs:[0],esp
0043F824 mov eax, 109DC8h
0043F830 call $$$00001 (0045e840) //Breaks here and jumps to CHKSTK.ASM File
0043F835 mov dword ptr [ebp-109DBCh], ecx
0043F83B mov dword ptr [ebp-4], 0
Currently I don't have the source of the CMyDllx. Only the header and lib.
But I am also calling the same function from the application menu, it get carried out without any error. Its only when I call it using tree control, I face the error.
#IInspectable Sorry but this is work PC, and I cannot install any software in this machine.
0043F824 mov eax, 109DC8h
0043F830 call $$$00001 (0045e840) //Breaks here and jumps to CHKSTK.ASM File
109DC8h tells the tale, the argument passed to __chkstk() is the amount of stack space that CreateFile() requires. 0x109dc8 == 1,088,968 bytes. No can do, that's more than the entire space available in the stack (1 megabyte). So __chkstk() correctly slams the emergency stop button before your program hits the wall, CreateFile() will always fail.
Your snippet squarely points the finger at the guilty party, it is one of the CMyDllx objects that is too big. Or more likely, all of them, needing a quarter of a megabyte each. Rewrite the code to allocate them on the free store with the new operator.

Is it possible to transfer thread execution to another thread?

I'm currently experimenting for possibilities transferring a thread execution to another newly created thread from current thread (I hope its a correct word); Here's the illustration:
Thread1 running
Thread1 stop in the middle of the code and create Thread2
Thread2 continue from the middle of the code where Thread1 stop
EDIT: Updated the example.
#include "stdafx.h"
#include <memory>
#include <windows.h>
#include <cassert>
int _eax, _ebx, _ecx, _edx;
int _ebp, _esp, _esi, _edi;
int _eip;
int _flags;
int _jmp_addr;
bool thread_setup = false;
CONTEXT PrevThreadCtx;
HANDLE thread_handle;
int _newt_esp;
int _newt_ret;
DWORD WINAPI RunTheThread(LPVOID lpParam)
{
// 1000 is more than enough, call to CreateThread() should already return by now.
Sleep(1000);
ResumeThread(thread_handle);
return 0;
}
DWORD WINAPI DummyPrologueEpilogue(LPVOID lpParam)
{
return 123;
}
__declspec(naked) void TransferThread(LPVOID lpParam)
{
//longjmp(jmpbuf, 0);=
__asm
{
call get_eip;
cmp[_newt_esp], 0;
mov[_newt_ret], eax;
jz setup_new_thread;
jmp DummyPrologueEpilogue;
get_eip:
mov eax, [esp];
ret;
setup_new_thread:
pushad;
mov[_newt_esp], esp;
mov eax, [_flags];
push eax;
popfd;
mov eax, [_eax];
mov ebx, [_ebx];
mov ecx, [_ecx];
mov edx, [_edx];
mov ebp, [_ebp];
mov esp, [_esp];
mov esi, [_esi];
mov edi, [_edi];
jmp [_eip];
}
}
int _tmain(int argc, _TCHAR* argv[])
{
int x = 100;
char szTest[256];
sprintf_s(szTest, "x = %d", x);
//HideThread();
//setjmp(jmpbuf);
__asm
{
// Save all the register
mov[_eax], eax;
mov[_ebx], ebx;
mov[_ecx], ecx;
mov[_edx], edx;
mov[_ebp], ebp;
mov[_esp], esp;
mov[_esi], esi;
mov[_edi], edi;
push eax;
// Save the flags
pushfd;
pop eax;
mov[_flags], eax;
// If we on *new thread* jmp to end_asm, otherwise continue...
call get_eip;
mov[_eip], eax;
mov al, byte ptr[thread_setup];
test al, al;
jnz end_asm;
mov eax, [jmp_self];
mov[_jmp_addr], eax;
pop eax;
mov[_newt_esp], 0;
mov byte ptr[thread_setup], 1;
push 0;
push CREATE_SUSPENDED;
push 0;
push TransferThread;
push 0;
push 0;
call CreateThread;
mov [thread_handle], eax;
// Create another thread just to resume 'TransferThread()'/*new thread* to give time to
// __stdcall below to return properly, thus restoring the stack.
// So the *new thread* does not accidentally pop the value from stacks or the __stdcall cleanup
// code doesn't accidentally overwrites new pushed value from *new thread*.
push 0;
push 0;
push 0;
push RunTheThread;
push 0;
push 0;
call CreateThread;
// Jump to self, consumes CPU
jmp_self:
jmp jmp_self;
nop;
nop;
jmp end_asm;
get_eip:
mov eax, [esp];
ret;
end_asm:
}
// Test stack-based variable
MessageBoxA(0, szTest, "Hello World!", MB_OK);
assert(x = 100);
x += GetCurrentThreadId();
sprintf_s(szTest, "x = %d", x);
HMODULE hMod = LoadLibrary(TEXT("comctl32"));
FreeLibrary(hMod);
try
{
std::unique_ptr<char[]> pTest(new char[256]);
sprintf_s(pTest.get(), 256, "WinApi call test. Previous loadLibrary() call return %X", hMod);
MessageBoxA(0, pTest.get(), "Hello World!", MB_OK);
} catch (...) {}
char *pszTest = (char*) malloc(256);
if (pszTest)
{
float f = 1.0;
f *= (float) GetCurrentThreadId();
sprintf_s(pszTest, 256, "Current Thread ID = %X, Thread handle = %X, FP Test = %f", GetCurrentThreadId(), GetCurrentThread(), f);
MessageBoxA(0, pszTest, "Hello World!", MB_OK);
free( pszTest );
}
// printf() from *new thread* will fail on stkchk()
//printf("Simple test\n");
// Let's terminate this *new* thread and continue the old thread
if (thread_setup)
{
DWORD OldProtect;
thread_setup = false;
VirtualProtect((PVOID)_jmp_addr, 2, PAGE_EXECUTE_READWRITE, &OldProtect);
*(int*)(_jmp_addr) = 0x90909090; // Prev thread not suspended. Just hope this op is atomic.
// Operation below will change the stack pointer
//VirtualProtect((PVOID)_jmp_addr, 2, OldProtect, &OldProtect);
//FlushInstructionCache(GetCurrentProcess(), (PVOID)_jmp_addr, 2);
__asm {
push eax;
mov eax, jmp_self2;
mov[_jmp_addr], eax;
pop eax;
jmp_self2:
jmp jmp_self2;
nop;
nop;
mov esp, [_newt_esp];
popad;
jmp _newt_ret;
}
}
else
{
DWORD OldProtect;
VirtualProtect((PVOID)_jmp_addr, 2, PAGE_EXECUTE_READWRITE, &OldProtect);
*(int*)(_jmp_addr) = 0x90909090; // Prev thread not suspended. Just hope this op is atomic.
}
// Show both thread can be exited cleanly... with some hacks.
DWORD dwStatus;
while (GetExitCodeThread(thread_handle, &dwStatus) && dwStatus == STILL_ACTIVE) Sleep(10);
printf("*New Thread* exited with status %d (Expected 123), Error=%X\n", dwStatus, GetLastError());
assert(dwStatus == 123);
printf("Test printf from original thread!\n");
printf("printf again!\n");
printf("and again!\n");
Sleep( 1000 );
return 0;
}
The code might be pain to read since it consists mostly asm. So I added a little comment to help. Now that I test, it is quite possible but with some problems. Calling few win api seems fine, but calling printf will certainly crash on stkchk() function (access denied). I will try alternative if there is any suggestion.
It won't be possible. (EDIT: It might be possible to switch successfully with OS APIs like GetThreadContext as JS1 mentionned, but others limitations still apply)
The thing is, the new thread needs the previous thread stack to run. You can do that by either using the old stack directly, or copying the old stack to the new stack. Neither of these are possible : you can't copy the stack because of stack-dependent pointers (frame pointers, for example), and you can't use the old stack, because the OS will detect that the thread went out of its stack, and throw a stack overflow or underflow.
It might be possible if the OS doesn't detect the stack misplacement. If that's the case, then you can load the old ESP and EBP to use the old stack (like you did). You have some problem with your current code (provided it can even work at all), because you push some registers AFTER you saved the stack pointer (ESP). When you reload ESP, it's like you never pushed anything. The ESP pointer really is a special case that need to be handled carefully. Note that you don't even need to care about the new stack in this case, it will just be ignored. That means you don't need any special naked declaration.
Another note, if you are able to do this, neither thread will be able to terminate if you don't restore the threads previous code flows. The old thread should not use the stack while the new is running, so it can't terminate, and the new can't terminate on the old stack. Each stack contains thread-dependent clean-up code at the bottom (or top, for top-down stack).
As an FYI, I have not tried the following, but it's possible that you might be able to get something to work like this with a naked function (AFAIK only Microsoft compilers):
https://msdn.microsoft.com/en-us/library/5ekezyy2.aspx
There are a significant number of limitations: https://msdn.microsoft.com/en-us/library/4d12973a.aspx but starting a thread with a naked function isn't listed as a limitation. A naked function would remove the prolog/epilog and allow you to try and transfer the context from the previous thread.
You can potentially also do this through an interpreter: basically save the interpreted state of the program and start on a separate thread.
As I can think of no actual use case, I'm not sure why you would ever want to do this.

How to get "cryptographically strong" random bytes with Windows APIs?

I need to get a small number of "cryptographically good" random bytes. (8 bytes in my case.) Are there any Windows APIs for that?
PS. It'd be nice if those APIs were backward compatible with Windows XP. But if not, it'd still work. Thanks.
I know that I originally asked about the Windows API, but since my original post I had some time to do the research. So I want to share my findings.
It turns out that since their Ivy Bridge chipset, Intel included a pretty cool hardware random number generator available via the RDRAND CPU instruction.
Since this is the question about Windows implementation and most of the Windows PCs run on the Intel chipsets, I decided to code a small class that (I can't believe that I'm saying it) seems to be generating true random numbers. Here's the description of how it works, and here's the analysis of the Intel's RNG.
I'm also assuming that this code is compiled for a 32-bit process (in case someone needs it for a 64-bit implementation, you'll have to adjust the asm parts.) It is also prudent to say that one should not assume that it will run on any Intel hardware. As I said above, it requires a relatively recent Intel's Ivy Bridge, or later chipset to run. (I tested it on the later Haswell system board.) The good news is that it takes almost no time to find out if the RDRAND instruction is supported, and if not, your most obvious route should be to use any of the OS provided APIs, described in other posts. (Also combining the results from both methods could also increase the entropy of your final result.)
So here's how I call the method to generate random numbers:
CHardwareRandomNumberGenerator h;
BYTE arr[4096] = {0};
UINT ncbSz = sizeof(arr);
int r = h.GetHardwareRandomBytes(arr, &ncbSz);
if(ncbSz != sizeof(arr)) //We'll need only the full array
{
//Use an alternate RNG method:
//- RtlGenRandom()
//or
//- CryptGenRandom()
}
_tprintf(L"RdRand result is %d\n", r);
if(ncbSz > 0)
{
_tprintf(L"Random Bytes (%d): ", ncbSz);
for(UINT i = 0; i < ncbSz; i++)
{
_tprintf(L"%02x", arr[i]);
}
_tprintf(L"\n");
}
This is the header file:
//This class uses the Intel RdRand CPU instruction for
//the random number generator that is compliant with security
//and cryptographic standards:
//
// http://en.wikipedia.org/wiki/RdRand
//
#pragma once
class CHardwareRandomNumberGenerator
{
public:
CHardwareRandomNumberGenerator(void);
~CHardwareRandomNumberGenerator(void);
int GetHardwareRandomBytes(BYTE* pOutRndVals = NULL, UINT* pncbInOutSzRndVals = NULL, DWORD dwmsMaxWait = 5 * 1000);
private:
BOOL bRdRandSupported;
static BOOL __is_cpuid_supported(void);
static BOOL __cpuid(int data[4], int nID);
int __fillHardwareRandomBytes(BYTE* pOutRndVals, UINT* pncbInOutSzRndVals, UINT& ncbOutSzWritten, DWORD dwmsMaxWait);
};
And the implementation file:
//This class uses the Intel RdRand CPU instruction for
//the random number generator that is compliant with security
//and cryptographic standards:
//
// http://en.wikipedia.org/wiki/RdRand
//
//[32-bit Intel-only implementation]
//
#include "HardwareRandomNumberGenerator.h"
CHardwareRandomNumberGenerator::CHardwareRandomNumberGenerator(void) :
bRdRandSupported(FALSE)
{
//Check that RdRand instruction is supported
if(__is_cpuid_supported())
{
//It must be Intel CPU
int name[4] = {0};
if(__cpuid(name, 0))
{
if(name[1] == 0x756e6547 && //uneG
name[2] == 0x6c65746e && //letn
name[3] == 0x49656e69) //Ieni
{
//Get flag itself
int data[4] = {0};
if(__cpuid(data, 1))
{
//Check bit 30 on the 2nd index (ECX register)
if(data[2] & (0x1 << 30))
{
//Supported!
bRdRandSupported = TRUE;
}
}
}
}
}
}
CHardwareRandomNumberGenerator::~CHardwareRandomNumberGenerator(void)
{
}
int CHardwareRandomNumberGenerator::GetHardwareRandomBytes(BYTE* pOutRndVals, UINT* pncbInOutSzRndVals, DWORD dwmsMaxWait)
{
//Generate random numbers into the 'pOutRndVals' buffer
//INFO: This function uses CPU/hardware to generate a set of
// random numbers that are cryptographically strong.
//INFO: For more details refer to:
// http://electronicdesign.com/learning-resources/understanding-intels-ivy-bridge-random-number-generator
//INFO: To review the "ANALYSIS OF INTEL’S IVY BRIDGE DIGITAL RANDOM NUMBER GENERATOR" check:
// http://www.cryptography.com/public/pdf/Intel_TRNG_Report_20120312.pdf
//'pOutRndVals' = if not NULL, points to the buffer that receives random bytes
//'pncbInOutSzRndVals' = if not NULL, on the input must contain the number of BYTEs to write into the 'pOutRndVals' buffer
// on the output will contain the number of BYTEs actually written into the 'pOutRndVals' buffer
//'dwmsMaxWait' = timeout for this method, expressed in milliseconds
//RETURN:
// = 1 if hardware random number generator is supported & the buffer in 'pOutRndVals' was successfully filled out with random numbers
// = 0 if hardware random number generator is supported, but timed out while filling out the buffer in 'pOutRndVals'
// INFO: Check 'pncbInOutSzRndVals', it will contain the number of BYTEs actually written into the 'pOutRndVals' array
// = -1 if general error
// = -2 if hardware random number generator is not supported on this hardware
// INFO: Requires Intel Ivy Bridge, or later chipset.
UINT ncbSzWritten = 0;
int nRes = __fillHardwareRandomBytes(pOutRndVals, pncbInOutSzRndVals, ncbSzWritten, dwmsMaxWait);
if(pncbInOutSzRndVals)
*pncbInOutSzRndVals = ncbSzWritten;
return nRes;
}
int CHardwareRandomNumberGenerator::__fillHardwareRandomBytes(BYTE* pOutRndVals, UINT* pncbInOutSzRndVals, UINT& ncbOutSzWritten, DWORD dwmsMaxWait)
{
//INTERNAL METHOD
ncbOutSzWritten = 0;
//Check support
if(!bRdRandSupported)
return -2;
__try
{
//We must have a buffer to fill out
if(pOutRndVals &&
pncbInOutSzRndVals &&
(int*)*pncbInOutSzRndVals > 0)
{
//Begin timing ticks in ms
DWORD dwmsIniTicks = ::GetTickCount();
UINT ncbSzRndVals = *pncbInOutSzRndVals;
//Fill in data array
for(UINT i = 0; i < ncbSzRndVals; i += sizeof(DWORD))
{
DWORD random_value;
int got_value;
int nFailureCount = 0;
//Since RdRand instruction may not have enough random numbers
//in its buffer, we may need to "loop" while waiting for it to
//generate more results...
//For the first 10 failures we'll simply loop around, after which we
//will wait for 1 ms per each failed iteration to save on the overall
//CPU cycles that this method may consume.
for(;; nFailureCount++ < 10 ? 1 : ::Sleep(1))
{
__asm
{
push eax
push edx
xor eax, eax
;RDRAND instruction = Set random value into EAX. Will set overflow [C] flag if success
_emit 0x0F
_emit 0xC7
_emit 0xF0
mov edx, 1
;Check if the value was available in the RNG buffer
jc lbl_set_it
;It wasn't available
xor edx, edx
xor eax, eax
lbl_set_it:
mov dword ptr [got_value], edx
mov dword ptr [random_value], eax
pop edx
pop eax
}
if(got_value)
{
//Got random value OK
break;
}
//Otherwise RdRand instruction failed to produce a random value
//See if we timed out?
if(::GetTickCount() - dwmsIniTicks > dwmsMaxWait)
{
//Timed out
return 0;
}
//Try again
}
//We now have a 4-byte, or DWORD, random value
//So let's put it into our array
if(i + sizeof(DWORD) <= ncbSzRndVals)
{
*(DWORD*)(pOutRndVals + i) = random_value;
ncbOutSzWritten += sizeof(DWORD);
}
else if(i + sizeof(WORD) + sizeof(BYTE) <= ncbSzRndVals)
{
*(WORD*)(pOutRndVals + i) = (WORD)random_value;
*(BYTE*)(pOutRndVals + i + sizeof(WORD)) = (BYTE)(random_value >> 16);
ncbOutSzWritten += sizeof(WORD) + sizeof(BYTE);
}
else if(i + sizeof(WORD) <= ncbSzRndVals)
{
*(WORD*)(pOutRndVals + i) = (WORD)random_value;
ncbOutSzWritten += sizeof(WORD);
}
else if(i + sizeof(BYTE) <= ncbSzRndVals)
{
*(BYTE*)(pOutRndVals + i) = (BYTE)random_value;
ncbOutSzWritten += sizeof(BYTE);
}
else
{
//Shouldn't even be here
ASSERT(NULL);
return -1;
}
}
}
}
__except(1)
{
//A generic catch-all just to be sure...
return -1;
}
return 1;
}
BOOL CHardwareRandomNumberGenerator::__is_cpuid_supported(void)
{
//See if CPUID command is supported
//INFO: Some really old CPUs may not support it!
//RETURN: = TRUE if yes, and __cpuid() can be called
BOOL bSupported;
DWORD nEFlags = 0;
__try
{
#define FLAG_VALUE (0x1 << 21)
_asm
{
//remember EFLAGS & EAX
pushfd
push eax
//Set bit 21 in EFLAGS
pushfd
pop eax
or eax, FLAG_VALUE
push eax
popfd
//Check if bit 21 in EFLAGS was set
pushfd
pop eax
mov nEFlags, eax
//Restore EFLAGS & EAX
pop eax
popfd
}
bSupported = (nEFlags & FLAG_VALUE) ? TRUE : FALSE;
}
__except(1)
{
//A generic catch-all just to be sure...
bSupported = FALSE;
}
return bSupported;
}
BOOL CHardwareRandomNumberGenerator::__cpuid(int data[4], int nID)
{
//INFO: Call __is_cpuid_supported() first to see if this function is supported
//RETURN:
// = TRUE if success, check 'data' for results
BOOL bRes = TRUE;
__try
{
_asm
{
push eax
push ebx
push ecx
push edx
push esi
//Call CPUID
mov eax, nID
_emit 0x0f ;CPUID
_emit 0xa2
//Save 4 registers
mov esi, data
mov dword ptr [esi], eax
mov dword ptr [esi + 4], ebx
mov dword ptr [esi + 8], ecx
mov dword ptr [esi + 12], edx
pop esi
pop edx
pop ecx
pop ebx
pop eax
}
}
__except(1)
{
//A generic catch-all just to be sure...
bRes = FALSE;
}
return bRes;
}
So idk, guys, I haven't done any extensive cryptographic analysis of the data produced by the method above ... so you'll be the judge. Any updates are welcome!
Here's a little bit of code that produces a sequence of "cryptographically strong" bytes, using the Microsoft Cryptography API... I've used this myself, as aside from anything else it's a nice way to just get a decent random sequence of numbers... I wasn't using it for cryptography:
#include <wincrypt.h>
class RandomSequence
{
HCRYPTPROV hProvider;
public:
RandomSequence(void) : hProvider(NULL) {
if (FALSE == CryptAcquireContext(&hProvider, NULL, NULL, PROV_RSA_FULL, 0)) {
// failed, should we try to create a default provider?
if (NTE_BAD_KEYSET == GetLastError()) {
if (FALSE == CryptAcquireContext(&hProvider, NULL, NULL, PROV_RSA_FULL, CRYPT_NEWKEYSET)) {
// ensure the provider is NULL so we could use a backup plan
hProvider = NULL;
}
}
}
}
~RandomSequence(void) {
if (NULL != hProvider) {
CryptReleaseContext(hProvider, 0U);
}
}
BOOL generate(BYTE* buf, DWORD len) {
if (NULL != hProvider) {
return CryptGenRandom(hProvider, len, buf);
}
return FALSE;
}
};
It's a simple little class that tries to get an RSA Crytographic "provider", and if that fails it tries to create one. Then if all is well, generate will fill your buffer with love. Uhm... I mean random bytes.
This has worked for me on XP, Win7 and Win8, tho' I've not actually used it for cryptography, I just needed a decent sequence of random-ish bytes.
#include <stdexcept>
#include <string>
#include <sstream>
#ifndef __linux__
// For Windows
// Also Works with: MinGW Compiler
#include <windows.h>
#include <wincrypt.h> /* CryptAcquireContext, CryptGenRandom */
int RandBytes(void* const byte_buf, const size_t byte_len) {
HCRYPTPROV p;
ULONG i;
if (CryptAcquireContext(&p, NULL, NULL, PROV_RSA_FULL, CRYPT_VERIFYCONTEXT) == FALSE) {
throw runtime_error{"RandBtyes(): CryptAcquireContext failed."};
}
if (CryptGenRandom(p, byte_len, (BYTE*)byte_buf) == FALSE) {
throw runtime_error{"RandBytes(): CryptGenRandom failed."};
}
CryptReleaseContext(p, 0);
return 0;
}
#endif // Not Linux
#if __linux__
#include <fctl.h>
int RandBytes(void* const byte_buf, const size_t byte_len) {
// NOTE: /dev/random is supposately cryptographically safe
int fd = open("/dev/urandom", O_RDONLY);
if (fd < 0) {
throw runtime_error{"RandBytes(): failed to open"};
}
int rd_len = 0;
while(rd_len < byte_len) {
int n = read(fd, byte_buf, byte_len);
if (n < 0){
stringstream ss;
ss << "RandBytes(): failed (n=" << n << ") " << "(rd_len=" << rd_len << ")";
throw runtime_error{ss.str()};
}
rd_len += n;
}
close(fd);
return 0;
}
#endif
Not sure how portable this is, probably just BSD/Mac; but here's arc4random_buf:
void arc4random_buf(void *buf, size_t nbytes);
MacOS man page says:
These functions use a cryptographic pseudo-random number generator to generate high quality random bytes very quickly.

Is my stdcall wrapper for usercall function correct?

I need to wrap the below __usercall function to _cdecl/_stdcall:
char __usercall sub_4017B0<al>(int a1<ebx>, int a2)
a1 is integer,
a2 is actually an arry of ints ('int args[10]')
Is this correct? What does the <al> mean behind sub_4017B0 ?
int __stdcall func_hook_payload(int callnum, int* args);
// Wrapper for
// char __usercall sub_4017B0<al>(int callnum<ebx>, int a2)
__declspec(naked) void func_hook()
{__asm{
push ebp
mov ebp, esp
push dword ptr[ebp + 0x28] // args[9]
push dword ptr[ebp + 0x24] // args[8]
push dword ptr[ebp + 0x20] // args[7]
push dword ptr[ebp + 0x1C] // args[6]
push dword ptr[ebp + 0x18] // args[5]
push dword ptr[ebp + 0x14] // args[4]
push dword ptr[ebp + 0x10] // args[3]
push dword ptr[ebp + 0x0C] // args[2]
push dword ptr[ebp + 0x08] // args[1]
push dword ptr[ebp + 0x04] // args[0]
push ebx // callnum
call func_hook_payload
leave
ret // note: __usercall is cdecl-like
}}
How would a wrapper look like for calling sub_4017B0 ?
The wrapper should have this signature:
int sub_4017B0_wrapper(int callnum, int* args);
does the function take an actual int* or does it take va_args? in cases like this you need to provide the original calling code to.
from what I can gather, your wrapper should look like this(I don't use stack frames, but your frame is wrong as you don't pop ebp before returning):
__declspec(naked) void func_hook()
{
__asm
{
push dword [esp + 4] //int* - pArgs
push ebx //int - nArgs
call func_hook_payload //you can even just jump to this, the stack should clean itself up correctly
retn
}
}
should it be va_args you can do something like this:
__declspec(naked) void func_hook()
{
__asm
{
lea eax,[esp + 4] //int* - &nArg[0]: here we abuse the way the windows stack grows, creating a stack based buffer
push eax //int* - pArgs
push ebx //int - nArgs
call func_hook_payload
retn
}
}
Calling the old func is pretty simple too, you can do it without a nake function, but really I prefer naked funcs :)
void __declspec(naked) __stdcall CallTheOldVMFunc(int nArgs, int* pArgs)
{
__asm
{
push ebx //save ebx, its not a scratch register
mov ebx,[esp + 8] //set the number of args
push [esp + 12] //push the arg ptr
call TheOldVMFunc
pop ebx //restore ebx
retn 8 //ret and cleanup
}
}