Calling API from a x86 DLL loaded into a x64 application - c++

This is an experiment to exploit the availability of a 32-bit "compatibility" mode selector in GDT in Windows x64 which has the value of 0x23.
I 've successfully loaded a x86 DLL into the address space of a x64 application. This is done by first taking all imports of the x86 dll and saving it into a XML, then using a modified MemoryModule to load the DLL from memory and patch the Import Address Table with those pointers.
Then, in the x64 host I jump to a compatibility code segment:
push 0x23
xor rcx,rcx
mov ecx,Start32
push rcx
retf
Start32 is a 32-bit entry point that would call a configured function:
USE32
push eax
push dword [WhatToPass]
mov eax, [WhatToCall]
call eax
pop eax
Then go back to 64-bit
USE64
db 0eah
ret_64:
dd 0
dw 0x33
nop
This works, as long as the configured exp1 function does practically nothing. The concept is fully discussed in my CodeProject article and the source is on GitHub.
It doesn't work when run in Visual Studio, but it does work when run in WinDbg; WinDbg shows a 'x86' running mode when switching.
Now the ambitious project is to call the API from the x86 module. This fails even with a simple MessageBeep(0);. I don't of course expect it at this point to go fully to the function but it doesn't even enter; I get an invalid execution instruction on the call. The address is valid.
What could be wrong? Yes this is too much perhaps but I'm still curious.
Patching the IAT works correctly. Sample code:
void PatchIAT(HINSTANCE h,std::vector<FANDP>* CustomLoading = 0)
{
PCHAR codeBase = (PCHAR)h;
XML3::XML x(xResult);
PIMAGE_NT_HEADERS32 ntheaders = (PIMAGE_NT_HEADERS32)(PCHAR(h) + PIMAGE_DOS_HEADER(h)->e_lfanew);
PIMAGE_SECTION_HEADER pSech = IMAGE_FIRST_SECTION(ntheaders);//Pointer to first section header
DWORD ulsize = 0;
PIMAGE_IMPORT_DESCRIPTOR importDesc = (PIMAGE_IMPORT_DESCRIPTOR)ImageDirectoryEntryToData(h, TRUE, IMAGE_DIRECTORY_ENTRY_IMPORT, &ulsize);
if (!importDesc)
return;
for (; importDesc && importDesc->Name; importDesc++) {
PSTR pszModName = (PSTR)((PBYTE)h + importDesc->Name);
if (!pszModName)
break;
XML3::XMLElement* module = ... ; // find the XML entry
if (!module)
continue;
DWORD* thunkRef;
DWORD* funcRef = 0;
if (importDesc->OriginalFirstThunk) {
thunkRef = (DWORD*)(codeBase + importDesc->OriginalFirstThunk);
funcRef = (DWORD*)(codeBase + importDesc->FirstThunk);
}
else {
// no hint table
thunkRef = (DWORD*)(codeBase + importDesc->FirstThunk);
funcRef = (DWORD *)(codeBase + importDesc->FirstThunk);
}
DWORD V = 0;
for (; *thunkRef; thunkRef++, funcRef++) {
DWORD* wr = (DWORD*)funcRef;
if (IMAGE_SNAP_BY_ORDINAL(*thunkRef)) {
// *funcRef = module->getProcAddress(handle, (LPCSTR)IMAGE_ORDINAL(*thunkRef), module->userdata);
const char* fe = (LPCSTR)IMAGE_ORDINAL(*thunkRef);
// find V from xml
...
}
else {
PIMAGE_IMPORT_BY_NAME thunkData = (PIMAGE_IMPORT_BY_NAME)(codeBase + (*thunkRef));
// *wr = module->getProcAddress(handle, (LPCSTR)&thunkData->Name, module->userdata);
... // Find V from XML
}
// Patch it now...
DWORD dwOldProtect = 0;
if (VirtualProtect((LPVOID)wr, 4, PAGE_READWRITE, &dwOldProtect))
{
memcpy((void*)wr, &V, 4);
VirtualProtect((LPVOID)wr, 4, dwOldProtect, &dwOldProtect);
}
VirtualProtect((LPVOID)V, 4, PAGE_EXECUTE_READWRITE, &dwOldProtect);
}
}
}

Related

Loading a 64-bit Windows PE file from memory

I wanted to load a Windows PE file (a.k.a. EXE) from a memory buffer in C++ so I found this code. Using a simple hello world example, it works fine. However, when loading a more sophisticated EXE with static dependencies the code crashes instead of loading the PE file successfully. My updated PE loader code is the following:
#include <Windows.h>
#include <stdexcept>
inline auto fix_image_iat(PIMAGE_DOS_HEADER dos_header, PIMAGE_NT_HEADERS nt_header)
{
auto import_table = reinterpret_cast<PIMAGE_IMPORT_DESCRIPTOR>(nt_header->OptionalHeader.DataDirectory[IMAGE_DIRECTORY_ENTRY_IMPORT].
VirtualAddress + reinterpret_cast<UINT_PTR>(dos_header));
const DWORD iat_loc = nt_header->OptionalHeader.DataDirectory[IMAGE_DIRECTORY_ENTRY_IAT].VirtualAddress ?
IMAGE_DIRECTORY_ENTRY_IAT : IMAGE_DIRECTORY_ENTRY_IMPORT;
const DWORD iat_rva = nt_header->OptionalHeader.DataDirectory[iat_loc].VirtualAddress;
const SIZE_T iat_size = nt_header->OptionalHeader.DataDirectory[iat_loc].Size;
const auto iat = reinterpret_cast<LPVOID>(iat_rva + reinterpret_cast<UINT_PTR>(dos_header));
DWORD op;
VirtualProtect(iat, iat_size, PAGE_READWRITE, &op);
PIMAGE_THUNK_DATA thunk;
while (import_table->Name)
{
const auto import_base = LoadLibraryA(
reinterpret_cast<LPCSTR>(import_table->Name + reinterpret_cast<UINT_PTR>(dos_header)));
auto fix_up = reinterpret_cast<PIMAGE_THUNK_DATA>(import_table->FirstThunk + reinterpret_cast<UINT_PTR>(
dos_header));
if (import_table->OriginalFirstThunk)
{
thunk = reinterpret_cast<PIMAGE_THUNK_DATA>(import_table->OriginalFirstThunk + reinterpret_cast<UINT_PTR>(dos_header));
}
else
{
thunk = reinterpret_cast<PIMAGE_THUNK_DATA>(import_table->FirstThunk + reinterpret_cast<UINT_PTR>(dos_header));
}
while (thunk->u1.Function)
{
if (thunk->u1.Ordinal & IMAGE_ORDINAL_FLAG64)
{
fix_up->u1.Function =
reinterpret_cast<UINT_PTR>(GetProcAddress(import_base, reinterpret_cast<LPCSTR>(thunk->u1.Ordinal & 0xFFFF)));
}
else
{
const PCHAR func_name = reinterpret_cast<PIMAGE_IMPORT_BY_NAME>(thunk->u1.AddressOfData)->Name + reinterpret_cast<
UINT_PTR>(dos_header);
fix_up->u1.Function = reinterpret_cast<UINT_PTR>(GetProcAddress(import_base, func_name));
}
fix_up++;
thunk++;
}
import_table++;
}
}
inline auto map_image_to_memory(const char* pe_buffer)
{
auto raw_image_base = PIMAGE_DOS_HEADER(pe_buffer);
if (IMAGE_DOS_SIGNATURE != raw_image_base->e_magic)
{
throw std::runtime_error(("Invalid DOS signature"));
}
auto nt_header = reinterpret_cast<PIMAGE_NT_HEADERS>(raw_image_base->e_lfanew + reinterpret_cast<UINT_PTR>(raw_image_base));
if (IMAGE_NT_SIGNATURE != nt_header->Signature)
{
throw std::runtime_error(("Invalid NT header"));
}
if (IMAGE_FILE_MACHINE_AMD64 != nt_header->FileHeader.Machine)
{
throw std::runtime_error(("Not a 64-bit module"));
}
if (nt_header->OptionalHeader.DataDirectory[IMAGE_DIRECTORY_ENTRY_COM_DESCRIPTOR].VirtualAddress)
{
throw std::runtime_error((".NET is not supported"));
}
auto section_header = reinterpret_cast<PIMAGE_SECTION_HEADER>(raw_image_base->e_lfanew + sizeof * nt_header
+ reinterpret_cast<UINT_PTR>(raw_image_base));
auto mem_image_base = VirtualAlloc(reinterpret_cast<LPVOID>(nt_header->OptionalHeader.ImageBase),
nt_header->OptionalHeader.SizeOfImage, MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE);
if (mem_image_base == nullptr)
{
throw std::runtime_error(("VirtualAlloc() failed"));
}
memcpy(mem_image_base, raw_image_base, nt_header->OptionalHeader.SizeOfHeaders);
for (WORD section_index = 0; section_index < nt_header->FileHeader.NumberOfSections; section_index++)
{
memcpy(reinterpret_cast<LPVOID>(section_header->VirtualAddress + reinterpret_cast<UINT_PTR>(mem_image_base)),
reinterpret_cast<LPVOID>(section_header->PointerToRawData + reinterpret_cast<UINT_PTR>(raw_image_base)),
section_header->SizeOfRawData);
section_header++;
}
return static_cast<PIMAGE_DOS_HEADER>(mem_image_base);
}
//works with manually mapped files
HANDLE GetImageActCtx(HMODULE module)
{
WCHAR temp_path[MAX_PATH];
WCHAR temp_filename[MAX_PATH];
for (int i = 1; i <= 3; i++) {
HRSRC resource_info = FindResource(module, MAKEINTRESOURCE(i), RT_MANIFEST);
if (resource_info) {
HGLOBAL resource = LoadResource(module, resource_info);
DWORD resource_size = SizeofResource(module, resource_info);
const PBYTE resource_data = (const PBYTE)LockResource(resource);
if (resource_data && resource_size) {
FILE* fp;
errno_t err;
DWORD ret_val = GetTempPath(MAX_PATH, temp_path);
if (0 == GetTempFileName(temp_path, L"manifest.tmp", 0, temp_filename))
return NULL;
err = _wfopen_s(&fp, temp_filename, L"w");
if (errno)
return NULL;
fprintf(fp, (const char *)resource_data);
fclose(fp);
break;
}
else {
return NULL;
}
}
}
ACTCTXW act = { sizeof(act) };
act.lpSource = temp_filename;
return CreateActCtx(&act);
}
BOOL FixImageRelocations(PIMAGE_DOS_HEADER dos_header, PIMAGE_NT_HEADERS nt_header, ULONG_PTR delta)
{
ULONG_PTR size;
PULONG_PTR intruction;
PIMAGE_BASE_RELOCATION reloc_block =
(PIMAGE_BASE_RELOCATION)(nt_header->OptionalHeader.DataDirectory[IMAGE_DIRECTORY_ENTRY_BASERELOC].VirtualAddress +
(UINT_PTR)dos_header);
while (reloc_block->VirtualAddress) {
size = (reloc_block->SizeOfBlock - sizeof(reloc_block)) / sizeof(WORD);
PWORD fixup = (PWORD)((ULONG_PTR)reloc_block + sizeof(reloc_block));
for (int i = 0; i < size; i++, fixup++) {
if (IMAGE_REL_BASED_DIR64 == *fixup >> 12) {
intruction = (PULONG_PTR)(reloc_block->VirtualAddress + (ULONG_PTR)dos_header + (*fixup & 0xfff));
*intruction += delta;
}
}
reloc_block = (PIMAGE_BASE_RELOCATION)(reloc_block->SizeOfBlock + (ULONG_PTR)reloc_block);
}
return TRUE;
}
void load_portable_executable(const char* pe_buffer)
{
const auto dos_header = map_image_to_memory(pe_buffer);
auto nt_header = reinterpret_cast<PIMAGE_NT_HEADERS>(dos_header->e_lfanew + reinterpret_cast<UINT_PTR>(dos_header));
HANDLE actctx = NULL;
UINT_PTR cookie = 0;
BOOL changed_ctx = FALSE;
if (nt_header->OptionalHeader.DataDirectory[IMAGE_DIRECTORY_ENTRY_RESOURCE].VirtualAddress) {
actctx = GetImageActCtx(reinterpret_cast<HMODULE>(dos_header));
if (actctx)
changed_ctx = ActivateActCtx(actctx, &cookie);
}
fix_image_iat(dos_header, nt_header);
if (nt_header->OptionalHeader.DataDirectory[IMAGE_DIRECTORY_ENTRY_BASERELOC].VirtualAddress) {
ptrdiff_t delta = (ptrdiff_t)((PBYTE)dos_header - (PBYTE)nt_header->OptionalHeader.ImageBase);
if (delta)
FixImageRelocations(dos_header, nt_header, delta);
}
// Calculate the absolute entry point address
const auto entry_point_address = reinterpret_cast<LPVOID>(nt_header->OptionalHeader.AddressOfEntryPoint + reinterpret_cast<UINT_PTR>(dos_header));
// Launch the PE file
static_cast<void(*)()>(entry_point_address)(); // TODO Crashes here
}
Note that C++20 is required to successfully compile.
Any idea why it crashes at the last line of code when passing control to the entry point address? Is there any library or clean reference implementation for PE loading (on Windows)?
I am doing a similar thing, loading an exe file (with LoadLibraryEx flag LOAD_IGNORE_CODE_AUTHZ_LEVEL) into the address space of the loading application. In this way I can observe all the activities of the program from its inception.
The steps are:
LoadLibrary, and if successful then
Patch the IAT
Get the address of entry-point of the loaded program
CreateThread, CREATE_SUSPENDED with the thread-start-address being the address of entry-point.
ResumeThread and off she goes.
So far there are 2 types of files that can be loaded, those with relocation table and those without. The ones with the relocation table are the modern ones compiled with /DynamicBase and the older ones do not have this information.
I compiled the loader to have fixed-base address way above the default 0x140000000 for x64 to ensure that there is ample space below it for loaded exes that have no relocation and have to be loaded at the default address.
The OS loader applies relocation fixups but does not fix import addresses which we have to do manually.
Any dll we load to fix the imports need no further cooking as the OS loader takes care of that.
Now just like the experience BullyWiiPlaza had I found some programs can be loaded and run perfectly yet many others crash on startup. And the annoying feature that when you exit your loaded program it tends to take with it the loading application on its way out.
To fix this I intercepted the CRT functions abort, _cexit, _c_exit and the kernel ExitProcess
and just the sheer virtue of using these functions enables the loaded program to go out alone but I use the opportunity to UnregisterClass in case I have to re-load the previous not so well written app.
Remember the RegisterClass function can fail with an error message ERROR_CLASS_ALREADY_EXISTS which isn't a sufficient reason to abort the application.
Another important function to intercept is GetModuleHandle because when the loaded app uses this function as GetModuleHandle(0) to query its base address then without intercepting it and returning its hModule the OS will give it the Loader's base address which will cause a crash.
What I find confusing is that some lost versions of my applications could load apps that have no relocation info and run successfully but after ceaseless tweaking I could no longer run them.
The problem appears to be write access violation.
So I checked the protection parameters given to VirtualProtect and found from this following code that the iat_size is small and does not include the thunk I wish modify.
DWORD iat_loc = (nt_header->OptionalHeader.DataDirectory[IMAGE_DIRECTORY_ENTRY_IAT].VirtualAddress)
? IMAGE_DIRECTORY_ENTRY_IAT : IMAGE_DIRECTORY_ENTRY_IMPORT;
UINT_PTR iat_rva = nt_header->OptionalHeader.DataDirectory[iat_loc].VirtualAddress;
SIZE_T iat_size = nt_header->OptionalHeader.DataDirectory[iat_loc].Size;
LPVOID iat = (LPVOID)(iat_rva + (UINT_PTR)dos_header);
DWORD op;
VirtualProtect(iat, iat_size, PAGE_READWRITE, &op);
If anyone wants to recreate the problem then the specimen I used was Celemony Melodyne 5.
It does not have a relocation table and loads at address 0x140000000.
What is confusing is that other x64 applications such as DarkWave Studio run normally except it takes out the loader when it exits.
And by the way Microsoft has made a fine mess of the pragmas data_seg and data_section with respect to x64 altho they claim it works in the documentation.

Converting "application's" memory address

So I am writing my very first trainer for Microsoft's Spider Solitaire. First I needed to backwards-engineer all memory adresses until I found a static one. I used offsets so I can easily revert them back.
I've found this:
1000157F78 <-- starting value(never changes)
+ E8 <-- offsets to pointers
+ 14
002DC3D4 <-- final adress(changes every time)
This is how my trainer gets his final memory address:
DWORD FindFinalAddr(HANDLE hProc, BYTE offsets[], DWORD baseAddress, unsigned char pointerLevel)
{
DWORD pointer = baseAddress;
DWORD pTemp = 0;
DWORD pointerAddr = 0;
// set base address
ReadProcessMemory(hProc, (LPCVOID)pointer, &pTemp, (DWORD)sizeof(pTemp), NULL);
for (int c = 0; c < pointerLevel; c++)
{
pointerAddr = pTemp + (DWORD)offsets[c];
ReadProcessMemory(hProc, (LPCVOID)pointerAddr, &pTemp, (DWORD)sizeof(pTemp), NULL);
}
return pointerAddr;
}
In this case, I do(roughly) this: FindFinalAddr(hProc, {0xE8, 0x14}, 0x1000157F78, 2);
This works fine when Spider Solitaire is open and I have just found the static value.
But when I close it and re-open it's no longer valid.
I found out that 1000157F78 is actually SpiderSolitaire.exe+B5F78 It's like a offset. If I enter this in cheat engine I get the right memory address, but I can't just simply enter it in my code.
Now is my question: How do I convert SpiderSolitaire.exe+B5F78 to the right memory adress?
Note: SpiderSolitaire.exe is 64 bit.
EDIT:
I've tried the following:
void * entryPoint = (void*) hProc;
DWORD base_addr = ((DWORD)(entryPoint) + 0xB5F78);
But that doesn't work, because the entry point is 5C. The adress it should give(in this session) is FF7A5F78, but what really happens is 5C + B5F78 = B5F4D.
I think you can query the load address using GetModuleInformation, passing NULL for the module handle parameter. If that doesn't work, you can take the longer route through EnumProcessModules and GetModuleBaseName.
After a long period of research I've found my own answer!
This piece of code gets the module base address(AKA entry point)(you need to include TlHelp32.h and tchar.h):
DWORD getModuleBaseAddr(DWORD procId, TCHAR * lpszModuleName)
{
HANDLE hSnapshot = CreateToolhelp32Snapshot(TH32CS_SNAPMODULE, procId);
DWORD moduleBaseAddr = 0;
if (hSnapshot != INVALID_HANDLE_VALUE)
{
MODULEENTRY32 mentry32 = { 0 };
mentry32.dwSize = sizeof(MODULEENTRY32);
if (Module32First(hSnapshot, &mentry32))
{
do
{
if (_tcscmp(mentry32.szModule, lpszModuleName) == 0)
{
moduleBaseAddr = (DWORD)mentry32.modBaseAddr;
break;
}
} while (Module32Next(hSnapshot, &mentry32));
}
}
else
{
std::cout << "Error on finding module base address: " << GetLastError() << "\n";
}
return moduleBaseAddr;
}
You give it the pid and the name of the module(like game.exe), then it browses through modules and check if they are the same, and then it returns the base address.
Now, I tested this with Spider Solitaire. It gave me an error.
That is because my compiled code was 32 bit and SpiderSolitaire.exe was 64 bit, which was caused because my Windows 7 was 64 bit.
So make sure your code has the same platform target as the code you're aiming for!

Convert assembly to machine code in C++

I seek for any lib or function to convert a string of assembly code to machine code,
like the following:
char asmString[] = {"mov eax,13H"};
byte[] output; // array of byte
output = asm2mach(asmString); // {0xB8, 0x13, 0x00, 0x00, 0x00}
The motivation is to inject machine code to call asm function in the program. This injection mainly has 3 steps: VirtualAllocEx, WriteProcessMemory and CreateRemoteThread. Here are the code:
bool injectAsm(const char* exeName,const byte* code, int size)
{
LPVOID allocAddr = NULL;
HANDLE ThreadProcess = NULL;
HANDLE hProcess = OpenProcessEasy(exeName);
allocAddr = VirtualAllocEx(hProcess, NULL, size, MEM_COMMIT, PAGE_READWRITE);
if(allocAddr){
if(WriteProcessMemory(hProcess, allocAddr, code, size, NULL)) {
ThreadProcess = CreateRemoteThread(hProcess, NULL, 0, (LPTHREAD_START_ROUTINE)allocAddr, NULL, 0, NULL);
WaitForSingleObject(ThreadProcess, INFINITE);
VirtualFreeEx(hProcess,allocAddr, 0, MEM_RELEASE);
CloseHandle(ThreadProcess);
return true;
}
}
if(allocAddr){
VirtualFreeEx(hProcess, allocAddr, 0, MEM_RELEASE);
}
return false;
}
int main()
{
byte code[] = {0xB8, 0x10, 0xED, 0x4A, 0x00, 0xFF, 0xD0, 0xC3, 0x90};
injectAsm("game.exe",code,sizeof(code));
system("pause");
return 0;
}
I would recommend using both AsmJit and AsmTK. AsmTK is a new project that uses AsmJit as a toolkit and adds additional functionality on top of it. NOTE that at the moment AsmTK requies asmjit:next branch to work, as this is a new functionality.
This is a minimal example that uses AsmTK to parse some asm:
#include <stdio.h>
#include <stdlib.h>
#include <asmtk/asmtk.h>
using namespace asmjit;
using namespace asmtk;
static const char someAsm[] =
"test eax, eax\n"
"jz L1\n"
"mov eax, ebx\n"
"mov eax, 0xFFFFFFFF\n"
"pand mm0, mm1\n"
"paddw xmm0, xmm1\n"
"vpaddw ymm0, ymm1, ymm7\n"
"vaddpd zmm0 {k1}{z}, zmm1, [rax] {1tox}\n"
"L1:\n";
int main(int argc, char* argv[]) {
CodeHolder code;
// Here select the target architecture - either X86 or X64.
code.init(CodeInfo(Arch::kTypeX64));
X86Assembler a(&code);
AsmParser p(&a);
Error err = p.parse(someAsm);
if (err) {
printf("ERROR: %0.8x (%s)\n", err, DebugUtils::errorAsString(err));
return 1;
}
// The machine-code is now stored in CodeHolder's first section:
code.sync();
CodeBuffer& buffer = code.getSection(0)->buffer;
// You can do whatever you need with the buffer:
uint8_t* data = buffer.data;
size_t length = buffer.length;
return 0;
}
AsmJit uses JitRuntime to allocate executable memory and to relocate the resulting machine code into it. However, AsmJit's virtual memory allocator can be created with hProcess, which could be your remote process handle, so it can also allocate memory of that process. Here is a small example of how it could be done:
bool addToRemote(HPROCESS hRemoteProcess, CodeHolder& code) {
// VMemMgr is AsmJit's low-level VM manager.
VMemMgr vm(hRemoteProcess);
// This will tell `vm` to not destroy allocated blocks when it
// gets destroyed.
vm.setKeepVirtualMemory(true);
// Okay, suppose we have the CodeHolder from previous example.
size_t codeSize = code.getCodeSize();
// Allocate a permanent memory of `hRemoteProcess`.
uint64_t remoteAddr = (uint64_t)
vm.alloc(codeSize, VMemMgr::kAllocPermanent);
// Temporary buffer for relocation.
uint8_t* tmp = ::malloc(code.getCodeSize());
if (!tmp) return false;
// First argument is where to relocate the code (it must be
// current's process memory), second argument is the base
// address of the relocated code - it's the remote process's
// memory. We need `tmp` as it will temporarily hold code
// that we want to write to the remote process.
code.relocate(tmp, remoteAddr);
// Now write to the remote process.
SIZE_T bytesWritten;
BOOL ok = WriteProcessMemory(
hRemoteProcess, (LPVOID)remoteMem, tmp, codeSize, &bytesWritten);
// Release temporary resources.
::free(tmp);
// Now the only thing needed is the CreateRemoteThread thingy...
return ok;
}
I would recommend wrapping such functionality into a new RemoteRuntime to make it a matter of a single function call, I can help with that if needed.
You should define what you really want:
Do you want to generate machine code at runtime? Then use some JIT compilation library like libgccjit, libjit, LLVM, GNU lightning, or asmjit. asmjit is a library emitting x86 machine code, probably what you need. There is no absolute need to use a string containing assembler code.
Or do you want to translate some assembler syntax (and there are several assembler syntaxes even for x86) to object code or machine code? Then you'll better run a real assembler as an external program. The produced object code will contain relocation directives, and you'll need something to handle these (e.g. a linker).
Alternative, you should consider generating some (e.g.) C code at runtime, then forking a compilation, and dynamically loading and using the resulting function at runtime (e.g. with dlopen(3) and dlsym). See this
Details are obviously operating system, ABI, and processor specific.
Here is a project that can convert a string of assembly code (Intel or ARM) into its corresponding bytes.
https://github.com/bsmt/Assembler
It's written in Objective-C, but the source is there. I hope this helps.

How to get "cryptographically strong" random bytes with Windows APIs?

I need to get a small number of "cryptographically good" random bytes. (8 bytes in my case.) Are there any Windows APIs for that?
PS. It'd be nice if those APIs were backward compatible with Windows XP. But if not, it'd still work. Thanks.
I know that I originally asked about the Windows API, but since my original post I had some time to do the research. So I want to share my findings.
It turns out that since their Ivy Bridge chipset, Intel included a pretty cool hardware random number generator available via the RDRAND CPU instruction.
Since this is the question about Windows implementation and most of the Windows PCs run on the Intel chipsets, I decided to code a small class that (I can't believe that I'm saying it) seems to be generating true random numbers. Here's the description of how it works, and here's the analysis of the Intel's RNG.
I'm also assuming that this code is compiled for a 32-bit process (in case someone needs it for a 64-bit implementation, you'll have to adjust the asm parts.) It is also prudent to say that one should not assume that it will run on any Intel hardware. As I said above, it requires a relatively recent Intel's Ivy Bridge, or later chipset to run. (I tested it on the later Haswell system board.) The good news is that it takes almost no time to find out if the RDRAND instruction is supported, and if not, your most obvious route should be to use any of the OS provided APIs, described in other posts. (Also combining the results from both methods could also increase the entropy of your final result.)
So here's how I call the method to generate random numbers:
CHardwareRandomNumberGenerator h;
BYTE arr[4096] = {0};
UINT ncbSz = sizeof(arr);
int r = h.GetHardwareRandomBytes(arr, &ncbSz);
if(ncbSz != sizeof(arr)) //We'll need only the full array
{
//Use an alternate RNG method:
//- RtlGenRandom()
//or
//- CryptGenRandom()
}
_tprintf(L"RdRand result is %d\n", r);
if(ncbSz > 0)
{
_tprintf(L"Random Bytes (%d): ", ncbSz);
for(UINT i = 0; i < ncbSz; i++)
{
_tprintf(L"%02x", arr[i]);
}
_tprintf(L"\n");
}
This is the header file:
//This class uses the Intel RdRand CPU instruction for
//the random number generator that is compliant with security
//and cryptographic standards:
//
// http://en.wikipedia.org/wiki/RdRand
//
#pragma once
class CHardwareRandomNumberGenerator
{
public:
CHardwareRandomNumberGenerator(void);
~CHardwareRandomNumberGenerator(void);
int GetHardwareRandomBytes(BYTE* pOutRndVals = NULL, UINT* pncbInOutSzRndVals = NULL, DWORD dwmsMaxWait = 5 * 1000);
private:
BOOL bRdRandSupported;
static BOOL __is_cpuid_supported(void);
static BOOL __cpuid(int data[4], int nID);
int __fillHardwareRandomBytes(BYTE* pOutRndVals, UINT* pncbInOutSzRndVals, UINT& ncbOutSzWritten, DWORD dwmsMaxWait);
};
And the implementation file:
//This class uses the Intel RdRand CPU instruction for
//the random number generator that is compliant with security
//and cryptographic standards:
//
// http://en.wikipedia.org/wiki/RdRand
//
//[32-bit Intel-only implementation]
//
#include "HardwareRandomNumberGenerator.h"
CHardwareRandomNumberGenerator::CHardwareRandomNumberGenerator(void) :
bRdRandSupported(FALSE)
{
//Check that RdRand instruction is supported
if(__is_cpuid_supported())
{
//It must be Intel CPU
int name[4] = {0};
if(__cpuid(name, 0))
{
if(name[1] == 0x756e6547 && //uneG
name[2] == 0x6c65746e && //letn
name[3] == 0x49656e69) //Ieni
{
//Get flag itself
int data[4] = {0};
if(__cpuid(data, 1))
{
//Check bit 30 on the 2nd index (ECX register)
if(data[2] & (0x1 << 30))
{
//Supported!
bRdRandSupported = TRUE;
}
}
}
}
}
}
CHardwareRandomNumberGenerator::~CHardwareRandomNumberGenerator(void)
{
}
int CHardwareRandomNumberGenerator::GetHardwareRandomBytes(BYTE* pOutRndVals, UINT* pncbInOutSzRndVals, DWORD dwmsMaxWait)
{
//Generate random numbers into the 'pOutRndVals' buffer
//INFO: This function uses CPU/hardware to generate a set of
// random numbers that are cryptographically strong.
//INFO: For more details refer to:
// http://electronicdesign.com/learning-resources/understanding-intels-ivy-bridge-random-number-generator
//INFO: To review the "ANALYSIS OF INTEL’S IVY BRIDGE DIGITAL RANDOM NUMBER GENERATOR" check:
// http://www.cryptography.com/public/pdf/Intel_TRNG_Report_20120312.pdf
//'pOutRndVals' = if not NULL, points to the buffer that receives random bytes
//'pncbInOutSzRndVals' = if not NULL, on the input must contain the number of BYTEs to write into the 'pOutRndVals' buffer
// on the output will contain the number of BYTEs actually written into the 'pOutRndVals' buffer
//'dwmsMaxWait' = timeout for this method, expressed in milliseconds
//RETURN:
// = 1 if hardware random number generator is supported & the buffer in 'pOutRndVals' was successfully filled out with random numbers
// = 0 if hardware random number generator is supported, but timed out while filling out the buffer in 'pOutRndVals'
// INFO: Check 'pncbInOutSzRndVals', it will contain the number of BYTEs actually written into the 'pOutRndVals' array
// = -1 if general error
// = -2 if hardware random number generator is not supported on this hardware
// INFO: Requires Intel Ivy Bridge, or later chipset.
UINT ncbSzWritten = 0;
int nRes = __fillHardwareRandomBytes(pOutRndVals, pncbInOutSzRndVals, ncbSzWritten, dwmsMaxWait);
if(pncbInOutSzRndVals)
*pncbInOutSzRndVals = ncbSzWritten;
return nRes;
}
int CHardwareRandomNumberGenerator::__fillHardwareRandomBytes(BYTE* pOutRndVals, UINT* pncbInOutSzRndVals, UINT& ncbOutSzWritten, DWORD dwmsMaxWait)
{
//INTERNAL METHOD
ncbOutSzWritten = 0;
//Check support
if(!bRdRandSupported)
return -2;
__try
{
//We must have a buffer to fill out
if(pOutRndVals &&
pncbInOutSzRndVals &&
(int*)*pncbInOutSzRndVals > 0)
{
//Begin timing ticks in ms
DWORD dwmsIniTicks = ::GetTickCount();
UINT ncbSzRndVals = *pncbInOutSzRndVals;
//Fill in data array
for(UINT i = 0; i < ncbSzRndVals; i += sizeof(DWORD))
{
DWORD random_value;
int got_value;
int nFailureCount = 0;
//Since RdRand instruction may not have enough random numbers
//in its buffer, we may need to "loop" while waiting for it to
//generate more results...
//For the first 10 failures we'll simply loop around, after which we
//will wait for 1 ms per each failed iteration to save on the overall
//CPU cycles that this method may consume.
for(;; nFailureCount++ < 10 ? 1 : ::Sleep(1))
{
__asm
{
push eax
push edx
xor eax, eax
;RDRAND instruction = Set random value into EAX. Will set overflow [C] flag if success
_emit 0x0F
_emit 0xC7
_emit 0xF0
mov edx, 1
;Check if the value was available in the RNG buffer
jc lbl_set_it
;It wasn't available
xor edx, edx
xor eax, eax
lbl_set_it:
mov dword ptr [got_value], edx
mov dword ptr [random_value], eax
pop edx
pop eax
}
if(got_value)
{
//Got random value OK
break;
}
//Otherwise RdRand instruction failed to produce a random value
//See if we timed out?
if(::GetTickCount() - dwmsIniTicks > dwmsMaxWait)
{
//Timed out
return 0;
}
//Try again
}
//We now have a 4-byte, or DWORD, random value
//So let's put it into our array
if(i + sizeof(DWORD) <= ncbSzRndVals)
{
*(DWORD*)(pOutRndVals + i) = random_value;
ncbOutSzWritten += sizeof(DWORD);
}
else if(i + sizeof(WORD) + sizeof(BYTE) <= ncbSzRndVals)
{
*(WORD*)(pOutRndVals + i) = (WORD)random_value;
*(BYTE*)(pOutRndVals + i + sizeof(WORD)) = (BYTE)(random_value >> 16);
ncbOutSzWritten += sizeof(WORD) + sizeof(BYTE);
}
else if(i + sizeof(WORD) <= ncbSzRndVals)
{
*(WORD*)(pOutRndVals + i) = (WORD)random_value;
ncbOutSzWritten += sizeof(WORD);
}
else if(i + sizeof(BYTE) <= ncbSzRndVals)
{
*(BYTE*)(pOutRndVals + i) = (BYTE)random_value;
ncbOutSzWritten += sizeof(BYTE);
}
else
{
//Shouldn't even be here
ASSERT(NULL);
return -1;
}
}
}
}
__except(1)
{
//A generic catch-all just to be sure...
return -1;
}
return 1;
}
BOOL CHardwareRandomNumberGenerator::__is_cpuid_supported(void)
{
//See if CPUID command is supported
//INFO: Some really old CPUs may not support it!
//RETURN: = TRUE if yes, and __cpuid() can be called
BOOL bSupported;
DWORD nEFlags = 0;
__try
{
#define FLAG_VALUE (0x1 << 21)
_asm
{
//remember EFLAGS & EAX
pushfd
push eax
//Set bit 21 in EFLAGS
pushfd
pop eax
or eax, FLAG_VALUE
push eax
popfd
//Check if bit 21 in EFLAGS was set
pushfd
pop eax
mov nEFlags, eax
//Restore EFLAGS & EAX
pop eax
popfd
}
bSupported = (nEFlags & FLAG_VALUE) ? TRUE : FALSE;
}
__except(1)
{
//A generic catch-all just to be sure...
bSupported = FALSE;
}
return bSupported;
}
BOOL CHardwareRandomNumberGenerator::__cpuid(int data[4], int nID)
{
//INFO: Call __is_cpuid_supported() first to see if this function is supported
//RETURN:
// = TRUE if success, check 'data' for results
BOOL bRes = TRUE;
__try
{
_asm
{
push eax
push ebx
push ecx
push edx
push esi
//Call CPUID
mov eax, nID
_emit 0x0f ;CPUID
_emit 0xa2
//Save 4 registers
mov esi, data
mov dword ptr [esi], eax
mov dword ptr [esi + 4], ebx
mov dword ptr [esi + 8], ecx
mov dword ptr [esi + 12], edx
pop esi
pop edx
pop ecx
pop ebx
pop eax
}
}
__except(1)
{
//A generic catch-all just to be sure...
bRes = FALSE;
}
return bRes;
}
So idk, guys, I haven't done any extensive cryptographic analysis of the data produced by the method above ... so you'll be the judge. Any updates are welcome!
Here's a little bit of code that produces a sequence of "cryptographically strong" bytes, using the Microsoft Cryptography API... I've used this myself, as aside from anything else it's a nice way to just get a decent random sequence of numbers... I wasn't using it for cryptography:
#include <wincrypt.h>
class RandomSequence
{
HCRYPTPROV hProvider;
public:
RandomSequence(void) : hProvider(NULL) {
if (FALSE == CryptAcquireContext(&hProvider, NULL, NULL, PROV_RSA_FULL, 0)) {
// failed, should we try to create a default provider?
if (NTE_BAD_KEYSET == GetLastError()) {
if (FALSE == CryptAcquireContext(&hProvider, NULL, NULL, PROV_RSA_FULL, CRYPT_NEWKEYSET)) {
// ensure the provider is NULL so we could use a backup plan
hProvider = NULL;
}
}
}
}
~RandomSequence(void) {
if (NULL != hProvider) {
CryptReleaseContext(hProvider, 0U);
}
}
BOOL generate(BYTE* buf, DWORD len) {
if (NULL != hProvider) {
return CryptGenRandom(hProvider, len, buf);
}
return FALSE;
}
};
It's a simple little class that tries to get an RSA Crytographic "provider", and if that fails it tries to create one. Then if all is well, generate will fill your buffer with love. Uhm... I mean random bytes.
This has worked for me on XP, Win7 and Win8, tho' I've not actually used it for cryptography, I just needed a decent sequence of random-ish bytes.
#include <stdexcept>
#include <string>
#include <sstream>
#ifndef __linux__
// For Windows
// Also Works with: MinGW Compiler
#include <windows.h>
#include <wincrypt.h> /* CryptAcquireContext, CryptGenRandom */
int RandBytes(void* const byte_buf, const size_t byte_len) {
HCRYPTPROV p;
ULONG i;
if (CryptAcquireContext(&p, NULL, NULL, PROV_RSA_FULL, CRYPT_VERIFYCONTEXT) == FALSE) {
throw runtime_error{"RandBtyes(): CryptAcquireContext failed."};
}
if (CryptGenRandom(p, byte_len, (BYTE*)byte_buf) == FALSE) {
throw runtime_error{"RandBytes(): CryptGenRandom failed."};
}
CryptReleaseContext(p, 0);
return 0;
}
#endif // Not Linux
#if __linux__
#include <fctl.h>
int RandBytes(void* const byte_buf, const size_t byte_len) {
// NOTE: /dev/random is supposately cryptographically safe
int fd = open("/dev/urandom", O_RDONLY);
if (fd < 0) {
throw runtime_error{"RandBytes(): failed to open"};
}
int rd_len = 0;
while(rd_len < byte_len) {
int n = read(fd, byte_buf, byte_len);
if (n < 0){
stringstream ss;
ss << "RandBytes(): failed (n=" << n << ") " << "(rd_len=" << rd_len << ")";
throw runtime_error{ss.str()};
}
rd_len += n;
}
close(fd);
return 0;
}
#endif
Not sure how portable this is, probably just BSD/Mac; but here's arc4random_buf:
void arc4random_buf(void *buf, size_t nbytes);
MacOS man page says:
These functions use a cryptographic pseudo-random number generator to generate high quality random bytes very quickly.

Read eax register

I would like to know if it is possible to read the eax register of another process immediately after an assembly instruction has been executed.
In my case I have the following assembly code:
mov byte ptr ss:[EBP-4]
call dword ptr ds:[<&MSVCR100.??2#YAPAXI#Z>]
add esp, 4
The idea is to get the eax value just after the "call dword ptr ds:[<&MSVCR100.??2#YAPAXI#Z>]" instruction has been executed.
Indeed, I have to retrieve the memory address returned by the instanciation of an object created in another process, in my C++ code.
Dunno if I have been clear enough. And please forgive my bad english.
You could debug the process using a hardware breakpoint.
Example using winapi:
DWORD address = 0x12345678; // address of the instruction after the call
DebugActiveProcess(pid); // PID of target process
CONTEXT ctx = {0};
ctx.ContextFlags = CONTEXT_DEBUG_REGISTERS | CONTEXT_INTEGER;
ctx.Dr0 = address;
ctx.Dr7 = 0x00000001;
SetThreadContext(hThread, &ctx); // hThread with enough permissions
DEBUG_EVENT dbgEvent;
while (true)
{
if (WaitForDebugEvent(&dbgEvent, INFINITE) == 0)
break;
if (dbgEvent.dwDebugEventCode == EXCEPTION_DEBUG_EVENT &&
dbgEvent.u.Exception.ExceptionRecord.ExceptionCode == EXCEPTION_SINGLE_STEP)
{
if (dbgEvent.u.Exception.ExceptionRecord.ExceptionAddress == (LPVOID)address)
{
GetThreadContext(hThread, &ctx);
DWORD eax = ctx.Eax; // eax get
}
}
ContinueDebugEvent(dbgEvent.dwProcessId, dbgEvent.dwThreadId, DBG_CONTINUE);
}