Exzessive stack usage for simple function in debug build - c++

I have simple class using a kind of ATL database access.
All functions are defined in a header file.
The problematic functions all do the same. There are some macros in use. The generated code looks like this
void InitBindings()
{
if (sName) // Static global char*
m_sTableName = sName; // Save into member
{ AddCol("Name", some_constant_data... _GetOleDBType(...), ...); };
{ AddCol("Name1", some_other_constant_data_GetOleDBType(...), ...); };
...
}
AddCol returns a reference to a structure, but as you see it is ignored.
When I look into the assembler code where I have a function that uses 6 AddCol calls I can see that the function requires 2176 bytes of stack space. I have functions that requires 20kb and more. And in the debugger I can see that the stack isn't use at all. (All initialized to 0xCC and never touched)
See assembler code at the end.
The problem can be seen with VS-2015, and VS-2017.Only in Debug mode.
In Release mode the function reserves no extra stack space at all.
The only rule I see is; more AddCol calls, will cause more stack to be reserved. I can see that approximativ 500bytes per AddCol call is reserved.
Again: The function returns no object, it returns a reference to the binding information.
I already used the following pragmas in front of the function (but inside the class definition in the header):
__pragma(runtime_checks("", off)) __pragma(optimize("ts", on)) __pragma(strict_gs_check(push, off))
But no avail. This pragmas should turn optimization on, switches off runtime checks and stack checks. How can I reduce this unneeded stack space that is allocated. In some cases I can see stack overflows in the debug version, when this functions are used. No problems in the release version.
; 325 : BIND_BEGIN(CMasterData, _T("tblMasterData"))
push ebp
mov ebp, esp
sub esp, 2176 ; 00000880H
push ebx
push esi
push edi
mov DWORD PTR _this$[ebp], ecx
mov eax, OFFSET ??_C#_1BM#GOLNKAI#?$AAt?$AAb?$AAl?$AAM?$AAa?$AAs?$AAt?$AAe?$AAr?$AAD?$AAa?$AAt?$AAa?$AA?$AA#
test eax, eax
je SHORT $LN2#InitBindin
push OFFSET ??_C#_1BM#GOLNKAI#?$AAt?$AAb?$AAl?$AAM?$AAa?$AAs?$AAt?$AAe?$AAr?$AAD?$AAa?$AAt?$AAa?$AA?$AA#
mov ecx, DWORD PTR _this$[ebp]
add ecx, 136 ; 00000088H
call DWORD PTR __imp_??4?$CStringT#_WV?$StrTraitMFC_DLL#_WV?$ChTraitsCRT#_W#ATL#####ATL##QAEAAV01#PB_W#Z
$LN2#InitBindin:
; 326 : // Columns:
; 327 : B$C_IDENT (_T("Id"), m_lId);
push 0
push 0
push 1
push 4
push 0
call ?_GetOleDBType#ATL##YAGAAJ#Z ; ATL::_GetOleDBType
add esp, 4
movzx eax, ax
push eax
push 0
push OFFSET ??_C#_15NCCOGFKM#?$AAI?$AAd?$AA?$AA#
mov ecx, DWORD PTR _this$[ebp]
call ?AddCol#CDBAccess#DB##QAEAAUS_BIND#2#PB_WKGKW4TYPE#32#0_N#Z ; DB::CDBAccess::AddCol
; 328 : B$C (_T("Name"), m_szName);
push 0
push 0
push 0
push 122 ; 0000007aH
mov eax, 4
push eax
call ?_GetOleDBType#ATL##YAGQA_W#Z ; ATL::_GetOleDBType
add esp, 4
movzx ecx, ax
push ecx
push 4
push OFFSET ??_C#_19DINFBLAK#?$AAN?$AAa?$AAm?$AAe?$AA?$AA#
mov ecx, DWORD PTR _this$[ebp]
call ?AddCol#CDBAccess#DB##QAEAAUS_BIND#2#PB_WKGKW4TYPE#32#0_N#Z ; DB::CDBAccess::AddCol
; 329 : B$C (_T("Data"), m_data);
push 0
push 0
push 0
push 4
push 128 ; 00000080H
call ?_GetOleDBType#ATL##YAGAAVCComBSTR#1##Z ; ATL::_GetOleDBType
add esp, 4
movzx eax, ax
push eax
push 128 ; 00000080H
push OFFSET ??_C#_19IEEMEPMH#?$AAD?$AAa?$AAt?$AAa?$AA?$AA#
mov ecx, DWORD PTR _this$[ebp]
call ?AddCol#CDBAccess#DB##QAEAAUS_BIND#2#PB_WKGKW4TYPE#32#0_N#Z ; DB::CDBAccess::AddCol

It is a compiler bug. Already known in connect.
EDIT The problem seams to be fixed in VS-2017 15.5.1
The problem has to do with a bug in the built in offsetof.
It is not possible for me to #undef _CRT_USE_BUILTIN_OFFSETOF as written in this case.
For me it only works to #undef offsetof and to use one of this:
#define myoffsetof1(s,m) ((size_t)&reinterpret_cast<char const volatile&>((((s*)0)->m)))
#define myoffsetof2(s, m) ((size_t)&(((s*)0)->m))
#undef offsetof
#define offsetof myoffsetof1
All ATL DB consumers are affected.
Here is a minimum repro, that shows the bug. Set a breakpint on the Init function. Look into the assembler code and wonder how much stack is used!
// StackUsage.cpp : Defines the entry point for the console application.
//
#include "stdafx.h"
#include <string>
#include <list>
#include <iostream>
using namespace std;
struct CRec
{
char t1[20];
char t2[20];
char t3[20];
char t4[20];
char t5[20];
int i1, i2, i3, i4, i5;
GUID g1, g2, g3, g4, g5;
DBTIMESTAMP d1, d2, d3, d4, d5;
};
#define sizeofmember(s,m) sizeof(reinterpret_cast<const s *>(0)->m)
#define typeofmember(c,m) _GetOleDBType(((c*)0)->m)
#define myoffsetof1(s,m) ((size_t)&reinterpret_cast<char const volatile&>((((s*)0)->m)))
#define myoffsetof2(s, m) ((size_t)&(((s*)0)->m))
// Undef this lines to fix the bug
// #undef offsetof
// #define offsetof myoffsetof1
#define COL(n,v) { AddCol(n,offsetof(CRec,v),typeofmember(CRec,v),sizeofmember(CRec,v)); }
class CFoo
{
public:
CFoo()
{
Init();
}
void Init()
{
COL("t1", t1);
COL("t2", t2);
COL("t3", t3);
COL("t4", t4);
COL("t5", t5);
COL("i1", i1);
COL("i2", i2);
COL("i3", i3);
COL("i4", i4);
COL("i5", i5);
COL("g1", g1);
COL("g2", g2);
COL("g2", g3);
COL("g2", g4);
COL("g2", g5);
COL("d1", d1);
COL("d2", d2);
COL("d2", d3);
COL("d2", d4);
COL("d2", d5);
}
void AddCol(PCSTR szName, ULONG nOffset, DBTYPE wType, ULONG nSize)
{
cout << szName << '\t' << nOffset << '\t' << wType << '\t' << nSize << endl;
}
};
int main()
{
CFoo foo;
return 0;
}

Related

c++ stack memory scope with curly braces

I'm looking for a double check on my understanding. I ran across code of this form:
#define BUFLEN_256 256
int main()
{
const char* charPtr = "";
if (true /* some real test here */)
{
char buf[BUFLEN_256] = { 0 };
snprintf(buf, BUFLEN_256, "Some string goes here..");
charPtr = buf;
}
std::cout << charPtr << std::endl; // Is accessing charPtr technically dangerous here?
}
My immediate thought was bug, the stack memory assigned to buf[] is no longer guaranteed to belong to the array once you exit the if(){}. But the code builds and runs without problem, and in double checking myself I got confused. I'm not good at assembly, but if I'm reading it correctly it does not appear that the stack pointer is reset after leaving the curly braces. Can someone double check me on that and chime in as to whether this code is technically valid? Here is the code with the assembly (built with Visual Studio 2019). My thought is this code is not OK, but I've been wrong on odd issues before.
#define BUFLEN_256 256
int main()
{
00DA25C0 push ebp
00DA25C1 mov ebp,esp
00DA25C3 sub esp,1D8h
00DA25C9 push ebx
00DA25CA push esi
00DA25CB push edi
00DA25CC lea edi,[ebp-1D8h]
00DA25D2 mov ecx,76h
00DA25D7 mov eax,0CCCCCCCCh
00DA25DC rep stos dword ptr es:[edi]
00DA25DE mov eax,dword ptr [__security_cookie (0DAC004h)]
00DA25E3 xor eax,ebp
00DA25E5 mov dword ptr [ebp-4],eax
00DA25E8 mov ecx,offset _1FACD15F_scratch#cpp (0DAF029h)
00DA25ED call #__CheckForDebuggerJustMyCode#4 (0DA138Eh)
const char* charPtr = "";
00DA25F2 mov dword ptr [charPtr],offset string "" (0DA9B30h)
if (true /* some real test here */)
00DA25F9 mov eax,1
00DA25FE test eax,eax
00DA2600 je main+7Ah (0DA263Ah)
{
char buf[BUFLEN_256] = { 0 };
00DA2602 push 100h
00DA2607 push 0
00DA2609 lea eax,[ebp-114h]
00DA260F push eax
00DA2610 call _memset (0DA1186h)
00DA2615 add esp,0Ch
snprintf(buf, BUFLEN_256, "Some string goes here..");
00DA2618 push offset string "Some string goes here.." (0DA9BB8h)
00DA261D push 100h
00DA2622 lea eax,[ebp-114h]
00DA2628 push eax
00DA2629 call _snprintf (0DA1267h)
00DA262E add esp,0Ch
charPtr = buf;
00DA2631 lea eax,[ebp-114h]
00DA2637 mov dword ptr [charPtr],eax
}
std::cout << charPtr << std::endl;
00DA263A mov esi,esp
00DA263C push offset std::endl<char,std::char_traits<char> > (0DA103Ch)
00DA2641 mov eax,dword ptr [charPtr]
00DA2644 push eax
00DA2645 mov ecx,dword ptr [__imp_std::cout (0DAD0D4h)]
00DA264B push ecx
00DA264C call std::operator<<<std::char_traits<char> > (0DA11AEh)
00DA2651 add esp,8
00DA2654 mov ecx,eax
00DA2656 call dword ptr [__imp_std::basic_ostream<char,std::char_traits<char> >::operator<< (0DAD0A0h)]
00DA265C cmp esi,esp
00DA265E call __RTC_CheckEsp (0DA129Eh)
}
00DA2663 xor eax,eax
00DA2665 push edx
00DA2666 mov ecx,ebp
00DA2668 push eax
00DA2669 lea edx,ds:[0DA2694h]
00DA266F call #_RTC_CheckStackVars#8 (0DA1235h)
00DA2674 pop eax
00DA2675 pop edx
00DA2676 pop edi
00DA2677 pop esi
00DA2678 pop ebx
00DA2679 mov ecx,dword ptr [ebp-4]
00DA267C xor ecx,ebp
00DA267E call #__security_check_cookie#4 (0DA1181h)
00DA2683 add esp,1D8h
00DA2689 cmp ebp,esp
00DA268B call __RTC_CheckEsp (0DA129Eh)
00DA2690 mov esp,ebp
00DA2692 pop ebp
00DA2693 ret
00DA2694 add dword ptr [eax],eax
00DA2696 add byte ptr [eax],al
00DA2698 pushfd
00DA2699 fiadd dword ptr es:[eax]
00DA269C in al,dx
00DA269D ?? ??????
00DA269E ?? ??????
}
My immediate thought was bug, the stack memory assigned to buf[] is no longer guaranteed to belong to the array once you exit the if(){}.
That is correct.
But the code builds and runs without problem
Undefined Behavior. In the cout << charPtr statement, charPtr is a dangling pointer to invalid memory. Whether or not the memory has been physically freed is irrelevent. The memory has gone out of scope.
I'm not good at assembly, but if I'm reading it correctly it does not appear that the stack pointer is reset after leaving the curly braces.
That is correct.
The memory for the array is being pre-allocated at the top of the stack frame when the function is entered (as part of the sub esp, 1D8h instruction), and then gets released during cleanup of the stack frame when the function exits (as part of the add esp, 1D8h instruction).
As you can see, when the if is entered, the very first thing it does is to call _memset() to zero out an array which already exists at [ebp-114h].
But that is an implementation detail, don't rely on that.
Can someone double check me on that and chime in as to whether this code is technically valid?
It is not.
What you're seeing is 'undefined' behavior. Stack memory is typically allocated all in one go at the start. So when a variable goes out-of-scope on the stack, that memory becomes available for re-use. Since you're not overwriting the stack with anything after the if statement, the data previously stored there is still intact. If you were to allocate additional memory/data to the stack after the if statement, you'd see a much different result.
See this post here:
What happens when a variable goes out of scope?
Edit:
To elaborate and demonstrate this, consider the following modification of your code (Compiled on VS2019 v142 x64):
#include <iostream>
#define BUFLEN_256 256
int main()
{
char* charPtr;
char other_buf[BUFLEN_256] = { 0 };
char* charPtr2 = other_buf;
if (true /* some real test here */)
{
char buf[BUFLEN_256] = { 0 };
snprintf(buf, BUFLEN_256, "Some string goes here..");
charPtr = buf;
}
std::cout << charPtr << std::endl;
for (int n = 0; n < 3000; ++n)
{
*charPtr2 = 'a';
charPtr2++;
}
std::cout << charPtr << std::endl;
}
Output
Some string goes here..
Some string goes haaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaca
Of course, keeping in mind that every compiler handles optimizations differently, and this may or may not happen in every case. That is why the behavior is 'undefined'. This example is more-so demonstrating overrunning the stack intentionally (buffer-overrun), but it illustrates the same effect. I'd produce a more direct example of legitimate cases where this could happen, but ironically undefined behavior is difficult to intentionally reproduce.
Yes, accessing charPtr in this way is undefined behaviour - and hence dangerous - because buf goes out of scope at the closing brace.
In practise, the code may work (or appear to work) because the memory used for buf is not re-used immediately but you should not, of course, rely on that. Whoever wrote this code made a mistake.

C++ ASM push offset string problem in release mode

The function I want to call is a function of a class:
void D3DBase::SetTexture(const std::string& path);
When i call it with asm block it works, but It was giving an error when I built it in release mode, then when I checked it from memory I realized that I needed to shift the string offset by 4 bytes and when I tried it worked.
My question is Why should I do that? What is the reason of this?
std::string __tmpString = "";
void SetTexture(DWORD table, const std::string& str)
{
__tmpString = str;
__asm {
#ifdef NDEBUG
push offset __tmpString - 0x4
#else
push offset __tmpString
#endif
mov ecx, table
mov eax, 0x401FC0
call eax
}
}

Frame not found in module

#include <iostream>
#include <Windows.h>
using namespace std;
int main() {
const char* message = ("hi");
__asm {
print:
push ebp;
mov ebp, esp;
mov ebx, [ebp + 8]; <- First argument
push ebx;
call printf;
mov esp, ebp;
pop ebp;
ret;
mov eax, message;
push eax;
call print;
};
std::cin.get();
return (0);
};
I'm using an inline assembler obviously.
I don't understand why this is an error, I've tried using leave instead of destroying the stack frame manually, no luck. I don't understand where I went wrong here.

Trying to create a windows 8 syscall callgate function

I have a windows 7 callgate function that I use to call NT functions directly:
//Windows 7 syscall
__declspec(naked)
NTSTATUS __fastcall wow64 ( DWORD ecxId, char *edxArgs )
{
__asm
{
mov eax, ecx;
mov ecx, m_param;
call DWORD ptr fs:[0xc0];
add esp, 0x4;
retn;
};
}
NTSTATUS callGate ( DWORD id, ... )
{
va_list valist;
va_start(valist,id);
return wow64(id,valist);
}
//Example NTClose function
NTSTATUS closeHandle ( void *object )
{
m_param = 0;
return callGate ( 0xc, object );
}
I am trying to do the same thing for windows 8.1. I have updated all of the function call indexes; however I noticed the actual callgate function is quite different on windows 8.1:
Here is what the actual call gate looks like (located in ntdll.dll) for the function ZwCreateThreadEx
mov eax, 0xA5 //the call index
xor ecx, ecx //(m_param)
lea edx, dword ptr ss:[esp + 0x4] //this causes an sp-analysis failure in IDA
call dword ptr fs:[0xC0]
add esp, 0x4
retn 0x2C
Now here is the EXACT same NT function (ZwCreateThreadEx) on windows 8.1
mov eax, 0xB0 //the call index
call dword ptr fs:[0xC0]
retn 0x2C //2c/4 = 11 parameters
I have been trying all kinds of stuff to get this working on windows 8.1 but have had no avail. I cannot explain what the issue is or what is going wrong, all I know is I am doing it correctly on windows 7.
From the looks of the W8.1 function, I have attempted to come up with this single function (Does not work):
DWORD dwebp,dwret,dwparams; //for saving stuff
NTSTATUS __cdecl callGate ( DWORD id, DWORD numparams, ... )
{
_asm
{
pop dwebp; //save ebp off stack
pop dwret; //save return address
pop eax; //save id
pop dwparams; //save param count
push dwret; //push return addy back onto stack cuz thats how windows has it
JMP DWORD ptr fs:[0xc0]; //call with correct stackframe (i think)
mov ecx, numparams; //store num params
imul ecx, 4; //multiply numparams by sizeof(int)
add esp, ecx; //add to esp
ret;
};
}
Any help would be appreciated greatly.
Your new callGate function doesn't set up the stack frame you want, the return address at the top of the stack is return address of callGate not the instruction after the call.
This is what the stack looks like after the CALL instruction is executed in your example ZwCreateThreadEx from Windows 8.1:
return address (retn 0x2c instruction)
return address (caller of ZwCreateThreadEx)
arguments (11 DWORDs)
Here's what the stack looks like after the JMP instruction is executed in your new callGate function:
return address (caller of callGate)
arguments
There are other problems with your new callGate function. It saves values in global variables which means you function isn't thread safe. Two threads can't call callBack at the same time without trashing these saved values. It uses inline assembly which both makes your code more complicated that it needs to be and make its dependent on undocumented behaviour: how the compiler will set up the stack for the function.
Here's how I write your Windows 8.1 version of callGate in MASM:
_text SEGMENT
MAXARGS = 16
do_call MACRO argcount
##call&argcount:
call DWORD PTR fs:[0C0h]
ret argcount * 4
ENDM
call_table_entry MACRO argcount
DD OFFSET ##call&argcount
ENDM
_callGate PROC
pop edx ; return address
pop eax ; id
pop ecx ; numparams
push edx ; return address
cmp ecx, MAXARGS
jg ##fail
jmp [##call_table + ecx * 4]
##args = 0
REPT MAXARGS + 1
do_call %##args
##args = ##args + 1
ENDM
##fail:
; add better error handling
int 3
jmp ##fail
##call_table:
##args = 0
REPT MAXARGS + 1
call_table_entry %##args
##args = ##args + 1
ENDM
_callGate ENDP
_TEXT ENDS
END
This implementation is limited to MAXARGS arguments (change the value if any Windows system call takes more than 16 arguments). It uses macros generate a table of CALL/RET code blocks to avoid having to store the number of arguments somewhere across the call. I have a version that supports any number of arguments but it's more complicated and a fair bit slower. This implementation is untested, I don't have Windows 8.1.

Replacing assembly instruction changes other instructions

I am trying to place a call instruction at a function to simulate a hook, so i should be replacing 6 bytes in the beginning of the function to place my call which is 2 bytes for the opcode and a dword for the address. however here is the disassembly of the function before i hook it
void realFunction()
{
00B533C0 push ebp
00B533C1 mov ebp,esp
00B533C3 sub esp,0C0h
00B533C9 push ebx
00B533CA push esi
00B533CB push edi
00B533CC lea edi,[ebp-0C0h]
00B533D2 mov ecx,30h
00B533D7 mov eax,0CCCCCCCCh
00B533DC rep stos dword ptr es:[edi]
MessageBox(NULL, "realFunction()", "Trace", MB_OK);
00B533DE mov esi,esp
00B533E0 push 0
00B533E2 push 0B56488h
00B533E7 push 0B56490h
00B533EC push 0
00B533EE call dword ptr ds:[0B5613Ch]
00B533F4 cmp esi,esp
00B533F6 call _RTC_CheckEsp (0B53A10h)
}
and strangely here is it after i just replace 6 bytes
void realFunction()
{
00B533C0 call fakeFunction (0B52EF0h)
00B533C5 rol byte ptr [eax],0 <--
00B533C8 add byte ptr [ebx+56h],dl <--
00B533CB push edi <--
00B533CC lea edi,[ebp-0C0h] <--
00B533D2 mov ecx,30h
00B533D7 mov eax,0CCCCCCCCh
00B533DC rep stos dword ptr es:[edi]
MessageBox(NULL, "realFunction()", "Trace", MB_OK);
00B533DE mov esi,esp
00B533E0 push 0
00B533E2 push 0B56488h
00B533E7 push 0B56490h
00B533EC push 0
00B533EE call dword ptr ds:[0B5613Ch]
00B533F4 cmp esi,esp
00B533F6 call _RTC_CheckEsp (0B53A10h)
}
code for the hook
#include <iostream>
#include <windows.h>
using namespace std;
void realFunction()
{
MessageBox(NULL, "realFunction()", "Trace", MB_OK);
}
__declspec(naked) void fakeFunction()
{
__asm {
pushad;
pushfd;
}
MessageBox(NULL, "fakeFunction()", "Trace", MB_OK);
__asm{
popfd;
popad;
ret; //This should return back and resumes the execution of the original function;
}
}
void main()
{
DWORD size = sizeof(double);
DWORD oldProtection;
DWORD realFunctionAddr = (DWORD)realFunction;
DWORD fakeFunctionAddr = (DWORD)fakeFunction;
VirtualProtect((LPVOID)realFunctionAddr, size, PAGE_EXECUTE_READWRITE, &oldProtection);
*((PBYTE)(realFunctionAddr)) = 0xE8;
*((PDWORD)(realFunctionAddr + 1)) = fakeFunctionAddr - realFunctionAddr - 5;
VirtualProtect((LPVOID)fakeFunctionAddr, size, oldProtection, &oldProtection);
realFunction();
while (true){
cin.get();
}
}
I want to understand why this happens, why not just the 6 bytes i replaced are changed ?
As you can see, the sub esp,0C0h instruction begins at address 00B533C3, but the next instruction push ebx begins at address 00B533C9. You have overwritten addresses 00B533C0 through 00B533C5, so immediately after your 6 bytes you are in the middle of the sub esp,0C0h instruction.
The disassembler has no way of knowing that a certain byte is garbage and not an instruction, so it tries to interpret the bytes as instructions, as best as it can, and what you see is, of course, nonsensical instructions. After a while it just so happens (by coincidence) that the end of a nonsensical instruction coincides with the end of an actual instruction that used to be there, so from that point on the disassembler interprets instructions successfully, that's why the remainder of your function looks okay.
If you look at the actual bytes, and not at the assembly language mnemonic interpretations of these bytes, you will see that nothing funky is going on.
(Except, perhaps, for the fact that you appear to have replaced 5, not 6 bytes.)