I've made this sample code:
#include <vector>
struct POD {
int a;
int b;
int c;
inline static POD make_pod_with_default()
{
POD p{ 41, 51, 61 };
return p;
}
inline void change_pod_a(POD &p, int a) {
p.a = a;
}
inline void change_pod_b(POD &p, int b) {
p.b = b;
}
static POD make_pod_with_a(int a) {
POD p = make_pod_with_default();
p.change_pod_a(p, a);
return p;
}
static POD make_pod_with_b(int a) {
POD p = make_pod_with_default();
p.change_pod_b(p, a);
return p;
}
};
int main()
{
std::vector<POD> vec{};
vec.reserve(2);
vec.push_back(POD::make_pod_with_a(71));
vec.push_back(POD::make_pod_with_b(81));
return vec[0].a + vec[0].b + vec[0].c + vec[1].a + vec[1].b + vec[1].c;
}
In the compiled assembly code we can see the following instructions are being generated for the first vec.push_back(...) call:
...
mov DWORD PTR $T2[esp+32], 41 ; 00000029H
...
mov DWORD PTR $T2[esp+36], 51 ; 00000033H
...
mov DWORD PTR $T5[esp+32], 71 ; 00000047H
...
mov DWORD PTR $T6[esp+44], 61 ; 0000003dH
...
There's a mov to [esp+32] for the 71, but the mov to [esp+32] for the 41 is still there, being useless! How can I write code for MSVC that will enable this kind of optimization, is MSVC even capable of it?
Both GCC and CLANG give more optimized versions, but CLANG defeats by a large margin with literally no overhead, in a very clean and logical fashion:
CLANG generated code:
main: # #main
push rax
mov edi, 24
call operator new(unsigned long)
mov rdi, rax
call operator delete(void*)
mov eax, 366
pop rcx
ret
Everything is done at compile time as 71 + 51 + 61 + 41 + 81 + 61 = 366!
I must admit its painful to see my program being computed at compile time and still throw in that call to vec.reserve() in the assembly... but CLANG still takes the cake, by far! Come on MSVC, this is not a vector of volatile.
If you turn your methods constexpr, you might do:
constexpr POD step_one()
{
POD p{2, 5, 11};
p.b = 3;
return p;
}
constexpr void step_two(POD &p)
{
p.c = 5;
}
constexpr POD make_pod(){
POD p = step_one();
step_two(p);
return p;
}
POD make_pod_final()
{
constexpr POD res = make_pod();
return res;
}
resulting to:
make_pod_final PROC
mov eax, DWORD PTR $T1[esp-4]
mov DWORD PTR [eax], 2
mov DWORD PTR [eax+4], 3
mov DWORD PTR [eax+8], 5
ret 0
Demo
Related
When I compile following code with gcc 6 -O3 -std=c++14, I get nice and empty main:
Dump of assembler code for function main():
0x00000000004003e0 <+0>: xor %eax,%eax
0x00000000004003e2 <+2>: retq
But uncommenting last line in main "breaks" optimization:
Dump of assembler code for function main():
0x00000000004005f0 <+0>: sub $0x78,%rsp
0x00000000004005f4 <+4>: lea 0x40(%rsp),%rdi
0x00000000004005f9 <+9>: movq $0x400838,0x10(%rsp)
0x0000000000400602 <+18>: movb $0x0,0x18(%rsp)
0x0000000000400607 <+23>: mov %fs:0x28,%rax
0x0000000000400610 <+32>: mov %rax,0x68(%rsp)
0x0000000000400615 <+37>: xor %eax,%eax
0x0000000000400617 <+39>: movl $0x0,(%rsp)
0x000000000040061e <+46>: movq $0x400838,0x30(%rsp)
0x0000000000400627 <+55>: movb $0x0,0x38(%rsp)
0x000000000040062c <+60>: movl $0x0,0x20(%rsp)
0x0000000000400634 <+68>: movq $0x400838,0x50(%rsp)
0x000000000040063d <+77>: movb $0x0,0x58(%rsp)
0x0000000000400642 <+82>: movl $0x0,0x40(%rsp)
0x000000000040064a <+90>: callq 0x400790 <ErasedObject::~ErasedObject()>
0x000000000040064f <+95>: lea 0x20(%rsp),%rdi
0x0000000000400654 <+100>: callq 0x400790 <ErasedObject::~ErasedObject()>
0x0000000000400659 <+105>: mov %rsp,%rdi
0x000000000040065c <+108>: callq 0x400790 <ErasedObject::~ErasedObject()>
0x0000000000400661 <+113>: mov 0x68(%rsp),%rdx
0x0000000000400666 <+118>: xor %fs:0x28,%rdx
0x000000000040066f <+127>: jne 0x400678 <main()+136>
0x0000000000400671 <+129>: xor %eax,%eax
0x0000000000400673 <+131>: add $0x78,%rsp
0x0000000000400677 <+135>: retq
0x0000000000400678 <+136>: callq 0x4005c0 <__stack_chk_fail#plt>
Code
#include <type_traits>
#include <new>
namespace
{
struct ErasedTypeVTable
{
using destructor_t = void (*)(void *obj);
destructor_t dtor;
};
template <typename T>
void dtor(void *obj)
{
return static_cast<T *>(obj)->~T();
}
template <typename T>
static const ErasedTypeVTable erasedTypeVTable = {
&dtor<T>
};
}
struct ErasedObject
{
std::aligned_storage<sizeof(void *)>::type storage;
const ErasedTypeVTable& vtbl;
bool flag = false;
template <typename T, typename S = typename std::decay<T>::type>
ErasedObject(T&& obj)
: vtbl(erasedTypeVTable<S>)
{
static_assert(sizeof(T) <= sizeof(storage) && alignof(T) <= alignof(decltype(storage)), "");
new (object()) S(std::forward<T>(obj));
}
ErasedObject(ErasedObject&& other) = default;
~ErasedObject()
{
if (flag)
{
::operator delete(object());
}
else
{
vtbl.dtor(object());
}
}
void *object()
{
return reinterpret_cast<char *>(&storage);
}
};
struct myType
{
int a;
};
int main()
{
ErasedObject c1(myType{});
ErasedObject c2(myType{});
//ErasedObject c3(myType{});
}
clang can optimize-out both versions.
Any ideas what's going on? Am I hitting some optimization limit? If so, is it configurable?
I ran g++ with -fdump-ipa-inline to get more information about why functions are or are not inlined.
For the testcase with main() function and three objects created I got:
(...)
150 Deciding on inlining of small functions. Starting with size 35.
151 Enqueueing calls in void {anonymous}::dtor(void*) [with T = myType]/40.
152 Enqueueing calls in int main()/35.
153 not inlinable: int main()/35 -> ErasedObject::~ErasedObject()/33, call is unlikely and code size would grow
154 not inlinable: int main()/35 -> ErasedObject::~ErasedObject()/33, call is unlikely and code size would grow
155 not inlinable: int main()/35 -> ErasedObject::~ErasedObject()/33, call is unlikely and code size would grow
(...)
This error code is set in gcc/gcc/ipa-inline.c:
else if (!e->maybe_hot_p ()
&& (growth >= MAX_INLINE_INSNS_SINGLE
|| growth_likely_positive (callee, growth)))
{
e->inline_failed = CIF_UNLIKELY_CALL;
want_inline = false;
}
Then I discovered, that the smallest change to make g++ inline these functions is to add a declaration:
int main() __attribute__((hot));
I wasn't able to find in code why int main() isn't considered hot, but probably this should be left for another question.
More interesting is the the second part of the conditional I pasted above. The intent was to not inline when the code will grow and you produced an example when the code shrinks after complete inlining.
I think this deserves to be reported on GCC's bugzilla, but I'm not sure if you can call it a bug - estimation of inline impact is a heuristic and as such it is expected to work correctly in most cases, not all of them.
I have a simple class with one private member that is accessible via get() and set() in a multithreaded environment (multi readers/multi writers). how do I lock a Get() as it only has a return statement?
class MyValue
{
private:
System::CriticalSection lock;
int val { 0 };
public:
int SetValue(int arg)
{
lock.Enter();
val = arg;
lock.Leave();
}
int GetValue()
{
lock.Enter();
return val;
//Where should I do lock.Leave()?
}
}
Don't lock anything. In your example, it is enough if you make your member an std::atomic integer.
You do not need anything else here. As a matter of fact, due to Intel architecture (strong memory ordering model), this std::atomic is not even likely to cause any performance issues.
I'm not a multithreading expert, but I think following should work.
int GetValue()
{
lock.Enter();
int ret = val;
lock.Leave();
return ret;
}
This is a demonstration of the synchronization object from hauron's answer -- I wanted to show that object construction and destruction overhead simply does not exist with an optomized build.
In the code below, CCsGrabber is an RAII-like class which enters a critical section (wrapped by a CCritical object) when constructed, then leaves it when destroyed:
class CCsGrabber {
class CCritical& m_Cs;
CCsGrabber();
public:
CCsGrabber(CCritical& cs);
~CCsGrabber();
};
class CCritical {
CRITICAL_SECTION cs;
public:
CCritical() {
InitializeCriticalSection(&cs);
}
~CCritical() { DeleteCriticalSection(&cs); }
void Enter() { EnterCriticalSection(&cs); }
void Leave() { LeaveCriticalSection(&cs); }
void Lock() { Enter(); }
void Unlock() { Leave(); }
};
inline CCsGrabber::CCsGrabber(CCritical& cs) : m_Cs(cs) { m_Cs.Enter(); }
inline CCsGrabber::CCsGrabber(CCritical *pcs) : m_Cs(*pcs) { m_Cs.Enter(); }
inline CCsGrabber::~CCsGrabber() { m_Cs.Leave(); }
Now, a global CCritical object is created (cs), which is used in SerialFunc(), along with a local CCsGrabber instance (csg) to take care of locking and unlocking:
CCritical cs;
DWORD last_tick = 0;
void SerialFunc() {
CCsGrabber csg(cs);
last_tick = GetTickCount();
}
int main() {
SerialFunc();
std::cout << last_tick << std::endl;
}
And below is the dissasembly of main() from an optimized 32-bit build. (I apologize for pasting in the whole thing -- I wanted to show that I wasn't hiding anything:
int main() {
00401C80 push ebp
00401C81 mov ebp,esp
00401C83 and esp,0FFFFFFF8h
00401C86 push 0FFFFFFFFh
00401C88 push 41B038h
00401C8D mov eax,dword ptr fs:[00000000h]
00401C93 push eax
00401C94 mov dword ptr fs:[0],esp
00401C9B sub esp,0Ch
00401C9E push esi
00401C9F push edi
SerialFunc();
00401CA0 push 427B78h ; pointer to CS object
00401CA5 call dword ptr ds:[41C00Ch] ; _RtlEnterCriticalSection#4:
00401CAB call dword ptr ds:[41C000h] ; _GetTickCountStub#0:
00401CB1 push 427B78h ; pointer to CS object
00401CB6 mov dword ptr ds:[00427B74h],eax ; return value => last_tick
00401CBB call dword ptr ds:[41C008h] ; _RtlLeaveCriticalSection#4:
std::cout << last_tick << std::endl;
00401CC1 push ecx
00401CC2 call std::basic_ostream<char,std::char_traits<char> >::operator<< (0401D90h)
00401CC7 mov esi,eax
00401CC9 lea eax,[esp+0Ch]
00401CCD push eax
00401CCE mov ecx,dword ptr [esi]
00401CD0 mov ecx,dword ptr [ecx+4]
00401CD3 add ecx,esi
00401CD5 call std::ios_base::getloc (0401BD0h)
00401CDA push eax
00401CDB mov dword ptr [esp+20h],0
00401CE3 call std::use_facet<std::ctype<char> > (0403E40h)
00401CE8 mov dword ptr [esp+20h],0FFFFFFFFh
00401CF0 add esp,4
00401CF3 mov ecx,dword ptr [esp+0Ch]
00401CF7 mov edi,eax
00401CF9 test ecx,ecx
00401CFB je main+8Eh (0401D0Eh)
00401CFD mov edx,dword ptr [ecx]
00401CFF call dword ptr [edx+8]
00401D02 test eax,eax
00401D04 je main+8Eh (0401D0Eh)
00401D06 mov edx,dword ptr [eax]
00401D08 mov ecx,eax
00401D0A push 1
00401D0C call dword ptr [edx]
00401D0E mov eax,dword ptr [edi]
00401D10 mov ecx,edi
00401D12 push 0Ah
00401D14 mov eax,dword ptr [eax+20h]
00401D17 call eax
00401D19 movzx eax,al
00401D1C mov ecx,esi
00401D1E push eax
00401D1F call std::basic_ostream<char,std::char_traits<char> >::put (0404220h)
00401D24 mov ecx,esi
00401D26 call std::basic_ostream<char,std::char_traits<char> >::flush (0402EB0h)
}
00401D2B mov ecx,dword ptr [esp+14h]
00401D2F xor eax,eax
00401D31 pop edi
00401D32 mov dword ptr fs:[0],ecx
00401D39 pop esi
00401D3A mov esp,ebp
00401D3C pop ebp
00401D3D ret
So we can see that SerialFunc() was inlined directly into main, after prologue at the beginning and before the cout code -- and nowhere to be found is any superflouous object creation, memory allocation or anything -- it just looks like the minimum amount of assembly code required to enter the critical section, get the tick count in a variable, and then leave the critical section.
Then I changed SerialFunc() to:
void SerialFunc() {
cs.Enter();
last_tick = GetTickCount();
cs.Leave();
}
With explicitly-placed cs.Enter() and cs.Leave(), just to compare with the RAII version. The generated code turned out to be identical:
int main() {
00401C80 push ebp
00401C81 mov ebp,esp
00401C83 and esp,0FFFFFFF8h
00401C86 push 0FFFFFFFFh
00401C88 push 41B038h
00401C8D mov eax,dword ptr fs:[00000000h]
00401C93 push eax
00401C94 mov dword ptr fs:[0],esp
00401C9B sub esp,0Ch
00401C9E push esi
00401C9F push edi
SerialFunc();
00401CA0 push 427B78h
00401CA5 call dword ptr ds:[41C00Ch]
00401CAB call dword ptr ds:[41C000h]
00401CB1 push 427B78h
00401CB6 mov dword ptr ds:[00427B74h],eax
00401CBB call dword ptr ds:[41C008h]
std::cout << last_tick << std::endl;
00401CC1 push ecx
00401CC2 call std::basic_ostream<char,std::char_traits<char> >::operator<< (0401D90h)
...
In my opinion, SergeyA's answer is best for the given situation -- a critical section for synchronizing reads and writes from/to 32-bit variables is excessive. However, if something comes up which calls for a critical section or mutex, using an RAII-like object to simplify your code is probably not going to incur significant (or even any) object creation overhead.
(I used Visual C++ 2013 to compile the code above)
Consider using a class wrapper locking in ctor, and unlocking in dtor. See standard implementation: http://en.cppreference.com/w/cpp/thread/unique_lock
This way you don't need to remember about unlocking in case of complex code or exceptions thrown within your code, altering the normal execution.
I found this call sub_10636F0 in 5 different places trying to figure out how to call it from C++ DLL which is injected into the target application so It has full access to all the calls in that application.
I had a chart of all the places where it's called most of these call's are cut right after another call above it, to ensure it's completeness.
I read tons of questions on stackoverflow about this subject found a few good answers from Necrolis, saying if its a EDX then you could use __fastcall.
I googled to find out about ECX and it seems to also be used to __fastcall so either ECX or EDX mean __fastcall.
But the function it calls uses the wrapper
sub esp, 5F4h
add esp, 5F4h
retn 8
I have no idea what this is about again doing tons of research I think SUB ESP, XXX at beginning and ADD ESP, XXX at end are used only for _cdecl conversions
My current code looks like this
typedef void(__fastcall *TThreeParamter)(int, int, int);
typedef void(__fastcall *TTwoParamter)(int, int);
typedef void(__fastcall *TOneParamter)(int);
typedef void(__fastcall *TZeroParamter)();
TTwoParamter sub_10636F0 = (TTwoParamter)(DWORD)GetModuleHandle(NULL) + 0xC636EF;
//the call
sub_10636F0(0x11223344, 0x55667788);
Don't ask me why the 0xC636EF is different from 10636F0 in the sub, I can tell you it's going into the correct sub upon inspection in a debugger, the sub's keep moving around everytime the program is re-launched it seems to be either a protection method, or possibly because this program loads over 50 dll's and the addresses need to move around.
I tried all different configurations, 2 int's, 3 int's nothing works..
IDA detects this method as being 3 parameters, but the last parameter isn't used anywhere in the decompiled pesudo-code, which I cannot figure out,
Pseudo-code looks like this ( I did heavy modifications to it, like change it to _fastcall from __thiscall )
Pseudo-code from IDA
//probably wrong.. packet is a variable not a parameter which will crash
void __fastcall sub_10636F0(int var1)
{
__int128 v1; // xmm0#0
int v2; // esi#1
int v3; // ebx#1
SOCKET v4; // ebp#1
int v5; // eax#2
int v6; // ecx#3
int v7; // [sp+8h] [bp-5FCh]#7
char a2a[1492]; // [sp+10h] [bp-5F4h]#2
int v9; // [sp+5E4h] [bp-20h]#2
int v10; // [sp+5E8h] [bp-1Ch]#2
struct _FILETIME SystemTimeAsFileTime; // [sp+5F0h] [bp-14h]#2
__int16 v12; // [sp+5F8h] [bp-Ch]#2
int packet; // [sp+608h] [bp+4h]#0
int to; // [sp+60Ch] [bp+8h]#0
v2 = to;
v3 = var1;
v4 = *(_DWORD *)(packet + 220);
if ( v4 != -1 )
{
//snipped lots of code
}
}
//probably wrong.. packet is a variable not a paramter which will crash
void __fastcall sub_10636F0(int var1, int var2)
{
__int128 v2; // xmm0#0
int v3; // esi#1
int v4; // ebx#1
SOCKET v5; // ebp#1
int v6; // eax#2
int v7; // ecx#3
int v8; // [sp+8h] [bp-5FCh]#7
char a2a[1492]; // [sp+10h] [bp-5F4h]#2
int v10; // [sp+5E4h] [bp-20h]#2
int v11; // [sp+5E8h] [bp-1Ch]#2
struct _FILETIME SystemTimeAsFileTime; // [sp+5F0h] [bp-14h]#2
__int16 v13; // [sp+5F8h] [bp-Ch]#2
int packet; // [sp+608h] [bp+4h]#0
int to; // [sp+60Ch] [bp+8h]#0
v3 = to;
v4 = var1;
v5 = *(_DWORD *)(packet + 220);
if ( v5 != -1 )
{
//snipped lots of code
}
}
//this looks the best, but still `to` isn't detected as paramter
void __fastcall sub_10636F0(int var1, int var2, int var3)
{
__int128 v3; // xmm0#0
int v4; // esi#1
int v5; // ebx#1
SOCKET v6; // ebp#1
int v7; // eax#2
int v8; // ecx#3
int v9; // [sp+8h] [bp-5FCh]#7
char a2a[1492]; // [sp+10h] [bp-5F4h]#2
int v11; // [sp+5E4h] [bp-20h]#2
int v12; // [sp+5E8h] [bp-1Ch]#2
struct _FILETIME SystemTimeAsFileTime; // [sp+5F0h] [bp-14h]#2
__int16 v14; // [sp+5F8h] [bp-Ch]#2
int to; // [sp+60Ch] [bp+8h]#0
v4 = to; //still doesn't detect this..
v5 = var1; //okay this isn't bad another parameter
v6 = *(_DWORD *)(var3 + 220); //like this detects this as parameter class atleast
if ( v6 != -1 )
{
//snipped lots of code
}
}
This is the code IDA recommends by default
char __userpurge sub_10636F0#<al>(int a1#<ecx>, __int128 a2#<xmm0>, int a3, int a4)
{
int v4; // esi#1
int v5; // ebx#1
SOCKET v6; // ebp#1
int v7; // eax#2
int v8; // ecx#3
int v9; // eax#8
char v11; // [sp+8h] [bp-5FCh]#7
int v12; // [sp+10h] [bp-5F4h]#4
int v13; // [sp+24h] [bp-5E0h]#2
int v14; // [sp+28h] [bp-5DCh]#2
int v15; // [sp+2Ch] [bp-5D8h]#2
int v16; // [sp+30h] [bp-5D4h]#2
char v17; // [sp+34h] [bp-5D0h]#2
signed int v18; // [sp+5E4h] [bp-20h]#2
int v19; // [sp+5E8h] [bp-1Ch]#2
int v20; // [sp+5F0h] [bp-14h]#2
__int16 v21; // [sp+5F8h] [bp-Ch]#2
v4 = a4;
v5 = a1;
v6 = *(_DWORD *)(a3 + 220);
if ( v6 == -1 )
return 0;
//Snipped code
if ( v9 >= 0 && v9 == *(_DWORD *)(v4 + 1492) )
return 1;
return 0;
}
Function in ASM
.text:010636F0 ; void __fastcall sub_10636F0(int var1, int var2, int var3)
.text:010636F0 sub_10636F0 proc near ; CODE XREF: sub_1062960+E0p
.text:010636F0 ; sub_10637E0+D4p ...
.text:010636F0
.text:010636F0 a2 = byte ptr -5F4h
.text:010636F0 var_20 = dword ptr -20h
.text:010636F0 var_1C = dword ptr -1Ch
.text:010636F0 SystemTimeAsFileTime= _FILETIME ptr -14h
.text:010636F0 var_C = word ptr -0Ch
.text:010636F0 var_4 = dword ptr -4
.text:010636F0 packet = dword ptr 4
.text:010636F0 to = dword ptr 8
.text:010636F0 test = dword ptr 0Ch
.text:010636F0
.text:010636F0 sub esp, 5F4h
.text:010636F6 mov eax, ___security_cookie
.text:010636FB xor eax, esp
.text:010636FD mov [esp+5F4h+var_4], eax
.text:01063704 push ebx
.text:01063705 push ebp ; a5
.text:01063706 push esi ; a4
.text:01063707 mov esi, [esp+600h+to]
.text:0106370E push edi ; a3
.text:0106370F mov edi, [esp+604h+packet]
.text:01063716 mov ebx, ecx
.text:01063718 mov ebp, [edi+0DCh]
.text:0106371E cmp ebp, 0FFFFFFFFh
.text:01063721 jz loc_10637BB
SNIP TONS OF CODE HERE
.text:010637BB
.text:010637BB loc_10637BB: ; CODE XREF: sub_10636F0+31j
.text:010637BB ; sub_10636F0+BDj ...
.text:010637BB xor al, al
.text:010637BD
.text:010637BD loc_10637BD: ; CODE XREF: sub_10636F0+C9j
.text:010637BD mov ecx, [esp+604h+var_4]
.text:010637C4 pop edi
.text:010637C5 pop esi
.text:010637C6 pop ebp
.text:010637C7 pop ebx
.text:010637C8 xor ecx, esp
.text:010637CA call #__security_check_cookie#4 ; __security_check_cookie(x)
.text:010637CF add esp, 5F4h
.text:010637D5 retn 8
.text:010637D5 sub_10636F0 endp ; sp-analysis failed
Calls to this function in ASM
.text:010638B0 push esi ; packet
.text:010638B1 push ebx ; this
.text:010638B2 mov ecx, ebp ; this
.text:010638B4 call sub_10636F0
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.text:01062A2E mov byte ptr [esi+5E9h], 1
.text:01062A35
.text:01062A35 loc_1062A35: ; CODE XREF: sub_1062960+C1j
.text:01062A35 add dword ptr [esi+5D4h], 2
.text:01062A3C push esi ; packet
.text:01062A3D push edi ; this
.text:01062A3E mov ecx, ebx ; this
.text:01062A40 call sub_10636F0
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.text:01063AF4 mov eax, [ebx+1128h]
.text:01063AFA mov [esp+1A4h+var_AC], eax
.text:01063B01 push esi ; packet
.text:01063B02 lea eax, [esp+1A8h+to]
.text:01063B06 push eax ; this
.text:01063B07 mov ecx, ebx ; this
.text:01063B09 mov [esp+1ACh+var_4], 1
.text:01063B14 call sub_10636F0
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.text:01089145 loc_1089145: ; CODE XREF: sub_10890B0+4Fj
.text:01089145 ; sub_10890B0+67j
.text:01089145 mov ecx, [edi+110h] ; this
.text:0108914B push esi ; packet
.text:0108914C push edi ; this
.text:0108914D call sub_10636F0
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.text:01089CBA mov ecx, [esi+110h] ; this
.text:01089CC0 push edi ; packet
.text:01089CC1 push esi ; this
.text:01089CC2 call sub_10636F0
I have no idea what this is about again doing tons of research I think
SUB ESP, XXX at beginning and ADD ESP, XXX at end are used only for
_cdecl conversions
No, it's used for ALL functions that use local variables (with minor variations as to exactly how it's done, but stack space needs to be allocated by subtracting from ESP, and "freed" by adding the same amount to the stack pointer.
However, the RET 8 does indeed indicate that the calling convention is NOT _cdecl, but one where the stack is cleaned up by the callee. There are a few different calling conventions that match this, but I have a feeling it's C++ code and a member function, which would make it thiscall - that does make it a little hard to simulate, since you want this in ECX.
The ret 8 says that the function has 8 bytes worth of arguments, so two int or void * variables.
I'm far from convinced there is a simple way to do this. You may be able to do something like this. Create a class X with a virtual function that takes two arguments:
class X
{
virtual void Func(int x, int y) { }
};
Then figure out where the compiler put the vtable, and modify the vtable for func to point at your target function, rather than the empty implementation of the class.
Now you can use X to create an instance:
X* p = new X;
and then call func.
p->func(1, 2);
However, if you are unlucky, the compiler doesn't realize that you have messed with the vtable, and end up calling the function directly. So you may need to do some trickery with separate compilation and other stuff.
In other words, you have your work cut out. But then reverse engineering wouldn't be any fun at all if you didn't have to trick around a bit.
Of course, the cheaters method is to just write a few lines of inline assembler, like such:
void CallMyFunc(void *func, int a, int b, int c)
{
__asm(mov ecx, a
push b
push c
call *func);
}
[It's about 10 years since I last wrote Windows inline assembly code, so apologies if the syntax isn't quite right - consider it a "rough sketch" and do modify it until it actually compiles...]
Hello i'm trying to show a 640x480 BMP image (16 color bitmap) with inline ASM in c++, it has to be with inline asm because it's a homework. I have this code in assembly code to do that:
cad db 'Error, file not found, press a key to finish.$'
filename db "C:\image.bmp"
handle dw ?
col dw 0
ren dw 479
col1 dw ?
ren1 dw ?
col2 dw ?
ren2 dw ?
buffer db ?
colo db ?
eti0:
mov ah,3dh
mov al,0
mov dx,offset filename
int 21h
jc err
mov handle,ax
mov cx,118d
eti1:
push cx
mov ah,3fh
mov bx,handle
mov dx,offset buffer
mov cx,1
int 21h
pop cx
loop eti1
mov ah,00h
mov al,18d
int 10h
eti2:
mov ah,3fh
mov bx,handle
mov dx,offset buffer
mov cx,1
int 21h
mov al,buffer
and al,11110000b
ror al,4
mov colo,al
mov ah,0ch
mov al,colo
mov cx,col
mov dx,ren
int 10h
mov al,buffer
and al,00001111b
mov colo,al
inc col
mov ah,0ch
mov al,colo
mov cx,col
mov dx,ren
int 10h
inc col
mov ah,0ch
mov al,colo
mov cx,col
mov dx,ren
int 10h
cmp col,639d
jbe eti2
mov col,0
dec ren
cmp ren,-1
jne eti2
Now to put it in inline ASM i'm trying with the next code:
#include<stdio.h>
#include<conio.h>
#include<iostream.h>
#include<dos.h>
#include<stdlib.h>
void main(void)
{
clrscr();
unsigned char buffer,colo;
unsigned int handle,col=0,ren=479,col1,col2,ren2;
int filename=675892105109971031011104698109112;
asm{
mov ah,3dh
mov al,0
mov dx,filename
int 21h
mov handle,ax
mov cx,118d
}
cout<<"si mino1";
for(int i=118;i>0;i++){
asm{
mov ah,3fh
mov bx,handle
mov dx,offset buffer
mov cx,1
int 21h
}
}
asm{
mov ah,00h
mov al,18d
int 10h
}
cout<<"si mino2";
eti2:
asm{
mov ah,3fh
mov bx,handle
mov dx,offset buffer
mov cx,1
int 21h
mov al,buffer
and al,11110000b
ror al,4
mov colo,al
mov ah,0ch
mov al,colo
mov cx,col
mov dx,ren
int 10h
mov al,buffer
and al,00001111b
mov colo,al
inc col
mov ah,0ch
mov al,colo
mov cx,col
mov dx,ren
int 10h
inc col
mov ah,0ch
mov al,colo
mov cx,col
mov dx,ren
int 10h
cmp col,639d
jbe eti2
mov col,0
dec ren
cmp ren,-1
jne eti2
}
cout<<"si mino3";
getch();
}
the code reaches to the first cout and then enters in an infinite loop.
Do you really mean the following line of code:
for(int i=118;i>0;i++)
This initializes i to 118, and every iteration, adds 1. It will only ever get larger (until i overflows). The test for whether the loop should continue is i > 0, which will always be true (until i overflows).
Are you sure you're in an infinite loop? Maybe the many, many int 21h just take a very, very long time.
I figured out how to show an 640x200 16 colors BMP with the next code:
//+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
// << DISPLAY 4-Bit BMP (16 colors) >>
// This program shows how to display a 16 color bitmap.
// I am not using palettes for bmp,hence all default 16 colors are used.
// If you know BMP structure you can try to add palettes.
//+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
#include <alloc.h>
#include <conio.h>
#include <graphics.h>
#include <stdio.h>
#include <stdlib.h>
//+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
#define UL unsigned long
#define UI unsigned int
#define UC unsigned char
//+-+-+-+-+-+-+-+-+-+-+-+-+-+< BMP Structures >+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
typedef struct
{
char Type[2];
UL Size;
UI R1;
UI R2;
UL OffSet;
}BMP1;
//+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
typedef struct
{
UL headsize;
UL Hlen;
UL Vlen;
UI planes;
UI BPP;
UL Method;
UL BmpSize;
UL HRes;
UL VRes;
UL Colors;
UL IColors;
}BMP2;
//+-+-+-+-+-+-+-+-+-+-+-+-+-< Display BMP >+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
int ShowBMP(int x, int y, char* FileName)
{
int b,a;
BMP1 Obj1;
BMP2 Obj2;
UC * Holder;
int in=0;
UC c=0;
FILE * fp;
fp = fopen(FileName,"rb");
if(fp==NULL)
return 0;
fread(&Obj1, sizeof(Obj1), 1, fp);
fread(&Obj2, sizeof(Obj2), 1, fp);
if(Obj2.BPP!=4) // This isn't a 16 color bmp we can read;
{
fclose(fp);
return 0;
};
fseek(fp,Obj1.OffSet,SEEK_SET);
Holder=(UC *) calloc(Obj2.Hlen/2+1, sizeof(UC));
for(b=Obj2.Vlen;b>=0;b--)
{
fread(Holder, sizeof(UC), Obj2.Hlen/2, fp);
c=0;
in=0;
for(a=0;a<=Obj2.Hlen;a+=2)
{
c = (Holder[in] | 0x00) >>4;
putpixel(a+x,b+y,c);
c = (Holder[in] | 0xF0) & 0x0F;
putpixel(a+1+x,b+y,c);
in++;
}
}
free (Holder);
fclose(fp);
return 1;
}
//+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-< **** >+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
// Two bmp demo.bmp & demo1.bmp are provided.
// open these bmp in windows paint & change.(do not change size of bmp)
//+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-< Main >+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
void main()
{
int color,D=3,E=0;
//registerfarbgidriver(EGAVGA_driver_far);
initgraph(&D,&E,"C:\\BC31\\BGI");
E=0;
if(!ShowBMP(0,0,"C:\\imagen.bmp")) E=1;
getch();
closegraph();
if(E) printf("\nError.");
else printf("Sucess !");
}
//+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
Why does this code call the wrong virtual function? It calls the one at offset 8 while it should be calling the one at offset 4. If I rename the function at 8 it correctly calls the one at 4. Code gen bug? Something silly I'm missing?
Source:
class surface_c
{
public:
virtual ~surface_c() = 0; // 0
virtual bool blit(int) = 0; // 4
virtual bool blit() = 0; // 8
};
int main()
{
surface_c* surface;
surface->blit(0);
return 0;
}
Disassembly:
int main()
{
00A11250 push ebp
00A11251 mov ebp,esp
00A11253 sub esp,44h
00A11256 push ebx
00A11257 push esi
00A11258 push edi
surface_c* surface;
surface->blit(0);
00A11259 push 0
00A1125B mov eax,dword ptr [surface]
00A1125E mov edx,dword ptr [eax]
00A11260 mov ecx,dword ptr [surface]
00A11263 mov eax,dword ptr [edx+8]
00A11266 call eax
return 0;
00A11268 xor eax,eax
}
00A1126A pop edi
00A1126B pop esi
00A1126C pop ebx
00A1126D mov esp,ebp
00A1126F pop ebp
00A11270 ret
int main()
{
surface_c* surface; // surface contains garbage, as it is uninitialzed
surface->blit(0);
return 0;
}
surface must be pointing to some non-abstract subclass of surface_c, e.g.
surface = new surface_f();
where surface_f is some non-abstract subclass of surface_c, and the inherited pure virtual functions must be overridden by concrete implementations
You are calling the method on an uninitialized pointer. It is pure luck that a method is called, and the program does not crash.