When I tried to create my own alternative to classic array, I saw, that into disassembly code added one instruction: mov edx,dword ptr [myarray]. Why this additional instruction was added?
I want to use my functionality of my alternative, but do not want to lose performance! How to resolve this question? Every processor cycle is important for this application.
For example:
for (unsigned i = 0; i < 10; ++i)
{
array1[i] = i;
array2[i] = 10 - i;
}
Assembly (classic int arrays):
mov edx, dword ptr [ebp-480h]
mov eax, dword ptr [ebp-480h]
mov dword ptr array1[edx*4], eax
mov ecx, 10
sub ecx, dword ptr [ebp-480h]
mov edx, dword ptr [ebp-480h]
mov dword ptr array2[edx*4], ecx
Assembly (my class):
mov edx,dword ptr [array1]
mov eax,dword ptr [ebp-43Ch]
mov ecx,dword ptr [ebp-43Ch]
mov dword ptr [edx+eax*4], ecx
mov edx, 10
sub edx, dword ptr [ebp-43Ch]
mov eax, dword ptr [array2]
mov ecx, dword ptr [ebp-43Ch]
mov dword ptr [eax+ecx*4], edx
One instruction is not a loss of performance with today's processors. I would not worry about it and instead suggest you read Coding Horror's article on micro optimization.
However, that instruction is just moving the first index (myarray+0) to edx so it can be used.
Related
I am trying to practice the inline ASM in C++ :) Maybe outdated, but it is interesting, to know how CPU is executing the code.
So, what I am trying to do here, is to loop through processes and get a handle of needed one :) I am using for that already created methods from tlhelp32
I have this code:
HANDLE RetHandle = nullptr, snap;
int SizeOfPE = sizeof(PROCESSENTRY32), pid; PROCESSENTRY32 pe;
int PA = PROCESS_ALL_ACCESS;
const char* Pname = "explorer.exe";
__asm
{
mov eax, pe
mov ebx, this
mov ecx, [ebx]pe.dwSize
mov ecx, SizeOfPE
mov[ebx]pe.dwSize, ecx
mov eax, PA
mov ebx,0
call CreateToolhelp32Snapshot
mov eax,snap
label1:
mov eax, snap
mov ebx, [pe]
call Process32First
cmp eax,1
jne exitLabel
Process32NextLoop:
mov eax, snap
mov ebx, [pe]
call Process32Next
cmp eax, 1
jne Process32NextLoop
mov edx, pe
mov ecx, [edx].szExeFile
cmp ecx, Pname
je ExitLoop
jne Process32NextLoop
ExitLoop:
mov eax, [ebx].th32ProcessID
mov pid, eax
ExitLabel:
ret
}
Apparently, it is throwing error in th32ProcessID as well, however, it is just regular int.
Have been searching, but haven't found the equivalent for movl in C++
I found something interesting while on leetcode and wish someone can help explain the cause:
I was basically doing merge sort and used the fast slow pointer to find the mid pointer. Here're two versions of such code snippets:
1. update in afterthought
for (ListNode* fast=head;
fast->next && fast->next->next;
fast = fast->next->next, slow = slow->next) { }
2. update in body
for (ListNode* fast=head; fast->next && fast->next->next; ) {
fast = fast->next->next;
slow = slow->next;
}
Why is version 2 faster than the first one?
Compiler: g++ 4.9.2
It is unlikely that comma operation can significantly reduce the speed of for-loop.
I have made both variants and opened disassembly (in Visual Studio 2012) for them to see difference.
looks as:
for (ListNode* fast = head;
0022545E mov eax,dword ptr [head]
00225461 mov dword ptr [ebp-2Ch],eax
fast->next && fast->next->next;
00225464 jmp main+17Bh (022547Bh)
fast = fast->next->next, slow = slow->next) {
00225466 mov eax,dword ptr [ebp-2Ch]
00225469 mov ecx,dword ptr [eax+4]
0022546C mov edx,dword ptr [ecx+4]
0022546F mov dword ptr [ebp-2Ch],edx
00225472 mov eax,dword ptr [slow]
00225475 mov ecx,dword ptr [eax+4]
00225478 mov dword ptr [slow],ecx
0022547B mov eax,dword ptr [ebp-2Ch]
0022547E cmp dword ptr [eax+4],0
00225482 je main+192h (0225492h)
00225484 mov eax,dword ptr [ebp-2Ch]
00225487 mov ecx,dword ptr [eax+4]
0022548A cmp dword ptr [ecx+4],0
0022548E je main+192h (0225492h)
}
is:
for (ListNode* fast = head; fast->next && fast->next->next;) {
0024545E mov eax,dword ptr [head]
00245461 mov dword ptr [ebp-2Ch],eax
00245464 mov eax,dword ptr [ebp-2Ch]
00245467 cmp dword ptr [eax+4],0
0024546B je main+190h (0245490h)
0024546D mov eax,dword ptr [ebp-2Ch]
00245470 mov ecx,dword ptr [eax+4]
00245473 cmp dword ptr [ecx+4],0
00245477 je main+190h (0245490h)
fast = fast->next->next;
00245479 mov eax,dword ptr [ebp-2Ch]
0024547C mov ecx,dword ptr [eax+4]
0024547F mov edx,dword ptr [ecx+4]
00245482 mov dword ptr [ebp-2Ch],edx
slow = slow->next;
00245485 mov eax,dword ptr [slow]
00245488 mov ecx,dword ptr [eax+4]
0024548B mov dword ptr [slow],ecx
}
Only one jmp is the distinction.
Sorry, but I cannot see significant differences, so perhaps the performance problem is not in the place of that two statements.
Consider the following code..
#include <vector>
std::basic_string<char> sBasicString = "basic_string";
char* buffer = new char[1000];
for (size_t i = 0 ; i < sBasicString.size() ; ++i)
{
char c;
c = sBasicString[i];
buffer[i] = c;
}
(Please ignore the memory leak - it is not relevant)
I'm compiling it in VS2012 64bit in both Release and Debug (default configuration).
When i'm running the debugger in Debug mode, i can watch the sBasicString and buffer variables as expected (query their value, etc...)
But when i'm running the debugger in Release mode, i still can watch the sBasicString but not the buffer.
Why?
As Release mode has optimization set to "Full Optimization" (default value) and "Generate Debug Info" set to "YES" - i would expect either both variables can be watched or none.
EDIT
trying to add a proper usage of the buffer variable (avoid compiler optimization) - i still get the same behavior.
EDIT 2 Adding 64bit disassemble output of Release mode compilation
int main()
{
000000013F091000 mov rax,rsp
000000013F091003 push rbx
000000013F091004 sub rsp,50h
000000013F091008 mov qword ptr [rax-38h],0FFFFFFFFFFFFFFFEh
std::basic_string<char> sBasicString = "basic_string";
000000013F091010 xor ebx,ebx
000000013F091012 mov qword ptr [rax-20h],rbx
000000013F091016 mov qword ptr [rax-18h],rbx
000000013F09101A mov qword ptr [rax-18h],0Fh
000000013F091022 mov qword ptr [rax-20h],rbx
000000013F091026 mov byte ptr [rax-30h],bl
000000013F091029 lea r8d,[rbx+0Ch]
000000013F09102D lea rdx,[__xi_z+40h (013F093238h)]
000000013F091034 lea rcx,[rax-30h]
000000013F091038 call std::basic_string<char,std::char_traits<char>,std::allocator<char> >::assign (013F0916A0h)
000000013F09103D nop
char* buffer = new char[1000];
000000013F09103E mov ecx,3E8h
000000013F091043 call operator new[] (013F091AD8h)
for (size_t i = 0 ; i < sBasicString.size() ; ++i)
000000013F091048 mov edx,ebx
000000013F09104A cmp qword ptr [rsp+38h],rbx
000000013F09104F jbe main+73h (013F091073h)
{
char c;
c = sBasicString[i];
000000013F091051 lea rcx,[sBasicString]
000000013F091056 cmp qword ptr [rsp+40h],10h
000000013F09105C cmovae rcx,qword ptr [sBasicString]
buffer[i] = c;
000000013F091062 movzx ecx,byte ptr [rcx+rdx]
buffer[i] = c;
000000013F091066 mov byte ptr [rdx+rax],cl
for (size_t i = 0 ; i < sBasicString.size() ; ++i)
000000013F091069 inc rdx
000000013F09106C cmp rdx,qword ptr [rsp+38h]
000000013F091071 jb main+51h (013F091051h)
}
std::cout << buffer << std::endl;
000000013F091073 mov rdx,rax
000000013F091076 mov rcx,qword ptr [__imp_std::cout (013F093068h)]
000000013F09107D call std::operator<<<std::char_traits<char> > (013F0910C0h)
000000013F091082 mov rcx,rax
000000013F091085 mov rdx,qword ptr [__imp_std::endl (013F093060h)]
000000013F09108C call qword ptr [__imp_std::basic_ostream<char,std::char_traits<char> >::operator<< (013F093098h)]
000000013F091092 nop
return 0;
000000013F091093 cmp qword ptr [rsp+40h],10h
000000013F091099 jb main+0A5h (013F0910A5h)
000000013F09109B mov rcx,qword ptr [sBasicString]
000000013F0910A0 call operator delete (013F091AEAh)
000000013F0910A5 mov qword ptr [rsp+40h],0Fh
000000013F0910AE mov qword ptr [rsp+38h],rbx
000000013F0910B3 mov byte ptr [sBasicString],0
return 0;
000000013F0910B8 xor eax,eax
}
000000013F0910BA add rsp,50h
000000013F0910BE pop rbx
000000013F0910BF ret
EDIT 3 Adding 32bit disassemble output
int main()
{
013B1000 push 0FFFFFFFFh
013B1002 push 13B2558h
013B1007 mov eax,dword ptr fs:[00000000h]
013B100D push eax
013B100E mov dword ptr fs:[0],esp
013B1015 sub esp,18h
013B1018 push esi
std::basic_string<char> sBasicString = "basic_string";
013B1019 push 0Ch
013B101B mov dword ptr [esp+18h],0
013B1023 mov dword ptr [esp+1Ch],0
013B102B push 13B3158h
013B1030 lea ecx,[esp+0Ch]
013B1034 mov dword ptr [esp+20h],0Fh
013B103C mov dword ptr [esp+1Ch],0
013B1044 mov byte ptr [esp+0Ch],0
013B1049 call std::basic_string<char,std::char_traits<char>,std::allocator<char> >::assign (013B17B0h)
013B104E mov dword ptr [esp+24h],0
char* buffer = new char[1000];
013B1056 push 3E8h
013B105B call operator new[] (013B1BD6h)
for (size_t i = 0 ; i < sBasicString.size() ; ++i)
013B1060 xor edx,edx
013B1062 add esp,4
013B1065 mov esi,eax
013B1067 cmp dword ptr [esp+14h],edx
013B106B jbe main+8Dh (013B108Dh)
for (size_t i = 0 ; i < sBasicString.size() ; ++i)
013B106D lea ecx,[ecx]
{
char c;
c = sBasicString[i];
013B1070 cmp dword ptr [esp+18h],10h
013B1075 lea ecx,[esp+4]
013B1079 cmovae ecx,dword ptr [esp+4]
for (size_t i = 0 ; i < sBasicString.size() ; ++i)
013B107E inc edx
buffer[i] = c;
013B107F mov al,byte ptr [ecx+edx-1]
013B1083 mov byte ptr [edx+esi-1],al
013B1087 cmp edx,dword ptr [esp+14h]
013B108B jb main+70h (013B1070h)
}
std::cout << buffer << std::endl;
013B108D push dword ptr ds:[13B3030h]
013B1093 push esi
013B1094 push dword ptr ds:[13B3034h]
013B109A call std::operator<<<std::char_traits<char> > (013B10F0h)
013B109F add esp,8
013B10A2 mov ecx,eax
013B10A4 call dword ptr ds:[13B3028h]
return 0;
013B10AA mov dword ptr [esp+24h],0FFFFFFFFh
013B10B2 cmp dword ptr [esp+18h],10h
013B10B7 pop esi
013B10B8 jb main+0C5h (013B10C5h)
013B10BA push dword ptr [esp]
013B10BD call operator delete (013B1BECh)
013B10C2 add esp,4
}
013B10C5 mov ecx,dword ptr [esp+18h]
return 0;
013B10C9 mov dword ptr [esp+14h],0Fh
013B10D1 mov dword ptr [esp+10h],0
013B10D9 mov byte ptr [esp],0
013B10DD xor eax,eax
}
013B10DF mov dword ptr fs:[0],ecx
013B10E6 add esp,24h
013B10E9 ret
Looking at x86 Assembly posted in this question, I can apply my rudimentary Assembly knowledge to understand, where the buffer variable is hidden:
char* buffer = new char[1000];
013B105B call operator new[] (013B1BD6h)
013B1065 mov esi,eax
My candidate is esi register: operator new returned the result in eax and it is moved to esi. Let's follow this register:
for (size_t i = 0 ; i < sBasicString.size() ; ++i)
013B107E inc edx
buffer[i] = c;
013B107F mov al,byte ptr [ecx+edx-1]
013B1083 mov byte ptr [edx+esi-1],al
The last line places char value al to the buffer. edx is obviously the loop counter, see ind edx. So, esi points to the buffer allocated by operator new. And finally:
013B1093 push esi
013B1094 push dword ptr ds:[13B3034h]
013B109A call std::operator<<<std::char_traits<char> > (013B10F0h)
Here esi is printed. So, the answer to your question: buffer variable is kept in the esi CPU register. You can add the line delete[] buffer; to the program and see, how operator delete is applied to esi in Assembly.
Since the whole loop doesn't contain function calls that can change CPU registers, optimized code produced by compiler just keeps the buffer in the register. Debugger doesn't know this and cannot display it.
x64 Assembly works by the same way, but it is more complicated and requires more time to understand. I hope you have an idea now, what happens.
The compiler is smart enough to see you are not doing anything with the buffer, so it simply optimized it away in the Release mode.
std::string, on the other hand comes from a library, and it is harder to detect that assigning to or reading from it does not have side effects. That's why the compiler didn't remove it.
I have disassembled two different variations of Swap function (simple value-swap between two pointers).
1). __fastcall http://pastebin.com/ux5LMktz
2). __stdcall (function without explicit calling convention modifier will have a __stdcall by default, because of MS C++ compiler for Windows) http://pastebin.com/eGR6VUjX
As I know, __fastcall is implemented differently, depending on the compiler, but basically it puts the first two arguments (left to right) into ECX and EDX register. And there could be stack use, but if the arguments are too long.
But as for the link at 1-st option, you can see, that value is pushed into the ECX registry, and there is no real difference between two variations of swap function.
And __fastcall variant does use:
00AA261F pop ecx
00AA2620 mov dword ptr [ebp-14h],edx
00AA2623 mov dword ptr [ebp-8],ecx
Which are not used in __stdcall version.
So it doesn't look like more optimized (as __fasctcall must be , by its definition).
I'm a newbie in ASM language and calling convention, so I ask you for a piece of advice. Maybe __fastcall is faster exactly in my sample, but I don't see it, do I?
Thanks!
Try turning on optimization, then comparing the results. Your fastcall version has many redundant operations because it's not optimized.
Here's output of VS 2010 with /Ox.
fastcall:
; _firstValue$ = ecx
; _secondValue$ = edx
?CallMe1##YIXPAH0#Z PROC ; CallMe1
mov eax, DWORD PTR [ecx]
push esi
mov esi, DWORD PTR [edx]
cmp eax, esi
je SHORT $LN1#CallMe1
mov DWORD PTR [ecx], esi
mov DWORD PTR [edx], eax
$LN1#CallMe1:
pop esi
ret 0
?CallMe1##YIXPAH0#Z ENDP ; CallMe1
stdcall:
_firstValue$ = 8 ; size = 4
_secondValue$ = 12 ; size = 4
?CallMe2##YGXPAH0#Z PROC ; CallMe2
mov edx, DWORD PTR _firstValue$[esp-4]
mov eax, DWORD PTR [edx]
push esi
mov esi, DWORD PTR _secondValue$[esp]
mov ecx, DWORD PTR [esi]
cmp eax, ecx
je SHORT $LN1#CallMe2
mov DWORD PTR [edx], ecx
mov DWORD PTR [esi], eax
$LN1#CallMe2:
pop esi
ret 8
?CallMe2##YGXPAH0#Z ENDP ; CallMe2
cdecl (what you mistakenly call stdcall in your example):
_firstValue$ = 8 ; size = 4
_secondValue$ = 12 ; size = 4
?CallMe3##YAXPAH0#Z PROC ; CallMe3
mov edx, DWORD PTR _firstValue$[esp-4]
mov eax, DWORD PTR [edx]
push esi
mov esi, DWORD PTR _secondValue$[esp]
mov ecx, DWORD PTR [esi]
cmp eax, ecx
je SHORT $LN1#CallMe3
mov DWORD PTR [edx], ecx
mov DWORD PTR [esi], eax
$LN1#CallMe3:
pop esi
ret 0
?CallMe3##YAXPAH0#Z ENDP ; CallMe3
I have ths code:
__asm
{
PUSHAD
MOV EAX, DWORD PTR DS:[hStorm_LOBBYPTR]
TEST EAX, EAX
JE nick_false
MOV ECX, DWORD PTR DS:[EAX+0xC464]
TEST ECX, ECX
JE nick_false
MOV EAX, DWORD PTR DS:[ECX+0x170+0xB0]
TEST EAX, EAX
JE nick_false
MOV EDX, i
MOV ECX, DWORD PTR DS:[EAX+EDX*4]
MOV EAX, DWORD PTR DS:[ECX+0x1A0]
MOV ECX, DWORD PTR DS:[EAX+0x1E4]
MOV EAX, DWORD PTR DS:[ECX+0x1E4]
MOV ECX, DWORD PTR DS:[EAX+0x1E8]
MOV tempdw, ECX
POPAD
JMP nick_true
nick_false:
XOR EAX, EAX
MOV tempdw, EAX
POPAD
}
nick_true:
/* do check if tempdw is NULL and then proceed with your stuff */
How can I wrap it into DLL (Visual Studio C++ 2008)?
After that, I need to inject the DLL into some process and then retrieve tempdw, how can I do that?
you'll need to warp that in a normal C func, however, judging by the labels, it won't be a naked func:
void MyHook()
{
__asm
{
//asm here
}
nick_true:
//the other stuff
}
this then needs to be put into a basic dll project that writes the needed hooks using WriteProcessMemory (nothing more than that can be given as there isn't enough info).
To inject it, you can use RemoteDll or edit the launcher from w3l