Related
So I'm debugging my DPLL implementation and it's not quite working right so I step through the code line by line in the debugger, it gets to a return statement but the thing is it doesn't return, it just keeps on executing the same function. WTF I thought, am I really seeing this? So I looked at the dissasembly and sure enough one of the return statements jumps to the wrong place. Never have I seen VS generate incorrect code so I'm wondering if I screwed up somewhere but I can't find anything. The jump is incorrect even when compiling with all optimizations off.
This illustrates whats going on.
bool dpll(std::vector<clause> f)
{
unitPropagate(f);
if(checkFalseClause(f))
{
return false; //je dpll+5Fh (0C01D1Fh) <-- Totally wrong jump adress
}
else if(checkAllClausesTrue(f))
{
return true; //jmp dpll+206h (0C01EC6h) <-- this is fine
}
else
{
atom l = chooseLiteral(f); //this is where the jump ends up (0C01D1Fh)
std::vector<clause> a = makeDuplicate(f);
replaceInstancesOf(a, l, true);
std::vector<clause> b = makeDuplicate(f);
replaceInstancesOf(b, l, false);
return dpll(a) | dpll(b);
}
//this is where the jump is supposed to go (0C01EC6h)
}
So my question is, is Visual Studio actually broken or have I misunderstood something? Has anyone run into something like this before?
The version is Visual Studio Enterprise 2015 if that makes a difference, the code is generated for x86_32.
Here's the full dissasembly if anyone's interested:
00C01CC0 push ebp
00C01CC1 mov ebp,esp
00C01CC3 push 0FFFFFFFFh
00C01CC5 push 0C08FF0h
00C01CCA mov eax,dword ptr fs:[00000000h]
00C01CD0 push eax
00C01CD1 sub esp,40h
00C01CD4 mov eax,dword ptr [__security_cookie (0C0D008h)]
00C01CD9 xor eax,ebp
00C01CDB mov dword ptr [ebp-10h],eax
00C01CDE push ebx
00C01CDF push esi
00C01CE0 push eax
00C01CE1 lea eax,[ebp-0Ch]
00C01CE4 mov dword ptr fs:[00000000h],eax
bool dpll(std::vector<clause> f)
00C01CEA lea ecx,[f]
00C01CED mov dword ptr [ebp-4],0
00C01CF4 call unitPropagate (0C01950h)
{
unitPropagate(f);
00C01CF9 lea ecx,[f]
00C01CFC call checkFalseClause (0C01660h)
00C01D01 test al,al
00C01D03 je dpll+4Ch (0C01D0Ch)
00C01D05 xor bh,bh
00C01D07 jmp dpll+206h (0C01EC6h)
if(checkFalseClause(f))
{
return false;
00C01D0C lea ecx,[f]
00C01D0F call checkAllClausesTrue (0C014F0h)
00C01D14 test al,al
00C01D16 je dpll+5Fh (0C01D1Fh)
}
else if(checkAllClausesTrue(f))
00C01D18 mov bh,1
00C01D1A jmp dpll+206h (0C01EC6h)
{
return true;
}
else
00C01D1F lea edx,[f]
00C01D22 lea ecx,[l]
00C01D25 call chooseLiteral (0C013D0h)
00C01D2A mov byte ptr [ebp-4],1
{
atom l = chooseLiteral(f);
00C01D2E lea edx,[f]
00C01D31 xorps xmm0,xmm0
00C01D34 mov dword ptr [ebp-20h],0
00C01D3B lea ecx,[a]
00C01D3E movq mmword ptr [a],xmm0
00C01D43 call makeDuplicate (0C01A30h)
00C01D48 mov byte ptr [ebp-4],2
00C01D4C sub esp,20h
00C01D4F mov esi,esp
00C01D51 mov bl,1
00C01D53 mov dword ptr [ebp-4Ch],esi
00C01D56 lea ecx,[esi+4]
00C01D59 mov al,byte ptr [l]
00C01D5C mov byte ptr [esi],al
00C01D5E mov dword ptr [ecx+14h],0Fh
00C01D65 mov dword ptr [ecx+10h],0
00C01D6C cmp dword ptr [ecx+14h],10h
00C01D70 jb dpll+0B6h (0C01D76h)
00C01D72 mov eax,dword ptr [ecx]
00C01D74 jmp dpll+0B8h (0C01D78h)
00C01D76 mov eax,ecx
00C01D78 push 0FFFFFFFFh
00C01D7A mov byte ptr [eax],0
00C01D7D lea eax,[ebp-44h]
00C01D80 push 0
00C01D82 push eax
00C01D83 call std::basic_string<char,std::char_traits<char>,std::allocator<char> >::assign (0C02A80h)
00C01D88 mov al,byte ptr [ebp-2Ch]
00C01D8B lea ecx,[a]
00C01D8E mov byte ptr [esi+1Ch],al
00C01D91 mov dl,bl
00C01D93 mov al,byte ptr [ebp-2Bh]
00C01D96 mov byte ptr [esi+1Dh],al
00C01D99 call replaceInstancesOf (0C017D0h)
00C01D9E xorps xmm0,xmm0
00C01DA1 mov dword ptr [ebp-14h],0
std::vector<clause> a = makeDuplicate(f);
replaceInstancesOf(a, l, true);
00C01DA8 lea edx,[f]
std::vector<clause> a = makeDuplicate(f);
replaceInstancesOf(a, l, true);
00C01DAB movq mmword ptr [b],xmm0
00C01DB0 lea ecx,[b]
00C01DB3 call makeDuplicate (0C01A30h)
00C01DB8 mov esi,esp
00C01DBA mov byte ptr [ebp-4],3
00C01DBE mov dword ptr [ebp-4Ch],esi
00C01DC1 lea ecx,[esi+4]
00C01DC4 mov al,byte ptr [l]
00C01DC7 xor bl,bl
00C01DC9 push 0FFFFFFFFh
00C01DCB mov byte ptr [esi],al
00C01DCD lea eax,[ebp-44h]
00C01DD0 push 0
00C01DD2 mov dword ptr [ecx+14h],0Fh
00C01DD9 mov dword ptr [ecx+10h],0
00C01DE0 push eax
00C01DE1 mov byte ptr [ecx],bl
00C01DE3 call std::basic_string<char,std::char_traits<char>,std::allocator<char> >::assign (0C02A80h)
00C01DE8 mov al,byte ptr [ebp-2Ch]
00C01DEB lea ecx,[b]
00C01DEE mov byte ptr [esi+1Ch],al
00C01DF1 mov dl,bl
00C01DF3 mov al,byte ptr [ebp-2Bh]
00C01DF6 mov byte ptr [esi+1Dh],al
00C01DF9 call replaceInstancesOf (0C017D0h)
std::vector<clause> b = makeDuplicate(f);
replaceInstancesOf(b, l, false);
00C01DFE add esp,14h
00C01E01 lea eax,[a]
00C01E04 mov ecx,esp
00C01E06 push eax
00C01E07 call std::vector<std::vector<atom,std::allocator<atom> >,std::allocator<std::vector<atom,std::allocator<atom> > > >::vector<std::vector<atom,std::allocator<atom> >,std::allocator<std::vector<atom,std::allocator<atom> > > > (0C02420h)
00C01E0C call dpll (0C01CC0h)
00C01E11 mov bl,al
00C01E13 mov ecx,esp
00C01E15 lea eax,[b]
00C01E18 push eax
00C01E19 call std::vector<std::vector<atom,std::allocator<atom> >,std::allocator<std::vector<atom,std::allocator<atom> > > >::vector<std::vector<atom,std::allocator<atom> >,std::allocator<std::vector<atom,std::allocator<atom> > > > (0C02420h)
00C01E1E call dpll (0C01CC0h)
00C01E23 mov ecx,dword ptr [b]
00C01E26 mov bh,al
00C01E28 add esp,0Ch
00C01E2B or bh,bl
00C01E2D test ecx,ecx
00C01E2F je dpll+1B4h (0C01E74h)
00C01E31 push dword ptr [ebp-4Ch]
00C01E34 mov edx,dword ptr [ebp-18h]
00C01E37 push ecx
00C01E38 call std::_Destroy_range1<std::allocator<std::vector<atom,std::allocator<atom> > >,std::vector<atom,std::allocator<atom> > *> (0C035E0h)
00C01E3D mov ecx,dword ptr [ebp-14h]
00C01E40 mov eax,2AAAAAABh
00C01E45 mov esi,dword ptr [b]
00C01E48 add esp,8
00C01E4B sub ecx,esi
00C01E4D imul ecx
00C01E4F sar edx,1
00C01E51 mov eax,edx
00C01E53 shr eax,1Fh
00C01E56 add eax,edx
00C01E58 push eax
00C01E59 push esi
00C01E5A call std::_Wrap_alloc<std::allocator<std::vector<atom,std::allocator<atom> > > >::deallocate (0C02D20h)
00C01E5F mov dword ptr [b],0
00C01E66 mov dword ptr [ebp-18h],0
00C01E6D mov dword ptr [ebp-14h],0
00C01E74 mov ecx,dword ptr [a]
00C01E77 test ecx,ecx
00C01E79 je dpll+1FEh (0C01EBEh)
00C01E7B push dword ptr [ebp-4Ch]
00C01E7E mov edx,dword ptr [ebp-24h]
00C01E81 push ecx
00C01E82 call std::_Destroy_range1<std::allocator<std::vector<atom,std::allocator<atom> > >,std::vector<atom,std::allocator<atom> > *> (0C035E0h)
00C01E87 mov ecx,dword ptr [ebp-20h]
00C01E8A mov eax,2AAAAAABh
00C01E8F mov esi,dword ptr [a]
00C01E92 add esp,8
00C01E95 sub ecx,esi
00C01E97 imul ecx
00C01E99 sar edx,1
00C01E9B mov eax,edx
00C01E9D shr eax,1Fh
00C01EA0 add eax,edx
00C01EA2 push eax
00C01EA3 push esi
00C01EA4 call std::_Wrap_alloc<std::allocator<std::vector<atom,std::allocator<atom> > > >::deallocate (0C02D20h)
00C01EA9 mov dword ptr [a],0
00C01EB0 mov dword ptr [ebp-24h],0
00C01EB7 mov dword ptr [ebp-20h],0
00C01EBE lea ecx,[ebp-44h]
00C01EC1 call std::basic_string<char,std::char_traits<char>,std::allocator<char> >::~basic_string<char,std::char_traits<char>,std::allocator<char> > (0C027A0h)
00C01EC6 mov ecx,dword ptr [f]
00C01EC9 test ecx,ecx
00C01ECB je dpll+23Bh (0C01EFBh)
00C01ECD push dword ptr [ebp-4Ch]
00C01ED0 mov edx,dword ptr [ebp+0Ch]
00C01ED3 push ecx
00C01ED4 call std::_Destroy_range1<std::allocator<std::vector<atom,std::allocator<atom> > >,std::vector<atom,std::allocator<atom> > *> (0C035E0h)
00C01ED9 mov ecx,dword ptr [ebp+10h]
00C01EDC mov eax,2AAAAAABh
00C01EE1 mov esi,dword ptr [f]
00C01EE4 add esp,8
00C01EE7 sub ecx,esi
00C01EE9 imul ecx
00C01EEB sar edx,1
00C01EED mov ecx,edx
00C01EEF shr ecx,1Fh
00C01EF2 add ecx,edx
00C01EF4 push ecx
00C01EF5 push esi
00C01EF6 call std::_Wrap_alloc<std::allocator<std::vector<atom,std::allocator<atom> > > >::deallocate (0C02D20h)
00C01EFB mov al,bh
00C01EFD mov ecx,dword ptr [ebp-0Ch]
00C01F00 mov dword ptr fs:[0],ecx
00C01F07 pop ecx
00C01F08 pop esi
00C01F09 pop ebx
00C01F0A mov ecx,dword ptr [ebp-10h]
00C01F0D xor ecx,ebp
00C01F0F call __security_check_cookie (0C080CCh)
00C01F14 mov esp,ebp
00C01F16 pop ebp
00C01F17 ret
The source interleaving is wrong. This is the correct place you want to look at:
00C01CFC call checkFalseClause (0C01660h)
00C01D01 test al,al
00C01D03 je dpll+4Ch (0C01D0Ch)
00C01D05 xor bh,bh
00C01D07 jmp dpll+206h (0C01EC6h)
As you can see, it goes to the expected address if the return value was nonzero.
The part you looked at is actually for the else if(checkAllClausesTrue(f)) and the jump is the one going to the else clause because the compiler negated the condition.
Our code is written in C++ 11 (VS2012/Win 7-64bit). The C++ library provides a sleep_for function that we use. We observed that the C++ sleep_for sometimes shows a large overshoot. In other words we request to sleep for say 15 ms but the sleep turns out to be e.g. 100 ms. We see this when the load on the system is high.
My first reaction: “of course the sleeps "take longer" if there is a lot of load on the system and other threads are using the CPU”.
However the “funny” thing is that if we replace the sleep_for by a Windows API “Sleep” call then we do not see this behavior. I also saw that the sleep_for function under water makes a call to the Window API Sleep method.
The documentation for sleep_for states:
The function blocks the calling thread for at least the time that's specified by Rel_time. This function does not throw any exceptions.
So technically the function is working. However we did not expect to see a difference between C++ sleep_for and the regular Sleep(Ex) function.
Can somebody explain this behavior?
There is quite a bit of additional code executed if using sleep_for vs SleepEx.
For example calling SleepEx(15) generates the following assembly in debug mode (Visual Studio 2015):
; 9 : SleepEx(15, false);
mov esi, esp
push 0
push 15 ; 0000000fH
call DWORD PTR __imp__SleepEx#8
cmp esi, esp
call __RTC_CheckEsp
By contrast this code
const std::chrono::milliseconds duration(15);
std::this_thread::sleep_for(duration);
Generates the following:
; 9 : std::this_thread::sleep_for(std::chrono::milliseconds(15));
mov DWORD PTR $T1[ebp], 15 ; 0000000fH
lea eax, DWORD PTR $T1[ebp]
push eax
lea ecx, DWORD PTR $T2[ebp]
call duration
push eax
call sleep_for
add esp, 4
This calls into:
duration PROC ; std::chrono::duration<__int64,std::ratio<1,1000> >::duration<__int64,std::ratio<1,1000> ><int,void>, COMDAT
; _this$ = ecx
; 113 : { // construct from representation
push ebp
mov ebp, esp
sub esp, 204 ; 000000ccH
push ebx
push esi
push edi
push ecx
lea edi, DWORD PTR [ebp-204]
mov ecx, 51 ; 00000033H
mov eax, -858993460 ; ccccccccH
rep stosd
pop ecx
mov DWORD PTR _this$[ebp], ecx
; 112 : : _MyRep(static_cast<_Rep>(_Val))
mov eax, DWORD PTR __Val$[ebp]
mov eax, DWORD PTR [eax]
cdq
mov ecx, DWORD PTR _this$[ebp]
mov DWORD PTR [ecx], eax
mov DWORD PTR [ecx+4], edx
; 114 : }
mov eax, DWORD PTR _this$[ebp]
pop edi
pop esi
pop ebx
mov esp, ebp
pop ebp
ret 4
duration ENDP
And calls into
sleep_for PROC ; std::this_thread::sleep_for<__int64,std::ratio<1,1000> >, COMDAT
; 151 : { // sleep for duration
push ebp
mov ebp, esp
sub esp, 268 ; 0000010cH
push ebx
push esi
push edi
lea edi, DWORD PTR [ebp-268]
mov ecx, 67 ; 00000043H
mov eax, -858993460 ; ccccccccH
rep stosd
mov eax, DWORD PTR ___security_cookie
xor eax, ebp
mov DWORD PTR __$ArrayPad$[ebp], eax
; 152 : stdext::threads::xtime _Tgt = _To_xtime(_Rel_time);
mov eax, DWORD PTR __Rel_time$[ebp]
push eax
lea ecx, DWORD PTR $T1[ebp]
push ecx
call to_xtime
add esp, 8
mov edx, DWORD PTR [eax]
mov DWORD PTR $T2[ebp], edx
mov ecx, DWORD PTR [eax+4]
mov DWORD PTR $T2[ebp+4], ecx
mov edx, DWORD PTR [eax+8]
mov DWORD PTR $T2[ebp+8], edx
mov eax, DWORD PTR [eax+12]
mov DWORD PTR $T2[ebp+12], eax
mov ecx, DWORD PTR $T2[ebp]
mov DWORD PTR __Tgt$[ebp], ecx
mov edx, DWORD PTR $T2[ebp+4]
mov DWORD PTR __Tgt$[ebp+4], edx
mov eax, DWORD PTR $T2[ebp+8]
mov DWORD PTR __Tgt$[ebp+8], eax
mov ecx, DWORD PTR $T2[ebp+12]
mov DWORD PTR __Tgt$[ebp+12], ecx
; 153 : sleep_until(&_Tgt);
lea eax, DWORD PTR __Tgt$[ebp]
push eax
call sleep_until
add esp, 4
; 154 : }
push edx
mov ecx, ebp
push eax
lea edx, DWORD PTR $LN5#sleep_for
call #_RTC_CheckStackVars#8
pop eax
pop edx
pop edi
pop esi
pop ebx
mov ecx, DWORD PTR __$ArrayPad$[ebp]
xor ecx, ebp
call #__security_check_cookie#4
add esp, 268 ; 0000010cH
cmp ebp, esp
call __RTC_CheckEsp
mov esp, ebp
pop ebp
ret 0
npad 3
$LN5#sleep_for:
DD 1
DD $LN4#sleep_for
$LN4#sleep_for:
DD -24 ; ffffffe8H
DD 16 ; 00000010H
DD $LN3#sleep_for
$LN3#sleep_for:
DB 95 ; 0000005fH
DB 84 ; 00000054H
DB 103 ; 00000067H
DB 116 ; 00000074H
DB 0
sleep_for ENDP
Some conversion happens:
to_xtime PROC ; std::_To_xtime<__int64,std::ratio<1,1000> >, COMDAT
; 758 : { // convert duration to xtime
push ebp
mov ebp, esp
sub esp, 348 ; 0000015cH
push ebx
push esi
push edi
lea edi, DWORD PTR [ebp-348]
mov ecx, 87 ; 00000057H
mov eax, -858993460 ; ccccccccH
rep stosd
mov eax, DWORD PTR ___security_cookie
xor eax, ebp
mov DWORD PTR __$ArrayPad$[ebp], eax
; 759 : xtime _Xt;
; 760 : if (_Rel_time <= chrono::duration<_Rep, _Period>::zero())
lea eax, DWORD PTR $T7[ebp]
push eax
call duration_zero ; std::chrono::duration<__int64,std::ratio<1,1000> >::zero
add esp, 4
push eax
mov ecx, DWORD PTR __Rel_time$[ebp]
push ecx
call chronos_operator ; std::chrono::operator<=<__int64,std::ratio<1,1000>,__int64,std::ratio<1,1000> >
add esp, 8
movzx edx, al
test edx, edx
je SHORT $LN2#To_xtime
; 761 : { // negative or zero relative time, return zero
; 762 : _Xt.sec = 0;
xorps xmm0, xmm0
movlpd QWORD PTR __Xt$[ebp], xmm0
; 763 : _Xt.nsec = 0;
mov DWORD PTR __Xt$[ebp+8], 0
; 764 : }
; 765 : else
jmp $LN3#To_xtime
$LN2#To_xtime:
; 766 : { // positive relative time, convert
; 767 : chrono::nanoseconds _T0 =
; 768 : chrono::system_clock::now().time_since_epoch();
lea eax, DWORD PTR $T5[ebp]
push eax
lea ecx, DWORD PTR $T6[ebp]
push ecx
call system_clock_now ; std::chrono::system_clock::now
add esp, 4
mov ecx, eax
call time_since_ephoch ; std::chrono::time_point<std::chrono::system_clock,std::chrono::duration<__int64,std::ratio<1,10000000> > >::time_since_epoch
push eax
lea ecx, DWORD PTR __T0$8[ebp]
call duration ; std::chrono::duration<__int64,std::ratio<1,1000000000> >::duration<__int64,std::ratio<1,1000000000> ><__int64,std::ratio<1,10000000>,void>
; 769 : _T0 += _Rel_time;
mov eax, DWORD PTR __Rel_time$[ebp]
push eax
lea ecx, DWORD PTR $T4[ebp]
call duration_ratio ; std::chrono::duration<__int64,std::ratio<1,1000000000> >::duration<__int64,std::ratio<1,1000000000> ><__int64,std::ratio<1,1000>,void>
lea ecx, DWORD PTR $T4[ebp]
push ecx
lea ecx, DWORD PTR __T0$8[ebp]
call duration_ratio ; std::chrono::duration<__int64,std::ratio<1,1000000000> >::operator+=
; 770 : _Xt.sec = chrono::duration_cast<chrono::seconds>(_T0).count();
lea eax, DWORD PTR __T0$8[ebp]
push eax
lea ecx, DWORD PTR $T3[ebp]
push ecx
call duration_cast ; std::chrono::duration_cast<std::chrono::duration<__int64,std::ratio<1,1> >,__int64,std::ratio<1,1000000000> >
add esp, 8
mov ecx, eax
call duration_count ; std::chrono::duration<__int64,std::ratio<1,1> >::count
mov DWORD PTR __Xt$[ebp], eax
mov DWORD PTR __Xt$[ebp+4], edx
; 771 : _T0 -= chrono::seconds(_Xt.sec);
lea eax, DWORD PTR __Xt$[ebp]
push eax
lea ecx, DWORD PTR $T1[ebp]
call duration_ratio ; std::chrono::duration<__int64,std::ratio<1,1> >::duration<__int64,std::ratio<1,1> ><__int64,void>
push eax
lea ecx, DWORD PTR $T2[ebp]
call duration_ratio ; std::chrono::duration<__int64,std::ratio<1,1000000000> >::duration<__int64,std::ratio<1,1000000000> ><__int64,std::ratio<1,1>,void>
lea ecx, DWORD PTR $T2[ebp]
push ecx
lea ecx, DWORD PTR __T0$8[ebp]
call duration_ratio ; std::chrono::duration<__int64,std::ratio<1,1000000000> >::operator-=
; 772 : _Xt.nsec = (long)_T0.count();
lea ecx, DWORD PTR __T0$8[ebp]
call duration_ratio ; std::chrono::duration<__int64,std::ratio<1,1000000000> >::count
mov DWORD PTR __Xt$[ebp+8], eax
$LN3#To_xtime:
; 773 : }
; 774 : return (_Xt);
mov eax, DWORD PTR $T9[ebp]
mov ecx, DWORD PTR __Xt$[ebp]
mov DWORD PTR [eax], ecx
mov edx, DWORD PTR __Xt$[ebp+4]
mov DWORD PTR [eax+4], edx
mov ecx, DWORD PTR __Xt$[ebp+8]
mov DWORD PTR [eax+8], ecx
mov edx, DWORD PTR __Xt$[ebp+12]
mov DWORD PTR [eax+12], edx
mov eax, DWORD PTR $T9[ebp]
; 775 : }
push edx
mov ecx, ebp
push eax
lea edx, DWORD PTR $LN8#To_xtime
call #_RTC_CheckStackVars#8
pop eax
pop edx
pop edi
pop esi
pop ebx
mov ecx, DWORD PTR __$ArrayPad$[ebp]
xor ecx, ebp
call #__security_check_cookie#4
add esp, 348 ; 0000015cH
cmp ebp, esp
call __RTC_CheckEsp
mov esp, ebp
pop ebp
ret 0
$LN8#To_xtime:
DD 2
DD $LN7#To_xtime
$LN7#To_xtime:
DD -24 ; ffffffe8H
DD 16 ; 00000010H
DD $LN5#To_xtime
DD -40 ; ffffffd8H
DD 8
DD $LN6#To_xtime
$LN6#To_xtime:
DB 95 ; 0000005fH
DB 84 ; 00000054H
DB 48 ; 00000030H
DB 0
$LN5#To_xtime:
DB 95 ; 0000005fH
DB 88 ; 00000058H
DB 116 ; 00000074H
DB 0
to_xtime ENDP
Eventually the imported function gets called, the same one SleepEx has used.
sleep_until PROC ; std::this_thread::sleep_until, COMDAT
; 131 : { // sleep until _Abs_time
push ebp
mov ebp, esp
sub esp, 192 ; 000000c0H
push ebx
push esi
push edi
lea edi, DWORD PTR [ebp-192]
mov ecx, 48 ; 00000030H
mov eax, -858993460 ; ccccccccH
rep stosd
; 132 : _Thrd_sleep(_Abs_time);
mov esi, esp
mov eax, DWORD PTR __Abs_time$[ebp]
push eax
call DWORD PTR __imp___Thrd_sleep
add esp, 4
cmp esi, esp
call __RTC_CheckEsp
; 133 : }
pop edi
pop esi
pop ebx
add esp, 192 ; 000000c0H
cmp ebp, esp
call __RTC_CheckEsp
mov esp, ebp
pop ebp
ret 0
sleep_until ENDP
You should also be aware even SleepEx may not give 100% exact results as per the MSDN documentation https://msdn.microsoft.com/en-us/library/windows/desktop/ms686307(v=vs.85).aspx
This function causes a thread to relinquish the remainder of its time slice and become unrunnable for an interval based on the value of dwMilliseconds. The system clock "ticks" at a constant rate. If dwMilliseconds is less than the resolution of the system clock, the thread may sleep for less than the specified length of time. If dwMilliseconds is greater than one tick but less than two, the wait can be anywhere between one and two ticks, and so on. To increase the accuracy of the sleep interval, call the timeGetDevCaps function to determine the supported minimum timer resolution and the timeBeginPeriod function to set the timer resolution to its minimum. Use caution when calling timeBeginPeriod, as frequent calls can significantly affect the system clock, system power usage, and the scheduler. If you call timeBeginPeriod, call it one time early in the application and be sure to call the timeEndPeriod function at the very end of the application.
I'm using a proprietary DLL (CP5200.dll) to communicate with 10 scrolling message LED signs. I'm using openFrameworks generate and save images of the text I want to display, and then using the DLL to package the images into data the sign can process.
I call the following functions a few times a minute, and they return a file of 2-4kb , depending on image size, but at a certain point - around 3hrs 45 minutes after startup, they start returning files of 128 bytes, which result in a blank LED display when uploaded. I'm hypothesizing that there's a buffer inside the dll that doesn't get emptied, or something of the sort, but I can't make sense of the decompiled code.
Here are the functions:
int CP5200_Program_AddPicture(HOBJECT hObj, int nWinNo, const char* pPictFile, int nMode, int nEffect, int nSpeed, int nStay, int nCompress)
int CP5200_Program_SaveToFile(HOBJECT hObj, const char* pFilename)
Decompiled functions:
Exported fn(): CP5200_Program_AddImage - Ord:00C3h
:1000FD20 51 push ecx
:1000FD21 55 push ebp
:1000FD22 8B6C240C mov ebp, dword ptr [esp+0C]
:1000FD26 85ED test ebp, ebp
:1000FD28 7508 jne 1000FD32
:1000FD2A 83C8FF or eax, FFFFFFFF
:1000FD2D 5D pop ebp
:1000FD2E 59 pop ecx
:1000FD2F C23000 ret 0030
Function fully disassembled. I couldn't manage to decompile.
:1000FC50 ; Exported entry 15. CP5200_Program_AddPicture
:1000FC50
:1000FC50 ; ¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦ S U B R O U T I N E ¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦¦
:1000FC50
:1000FC50
:1000FC50 public CP5200_Program_AddPicture
:1000FC50 CP5200_Program_AddPicture proc near
:1000FC50
:1000FC50 arg_0 = dword ptr 8
:1000FC50 arg_4 = dword ptr 0Ch
:1000FC50 arg_8 = dword ptr 10h
:1000FC50 arg_C = dword ptr 14h
:1000FC50 arg_10 = dword ptr 18h
:1000FC50 arg_14 = dword ptr 1Ch
:1000FC50 arg_18 = dword ptr 20h
:1000FC50 arg_1C = dword ptr 24h
:1000FC50
:1000FC50 push ebx
:1000FC51 mov ebx, [esp+arg_0]
:1000FC55 test ebx, ebx
:1000FC57 jnz short loc_1000FC60
:1000FC59 or eax, 0FFFFFFFFh
:1000FC5C pop ebx
:1000FC5D retn 20h
:1000FC60 ; ---------------------------------------------------------------------------
:1000FC60
:1000FC60 loc_1000FC60: ; CODE XREF: CP5200_Program_AddPicture+7j
:1000FC60 push esi
:1000FC61 push edi
:1000FC62 mov edi, ebx
:1000FC64 mov esi, offset aCprogram ; "CProgram"
:1000FC69 mov ecx, 9
:1000FC6E xor eax, eax
:1000FC70 repe cmpsb
:1000FC72 jz short loc_1000FC79
:1000FC74 sbb eax, eax
:1000FC76 sbb eax, 0FFFFFFFFh
:1000FC79
:1000FC79 loc_1000FC79: ; CODE XREF: CP5200_Program_AddPicture+22j
:1000FC79 test eax, eax
:1000FC7B jz short loc_1000FC86
:1000FC7D pop edi
:1000FC7E pop esi
:1000FC7F or eax, 0FFFFFFFFh
:1000FC82 pop ebx
:1000FC83 retn 20h
:1000FC86 ; ---------------------------------------------------------------------------
:1000FC86
:1000FC86 loc_1000FC86: ; CODE XREF: CP5200_Program_AddPicture+2Bj
:1000FC86 mov esi, [esp+8+arg_4]
:1000FC8A test esi, esi
:1000FC8C jl short loc_1000FD07
:1000FC8E mov ecx, ebx
:1000FC90 call sub_10018020
:1000FC95 cmp esi, eax
:1000FC97 jge short loc_1000FD07
:1000FC99 push esi
:1000FC9A mov ecx, ebx
:1000FC9C call sub_10018030
:1000FCA1 push 3Eh
:1000FCA3 mov edi, eax
:1000FCA5 call ??2#YAPAXI#Z ; operator new(uint)
:1000FCAA add esp, 4
:1000FCAD test eax, eax
:1000FCAF jz short loc_1000FCBE
:1000FCB1 mov ecx, eax
:1000FCB3 call sub_100012E0
:1000FCB8 mov esi, eax
:1000FCBA test esi, esi
:1000FCBC jnz short loc_1000FCC9
:1000FCBE
:1000FCBE loc_1000FCBE: ; CODE XREF: CP5200_Program_AddPicture+5Fj
:1000FCBE pop edi
:1000FCBF pop esi
:1000FCC0 mov eax, 0FFFFFFFCh
:1000FCC5 pop ebx
:1000FCC6 retn 20h
:1000FCC9 ; ---------------------------------------------------------------------------
:1000FCC9
:1000FCC9 loc_1000FCC9: ; CODE XREF: CP5200_Program_AddPicture+6Cj
:1000FCC9 mov eax, [esp+8+arg_8]
:1000FCCD push eax
:1000FCCE lea ecx, [esi+25h]
:1000FCD1 call sub_100076A0
:1000FCD6 mov ecx, [esp+8+arg_C]
:1000FCDA mov edx, [esp+8+arg_10]
:1000FCDE mov eax, [esp+8+arg_14]
:1000FCE2 mov [esi+2Ah], ecx
:1000FCE5 mov ecx, [esp+8+arg_18]
:1000FCE9 mov [esi+2Eh], edx
:1000FCEC mov edx, [esp+8+arg_1C]
:1000FCF0 mov [esi+36h], ecx
:1000FCF3 push esi
:1000FCF4 mov ecx, edi
:1000FCF6 mov [esi+32h], eax
:1000FCF9 mov [esi+3Ah], edx
:1000FCFC call sub_100015A0
:1000FD01 pop edi
:1000FD02 pop esi
:1000FD03 pop ebx
:1000FD04 retn 20h
:1000FD0F CP5200_Program_AddPicture endp
Exported fn(): CP5200_Program_SaveToFile - Ord:0013h
:1000CE80 8B542404 mov edx, dword ptr [esp+04]
:1000CE84 85D2 test edx, edx
:1000CE86 741F je 1000CEA7
:1000CE88 56 push esi
:1000CE89 57 push edi
:1000CE8A 8BFA mov edi, edx
:1000CE8C BE8C060610 mov esi, 1006068C
:1000CE91 B909000000 mov ecx, 00000009
:1000CE96 33C0 xor eax, eax
:1000CE98 F3 repz
:1000CE99 A6 cmpsb
:1000CE9A 5F pop edi
:1000CE9B 5E pop esi
:1000CE9C 7405 je 1000CEA3
:1000CE9E 1BC0 sbb eax, eax
:1000CEA0 83D8FF sbb eax, FFFFFFFF
I'm writing in VS 2012, with openframeworks version of_v0.8.3_vs_release.
Does the decompiled code give any clues as to what's happening in the DLL that causes this behavior?
EDIT
I switched from memcmp to a home brewed 13 byte compare function and the homebrew doesnt have the extra instructions. So all I can guess is that the extra assembly is just a flaw in the optimizer.
if (!EQ13(&ti, &m_ti)) { // in 2014, memcmp was not being optimzied here
000007FEF91B2CFE mov rdx,qword ptr [rsp]
000007FEF91B2D02 movzx eax,byte ptr [rsp+0Ch]
000007FEF91B2D07 mov ecx,dword ptr [rsp+8]
000007FEF91B2D0B cmp rdx,qword ptr [r10+28h]
000007FEF91B2D0F jne TSccIter::SetTi+9Dh (7FEF91B2D1Dh)
000007FEF91B2D11 cmp ecx,dword ptr [r10+30h]
000007FEF91B2D15 jne TSccIter::SetTi+9Dh (7FEF91B2D1Dh)
000007FEF91B2D17 cmp al,byte ptr [r10+34h]
000007FEF91B2D1B je TSccIter::SetTi+0B1h (7FEF91B2D31h)
My homebrew isn't perfect in this case since it does 3 movs at the start even though it is unlikely to ever check past the first mov. I need to work on that part.
ORIGINAL QUESTION
Here is asm code from msvc 2010 showing how it can optimze a small, fixed-sized memcmp (in this case, 13 bytes). I've seen this type of optimization a lot in our code, but never with the last 6 lines. Can anyone tell me why the last 6 lines of assembly are there? TransferItem is 13 bytes so that explains the QWORD, DWORD, then BYTE cmps.
struct TransferItem {
char m_szCxrMkt1[3];
char m_szCxrOp1[3];
char m_chDelimiter;
char m_szCxrMkt2[3];
char m_szCxrOp2[3];
};
...
if (memcmp(&ti, &m_ti, sizeof(TransferItem))) {
2B8E lea rax,[rsp]
2B92 mov rdx,qword ptr [rax]
2B95 cmp rdx,qword ptr [r10+28h]
2B99 jne TSccIter::SetTi+0A2h (7FEF9302BB2h)
2B9B mov edx,dword ptr [rax+8]
2B9E cmp edx,dword ptr [r10+30h]
2BA2 jne TSccIter::SetTi+0A2h (7FEF9302BB2h)
2BA4 movzx edx,byte ptr [rax+0Ch]
2BA8 cmp dl,byte ptr [r10+34h]
2BAC jne TSccIter::SetTi+0A2h (7FEF9302BB2h)
2BAE xor eax,eax
2BB0 jmp TSccIter::SetTi+0A7h (7FEF9302BB7h)
2BB2 sbb eax,eax
2BB4 sbb eax,0FFFFFFFFh
2BB7 test eax,eax
2BB9 je TSccIter::SetTi+0CCh (7FEF9302BDCh)
Also what is the point of xor eax,eax which we know will be zero and then testing that for that known to be zero on line 2bb7?
Here is the whole function
// fWildCard means match certain fields to '**' in the db
// szCxrMkt1,2 are required and cannot be null, ' ', or '\0\0'.
// szCxrOp1,2 can be null, ' ', or '\0\0'.
TSccIter& SetTi(bool fWildCard, LPCSTR szCxrMkt1, LPCSTR szCxrOp1, LPCSTR szCxrMkt2, LPCSTR szCxrOp2) {
if (m_fSkipSet)
return *this;
m_iSid = -1; // resets the iterator to search from the start
// Pad the struct to 16 bytes so we can clear it with 2 QWORDS
// We use a temp, ti, to detect if the new transferitem has changed
class TransferItemPadded : public TransferItem {
char padding[16 - sizeof(TransferItem)]; // get us to 16 bytes
} ti;
U8(&ti) = U8(BUMP(&ti, 8)) = 0x2020202020202020; // 8 spaces
// copy in the params
CPY2(ti.m_szCxrMkt1, szCxrMkt1);
if (szCxrOp1 && *szCxrOp1)
CPY2(ti.m_szCxrOp1, szCxrOp1);
ti.m_chDelimiter = (fWildCard) ? '*' : ':'; // this controls wild card matching
CPY2(ti.m_szCxrMkt2, szCxrMkt2);
if (szCxrOp2 && *szCxrOp2)
CPY2(ti.m_szCxrOp2, szCxrOp2);
// see if different
if (memcmp(&ti, &m_ti, sizeof(TransferItem))) {
memcpy(&m_ti, &ti, sizeof(TransferItem));
m_fQryChanged = true;
}
return *this;
}
typedef unsigned __int64 U8;
#define CPY2(a,b) ((*(WORD*)a) = (*(WORD*)b))
And here's the whole asm
TSccIter& SetTi(bool fWildCard, LPCSTR szCxrMkt1, LPCSTR szCxrOp1, LPCSTR szCxrMkt2, LPCSTR szCxrOp2) {
2B10 sub rsp,18h
if (m_fSkipSet)
2B14 cmp byte ptr [rcx+0EAh],0
2B1B mov r10,rcx
return *this;
2B1E jne TSccIter::SetTi+0CCh (7FEF9302BDCh)
m_iSid = -1;
class TransferItemPadded : public TransferItem {
char padding[16 - sizeof(TransferItem)];
} ti;
U8(&ti) = U8(BUMP(&ti, 8)) = 0x2020202020202020;
2B24 mov rax,2020202020202020h
2B2E mov byte ptr [rcx+36h],0FFh
2B32 mov qword ptr [rsp],rax
2B36 mov qword ptr [rsp+8],rax
CPY2(ti.m_szCxrMkt1, szCxrMkt1);
2B3B movzx eax,word ptr [r8]
2B3F mov word ptr [rsp],ax
if (szCxrOp1 && *szCxrOp1)
2B43 test r9,r9
2B46 je TSccIter::SetTi+47h (7FEF9302B57h)
2B48 cmp byte ptr [r9],0
2B4C je TSccIter::SetTi+47h (7FEF9302B57h)
CPY2(ti.m_szCxrOp1, szCxrOp1);
2B4E movzx eax,word ptr [r9]
2B52 mov word ptr [rsp+3],ax
ti.m_chDelimiter = (fWildCard) ? '*' : ':';
2B57 mov eax,3Ah
2B5C mov ecx,2Ah
2B61 test dl,dl
2B63 cmovne eax,ecx
2B66 mov byte ptr [rsp+6],al
CPY2(ti.m_szCxrMkt2, szCxrMkt2);
2B6A mov rax,qword ptr [szCxrMkt2]
2B6F movzx ecx,word ptr [rax]
if (szCxrOp2 && *szCxrOp2)
2B72 mov rax,qword ptr [szCxrOp2]
2B77 mov word ptr [rsp+7],cx
2B7C test rax,rax
2B7F je TSccIter::SetTi+7Eh (7FEF9302B8Eh)
2B81 cmp byte ptr [rax],0
2B84 je TSccIter::SetTi+7Eh (7FEF9302B8Eh)
CPY2(ti.m_szCxrOp2, szCxrOp2);
2B86 movzx eax,word ptr [rax]
2B89 mov word ptr [rsp+0Ah],ax
if (memcmp(&ti, &m_ti, sizeof(TransferItem))) {
2B8E lea rax,[rsp]
2B92 mov rdx,qword ptr [rax]
2B95 cmp rdx,qword ptr [r10+28h]
2B99 jne TSccIter::SetTi+0A2h (7FEF9302BB2h)
2B9B mov edx,dword ptr [rax+8]
2B9E cmp edx,dword ptr [r10+30h]
2BA2 jne TSccIter::SetTi+0A2h (7FEF9302BB2h)
2BA4 movzx edx,byte ptr [rax+0Ch]
2BA8 cmp dl,byte ptr [r10+34h]
2BAC jne TSccIter::SetTi+0A2h (7FEF9302BB2h)
2BAE xor eax,eax
2BB0 jmp TSccIter::SetTi+0A7h (7FEF9302BB7h)
2BB2 sbb eax,eax
2BB4 sbb eax,0FFFFFFFFh
2BB7 test eax,eax
2BB9 je TSccIter::SetTi+0CCh (7FEF9302BDCh)
memcpy(&m_ti, &ti, sizeof(TransferItem));
2BBB mov rax,qword ptr [rsp]
m_fQryChanged = true;
2BBF mov byte ptr [r10+0E9h],1
2BC7 mov qword ptr [r10+28h],rax
2BCB mov eax,dword ptr [rsp+8]
2BCF mov dword ptr [r10+30h],eax
2BD3 movzx eax,byte ptr [rsp+0Ch]
2BD8 mov byte ptr [r10+34h],al
}
return *this;
2BDC mov rax,r10
}
2bb7 can be reached by different code paths: via taken jumps at 2b99, 2ba2 and 2bac, as well as directly when none of the conditional jumps is taken. The xor eax,eax is only executed at the last path, and it ensures that eax is 0 - which is apparently not the case otherwise.
The last 6 lines return the value in eax == 0 for a match, and also set the SF and ZF condition codes.
test eax, eax will test whether eax AND eax == 0. The following je will jump if zero.
And xor eax, eax is an efficient way to encode "eax = 0". It is more efficient than mov eax, 0
EDIT: Initially misread the question. It looks like something will happen at "TSccIter::SetTi+0A7h" which should change the value?
Also, the SBB trick to replicate the carry(2BB2-2BB4) is explained here:
http://compgroups.net/comp.lang.asm.x86/trick-with-sbb-instruction/20164
I'm using Visual Studio 2010. I wrote a program that does a simple Binary Search algorithm. I'm trying to convert it into assemble code. I used the Disassembler to get the assembly code. I'm trying to paste it into _asm. I've tried so many ways and it's just not working.
I tried
_asm(" . . . .");
_asm( );
_asm{ } <--- -Currently going with this way for c++. seems to work well.
seen somewhere online people saying put '\' at the end of each line. That hasn't worked for me.
Here's the code. I'll comment where the errors are. well, I have 13 as of now. Some I won't list because they're the same as other errors. Once I fix one or 2 I should be able to fix them all. The orignal c++ code for the function is also in there.It's commented out.
bool binarySearch(int searchNum,int myArray[],int size){
_asm{
push ebp
mov ebp,esp
sub esp,0F0h
push ebx
push esi
push edi
lea edi,[ebp-0F0h]
mov ecx,3Ch
mov eax,0CCCCCCCCh
rep stos dword ptr es:[edi]
// 217: int first=0,last=size-1,middle;
mov dword ptr [first],0
mov eax,dword ptr [size] // ERROR! Error 2 error C2400: inline assembler syntax error in 'second operand'; found ']'
sub eax,1
mov dword ptr [last],eax
// 218: bool found = false;
mov byte ptr [found],0
// 220: while (first <= last)
mov eax,dword ptr [first]
cmp eax,dword ptr [last]
jg binarySearch+80h (0B51970h) //ERROR! Error 4 error C2400: inline assembler syntax error in 'second operand'; found '('
// 222: middle = (first + last)/2;
mov eax,dword ptr [first] ; // Error 5 error C2400: inline assembler syntax error in 'opcode'; found '('
add eax,dword ptr [last] ;
cdq
sub eax,edx
sar eax,1
mov dword ptr [middle],eax
// 224: if(searchNum > myArray[middle])
mov eax,dword ptr [middle]
mov ecx,dword ptr [myArray]
mov edx,dword ptr [searchNum]
cmp edx,dword ptr [ecx+eax*4]
jle binarySearch+61h (0B51951h) // Error 8 error C2400: inline assembler syntax error in 'opcode'; found '('
// 226: first = middle +1;
mov eax,dword ptr [middle]
add eax,1
mov dword ptr [first],eax
jmp binarySearch+7Eh (0B5196Eh)
// 228: else if (searchNum < myArray[middle])
mov eax,dword ptr [middle]
mov ecx,dword ptr [myArray]
mov edx,dword ptr [searchNum]
cmp edx,dword ptr [ecx+eax*4]
jge binarySearch+7Ah (0B5196Ah)
// 230: last = middle -1;
mov eax,dword ptr [middle]
sub eax,1
mov dword ptr [last],eax
// 232: else
jmp binarySearch+7Eh (0B5196Eh) // Error 18 error C2400: inline assembler syntax error in 'second operand'; found '('
// 233: return true;
mov al,1 // Error 19 error C2400: inline assembler syntax error in 'opcode'; found '('
jmp binarySearch+82h (0B51972h)
jmp binarySearch+32h (0B51922h) // Error 22 error C2400: inline assembler syntax error in 'opcode'; found '('
// 236: return false;
xor al,al
pop edi
pop esi
pop ebx
mov esp,ebp
pop ebp
ret
};
/*
int first=0,last=size-1,middle;
bool found = false;
while (first <= last)
{
middle = (first + last)/2;
if(searchNum > myArray[middle])
{
first = middle +1;
}
else if (searchNum < myArray[middle])
{
last = middle -1;
}
else
return true;
}
return false;
*/
}
Here is the (almost 1:1) working code you posted for a standalone assembly.
binsearch.cpp
extern "C"
{
bool BinSearch(int searchNum, int myArray[], int arraySize);
};
// This is the inlined version.
bool BinSearchInline(int searchNum, int myArray[], int arraySize)
{
int middle;
int first;
int last;
char found;
_asm
{
push ebx
push esi
push edi
mov first,0
mov eax, arraySize
sub eax,1
mov last ,eax
mov found,0
LocalLoop:
mov eax, first
cmp eax, last
jg NotFound
mov eax, first
add eax, last
cdq
sub eax,edx
sar eax,1
mov middle,eax
mov eax,middle
mov ecx,myArray
mov edx,searchNum
cmp edx, dword ptr [ecx+eax*4]
jle MaybeLower
mov eax, middle
add eax,1
mov first, eax
jmp WhileLoop
MaybeLower:
mov eax, middle
mov ecx, myArray
mov edx, searchNum
cmp edx,dword ptr [ecx+eax*4]
jge Found
mov eax, middle
sub eax,1
mov last, eax
jmp WhileLoop
Found:
mov al,1
jmp Done
WhileLoop:
jmp LocalLoop
NotFound:
xor al,al
Done:
pop edi
pop esi
pop ebx
};
}
int main(int argc, char*arg[])
{
int testvalues[7];
for(int i = 0; i < 7; i++)
testvalues[i] = i;
bool b = BinSearch(8, testvalues, 7); // false, value not in array
b = BinSearch(3, testvalues, 7); // true, value is in array.
b = BinSearchInline(8, testvalues, 7); // false
b = BinSearchInline(3, testvalues, 7); // true
return 0;
}
binsearch.asm
.486
.model flat, C
option casemap :none
.code
BinSearch PROC, searchNum:DWORD, myArray:PTR DWORD, arraySize:DWORD
LOCAL first:DWORD
LOCAL middle:DWORD
LOCAL last:DWORD
LOCAL found:BYTE
push ebx
push esi
push edi
; This block is only for debugging stack errors and should be removed.
; lea edi,[ebp-0F0h]
; mov ecx,3Ch
; mov eax,0CCCCCCCCh
; rep stos dword ptr es:[edi]
mov dword ptr [first],0
mov eax,dword ptr [arraySize]
sub eax,1
mov dword ptr [last],eax
mov byte ptr [found],0 ; not even used.
##Loop:
mov eax,dword ptr [first]
cmp eax,dword ptr [last]
jg ##NotFound
mov eax,dword ptr [first]
add eax,dword ptr [last]
cdq
sub eax,edx
sar eax,1
mov dword ptr [middle],eax
mov eax,dword ptr [middle]
mov ecx,dword ptr [myArray]
mov edx,dword ptr [searchNum]
cmp edx,dword ptr [ecx+eax*4]
jle ##MaybeLower
mov eax,dword ptr [middle]
add eax,1
mov dword ptr [first],eax
jmp ##WhileLoop
##MaybeLower:
mov eax,dword ptr [middle]
mov ecx,dword ptr [myArray]
mov edx,dword ptr [searchNum]
cmp edx,dword ptr [ecx+eax*4]
jge ##Found
mov eax,dword ptr [middle]
sub eax,1
mov dword ptr [last],eax
jmp ##WhileLoop
##Found:
mov al,1
jmp ##Done
##WhileLoop:
jmp ##Loop
##NotFound:
xor al,al
##Done:
pop edi
pop esi
pop ebx
ret
BinSearch ENDP
END