Assembly of constructor for virtual inheritance - c++

Here's a simple inheritance usinig a virtual base class (code available on Compiler Explorer).
class B {
public:
int i = 1;
};
class D : virtual public B {
public:
int j = 2;
};
void Assign(B *b) {
b->i = 2;
}
int main() {
B *b = new D();
Assign(b);
return 0;
}
The assembly listing of the main() function looks like this:
09 main: # #main
10 push rbp
11 mov rbp, rsp
12 sub rsp, 32
13 mov eax, 16
14 mov edi, eax
15 mov dword ptr [rbp - 4], 0
16 call operator new(unsigned long)
17 xor esi, esi
18 mov ecx, 16
19 mov edx, ecx
20 mov rdi, rax
21 mov qword ptr [rbp - 24], rax # 8-byte Spill
22 call memset
23 mov rdi, qword ptr [rbp - 24] # 8-byte Reload
24 call D::D() [complete object constructor]
25 xor ecx, ecx
26 mov eax, ecx
27 mov rdx, qword ptr [rbp - 24] # 8-byte Reload
28 cmp rdx, 0
29 mov qword ptr [rbp - 32], rax # 8-byte Spill
30 je .LBB1_2
31 mov rax, qword ptr [rbp - 24] # 8-byte Reload
32 mov rcx, qword ptr [rax]
33 mov rcx, qword ptr [rcx - 24]
34 add rax, rcx
35 mov qword ptr [rbp - 32], rax # 8-byte Spill
36 .LBB1_2:
37 mov rax, qword ptr [rbp - 32] # 8-byte Reload
38 mov qword ptr [rbp - 16], rax
39 mov rdi, qword ptr [rbp - 16]
40 call Assign(B*)
41 xor eax, eax
42 add rsp, 32
43 pop rbp
44 ret
What is the effect of line 27-38 of the assembly?
What is the value of rax in line 29?
Why is there a branch statement?

The effect of lines 27-38 is to convert a D * to a B *. Because B is a virtual base class, it can have a variable offset from the start of D. Those 12 lines calculate where the B object is, in an unoptimized way.
The value of eax on line 29 is 0 (see lines 25-26).
The branch statement on line 30 is the result of a NULL pointer check. If the pointer to D is NULL, the conversion to a B * will also be NULL and the extra code to determine the correct offset is not wanted in that case.

Related

vector bool compiler xor specialization?

I was thinking again about implementing the quadratic sieve for fun, which requires Guassian elimination over a binary field, that is the operations required are 1. swapping rows and 2. XORing rows.
My ideas were either to maintain a bit array using a vector of 64-bit ints and bit twiddling, or use vector<bool>, which is probably space-optimized on my system. The bit array must be able to be dynamically sized, so std::bitset won't work. The advantage of maintaining my own ints is that I can XOR 64 bits at a time which is a neat trick. I wanted to see what a compiler would do for a loop that XOR'd bool vectors: (I wasn't able to use ^=, see operator |= on std::vector<bool>)
void xor_vector(std::vector<bool>& a, std::vector<bool>& b) {
for (std::size_t i=0; i<a.size(); ++i)
a[i] = a[i] ^ b[i];
}
I have a very basic understanding of x86 but it looks like the compiler isn't actually XORing words together? Is there a way to get the compiler to XOR entire words at a time?
https://godbolt.org/z/PbGdv3sKT
xor_vector(std::vector<bool, std::allocator<bool> >&, std::vector<bool, std::allocator<bool> >&):
mov r8, QWORD PTR [rdi]
mov rax, QWORD PTR [rdi+16]
mov edx, DWORD PTR [rdi+24]
sub rax, r8
lea rdi, [rdx+rax*8]
test rdi, rdi
je .L11
push rbp
mov r10d, 1
push rbx
mov r9, QWORD PTR [rsi]
xor esi, esi
jmp .L7
.L16:
mov rdx, r10
sal rdx, cl
mov rcx, QWORD PTR [r11]
mov rbp, rdx
test rdx, rcx
setne bl
and rbp, QWORD PTR [rax]
setne bpl
.L4:
mov rax, rdx
not rdx
or rax, rcx
and rdx, rcx
cmp bpl, bl
cmovne rdx, rax
add rsi, 1
mov QWORD PTR [r11], rdx
cmp rsi, rdi
je .L15
.L7:
test rsi, rsi
lea rax, [rsi+63]
mov rdx, rsi
cmovns rax, rsi
sar rdx, 63
shr rdx, 58
sar rax, 6
lea rcx, [rsi+rdx]
sal rax, 3
and ecx, 63
lea r11, [r8+rax]
add rax, r9
sub rcx, rdx
jns .L16
add rcx, 64
mov rdx, r10
sal rdx, cl
mov rcx, QWORD PTR [r11-8]
mov rbp, rdx
test rcx, rdx
setne bl
and rbp, QWORD PTR [rax-8]
setne bpl
sub r11, 8
jmp .L4
.L15:
pop rbx
pop rbp
ret
.L11:
ret
My question is similar to bitwise operations on vector<bool> but the answers are dated and don't seem to answer my question.
Update: I tested with a 256 bit sized bitset too. Still I don't see XORing whole machine words.
void xor_vector(std::bitset<256>& a, std::bitset<256>& b) {
for (std::size_t i=0; i<a.size(); ++i)
a[i] = a[i] ^ b[i];
}
https://godbolt.org/z/jKEf89E1j
xor_vector(std::bitset<256ul>&, std::bitset<256ul>&):
push rbx
mov r8, rdi
mov r11, rsi
xor edx, edx
mov ebx, 1
.L4:
mov rsi, rdx
mov rcx, rdx
mov rax, rbx
shr rsi, 6
and ecx, 63
sal rax, cl
mov rdi, QWORD PTR [r8+rsi*8]
mov rcx, rax
and rcx, QWORD PTR [r11+rsi*8]
mov rcx, rax
setne r10b
test rax, rdi
not rax
setne r9b
or rcx, rdi
and rax, rdi
cmp r10b, r9b
cmovne rax, rcx
add rdx, 1
mov QWORD PTR [r8+rsi*8], rax
cmp rdx, 256
jne .L4
pop rbx
ret

Modulus in Assembly x64 linux question C++ [duplicate]

This question already has answers here:
Why does GCC use multiplication by a strange number in implementing integer division?
(5 answers)
Divide Signed Integer By 2 compiles to complex assembly output, not just a shift
(1 answer)
Closed 1 year ago.
I have these functions in C++
int f1(int a)
{
int x = a / 2;
}
int f2(int a)
{
int y = a % 2;
}
int f3(int a)
{
int z = a % 7;
}
int f4(int a,int b)
{
int xy = a % b;
}
And i saw their assembly code but couldn't understand what they are doing.I couldn't even find a good referance or some explained example for the same. Here is the assembly
f1(int):
push rbp
mov rbp, rsp
mov DWORD PTR [rbp-20], edi
mov eax, DWORD PTR [rbp-20]
mov edx, eax
shr edx, 31
add eax, edx
sar eax
mov DWORD PTR [rbp-4], eax
nop
pop rbp
ret
f2(int):
push rbp
mov rbp, rsp
mov DWORD PTR [rbp-20], edi
mov eax, DWORD PTR [rbp-20]
cdq
shr edx, 31
add eax, edx
and eax, 1
sub eax, edx
mov DWORD PTR [rbp-4], eax
nop
pop rbp
ret
f3(int):
push rbp
mov rbp, rsp
mov DWORD PTR [rbp-20], edi
mov eax, DWORD PTR [rbp-20]
movsx rdx, eax
imul rdx, rdx, -1840700269
shr rdx, 32
add edx, eax
sar edx, 2
mov esi, eax
sar esi, 31
mov ecx, edx
sub ecx, esi
mov edx, ecx
sal edx, 3
sub edx, ecx
sub eax, edx
mov DWORD PTR [rbp-4], eax
nop
pop rbp
ret
f4(int, int):
push rbp
mov rbp, rsp
mov DWORD PTR [rbp-20], edi
mov DWORD PTR [rbp-24], esi
mov eax, DWORD PTR [rbp-20]
cdq
idiv DWORD PTR [rbp-24]
mov DWORD PTR [rbp-4], edx
nop
pop rbp
ret
Can you please tell by some example or what steps it is following to calculate the answers in all these three cases and why would they work just fine instead of normal divide

MSVC++ inline assembly unhandled exception 0x80000004: Single step

I am writing a code using inline asm with VC++ 2019 32bit. I have written a function to switch coroutine.This is the source code :
I tested it and it works well. The argument is a uintptr_t array that contains the register value. This function will exchagne register value except ebx.
The problem is the "Unhandled exception at 0x5514704E (pevm.dll) in tool.exe: 0x80000004: Single step.".
Register value : EAX = 00000246 EBX = 0019F5A0 ECX = E2F13240 EDX = 0019F5A0 ESI = 0019F3A8 EDI = 0019F3C8 EIP = 5514704E ESP = 0019F2BC EBP = 0019F2C0 EFL = 00000202
I can not understand why "pop eax" throw exception ?
Maybe my code destroy some "internal data structure" and the program happened to stop here, like double free. Any suggestions to how to debug ?
inline __declspec(naked) void switchCoroutine(uintptr_t* vreg)
{
//discard ebx
__asm
{
push ebp
mov ebp, esp
//save
push eax
//argument
mov ebx, [ebp + 8]
//exchange eflags
pushfd
pop eax
push[ebx]
popfd
mov[ebx], eax
pop eax
//exchange eax ,ecx,edx,esi,edi
XCHG eax, [ebx + type int]
xchg ecx, [ebx + 3 * type int]
xchg edx, [ebx + 4 * type int]
xchg esi, [ebx + 5 * type int]
xchg edi, [ebx + 6 * type int]
//exchange ebp,esp
mov esp, ebp
pop ebp
xchg ebp, [ebx + 7 * type int]
xchg esp, [ebx + 8 * type int]
//go eip
ret
}
}
55147031 C2 04 00 ret 4
--- No source file -------------------------------------------------------------
55147034 CC int 3
55147035 CC int 3
55147036 CC int 3
55147037 CC int 3
55147038 CC int 3
55147039 CC int 3
5514703A CC int 3
5514703B CC int 3
5514703C CC int 3
5514703D CC int 3
5514703E CC int 3
5514703F CC int 3
--- D:\code\c++\PEVM\core\vm\vdata.h -------------------------------------------
643: //discard ebx
644: __asm
645: {
646: push ebp
55147040 55 push ebp
647: mov ebp, esp
55147041 8B EC mov ebp,esp
648: //save
649: push eax
55147043 50 push eax
650: //argument
651: mov ebx, [ebp + 8]
55147044 8B 5D 08 mov ebx,dword ptr [vreg]
652:
653: //exchange eflags
654: pushfd
55147047 9C pushfd
655: pop eax
55147048 58 pop eax
656: push[ebx]
55147049 FF 33 push dword ptr [ebx]
657: popfd
5514704B 9D popfd
658: mov[ebx], eax
5514704C 89 03 mov dword ptr [ebx],eax
659:
660: pop eax
5514704E 58 pop eax //HERE **Unhandled exception at 0x5514704E (pevm.dll) in tool.exe: 0x80000004: Single step.**
661: //exchange eax ,ecx,edx,esi,edi
662: XCHG eax, [ebx + type int]
5514704F 87 43 04 xchg eax,dword ptr [ebx+4]
663: xchg ecx, [ebx + 3 * type int]
55147052 87 4B 0C xchg ecx,dword ptr [ebx+0Ch]
664: xchg edx, [ebx + 4 * type int]
55147055 87 53 10 xchg edx,dword ptr [ebx+10h]
665: xchg esi, [ebx + 5 * type int]
55147058 87 73 14 xchg esi,dword ptr [ebx+14h]
666: xchg edi, [ebx + 6 * type int]
5514705B 87 7B 18 xchg edi,dword ptr [ebx+18h]
667:
668: //exchange ebp,esp
669: mov esp, ebp
5514705E 8B E5 mov esp,ebp
670: pop ebp
55147060 5D pop ebp
671: xchg ebp, [ebx + 7 * type int]
55147061 87 6B 1C xchg ebp,dword ptr [ebx+1Ch]
672: xchg esp, [ebx + 8 * type int]
55147064 87 63 20 xchg esp,dword ptr [ebx+20h]
673:
674: //go eip
675: ret
55147067 C3 ret
--- No source file -------------------------------------------------------------
55147068 CC int 3
55147069 CC int 3
5514706A CC int 3
5514706B CC int 3
5514706C CC int 3
5514706D CC int 3
5514706E CC int 3
5514706F CC int 3
At 0x5514704B you set EFLAGS. When it has TF flag set, a debug exception (#DB) will be generated by the CPU after next executed instruction. Next after popfd is mov[ebx], eax, thus the exception is generated after it's execution. Since #DB is a trap, eip points to address after the executed instruction, pop eax in your case.
Check if push[ebx] at 0x55147048 has TF bit set.

One writer and multiple readers - 256bit - AVX - atomic [duplicate]

This question already has answers here:
Why does clang produce inefficient asm with -O0 (for this simple floating point sum)?
(1 answer)
SSE instructions: which CPUs can do atomic 16B memory operations?
(7 answers)
Largest data type which can be fetch-ANDed atomically?
(2 answers)
Closed 2 years ago.
Would like to write 256bit of data on one core and read it on another one. So there will be only one process to write and can be multiple readers.
Was thinking to implement it using AVX. The reads and writes should be atomic since they are only 1 instruction (vmovdqa) and if aligned by cache line cache coherency would move the data atomically between cores.
Looked at the generated assembly but can see 2 writes and 2 reads. Why is not there just one? Would this solution for atomic read/write work given the assumptions?
#include <immintrin.h>
#include <cstdint>
struct Data {
int64_t a[4];
};
struct DataHolder {
void set_data(Data* in) {
_mm256_store_si256(reinterpret_cast<__m256i *>(&data_), *reinterpret_cast<__m256i *>(in));
}
void get_data(Data* out) {
_mm256_store_si256(reinterpret_cast<__m256i *>(out), *reinterpret_cast<__m256i *>(&data_));
}
alignas(64) Data data_;
char padding [64 - sizeof(Data)];
};
int main() {
Data a, b;
DataHolder ab;
ab.set_data(&a);
ab.get_data(&b);
}
DataHolder::set_data(Data*):
push rbp
mov rbp, rsp
and rsp, -32
mov QWORD PTR [rsp-72], rdi
mov QWORD PTR [rsp-80], rsi
mov rax, QWORD PTR [rsp-80]
vmovdqa ymm0, YMMWORD PTR [rax]
mov rax, QWORD PTR [rsp-72]
mov QWORD PTR [rsp-8], rax
vmovdqa YMMWORD PTR [rsp-64], ymm0
mov rax, QWORD PTR [rsp-8]
vmovdqa ymm0, YMMWORD PTR [rsp-64]
vmovdqa YMMWORD PTR [rax], ymm0
nop
nop
leave
ret
DataHolder::get_data(Data*):
push rbp
mov rbp, rsp
and rsp, -32
mov QWORD PTR [rsp-72], rdi
mov QWORD PTR [rsp-80], rsi
mov rax, QWORD PTR [rsp-72]
vmovdqa ymm0, YMMWORD PTR [rax]
mov rax, QWORD PTR [rsp-80]
mov QWORD PTR [rsp-8], rax
vmovdqa YMMWORD PTR [rsp-64], ymm0
mov rax, QWORD PTR [rsp-8]
vmovdqa ymm0, YMMWORD PTR [rsp-64]
vmovdqa YMMWORD PTR [rax], ymm0
nop
nop
leave
ret
main:
push rbp
mov rbp, rsp
and rsp, -64
add rsp, -128
lea rdx, [rsp+96]
mov rax, rsp
mov rsi, rdx
mov rdi, rax
call DataHolder::set_data(Data*)
lea rdx, [rsp+64]
mov rax, rsp
mov rsi, rdx
mov rdi, rax
call DataHolder::get_data(Data*)
mov eax, 0
leave
ret

Trying to understand ASM code

EDIT
I switched from memcmp to a home brewed 13 byte compare function and the homebrew doesnt have the extra instructions. So all I can guess is that the extra assembly is just a flaw in the optimizer.
if (!EQ13(&ti, &m_ti)) { // in 2014, memcmp was not being optimzied here
000007FEF91B2CFE mov rdx,qword ptr [rsp]
000007FEF91B2D02 movzx eax,byte ptr [rsp+0Ch]
000007FEF91B2D07 mov ecx,dword ptr [rsp+8]
000007FEF91B2D0B cmp rdx,qword ptr [r10+28h]
000007FEF91B2D0F jne TSccIter::SetTi+9Dh (7FEF91B2D1Dh)
000007FEF91B2D11 cmp ecx,dword ptr [r10+30h]
000007FEF91B2D15 jne TSccIter::SetTi+9Dh (7FEF91B2D1Dh)
000007FEF91B2D17 cmp al,byte ptr [r10+34h]
000007FEF91B2D1B je TSccIter::SetTi+0B1h (7FEF91B2D31h)
My homebrew isn't perfect in this case since it does 3 movs at the start even though it is unlikely to ever check past the first mov. I need to work on that part.
ORIGINAL QUESTION
Here is asm code from msvc 2010 showing how it can optimze a small, fixed-sized memcmp (in this case, 13 bytes). I've seen this type of optimization a lot in our code, but never with the last 6 lines. Can anyone tell me why the last 6 lines of assembly are there? TransferItem is 13 bytes so that explains the QWORD, DWORD, then BYTE cmps.
struct TransferItem {
char m_szCxrMkt1[3];
char m_szCxrOp1[3];
char m_chDelimiter;
char m_szCxrMkt2[3];
char m_szCxrOp2[3];
};
...
if (memcmp(&ti, &m_ti, sizeof(TransferItem))) {
2B8E lea rax,[rsp]
2B92 mov rdx,qword ptr [rax]
2B95 cmp rdx,qword ptr [r10+28h]
2B99 jne TSccIter::SetTi+0A2h (7FEF9302BB2h)
2B9B mov edx,dword ptr [rax+8]
2B9E cmp edx,dword ptr [r10+30h]
2BA2 jne TSccIter::SetTi+0A2h (7FEF9302BB2h)
2BA4 movzx edx,byte ptr [rax+0Ch]
2BA8 cmp dl,byte ptr [r10+34h]
2BAC jne TSccIter::SetTi+0A2h (7FEF9302BB2h)
2BAE xor eax,eax
2BB0 jmp TSccIter::SetTi+0A7h (7FEF9302BB7h)
2BB2 sbb eax,eax
2BB4 sbb eax,0FFFFFFFFh
2BB7 test eax,eax
2BB9 je TSccIter::SetTi+0CCh (7FEF9302BDCh)
Also what is the point of xor eax,eax which we know will be zero and then testing that for that known to be zero on line 2bb7?
Here is the whole function
// fWildCard means match certain fields to '**' in the db
// szCxrMkt1,2 are required and cannot be null, ' ', or '\0\0'.
// szCxrOp1,2 can be null, ' ', or '\0\0'.
TSccIter& SetTi(bool fWildCard, LPCSTR szCxrMkt1, LPCSTR szCxrOp1, LPCSTR szCxrMkt2, LPCSTR szCxrOp2) {
if (m_fSkipSet)
return *this;
m_iSid = -1; // resets the iterator to search from the start
// Pad the struct to 16 bytes so we can clear it with 2 QWORDS
// We use a temp, ti, to detect if the new transferitem has changed
class TransferItemPadded : public TransferItem {
char padding[16 - sizeof(TransferItem)]; // get us to 16 bytes
} ti;
U8(&ti) = U8(BUMP(&ti, 8)) = 0x2020202020202020; // 8 spaces
// copy in the params
CPY2(ti.m_szCxrMkt1, szCxrMkt1);
if (szCxrOp1 && *szCxrOp1)
CPY2(ti.m_szCxrOp1, szCxrOp1);
ti.m_chDelimiter = (fWildCard) ? '*' : ':'; // this controls wild card matching
CPY2(ti.m_szCxrMkt2, szCxrMkt2);
if (szCxrOp2 && *szCxrOp2)
CPY2(ti.m_szCxrOp2, szCxrOp2);
// see if different
if (memcmp(&ti, &m_ti, sizeof(TransferItem))) {
memcpy(&m_ti, &ti, sizeof(TransferItem));
m_fQryChanged = true;
}
return *this;
}
typedef unsigned __int64 U8;
#define CPY2(a,b) ((*(WORD*)a) = (*(WORD*)b))
And here's the whole asm
TSccIter& SetTi(bool fWildCard, LPCSTR szCxrMkt1, LPCSTR szCxrOp1, LPCSTR szCxrMkt2, LPCSTR szCxrOp2) {
2B10 sub rsp,18h
if (m_fSkipSet)
2B14 cmp byte ptr [rcx+0EAh],0
2B1B mov r10,rcx
return *this;
2B1E jne TSccIter::SetTi+0CCh (7FEF9302BDCh)
m_iSid = -1;
class TransferItemPadded : public TransferItem {
char padding[16 - sizeof(TransferItem)];
} ti;
U8(&ti) = U8(BUMP(&ti, 8)) = 0x2020202020202020;
2B24 mov rax,2020202020202020h
2B2E mov byte ptr [rcx+36h],0FFh
2B32 mov qword ptr [rsp],rax
2B36 mov qword ptr [rsp+8],rax
CPY2(ti.m_szCxrMkt1, szCxrMkt1);
2B3B movzx eax,word ptr [r8]
2B3F mov word ptr [rsp],ax
if (szCxrOp1 && *szCxrOp1)
2B43 test r9,r9
2B46 je TSccIter::SetTi+47h (7FEF9302B57h)
2B48 cmp byte ptr [r9],0
2B4C je TSccIter::SetTi+47h (7FEF9302B57h)
CPY2(ti.m_szCxrOp1, szCxrOp1);
2B4E movzx eax,word ptr [r9]
2B52 mov word ptr [rsp+3],ax
ti.m_chDelimiter = (fWildCard) ? '*' : ':';
2B57 mov eax,3Ah
2B5C mov ecx,2Ah
2B61 test dl,dl
2B63 cmovne eax,ecx
2B66 mov byte ptr [rsp+6],al
CPY2(ti.m_szCxrMkt2, szCxrMkt2);
2B6A mov rax,qword ptr [szCxrMkt2]
2B6F movzx ecx,word ptr [rax]
if (szCxrOp2 && *szCxrOp2)
2B72 mov rax,qword ptr [szCxrOp2]
2B77 mov word ptr [rsp+7],cx
2B7C test rax,rax
2B7F je TSccIter::SetTi+7Eh (7FEF9302B8Eh)
2B81 cmp byte ptr [rax],0
2B84 je TSccIter::SetTi+7Eh (7FEF9302B8Eh)
CPY2(ti.m_szCxrOp2, szCxrOp2);
2B86 movzx eax,word ptr [rax]
2B89 mov word ptr [rsp+0Ah],ax
if (memcmp(&ti, &m_ti, sizeof(TransferItem))) {
2B8E lea rax,[rsp]
2B92 mov rdx,qword ptr [rax]
2B95 cmp rdx,qword ptr [r10+28h]
2B99 jne TSccIter::SetTi+0A2h (7FEF9302BB2h)
2B9B mov edx,dword ptr [rax+8]
2B9E cmp edx,dword ptr [r10+30h]
2BA2 jne TSccIter::SetTi+0A2h (7FEF9302BB2h)
2BA4 movzx edx,byte ptr [rax+0Ch]
2BA8 cmp dl,byte ptr [r10+34h]
2BAC jne TSccIter::SetTi+0A2h (7FEF9302BB2h)
2BAE xor eax,eax
2BB0 jmp TSccIter::SetTi+0A7h (7FEF9302BB7h)
2BB2 sbb eax,eax
2BB4 sbb eax,0FFFFFFFFh
2BB7 test eax,eax
2BB9 je TSccIter::SetTi+0CCh (7FEF9302BDCh)
memcpy(&m_ti, &ti, sizeof(TransferItem));
2BBB mov rax,qword ptr [rsp]
m_fQryChanged = true;
2BBF mov byte ptr [r10+0E9h],1
2BC7 mov qword ptr [r10+28h],rax
2BCB mov eax,dword ptr [rsp+8]
2BCF mov dword ptr [r10+30h],eax
2BD3 movzx eax,byte ptr [rsp+0Ch]
2BD8 mov byte ptr [r10+34h],al
}
return *this;
2BDC mov rax,r10
}
2bb7 can be reached by different code paths: via taken jumps at 2b99, 2ba2 and 2bac, as well as directly when none of the conditional jumps is taken. The xor eax,eax is only executed at the last path, and it ensures that eax is 0 - which is apparently not the case otherwise.
The last 6 lines return the value in eax == 0 for a match, and also set the SF and ZF condition codes.
test eax, eax will test whether eax AND eax == 0. The following je will jump if zero.
And xor eax, eax is an efficient way to encode "eax = 0". It is more efficient than mov eax, 0
EDIT: Initially misread the question. It looks like something will happen at "TSccIter::SetTi+0A7h" which should change the value?
Also, the SBB trick to replicate the carry(2BB2-2BB4) is explained here:
http://compgroups.net/comp.lang.asm.x86/trick-with-sbb-instruction/20164