Plain C++ Code 10 times faster than inline assembler. Why?

Plain C++ Code 10 times faster than inline assembler. Why? - c++

These two code snippets do the same thing: Adding two float arrays together and storing the result back into them.
Inline Assembler:
void vecAdd_SSE(float* v1, float* v2) {
_asm {
mov esi, v1
mov edi, v2
movups xmm0, [esi]
movups xmm1, [edi]
addps xmm0, xmm1
movups [esi], xmm0
movups [edi], xmm0
}
}
Plain C++ Code:
void vecAdd_Std(float* v1, float* v2) {
v1[0] = v1[0]+ v2[0];
v1[1] = v1[1]+ v2[1];
v1[2] = v1[2]+ v2[2];
v1[3] = v1[3]+ v2[3];
v2[0] = v1[0];
v2[1] = v1[1];
v2[2] = v1[2];
v2[3] = v1[3];
}
Disassembly for C++ Code (Disassembly made in Debug mode because i cannot view the Disassembly in Release mode for some reason):
void vecAdd_Std(float* v1, float* v2) {
push ebp
mov ebp,esp
sub esp,0C0h
push ebx
push esi
push edi
lea edi,[ebp-0C0h]
mov ecx,30h
mov eax,0CCCCCCCCh
rep stos dword ptr es:[edi]
v1[0] = v1[0]+ v2[0];
mov eax,4
imul ecx,eax,0
mov edx,4
imul eax,edx,0
mov edx,dword ptr [v1]
mov esi,dword ptr [v2]
movss xmm0,dword ptr [edx+ecx]
addss xmm0,dword ptr [esi+eax]
mov eax,4
imul ecx,eax,0
mov edx,dword ptr [v1]
movss dword ptr [edx+ecx],xmm0
v1[1] = v1[1]+ v2[1];
mov eax,4
shl eax,0
v1[1] = v1[1]+ v2[1];
mov ecx,4
shl ecx,0
mov edx,dword ptr [v1]
mov esi,dword ptr [v2]
movss xmm0,dword ptr [edx+eax]
addss xmm0,dword ptr [esi+ecx]
mov eax,4
shl eax,0
mov ecx,dword ptr [v1]
movss dword ptr [ecx+eax],xmm0
v1[2] = v1[2]+ v2[2];
mov eax,4
shl eax,1
mov ecx,4
shl ecx,1
mov edx,dword ptr [v1]
mov esi,dword ptr [v2]
movss xmm0,dword ptr [edx+eax]
addss xmm0,dword ptr [esi+ecx]
mov eax,4
shl eax,1
mov ecx,dword ptr [v1]
movss dword ptr [ecx+eax],xmm0
v1[3] = v1[3]+ v2[3];
mov eax,4
imul ecx,eax,3
mov edx,4
imul eax,edx,3
mov edx,dword ptr [v1]
mov esi,dword ptr [v2]
movss xmm0,dword ptr [edx+ecx]
addss xmm0,dword ptr [esi+eax]
mov eax,4
imul ecx,eax,3
mov edx,dword ptr [v1]
movss dword ptr [edx+ecx],xmm0
v2[0] = v1[0];
mov eax,4
imul ecx,eax,0
mov edx,4
imul eax,edx,0
mov edx,dword ptr [v2]
mov esi,dword ptr [v1]
mov ecx,dword ptr [esi+ecx]
mov dword ptr [edx+eax],ecx
v2[1] = v1[1];
mov eax,4
shl eax,0
mov ecx,4
shl ecx,0
mov edx,dword ptr [v2]
mov esi,dword ptr [v1]
mov eax,dword ptr [esi+eax]
mov dword ptr [edx+ecx],eax
v2[2] = v1[2];
mov eax,4
shl eax,1
mov ecx,4
shl ecx,1
mov edx,dword ptr [v2]
mov esi,dword ptr [v1]
mov eax,dword ptr [esi+eax]
mov dword ptr [edx+ecx],eax
v2[3] = v1[3];
mov eax,4
imul ecx,eax,3
mov edx,4
imul eax,edx,3
mov edx,dword ptr [v2]
mov esi,dword ptr [v1]
mov ecx,dword ptr [esi+ecx]
mov dword ptr [edx+eax],ecx
}
Now I made a time measurement on those to functions and noticed that the inline assembler code takes approximately 10 times longer (in Release mode).
Does anybody know why?

On my machine (VS2015 64-bit mode), the compiler inlines vecAdd_Std and produces
00007FF625921C8F vmovups xmm1,xmmword ptr [__xmm#4100000040c000004080000040000000 (07FF625929D60h)]
00007FF625921C97 vmovups xmm4,xmm1
00007FF625921C9B vcvtss2sd xmm1,xmm1,xmm4
Test code
int main() {
float x[4] = {1.0, 2.0, 3.0, 4.0};
float y[4] = {1.0, 2.0, 3.0, 4.0};
vecAdd_Std(x, y);
std::cout << x[0];
}

You aren't really calling a function that executes one SSE instruction, are you? There's non-trivial overhead involved in setting up the xmm registers, and you're copying the values from memory to the registers and back, which will take far longer than the actual calculation.
I wouldn't be at all surprised to find that the compiler inlines the C++ version of the function, but doesn't (can't, really) do the same for functions that contain inline assembly.

Related

Different assembly when rangifying a simple algorithm

When I was preparing supplementary info for this question, I noticed that “rangified” implementations of a very simple algorithm resulted in important differences (to my eyes) in the resulting assembly, compared with “legacy” implementations.
I expanded the tests a bit, with the following results (GCC 9.1 -O3):
Case 1. Simple for loop (https://godbolt.org/z/rAVaT2)
#include <vector>
void foo(std::vector<double> &u, std::vector<double> const &v)
{
for (std::size_t i = 0u; i < u.size(); ++i)
u[i] += v[i];
}
mov rdx, QWORD PTR [rdi]
mov rdi, QWORD PTR [rdi+8]
sub rdi, rdx
sar rdi, 3
je .L1
mov rcx, QWORD PTR [rsi]
lea rax, [rcx+15]
sub rax, rdx
cmp rax, 30
jbe .L7
lea rax, [rdi-1]
cmp rax, 1
jbe .L7
mov rsi, rdi
xor eax, eax
shr rsi
sal rsi, 4
.L4:
movupd xmm0, XMMWORD PTR [rcx+rax]
movupd xmm1, XMMWORD PTR [rdx+rax]
addpd xmm0, xmm1
movups XMMWORD PTR [rdx+rax], xmm0
add rax, 16
cmp rsi, rax
jne .L4
mov rsi, rdi
and rsi, -2
and edi, 1
je .L1
lea rax, [rdx+rsi*8]
movsd xmm0, QWORD PTR [rax]
addsd xmm0, QWORD PTR [rcx+rsi*8]
movsd QWORD PTR [rax], xmm0
ret
.L7:
xor eax, eax
.L3:
movsd xmm0, QWORD PTR [rdx+rax*8]
addsd xmm0, QWORD PTR [rcx+rax*8]
movsd QWORD PTR [rdx+rax*8], xmm0
add rax, 1
cmp rdi, rax
jne .L3
.L1:
ret
Case 2. std::transform (https://godbolt.org/z/2iZaqo)
#include <algorithm>
#include <vector>
void foo(std::vector<double> &u, std::vector<double> const &v)
{
std::transform(std::begin(u), std::end(u),
std::begin(v),
std::begin(u),
std::plus());
}
mov rdx, QWORD PTR [rdi]
mov rax, QWORD PTR [rdi+8]
mov rsi, QWORD PTR [rsi]
cmp rax, rdx
je .L1
sub rax, 8
lea rcx, [rsi+15]
sub rax, rdx
sub rcx, rdx
shr rax, 3
cmp rcx, 30
jbe .L7
movabs rcx, 2305843009213693950
test rax, rcx
je .L7
lea rcx, [rax+1]
xor eax, eax
mov rdi, rcx
shr rdi
sal rdi, 4
.L4:
movupd xmm0, XMMWORD PTR [rdx+rax]
movupd xmm1, XMMWORD PTR [rsi+rax]
addpd xmm0, xmm1
movups XMMWORD PTR [rdx+rax], xmm0
add rax, 16
cmp rax, rdi
jne .L4
mov rdi, rcx
and rdi, -2
lea rax, [0+rdi*8]
add rdx, rax
add rsi, rax
cmp rcx, rdi
je .L1
movsd xmm0, QWORD PTR [rdx]
addsd xmm0, QWORD PTR [rsi]
movsd QWORD PTR [rdx], xmm0
ret
.L7:
xor ecx, ecx
.L3:
movsd xmm0, QWORD PTR [rdx+rcx*8]
addsd xmm0, QWORD PTR [rsi+rcx*8]
mov rdi, rcx
movsd QWORD PTR [rdx+rcx*8], xmm0
add rcx, 1
cmp rax, rdi
jne .L3
.L1:
ret
Case 3. Range-v3 view::zip (https://godbolt.org/z/0BEkfT)
#define RANGES_ASSERT(...) ((void)0)
#include <algorithm>
#include <range/v3/view/zip.hpp>
#include <vector>
void foo(std::vector<double> &u, std::vector<double> const &v)
{
auto w = ranges::view::zip(u, v);
std::for_each(std::begin(w), std::end(w),
[](auto &&x) { std::get<0u>(x) += std::get<1u>(x); });
}
mov rdx, QWORD PTR [rsi]
mov rsi, QWORD PTR [rsi+8]
mov rax, QWORD PTR [rdi]
mov rcx, QWORD PTR [rdi+8]
cmp rdx, rsi
je .L1
cmp rax, rcx
je .L1
.L3:
movsd xmm0, QWORD PTR [rax]
addsd xmm0, QWORD PTR [rdx]
add rax, 8
add rdx, 8
movsd QWORD PTR [rax-8], xmm0
cmp rax, rcx
je .L1
cmp rdx, rsi
jne .L3
.L1:
ret
Case 4. cmcstl2 ranges::transform (https://godbolt.org/z/MjYO1G)
#include <experimental/ranges/algorithm>
#include <vector>
namespace std
{
namespace ranges = experimental::ranges;
}
void foo(std::vector<double> &u,s td::vector<double> const &v)
{
std::ranges::transform(std::ranges::begin(u), std::ranges::end(u),
std::ranges::begin(v), std::ranges::end(v),
std::ranges::begin(u),
std::plus());
}
mov r8, QWORD PTR [rsi+8]
mov rdx, QWORD PTR [rsi]
mov rax, QWORD PTR [rdi]
mov rcx, QWORD PTR [rdi+8]
cmp rdx, r8
je .L1
cmp rcx, rax
jne .L3
jmp .L1
.L16:
cmp rdx, r8
je .L1
.L3:
movsd xmm0, QWORD PTR [rax]
addsd xmm0, QWORD PTR [rdx]
add rax, 8
add rdx, 8
movsd QWORD PTR [rax-8], xmm0
cmp rax, rcx
jne .L16
.L1:
ret
I can’t read assembly, but I seem to understand that the assemblies of Case 1 and Case 2 are almost equivalent and involve packed sums, whilst the assembly of the ranges versions (Cases 3 and 4) is much terser, but not vectorized.
I would really love to understand what those differences mean. Do my interpretation of the assembly make any sense? What are the additional instructions in the non-ranges versions? Why are there those differences?

Optimizations in case of multiple virtual target invocations (i.e. 1, 2, 3)

I have below code which actually uses 1, 2, 3 virtual call targets:
#include <random>
#include <memory>
#include <ctime>
struct IMath
{
virtual ~IMath() = default;
virtual int compute(int) = 0;
};
struct MathAlg1: IMath
{
int compute(int i) final
{
return i * 17;
}
};
struct MathAlg2: IMath
{
int compute(int i) final
{
return i * 19;
}
};
struct MathAlg3: IMath
{
int compute(int i) final
{
return i * 23;
}
};
struct MathAlg4: IMath
{
int compute(int i) final
{
return i * 29;
}
};
namespace
{
static std::shared_ptr<int> param = std::make_shared<int>(3);
int compute(IMath& alg, int i)
{
return alg.compute(i);
}
std::unique_ptr<IMath> alg1 = std::make_unique<MathAlg1>();
std::unique_ptr<IMath> alg2 = std::make_unique<MathAlg2>();
std::unique_ptr<IMath> alg3 = std::make_unique<MathAlg3>();
std::unique_ptr<IMath> alg4 = std::make_unique<MathAlg4>();
int monomorphicCall()
{
return compute(*alg1, *param);
}
int bimorphicCall()
{
return compute(*alg1, *param) + compute(*alg2, *param);
}
int megamorphic3Call()
{
return compute(*alg1, *param) + compute(*alg2, *param) + compute(*alg3, *param);
}
int megamorphic4Call()
{
return compute(*alg1, *param) + compute(*alg2, *param) + compute(*alg3, *param) + compute(*alg4, *param);
}
}
int main(){
return monomorphicCall();
//return bimorphicCall();
//return megamorphic3Call();
//return megamorphic4Call();
}
Generated ASM: (clang 6.0.0 w/ -03)
monomorphicCall()
main: # #main
mov rdi, qword ptr [rip + (anonymous namespace)::alg1]
mov rax, qword ptr [rip + (anonymous namespace)::param]
mov esi, dword ptr [rax]
mov rax, qword ptr [rdi]
mov rax, qword ptr [rax + 16]
jmp rax # TAILCALL
bimorphicCall()
main: # #main
push rbx
mov rdi, qword ptr [rip + (anonymous namespace)::alg1]
mov rax, qword ptr [rip + (anonymous namespace)::param]
mov esi, dword ptr [rax]
mov rax, qword ptr [rdi]
call qword ptr [rax + 16]
mov ebx, eax
mov rdi, qword ptr [rip + (anonymous namespace)::alg2]
mov rax, qword ptr [rip + (anonymous namespace)::param]
mov esi, dword ptr [rax]
mov rax, qword ptr [rdi]
call qword ptr [rax + 16]
add eax, ebx
pop rbx
ret
megamorphic3Call()
main: # #main
push rbp
push rbx
push rax
mov rdi, qword ptr [rip + (anonymous namespace)::alg1]
mov rax, qword ptr [rip + (anonymous namespace)::param]
mov esi, dword ptr [rax]
mov rax, qword ptr [rdi]
call qword ptr [rax + 16]
mov ebx, eax
mov rdi, qword ptr [rip + (anonymous namespace)::alg2]
mov rax, qword ptr [rip + (anonymous namespace)::param]
mov esi, dword ptr [rax]
mov rax, qword ptr [rdi]
call qword ptr [rax + 16]
mov ebp, eax
add ebp, ebx
mov rdi, qword ptr [rip + (anonymous namespace)::alg3]
mov rax, qword ptr [rip + (anonymous namespace)::param]
mov esi, dword ptr [rax]
mov rax, qword ptr [rdi]
call qword ptr [rax + 16]
add eax, ebp
add rsp, 8
pop rbx
pop rbp
ret
megamorphic4Call()
main: # #main
push rbp
push rbx
push rax
mov rdi, qword ptr [rip + (anonymous namespace)::alg1]
mov rax, qword ptr [rip + (anonymous namespace)::param]
mov esi, dword ptr [rax]
mov rax, qword ptr [rdi]
call qword ptr [rax + 16]
mov ebx, eax
mov rdi, qword ptr [rip + (anonymous namespace)::alg2]
mov rax, qword ptr [rip + (anonymous namespace)::param]
mov esi, dword ptr [rax]
mov rax, qword ptr [rdi]
call qword ptr [rax + 16]
mov ebp, eax
add ebp, ebx
mov rdi, qword ptr [rip + (anonymous namespace)::alg3]
mov rax, qword ptr [rip + (anonymous namespace)::param]
mov esi, dword ptr [rax]
mov rax, qword ptr [rdi]
call qword ptr [rax + 16]
mov ebx, eax
add ebx, ebp
mov rdi, qword ptr [rip + (anonymous namespace)::alg4]
mov rax, qword ptr [rip + (anonymous namespace)::param]
mov esi, dword ptr [rax]
mov rax, qword ptr [rdi]
call qword ptr [rax + 16]
add eax, ebx
add rsp, 8
pop rbx
pop rbp
ret
Questions/Confirmation points:
In the case of monomorphicCall() to me, it looks like there is no actual
virtual table call, but a jmp rax # TAILCALL. Is this a correct assessment?
In case of bimorphicCall(), megamorphic3Call(), megamorphic4Call() it triggers the vcalls in all cases. Is this a correct assessment?

is Visual C++ actually generating blatantly incorrect code?

So I'm debugging my DPLL implementation and it's not quite working right so I step through the code line by line in the debugger, it gets to a return statement but the thing is it doesn't return, it just keeps on executing the same function. WTF I thought, am I really seeing this? So I looked at the dissasembly and sure enough one of the return statements jumps to the wrong place. Never have I seen VS generate incorrect code so I'm wondering if I screwed up somewhere but I can't find anything. The jump is incorrect even when compiling with all optimizations off.
This illustrates whats going on.
bool dpll(std::vector<clause> f)
{
unitPropagate(f);
if(checkFalseClause(f))
{
return false; //je dpll+5Fh (0C01D1Fh) <-- Totally wrong jump adress
}
else if(checkAllClausesTrue(f))
{
return true; //jmp dpll+206h (0C01EC6h) <-- this is fine
}
else
{
atom l = chooseLiteral(f); //this is where the jump ends up (0C01D1Fh)
std::vector<clause> a = makeDuplicate(f);
replaceInstancesOf(a, l, true);
std::vector<clause> b = makeDuplicate(f);
replaceInstancesOf(b, l, false);
return dpll(a) | dpll(b);
}
//this is where the jump is supposed to go (0C01EC6h)
}
So my question is, is Visual Studio actually broken or have I misunderstood something? Has anyone run into something like this before?
The version is Visual Studio Enterprise 2015 if that makes a difference, the code is generated for x86_32.
Here's the full dissasembly if anyone's interested:
00C01CC0 push ebp
00C01CC1 mov ebp,esp
00C01CC3 push 0FFFFFFFFh
00C01CC5 push 0C08FF0h
00C01CCA mov eax,dword ptr fs:[00000000h]
00C01CD0 push eax
00C01CD1 sub esp,40h
00C01CD4 mov eax,dword ptr [__security_cookie (0C0D008h)]
00C01CD9 xor eax,ebp
00C01CDB mov dword ptr [ebp-10h],eax
00C01CDE push ebx
00C01CDF push esi
00C01CE0 push eax
00C01CE1 lea eax,[ebp-0Ch]
00C01CE4 mov dword ptr fs:[00000000h],eax
bool dpll(std::vector<clause> f)
00C01CEA lea ecx,[f]
00C01CED mov dword ptr [ebp-4],0
00C01CF4 call unitPropagate (0C01950h)
{
unitPropagate(f);
00C01CF9 lea ecx,[f]
00C01CFC call checkFalseClause (0C01660h)
00C01D01 test al,al
00C01D03 je dpll+4Ch (0C01D0Ch)
00C01D05 xor bh,bh
00C01D07 jmp dpll+206h (0C01EC6h)
if(checkFalseClause(f))
{
return false;
00C01D0C lea ecx,[f]
00C01D0F call checkAllClausesTrue (0C014F0h)
00C01D14 test al,al
00C01D16 je dpll+5Fh (0C01D1Fh)
}
else if(checkAllClausesTrue(f))
00C01D18 mov bh,1
00C01D1A jmp dpll+206h (0C01EC6h)
{
return true;
}
else
00C01D1F lea edx,[f]
00C01D22 lea ecx,[l]
00C01D25 call chooseLiteral (0C013D0h)
00C01D2A mov byte ptr [ebp-4],1
{
atom l = chooseLiteral(f);
00C01D2E lea edx,[f]
00C01D31 xorps xmm0,xmm0
00C01D34 mov dword ptr [ebp-20h],0
00C01D3B lea ecx,[a]
00C01D3E movq mmword ptr [a],xmm0
00C01D43 call makeDuplicate (0C01A30h)
00C01D48 mov byte ptr [ebp-4],2
00C01D4C sub esp,20h
00C01D4F mov esi,esp
00C01D51 mov bl,1
00C01D53 mov dword ptr [ebp-4Ch],esi
00C01D56 lea ecx,[esi+4]
00C01D59 mov al,byte ptr [l]
00C01D5C mov byte ptr [esi],al
00C01D5E mov dword ptr [ecx+14h],0Fh
00C01D65 mov dword ptr [ecx+10h],0
00C01D6C cmp dword ptr [ecx+14h],10h
00C01D70 jb dpll+0B6h (0C01D76h)
00C01D72 mov eax,dword ptr [ecx]
00C01D74 jmp dpll+0B8h (0C01D78h)
00C01D76 mov eax,ecx
00C01D78 push 0FFFFFFFFh
00C01D7A mov byte ptr [eax],0
00C01D7D lea eax,[ebp-44h]
00C01D80 push 0
00C01D82 push eax
00C01D83 call std::basic_string<char,std::char_traits<char>,std::allocator<char> >::assign (0C02A80h)
00C01D88 mov al,byte ptr [ebp-2Ch]
00C01D8B lea ecx,[a]
00C01D8E mov byte ptr [esi+1Ch],al
00C01D91 mov dl,bl
00C01D93 mov al,byte ptr [ebp-2Bh]
00C01D96 mov byte ptr [esi+1Dh],al
00C01D99 call replaceInstancesOf (0C017D0h)
00C01D9E xorps xmm0,xmm0
00C01DA1 mov dword ptr [ebp-14h],0
std::vector<clause> a = makeDuplicate(f);
replaceInstancesOf(a, l, true);
00C01DA8 lea edx,[f]
std::vector<clause> a = makeDuplicate(f);
replaceInstancesOf(a, l, true);
00C01DAB movq mmword ptr [b],xmm0
00C01DB0 lea ecx,[b]
00C01DB3 call makeDuplicate (0C01A30h)
00C01DB8 mov esi,esp
00C01DBA mov byte ptr [ebp-4],3
00C01DBE mov dword ptr [ebp-4Ch],esi
00C01DC1 lea ecx,[esi+4]
00C01DC4 mov al,byte ptr [l]
00C01DC7 xor bl,bl
00C01DC9 push 0FFFFFFFFh
00C01DCB mov byte ptr [esi],al
00C01DCD lea eax,[ebp-44h]
00C01DD0 push 0
00C01DD2 mov dword ptr [ecx+14h],0Fh
00C01DD9 mov dword ptr [ecx+10h],0
00C01DE0 push eax
00C01DE1 mov byte ptr [ecx],bl
00C01DE3 call std::basic_string<char,std::char_traits<char>,std::allocator<char> >::assign (0C02A80h)
00C01DE8 mov al,byte ptr [ebp-2Ch]
00C01DEB lea ecx,[b]
00C01DEE mov byte ptr [esi+1Ch],al
00C01DF1 mov dl,bl
00C01DF3 mov al,byte ptr [ebp-2Bh]
00C01DF6 mov byte ptr [esi+1Dh],al
00C01DF9 call replaceInstancesOf (0C017D0h)
std::vector<clause> b = makeDuplicate(f);
replaceInstancesOf(b, l, false);
00C01DFE add esp,14h
00C01E01 lea eax,[a]
00C01E04 mov ecx,esp
00C01E06 push eax
00C01E07 call std::vector<std::vector<atom,std::allocator<atom> >,std::allocator<std::vector<atom,std::allocator<atom> > > >::vector<std::vector<atom,std::allocator<atom> >,std::allocator<std::vector<atom,std::allocator<atom> > > > (0C02420h)
00C01E0C call dpll (0C01CC0h)
00C01E11 mov bl,al
00C01E13 mov ecx,esp
00C01E15 lea eax,[b]
00C01E18 push eax
00C01E19 call std::vector<std::vector<atom,std::allocator<atom> >,std::allocator<std::vector<atom,std::allocator<atom> > > >::vector<std::vector<atom,std::allocator<atom> >,std::allocator<std::vector<atom,std::allocator<atom> > > > (0C02420h)
00C01E1E call dpll (0C01CC0h)
00C01E23 mov ecx,dword ptr [b]
00C01E26 mov bh,al
00C01E28 add esp,0Ch
00C01E2B or bh,bl
00C01E2D test ecx,ecx
00C01E2F je dpll+1B4h (0C01E74h)
00C01E31 push dword ptr [ebp-4Ch]
00C01E34 mov edx,dword ptr [ebp-18h]
00C01E37 push ecx
00C01E38 call std::_Destroy_range1<std::allocator<std::vector<atom,std::allocator<atom> > >,std::vector<atom,std::allocator<atom> > *> (0C035E0h)
00C01E3D mov ecx,dword ptr [ebp-14h]
00C01E40 mov eax,2AAAAAABh
00C01E45 mov esi,dword ptr [b]
00C01E48 add esp,8
00C01E4B sub ecx,esi
00C01E4D imul ecx
00C01E4F sar edx,1
00C01E51 mov eax,edx
00C01E53 shr eax,1Fh
00C01E56 add eax,edx
00C01E58 push eax
00C01E59 push esi
00C01E5A call std::_Wrap_alloc<std::allocator<std::vector<atom,std::allocator<atom> > > >::deallocate (0C02D20h)
00C01E5F mov dword ptr [b],0
00C01E66 mov dword ptr [ebp-18h],0
00C01E6D mov dword ptr [ebp-14h],0
00C01E74 mov ecx,dword ptr [a]
00C01E77 test ecx,ecx
00C01E79 je dpll+1FEh (0C01EBEh)
00C01E7B push dword ptr [ebp-4Ch]
00C01E7E mov edx,dword ptr [ebp-24h]
00C01E81 push ecx
00C01E82 call std::_Destroy_range1<std::allocator<std::vector<atom,std::allocator<atom> > >,std::vector<atom,std::allocator<atom> > *> (0C035E0h)
00C01E87 mov ecx,dword ptr [ebp-20h]
00C01E8A mov eax,2AAAAAABh
00C01E8F mov esi,dword ptr [a]
00C01E92 add esp,8
00C01E95 sub ecx,esi
00C01E97 imul ecx
00C01E99 sar edx,1
00C01E9B mov eax,edx
00C01E9D shr eax,1Fh
00C01EA0 add eax,edx
00C01EA2 push eax
00C01EA3 push esi
00C01EA4 call std::_Wrap_alloc<std::allocator<std::vector<atom,std::allocator<atom> > > >::deallocate (0C02D20h)
00C01EA9 mov dword ptr [a],0
00C01EB0 mov dword ptr [ebp-24h],0
00C01EB7 mov dword ptr [ebp-20h],0
00C01EBE lea ecx,[ebp-44h]
00C01EC1 call std::basic_string<char,std::char_traits<char>,std::allocator<char> >::~basic_string<char,std::char_traits<char>,std::allocator<char> > (0C027A0h)
00C01EC6 mov ecx,dword ptr [f]
00C01EC9 test ecx,ecx
00C01ECB je dpll+23Bh (0C01EFBh)
00C01ECD push dword ptr [ebp-4Ch]
00C01ED0 mov edx,dword ptr [ebp+0Ch]
00C01ED3 push ecx
00C01ED4 call std::_Destroy_range1<std::allocator<std::vector<atom,std::allocator<atom> > >,std::vector<atom,std::allocator<atom> > *> (0C035E0h)
00C01ED9 mov ecx,dword ptr [ebp+10h]
00C01EDC mov eax,2AAAAAABh
00C01EE1 mov esi,dword ptr [f]
00C01EE4 add esp,8
00C01EE7 sub ecx,esi
00C01EE9 imul ecx
00C01EEB sar edx,1
00C01EED mov ecx,edx
00C01EEF shr ecx,1Fh
00C01EF2 add ecx,edx
00C01EF4 push ecx
00C01EF5 push esi
00C01EF6 call std::_Wrap_alloc<std::allocator<std::vector<atom,std::allocator<atom> > > >::deallocate (0C02D20h)
00C01EFB mov al,bh
00C01EFD mov ecx,dword ptr [ebp-0Ch]
00C01F00 mov dword ptr fs:[0],ecx
00C01F07 pop ecx
00C01F08 pop esi
00C01F09 pop ebx
00C01F0A mov ecx,dword ptr [ebp-10h]
00C01F0D xor ecx,ebp
00C01F0F call __security_check_cookie (0C080CCh)
00C01F14 mov esp,ebp
00C01F16 pop ebp
00C01F17 ret

The source interleaving is wrong. This is the correct place you want to look at:
00C01CFC call checkFalseClause (0C01660h)
00C01D01 test al,al
00C01D03 je dpll+4Ch (0C01D0Ch)
00C01D05 xor bh,bh
00C01D07 jmp dpll+206h (0C01EC6h)
As you can see, it goes to the expected address if the return value was nonzero.
The part you looked at is actually for the else if(checkAllClausesTrue(f)) and the jump is the one going to the else clause because the compiler negated the condition.

c++ for loop efficiency: body vs. afterthought

I found something interesting while on leetcode and wish someone can help explain the cause:
I was basically doing merge sort and used the fast slow pointer to find the mid pointer. Here're two versions of such code snippets:
1. update in afterthought
for (ListNode* fast=head;
fast->next && fast->next->next;
fast = fast->next->next, slow = slow->next) { }
2. update in body
for (ListNode* fast=head; fast->next && fast->next->next; ) {
fast = fast->next->next;
slow = slow->next;
}
Why is version 2 faster than the first one?
Compiler: g++ 4.9.2

It is unlikely that comma operation can significantly reduce the speed of for-loop.
I have made both variants and opened disassembly (in Visual Studio 2012) for them to see difference.
looks as:
for (ListNode* fast = head;
0022545E mov eax,dword ptr [head]
00225461 mov dword ptr [ebp-2Ch],eax
fast->next && fast->next->next;
00225464 jmp main+17Bh (022547Bh)
fast = fast->next->next, slow = slow->next) {
00225466 mov eax,dword ptr [ebp-2Ch]
00225469 mov ecx,dword ptr [eax+4]
0022546C mov edx,dword ptr [ecx+4]
0022546F mov dword ptr [ebp-2Ch],edx
00225472 mov eax,dword ptr [slow]
00225475 mov ecx,dword ptr [eax+4]
00225478 mov dword ptr [slow],ecx
0022547B mov eax,dword ptr [ebp-2Ch]
0022547E cmp dword ptr [eax+4],0
00225482 je main+192h (0225492h)
00225484 mov eax,dword ptr [ebp-2Ch]
00225487 mov ecx,dword ptr [eax+4]
0022548A cmp dword ptr [ecx+4],0
0022548E je main+192h (0225492h)
}
is:
for (ListNode* fast = head; fast->next && fast->next->next;) {
0024545E mov eax,dword ptr [head]
00245461 mov dword ptr [ebp-2Ch],eax
00245464 mov eax,dword ptr [ebp-2Ch]
00245467 cmp dword ptr [eax+4],0
0024546B je main+190h (0245490h)
0024546D mov eax,dword ptr [ebp-2Ch]
00245470 mov ecx,dword ptr [eax+4]
00245473 cmp dword ptr [ecx+4],0
00245477 je main+190h (0245490h)
fast = fast->next->next;
00245479 mov eax,dword ptr [ebp-2Ch]
0024547C mov ecx,dword ptr [eax+4]
0024547F mov edx,dword ptr [ecx+4]
00245482 mov dword ptr [ebp-2Ch],edx
slow = slow->next;
00245485 mov eax,dword ptr [slow]
00245488 mov ecx,dword ptr [eax+4]
0024548B mov dword ptr [slow],ecx
}
Only one jmp is the distinction.
Sorry, but I cannot see significant differences, so perhaps the performance problem is not in the place of that two statements.

x86-64 assembler for polymorphic call

I have the C++ code:
int main(){
M* m;
O* o = new IO();
H* h = new H("A");
if(__rdtsc() % 5 == 0){
m = new Y(o, h);
}
else{
m = new Z(o, h);
}
m->my_virtual();
return 1;
}
where the virtual call is represented by this asm:
mov rax,qword ptr [x]
mov rax,qword ptr [rax]
mov rcx,qword ptr [x]
call qword ptr [rax]
It is one more line than I was expecting for the vtable method invoccation. Are all four of the ASM lines specific to the polymorphic call?
How do the above four lines read pseudo-ly?
This is the complete ASM and C++ (the virtual call is made right at the end):
int main(){
add byte ptr [rax-33333334h],bh
rep stos dword ptr [rdi]
mov qword ptr [rsp+0A8h],0FFFFFFFFFFFFFFFEh
M* x;
o* o = new IO();
mov ecx,70h
call operator new (013F6B7A70h)
mov qword ptr [rsp+40h],rax
cmp qword ptr [rsp+40h],0
je main+4Fh (013F69687Fh)
mov rcx,qword ptr [rsp+40h]
call IO::IO (013F6814F6h)
mov qword ptr [rsp+0B0h],rax
jmp main+5Bh (013F69688Bh)
mov qword ptr [rsp+0B0h],0
mov rax,qword ptr [rsp+0B0h]
mov qword ptr [rsp+38h],rax
mov rax,qword ptr [rsp+38h]
mov qword ptr [o],rax
H* h = new H("A");
mov ecx,150h
call operator new (013F6B7A70h)
mov qword ptr [rsp+50h],rax
cmp qword ptr [rsp+50h],0
je main+0CEh (013F6968FEh)
lea rax,[rsp+58h]
mov qword ptr [rsp+80h],rax
lea rdx,[ec_table+11Ch (013F7C073Ch)]
mov rcx,qword ptr [rsp+80h]
call std::basic_string<char,std::char_traits<char>,std::allocator<char> >::basic_string<char,std::char_traits<char>,std::allocator<char> > (013F681104h)
mov qword ptr [rsp+0B8h],rax
mov rdx,qword ptr [rsp+0B8h]
mov rcx,qword ptr [rsp+50h]
call H::H (013F6826A3h)
mov qword ptr [rsp+0C0h],rax
jmp main+0DAh (013F69690Ah)
mov qword ptr [rsp+0C0h],0
mov rax,qword ptr [rsp+0C0h]
mov qword ptr [rsp+48h],rax
mov rax,qword ptr [rsp+48h]
mov qword ptr [h],rax
if(__rdtsc() % 5 == 0){
rdtsc
shl rdx,20h
or rax,rdx
xor edx,edx
mov ecx,5
div rax,rcx
mov rax,rdx
test rax,rax
jne main+175h (013F6969A5h)
x = new Y(o, h);
mov ecx,18h
call operator new (013F6B7A70h)
mov qword ptr [rsp+90h],rax
cmp qword ptr [rsp+90h],0
je main+14Ah (013F69697Ah)
mov r8,qword ptr [h]
mov rdx,qword ptr [o]
mov rcx,qword ptr [rsp+90h]
call Y::Y (013F681B4Fh)
mov qword ptr [rsp+0C8h],rax
jmp main+156h (013F696986h)
mov qword ptr [rsp+0C8h],0
mov rax,qword ptr [rsp+0C8h]
mov qword ptr [rsp+88h],rax
mov rax,qword ptr [rsp+88h]
mov qword ptr [x],rax
}
else{
jmp main+1DCh (013F696A0Ch)
x = new Z(o, h);
mov ecx,18h
call operator new (013F6B7A70h)
mov qword ptr [rsp+0A0h],rax
cmp qword ptr [rsp+0A0h],0
je main+1B3h (013F6969E3h)
mov r8,qword ptr [h]
mov rdx,qword ptr [o]
mov rcx,qword ptr [rsp+0A0h]
call Z::Z (013F68160Eh)
mov qword ptr [rsp+0D0h],rax
jmp main+1BFh (013F6969EFh)
mov qword ptr [rsp+0D0h],0
mov rax,qword ptr [rsp+0D0h]
mov qword ptr [rsp+98h],rax
mov rax,qword ptr [rsp+98h]
mov qword ptr [x],rax
}
x->my_virtual();
mov rax,qword ptr [x]
mov rax,qword ptr [rax]
mov rcx,qword ptr [x]
call qword ptr [rax]
return 1;
mov eax,1
}

You're probably looking at unoptimized code:
mov rax,qword ptr [x] ; load rax with object pointer
mov rax,qword ptr [rax] ; load rax with the vtable pointer
mov rcx,qword ptr [x] ; load rcx with the object pointer (the 'this' pointer)
call qword ptr [rax] ; call through the vtable slot for the virtual function

mov rax,qword ptr [x]
get the address pointed to by x
mov rax,qword ptr [rax]
get the address of the vtable for x's class (using rax we just worked out). Put it in rax
mov rcx,qword ptr [x]
get the pointer x and put it in rcx, so it can be used as the "this" pointer in the called function.
call qword ptr [rax]
call the function using the address from the vtable we found earlier (no offset as it is the first virtual function).
There are definitely shorter ways to do it, which the compiler might use if you switch optimizations on (e.g. only get [x] once).
Updated with more info from Ben Voigt

In pseudo-code:
(*(*m->__vtbl)[0])(m)
Optimized version (can rcx be used for indexing?):
mov rcx,qword ptr [x] ; load rcx with object pointer
mov rax,qword ptr [rcx] ; load rax with the vtable pointer
call qword ptr [rax] ; call through the vtable slot for the virtual function
or
mov rax,qword ptr [x] ; load rax with object pointer
mov rcx,rax ; copy object pointer to rcx (the 'this' pointer)
mov rax,qword ptr [rax] ; load rax with the vtable pointer
call qword ptr [rax] ; call through the vtable slot for the virtual function

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

Plain C++ Code 10 times faster than inline assembler. Why? - c++

Related

Different assembly when rangifying a simple algorithm

Optimizations in case of multiple virtual target invocations (i.e. 1, 2, 3)

is Visual C++ actually generating blatantly incorrect code?

c++ for loop efficiency: body vs. afterthought

x86-64 assembler for polymorphic call

Categories

Resources