x86-64 assembler for polymorphic call

x86-64 assembler for polymorphic call - c++

I have the C++ code:
int main(){
M* m;
O* o = new IO();
H* h = new H("A");
if(__rdtsc() % 5 == 0){
m = new Y(o, h);
}
else{
m = new Z(o, h);
}
m->my_virtual();
return 1;
}
where the virtual call is represented by this asm:
mov rax,qword ptr [x]
mov rax,qword ptr [rax]
mov rcx,qword ptr [x]
call qword ptr [rax]
It is one more line than I was expecting for the vtable method invoccation. Are all four of the ASM lines specific to the polymorphic call?
How do the above four lines read pseudo-ly?
This is the complete ASM and C++ (the virtual call is made right at the end):
int main(){
add byte ptr [rax-33333334h],bh
rep stos dword ptr [rdi]
mov qword ptr [rsp+0A8h],0FFFFFFFFFFFFFFFEh
M* x;
o* o = new IO();
mov ecx,70h
call operator new (013F6B7A70h)
mov qword ptr [rsp+40h],rax
cmp qword ptr [rsp+40h],0
je main+4Fh (013F69687Fh)
mov rcx,qword ptr [rsp+40h]
call IO::IO (013F6814F6h)
mov qword ptr [rsp+0B0h],rax
jmp main+5Bh (013F69688Bh)
mov qword ptr [rsp+0B0h],0
mov rax,qword ptr [rsp+0B0h]
mov qword ptr [rsp+38h],rax
mov rax,qword ptr [rsp+38h]
mov qword ptr [o],rax
H* h = new H("A");
mov ecx,150h
call operator new (013F6B7A70h)
mov qword ptr [rsp+50h],rax
cmp qword ptr [rsp+50h],0
je main+0CEh (013F6968FEh)
lea rax,[rsp+58h]
mov qword ptr [rsp+80h],rax
lea rdx,[ec_table+11Ch (013F7C073Ch)]
mov rcx,qword ptr [rsp+80h]
call std::basic_string<char,std::char_traits<char>,std::allocator<char> >::basic_string<char,std::char_traits<char>,std::allocator<char> > (013F681104h)
mov qword ptr [rsp+0B8h],rax
mov rdx,qword ptr [rsp+0B8h]
mov rcx,qword ptr [rsp+50h]
call H::H (013F6826A3h)
mov qword ptr [rsp+0C0h],rax
jmp main+0DAh (013F69690Ah)
mov qword ptr [rsp+0C0h],0
mov rax,qword ptr [rsp+0C0h]
mov qword ptr [rsp+48h],rax
mov rax,qword ptr [rsp+48h]
mov qword ptr [h],rax
if(__rdtsc() % 5 == 0){
rdtsc
shl rdx,20h
or rax,rdx
xor edx,edx
mov ecx,5
div rax,rcx
mov rax,rdx
test rax,rax
jne main+175h (013F6969A5h)
x = new Y(o, h);
mov ecx,18h
call operator new (013F6B7A70h)
mov qword ptr [rsp+90h],rax
cmp qword ptr [rsp+90h],0
je main+14Ah (013F69697Ah)
mov r8,qword ptr [h]
mov rdx,qword ptr [o]
mov rcx,qword ptr [rsp+90h]
call Y::Y (013F681B4Fh)
mov qword ptr [rsp+0C8h],rax
jmp main+156h (013F696986h)
mov qword ptr [rsp+0C8h],0
mov rax,qword ptr [rsp+0C8h]
mov qword ptr [rsp+88h],rax
mov rax,qword ptr [rsp+88h]
mov qword ptr [x],rax
}
else{
jmp main+1DCh (013F696A0Ch)
x = new Z(o, h);
mov ecx,18h
call operator new (013F6B7A70h)
mov qword ptr [rsp+0A0h],rax
cmp qword ptr [rsp+0A0h],0
je main+1B3h (013F6969E3h)
mov r8,qword ptr [h]
mov rdx,qword ptr [o]
mov rcx,qword ptr [rsp+0A0h]
call Z::Z (013F68160Eh)
mov qword ptr [rsp+0D0h],rax
jmp main+1BFh (013F6969EFh)
mov qword ptr [rsp+0D0h],0
mov rax,qword ptr [rsp+0D0h]
mov qword ptr [rsp+98h],rax
mov rax,qword ptr [rsp+98h]
mov qword ptr [x],rax
}
x->my_virtual();
mov rax,qword ptr [x]
mov rax,qword ptr [rax]
mov rcx,qword ptr [x]
call qword ptr [rax]
return 1;
mov eax,1
}

You're probably looking at unoptimized code:
mov rax,qword ptr [x] ; load rax with object pointer
mov rax,qword ptr [rax] ; load rax with the vtable pointer
mov rcx,qword ptr [x] ; load rcx with the object pointer (the 'this' pointer)
call qword ptr [rax] ; call through the vtable slot for the virtual function

mov rax,qword ptr [x]
get the address pointed to by x
mov rax,qword ptr [rax]
get the address of the vtable for x's class (using rax we just worked out). Put it in rax
mov rcx,qword ptr [x]
get the pointer x and put it in rcx, so it can be used as the "this" pointer in the called function.
call qword ptr [rax]
call the function using the address from the vtable we found earlier (no offset as it is the first virtual function).
There are definitely shorter ways to do it, which the compiler might use if you switch optimizations on (e.g. only get [x] once).
Updated with more info from Ben Voigt

In pseudo-code:
(*(*m->__vtbl)[0])(m)
Optimized version (can rcx be used for indexing?):
mov rcx,qword ptr [x] ; load rcx with object pointer
mov rax,qword ptr [rcx] ; load rax with the vtable pointer
call qword ptr [rax] ; call through the vtable slot for the virtual function
or
mov rax,qword ptr [x] ; load rax with object pointer
mov rcx,rax ; copy object pointer to rcx (the 'this' pointer)
mov rax,qword ptr [rax] ; load rax with the vtable pointer
call qword ptr [rax] ; call through the vtable slot for the virtual function

Related

C++ large deque - program takes very long time to exit?

Consider the following C++ program:
#include <deque>
#include <iostream>
using namespace std;
int main()
{
deque<double> d(30000000);
cout << "Done\n";
}
The memory allocation in the first line only takes a second, but after it prints Done, it takes 33 seconds (!) to exit back to the terminal. Decreasing the number of elements to 20000000 reduces that time to 22 seconds, so clearly it's linear in the number of elements.
I am compiling on Windows 10, and the same thing happened with both GCC 10.2.0 and Visual Studio 2019.
What's going on here? Am I using deque in a way it's not supposed to be used?
EDIT:
#include <deque>
#include <iostream>
using namespace std;
void test_deque()
{
deque<double> d(30000000);
cout << "Function done\n";
}
int main()
{
test_deque();
cout << "Main done\n";
}
Now it prints Function done and then there is the 33 second delay. So I assume this has to do with the destructor that gets executed when the function exits. But why does it take so long to destruct 240 MB of memory?
EDIT 2: Tried it (the second version) with GCC on Ubuntu and it only takes a fraction of a second to run! Same with some online C++ compilers. Is this a problem specific to Windows?
EDIT 3: With vector it also takes a fraction of a second to run. However, with list (and forward_list) I get a similar extremely long delay.
EDIT 4: Compiling with MSVC in Release (rather than Debug) configuration also takes a fraction of a second. I'm not sure what the GCC equivalent it, but with -O3 (max optimizations) the execution time remains 33 seconds.

Fundamentally the answer isn't very interesting. Your program is a no-op, so a compiler may optimize out the deque construction. But it doesn't have to.
But first, a legal sane implementation may do any of the following:
Do an allocation for 30000000 float elements and nothing else. The allocator might:
Do the allocation in the most lazy way, doing essentially nothing but some bookkeeping.
Eagerly allocate and page in memory, causing 30000000/page size operations.
Zero-initialize or pattern-initialize (e.g. 0xdeadbeef) to help detect uninitialized usage the memory, causing 30000000 writes.
Allocate (include above) and zero-initialize or pattern-initialize the memory.
Run some sort of destructor over all elements (e.g. zeroing out memory).
Not run a destructor on any elements since it's a built-in type.
Now all of the above are possible options. And since your program is a no-op, a legal compiler may optimize out any or none of these steps. Your system allocator might vary in capabilities, supporting lazy allocation, overcommit, automatic zeroing, etc. So the end result is that you could get any kind of behavior depending on your operating system, compiler version, compiler flags, standard library, etc.

MSVC has a built-in profiler. We can run it (press Alt-F2) to see that the majority of CPU time is spent inside the constructor and destructor, which invoke deque::resize() and deque::_Tidy() functions, respectively.
If we drill down further we see that deque::emplace_back() is resulting in quite a lot of code
#define _PUSH_BACK_BEGIN \
if ((_Myoff() + _Mysize()) % _DEQUESIZ == 0 && _Mapsize() <= (_Mysize() + _DEQUESIZ) / _DEQUESIZ) { \
_Growmap(1); \
} \
_Myoff() &= _Mapsize() * _DEQUESIZ - 1; \
size_type _Newoff = _Myoff() + _Mysize(); \
size_type _Block = _Getblock(_Newoff); \
if (_Map()[_Block] == nullptr) { \
_Map()[_Block] = _Getal().allocate(_DEQUESIZ); \
}
#define _PUSH_BACK_END ++_Mysize()
template <class... _Valty>
decltype(auto) emplace_back(_Valty&&... _Val) {
_Orphan_all();
_PUSH_BACK_BEGIN;
_Alty_traits::construct(
_Getal(), _Unfancy(_Map()[_Block] + _Newoff % _DEQUESIZ), _STD forward<_Valty>(_Val)...);
_PUSH_BACK_END;
#if _HAS_CXX17
return back();
#endif // _HAS_CXX17
}
Disassembly view:
template <class... _Valty>
decltype(auto) emplace_back(_Valty&&... _Val) {
00007FF674A238E0 mov qword ptr [rsp+8],rcx
00007FF674A238E5 push rbp
00007FF674A238E6 push rdi
00007FF674A238E7 sub rsp,138h
00007FF674A238EE lea rbp,[rsp+20h]
00007FF674A238F3 mov rdi,rsp
00007FF674A238F6 mov ecx,4Eh
00007FF674A238FB mov eax,0CCCCCCCCh
00007FF674A23900 rep stos dword ptr [rdi]
00007FF674A23902 mov rcx,qword ptr [rsp+158h]
00007FF674A2390A lea rcx,[__0657B1E2_deque (07FF674A3E02Fh)]
00007FF674A23911 call __CheckForDebuggerJustMyCode (07FF674A21159h)
_Orphan_all();
00007FF674A23916 mov rcx,qword ptr [this]
00007FF674A2391D call std::deque<double,std::allocator<double> >::_Orphan_all (07FF674A217FDh)
_PUSH_BACK_BEGIN;
00007FF674A23922 mov rcx,qword ptr [this]
00007FF674A23929 call std::deque<double,std::allocator<double> >::_Myoff (07FF674A2139Dh)
00007FF674A2392E mov qword ptr [rbp+0F8h],rax
00007FF674A23935 mov rcx,qword ptr [this]
00007FF674A2393C call std::deque<double,std::allocator<double> >::_Mysize (07FF674A211B8h)
00007FF674A23941 mov rcx,qword ptr [rbp+0F8h]
00007FF674A23948 mov rcx,qword ptr [rcx]
00007FF674A2394B add rcx,qword ptr [rax]
00007FF674A2394E mov rax,rcx
00007FF674A23951 xor edx,edx
00007FF674A23953 mov ecx,2
00007FF674A23958 div rax,rcx
00007FF674A2395B mov rax,rdx
00007FF674A2395E test rax,rax
00007FF674A23961 jne std::deque<double,std::allocator<double> >::emplace_back<>+0D0h (07FF674A239B0h)
00007FF674A23963 mov rcx,qword ptr [this]
00007FF674A2396A call std::deque<double,std::allocator<double> >::_Mapsize (07FF674A214BFh)
00007FF674A2396F mov qword ptr [rbp+0F8h],rax
00007FF674A23976 mov rcx,qword ptr [this]
00007FF674A2397D call std::deque<double,std::allocator<double> >::_Mysize (07FF674A211B8h)
00007FF674A23982 mov rax,qword ptr [rax]
00007FF674A23985 add rax,2
00007FF674A23989 xor edx,edx
00007FF674A2398B mov ecx,2
00007FF674A23990 div rax,rcx
00007FF674A23993 mov rcx,qword ptr [rbp+0F8h]
00007FF674A2399A cmp qword ptr [rcx],rax
00007FF674A2399D ja std::deque<double,std::allocator<double> >::emplace_back<>+0D0h (07FF674A239B0h)
00007FF674A2399F mov edx,1
00007FF674A239A4 mov rcx,qword ptr [this]
00007FF674A239AB call std::deque<double,std::allocator<double> >::_Growmap (07FF674A21640h)
00007FF674A239B0 mov rcx,qword ptr [this]
00007FF674A239B7 call std::deque<double,std::allocator<double> >::_Mapsize (07FF674A214BFh)
00007FF674A239BC mov rax,qword ptr [rax]
00007FF674A239BF lea rax,[rax+rax-1]
00007FF674A239C4 mov qword ptr [rbp+0F8h],rax
00007FF674A239CB mov rcx,qword ptr [this]
00007FF674A239D2 call std::deque<double,std::allocator<double> >::_Myoff (07FF674A2139Dh)
00007FF674A239D7 mov qword ptr [rbp+100h],rax
00007FF674A239DE mov rax,qword ptr [rbp+100h]
00007FF674A239E5 mov rax,qword ptr [rax]
00007FF674A239E8 mov qword ptr [rbp+108h],rax
00007FF674A239EF mov rax,qword ptr [rbp+0F8h]
00007FF674A239F6 mov rcx,qword ptr [rbp+108h]
00007FF674A239FD and rcx,rax
00007FF674A23A00 mov rax,rcx
00007FF674A23A03 mov rcx,qword ptr [rbp+100h]
00007FF674A23A0A mov qword ptr [rcx],rax
00007FF674A23A0D mov rcx,qword ptr [this]
00007FF674A23A14 call std::deque<double,std::allocator<double> >::_Myoff (07FF674A2139Dh)
00007FF674A23A19 mov qword ptr [rbp+0F8h],rax
00007FF674A23A20 mov rcx,qword ptr [this]
00007FF674A23A27 call std::deque<double,std::allocator<double> >::_Mysize (07FF674A211B8h)
00007FF674A23A2C mov rcx,qword ptr [rbp+0F8h]
00007FF674A23A33 mov rcx,qword ptr [rcx]
00007FF674A23A36 add rcx,qword ptr [rax]
00007FF674A23A39 mov rax,rcx
00007FF674A23A3C mov qword ptr [_Newoff],rax
00007FF674A23A40 mov rdx,qword ptr [_Newoff]
00007FF674A23A44 mov rcx,qword ptr [this]
00007FF674A23A4B call std::deque<double,std::allocator<double> >::_Getblock (07FF674A21334h)
00007FF674A23A50 mov qword ptr [_Block],rax
00007FF674A23A54 mov rcx,qword ptr [this]
00007FF674A23A5B call std::deque<double,std::allocator<double> >::_Map (07FF674A21753h)
00007FF674A23A60 mov rax,qword ptr [rax]
00007FF674A23A63 mov rcx,qword ptr [_Block]
00007FF674A23A67 cmp qword ptr [rax+rcx*8],0
00007FF674A23A6C jne std::deque<double,std::allocator<double> >::emplace_back<>+1D7h (07FF674A23AB7h)
00007FF674A23A6E mov rcx,qword ptr [this]
00007FF674A23A75 call std::deque<double,std::allocator<double> >::_Getal (07FF674A216CCh)
00007FF674A23A7A mov qword ptr [rbp+0F8h],rax
00007FF674A23A81 mov edx,2
00007FF674A23A86 mov rcx,qword ptr [rbp+0F8h]
00007FF674A23A8D call std::allocator<double>::allocate (07FF674A216C7h)
00007FF674A23A92 mov qword ptr [rbp+100h],rax
00007FF674A23A99 mov rcx,qword ptr [this]
00007FF674A23AA0 call std::deque<double,std::allocator<double> >::_Map (07FF674A21753h)
00007FF674A23AA5 mov rax,qword ptr [rax]
00007FF674A23AA8 mov rcx,qword ptr [_Block]
00007FF674A23AAC mov rdx,qword ptr [rbp+100h]
00007FF674A23AB3 mov qword ptr [rax+rcx*8],rdx
_Alty_traits::construct(
00007FF674A23AB7 mov rcx,qword ptr [this]
00007FF674A23ABE call std::deque<double,std::allocator<double> >::_Map (07FF674A21753h)
00007FF674A23AC3 mov rax,qword ptr [rax]
00007FF674A23AC6 mov qword ptr [rbp+0F8h],rax
00007FF674A23ACD xor edx,edx
00007FF674A23ACF mov rax,qword ptr [_Newoff]
00007FF674A23AD3 mov ecx,2
00007FF674A23AD8 div rax,rcx
00007FF674A23ADB mov rax,rdx
00007FF674A23ADE mov rcx,qword ptr [_Block]
00007FF674A23AE2 mov rdx,qword ptr [rbp+0F8h]
00007FF674A23AE9 mov rcx,qword ptr [rdx+rcx*8]
00007FF674A23AED lea rax,[rcx+rax*8]
00007FF674A23AF1 mov rcx,rax
00007FF674A23AF4 call std::_Unfancy<double> (07FF674A214A6h)
00007FF674A23AF9 mov qword ptr [rbp+100h],rax
00007FF674A23B00 mov rcx,qword ptr [this]
00007FF674A23B07 call std::deque<double,std::allocator<double> >::_Getal (07FF674A216CCh)
00007FF674A23B0C mov qword ptr [rbp+108h],rax
00007FF674A23B13 mov rdx,qword ptr [rbp+100h]
00007FF674A23B1A mov rcx,qword ptr [rbp+108h]
00007FF674A23B21 call std::_Default_allocator_traits<std::allocator<double> >::construct<double> (07FF674A211E5h)
_Getal(), _Unfancy(_Map()[_Block] + _Newoff % _DEQUESIZ), _STD forward<_Valty>(_Val)...);
_PUSH_BACK_END;
00007FF674A23B26 mov rcx,qword ptr [this]
00007FF674A23B2D call std::deque<double,std::allocator<double> >::_Mysize (07FF674A211B8h)
00007FF674A23B32 mov qword ptr [rbp+0F8h],rax
00007FF674A23B39 mov rax,qword ptr [rbp+0F8h]
00007FF674A23B40 mov rax,qword ptr [rax]
00007FF674A23B43 inc rax
00007FF674A23B46 mov rcx,qword ptr [rbp+0F8h]
00007FF674A23B4D mov qword ptr [rcx],rax
#if _HAS_CXX17
return back();
00007FF674A23B50 mov rcx,qword ptr [this]
00007FF674A23B57 call std::deque<double,std::allocator<double> >::back (07FF674A2127Bh)
#endif // _HAS_CXX17
}
00007FF674A23B5C lea rsp,[rbp+118h]
00007FF674A23B63 pop rdi
00007FF674A23B64 pop rbp
00007FF674A23B65 ret
Apparently std::deque doesn't pre-allocate elements, and instead uses a loop to add them one-by-one. So no wonder it is slow.
You can speed up the Debug build by enabling some optimizations (e.g. /Ob1) and reducing runtime checks (e.g. remove /RTC1).
But really, std::deque is just a horrible structure from a performance point of view (a vector of tiny vectors - not cache-friendly at all).

It is really slow in debug mode.
MSDN:
Processes that the debugger creates (also known as spawned processes) behave a little differently than processes that the debugger does not create.
Instead of using the standard heap API, processes that the debugger creates use a special debug heap. You can force a spawned process to use the standard heap instead of the debug heap by using the _NO_DEBUG_HEAP environment variable.

std::deque allocates data in chunks of fixed size varying with platform and type. For double that could be 4KB. So allocating 30,000,000 doubles takes 2.4GB of memory and thus 6,000 allocations/deallocations. With std::list that would be 30,000,000 allocations/deallocations and take a couple GB more memory making it all ridiculously slow.
This might even cause memory fragmentation issues depending on your hardware. And if you run without optimizations it will be even slower.
There is also the privacy issue. Deallocation might be clearing data to ensure that your program didn't leak any information to outside programs.
As mentioned by #orlp as your program is a no-op the whole allocation/deallocation can be optimized out completely which might explain why it speeds up significantly at times.

Optimizations in case of multiple virtual target invocations (i.e. 1, 2, 3)

I have below code which actually uses 1, 2, 3 virtual call targets:
#include <random>
#include <memory>
#include <ctime>
struct IMath
{
virtual ~IMath() = default;
virtual int compute(int) = 0;
};
struct MathAlg1: IMath
{
int compute(int i) final
{
return i * 17;
}
};
struct MathAlg2: IMath
{
int compute(int i) final
{
return i * 19;
}
};
struct MathAlg3: IMath
{
int compute(int i) final
{
return i * 23;
}
};
struct MathAlg4: IMath
{
int compute(int i) final
{
return i * 29;
}
};
namespace
{
static std::shared_ptr<int> param = std::make_shared<int>(3);
int compute(IMath& alg, int i)
{
return alg.compute(i);
}
std::unique_ptr<IMath> alg1 = std::make_unique<MathAlg1>();
std::unique_ptr<IMath> alg2 = std::make_unique<MathAlg2>();
std::unique_ptr<IMath> alg3 = std::make_unique<MathAlg3>();
std::unique_ptr<IMath> alg4 = std::make_unique<MathAlg4>();
int monomorphicCall()
{
return compute(*alg1, *param);
}
int bimorphicCall()
{
return compute(*alg1, *param) + compute(*alg2, *param);
}
int megamorphic3Call()
{
return compute(*alg1, *param) + compute(*alg2, *param) + compute(*alg3, *param);
}
int megamorphic4Call()
{
return compute(*alg1, *param) + compute(*alg2, *param) + compute(*alg3, *param) + compute(*alg4, *param);
}
}
int main(){
return monomorphicCall();
//return bimorphicCall();
//return megamorphic3Call();
//return megamorphic4Call();
}
Generated ASM: (clang 6.0.0 w/ -03)
monomorphicCall()
main: # #main
mov rdi, qword ptr [rip + (anonymous namespace)::alg1]
mov rax, qword ptr [rip + (anonymous namespace)::param]
mov esi, dword ptr [rax]
mov rax, qword ptr [rdi]
mov rax, qword ptr [rax + 16]
jmp rax # TAILCALL
bimorphicCall()
main: # #main
push rbx
mov rdi, qword ptr [rip + (anonymous namespace)::alg1]
mov rax, qword ptr [rip + (anonymous namespace)::param]
mov esi, dword ptr [rax]
mov rax, qword ptr [rdi]
call qword ptr [rax + 16]
mov ebx, eax
mov rdi, qword ptr [rip + (anonymous namespace)::alg2]
mov rax, qword ptr [rip + (anonymous namespace)::param]
mov esi, dword ptr [rax]
mov rax, qword ptr [rdi]
call qword ptr [rax + 16]
add eax, ebx
pop rbx
ret
megamorphic3Call()
main: # #main
push rbp
push rbx
push rax
mov rdi, qword ptr [rip + (anonymous namespace)::alg1]
mov rax, qword ptr [rip + (anonymous namespace)::param]
mov esi, dword ptr [rax]
mov rax, qword ptr [rdi]
call qword ptr [rax + 16]
mov ebx, eax
mov rdi, qword ptr [rip + (anonymous namespace)::alg2]
mov rax, qword ptr [rip + (anonymous namespace)::param]
mov esi, dword ptr [rax]
mov rax, qword ptr [rdi]
call qword ptr [rax + 16]
mov ebp, eax
add ebp, ebx
mov rdi, qword ptr [rip + (anonymous namespace)::alg3]
mov rax, qword ptr [rip + (anonymous namespace)::param]
mov esi, dword ptr [rax]
mov rax, qword ptr [rdi]
call qword ptr [rax + 16]
add eax, ebp
add rsp, 8
pop rbx
pop rbp
ret
megamorphic4Call()
main: # #main
push rbp
push rbx
push rax
mov rdi, qword ptr [rip + (anonymous namespace)::alg1]
mov rax, qword ptr [rip + (anonymous namespace)::param]
mov esi, dword ptr [rax]
mov rax, qword ptr [rdi]
call qword ptr [rax + 16]
mov ebx, eax
mov rdi, qword ptr [rip + (anonymous namespace)::alg2]
mov rax, qword ptr [rip + (anonymous namespace)::param]
mov esi, dword ptr [rax]
mov rax, qword ptr [rdi]
call qword ptr [rax + 16]
mov ebp, eax
add ebp, ebx
mov rdi, qword ptr [rip + (anonymous namespace)::alg3]
mov rax, qword ptr [rip + (anonymous namespace)::param]
mov esi, dword ptr [rax]
mov rax, qword ptr [rdi]
call qword ptr [rax + 16]
mov ebx, eax
add ebx, ebp
mov rdi, qword ptr [rip + (anonymous namespace)::alg4]
mov rax, qword ptr [rip + (anonymous namespace)::param]
mov esi, dword ptr [rax]
mov rax, qword ptr [rdi]
call qword ptr [rax + 16]
add eax, ebx
add rsp, 8
pop rbx
pop rbp
ret
Questions/Confirmation points:
In the case of monomorphicCall() to me, it looks like there is no actual
virtual table call, but a jmp rax # TAILCALL. Is this a correct assessment?
In case of bimorphicCall(), megamorphic3Call(), megamorphic4Call() it triggers the vcalls in all cases. Is this a correct assessment?

is Visual C++ actually generating blatantly incorrect code?

So I'm debugging my DPLL implementation and it's not quite working right so I step through the code line by line in the debugger, it gets to a return statement but the thing is it doesn't return, it just keeps on executing the same function. WTF I thought, am I really seeing this? So I looked at the dissasembly and sure enough one of the return statements jumps to the wrong place. Never have I seen VS generate incorrect code so I'm wondering if I screwed up somewhere but I can't find anything. The jump is incorrect even when compiling with all optimizations off.
This illustrates whats going on.
bool dpll(std::vector<clause> f)
{
unitPropagate(f);
if(checkFalseClause(f))
{
return false; //je dpll+5Fh (0C01D1Fh) <-- Totally wrong jump adress
}
else if(checkAllClausesTrue(f))
{
return true; //jmp dpll+206h (0C01EC6h) <-- this is fine
}
else
{
atom l = chooseLiteral(f); //this is where the jump ends up (0C01D1Fh)
std::vector<clause> a = makeDuplicate(f);
replaceInstancesOf(a, l, true);
std::vector<clause> b = makeDuplicate(f);
replaceInstancesOf(b, l, false);
return dpll(a) | dpll(b);
}
//this is where the jump is supposed to go (0C01EC6h)
}
So my question is, is Visual Studio actually broken or have I misunderstood something? Has anyone run into something like this before?
The version is Visual Studio Enterprise 2015 if that makes a difference, the code is generated for x86_32.
Here's the full dissasembly if anyone's interested:
00C01CC0 push ebp
00C01CC1 mov ebp,esp
00C01CC3 push 0FFFFFFFFh
00C01CC5 push 0C08FF0h
00C01CCA mov eax,dword ptr fs:[00000000h]
00C01CD0 push eax
00C01CD1 sub esp,40h
00C01CD4 mov eax,dword ptr [__security_cookie (0C0D008h)]
00C01CD9 xor eax,ebp
00C01CDB mov dword ptr [ebp-10h],eax
00C01CDE push ebx
00C01CDF push esi
00C01CE0 push eax
00C01CE1 lea eax,[ebp-0Ch]
00C01CE4 mov dword ptr fs:[00000000h],eax
bool dpll(std::vector<clause> f)
00C01CEA lea ecx,[f]
00C01CED mov dword ptr [ebp-4],0
00C01CF4 call unitPropagate (0C01950h)
{
unitPropagate(f);
00C01CF9 lea ecx,[f]
00C01CFC call checkFalseClause (0C01660h)
00C01D01 test al,al
00C01D03 je dpll+4Ch (0C01D0Ch)
00C01D05 xor bh,bh
00C01D07 jmp dpll+206h (0C01EC6h)
if(checkFalseClause(f))
{
return false;
00C01D0C lea ecx,[f]
00C01D0F call checkAllClausesTrue (0C014F0h)
00C01D14 test al,al
00C01D16 je dpll+5Fh (0C01D1Fh)
}
else if(checkAllClausesTrue(f))
00C01D18 mov bh,1
00C01D1A jmp dpll+206h (0C01EC6h)
{
return true;
}
else
00C01D1F lea edx,[f]
00C01D22 lea ecx,[l]
00C01D25 call chooseLiteral (0C013D0h)
00C01D2A mov byte ptr [ebp-4],1
{
atom l = chooseLiteral(f);
00C01D2E lea edx,[f]
00C01D31 xorps xmm0,xmm0
00C01D34 mov dword ptr [ebp-20h],0
00C01D3B lea ecx,[a]
00C01D3E movq mmword ptr [a],xmm0
00C01D43 call makeDuplicate (0C01A30h)
00C01D48 mov byte ptr [ebp-4],2
00C01D4C sub esp,20h
00C01D4F mov esi,esp
00C01D51 mov bl,1
00C01D53 mov dword ptr [ebp-4Ch],esi
00C01D56 lea ecx,[esi+4]
00C01D59 mov al,byte ptr [l]
00C01D5C mov byte ptr [esi],al
00C01D5E mov dword ptr [ecx+14h],0Fh
00C01D65 mov dword ptr [ecx+10h],0
00C01D6C cmp dword ptr [ecx+14h],10h
00C01D70 jb dpll+0B6h (0C01D76h)
00C01D72 mov eax,dword ptr [ecx]
00C01D74 jmp dpll+0B8h (0C01D78h)
00C01D76 mov eax,ecx
00C01D78 push 0FFFFFFFFh
00C01D7A mov byte ptr [eax],0
00C01D7D lea eax,[ebp-44h]
00C01D80 push 0
00C01D82 push eax
00C01D83 call std::basic_string<char,std::char_traits<char>,std::allocator<char> >::assign (0C02A80h)
00C01D88 mov al,byte ptr [ebp-2Ch]
00C01D8B lea ecx,[a]
00C01D8E mov byte ptr [esi+1Ch],al
00C01D91 mov dl,bl
00C01D93 mov al,byte ptr [ebp-2Bh]
00C01D96 mov byte ptr [esi+1Dh],al
00C01D99 call replaceInstancesOf (0C017D0h)
00C01D9E xorps xmm0,xmm0
00C01DA1 mov dword ptr [ebp-14h],0
std::vector<clause> a = makeDuplicate(f);
replaceInstancesOf(a, l, true);
00C01DA8 lea edx,[f]
std::vector<clause> a = makeDuplicate(f);
replaceInstancesOf(a, l, true);
00C01DAB movq mmword ptr [b],xmm0
00C01DB0 lea ecx,[b]
00C01DB3 call makeDuplicate (0C01A30h)
00C01DB8 mov esi,esp
00C01DBA mov byte ptr [ebp-4],3
00C01DBE mov dword ptr [ebp-4Ch],esi
00C01DC1 lea ecx,[esi+4]
00C01DC4 mov al,byte ptr [l]
00C01DC7 xor bl,bl
00C01DC9 push 0FFFFFFFFh
00C01DCB mov byte ptr [esi],al
00C01DCD lea eax,[ebp-44h]
00C01DD0 push 0
00C01DD2 mov dword ptr [ecx+14h],0Fh
00C01DD9 mov dword ptr [ecx+10h],0
00C01DE0 push eax
00C01DE1 mov byte ptr [ecx],bl
00C01DE3 call std::basic_string<char,std::char_traits<char>,std::allocator<char> >::assign (0C02A80h)
00C01DE8 mov al,byte ptr [ebp-2Ch]
00C01DEB lea ecx,[b]
00C01DEE mov byte ptr [esi+1Ch],al
00C01DF1 mov dl,bl
00C01DF3 mov al,byte ptr [ebp-2Bh]
00C01DF6 mov byte ptr [esi+1Dh],al
00C01DF9 call replaceInstancesOf (0C017D0h)
std::vector<clause> b = makeDuplicate(f);
replaceInstancesOf(b, l, false);
00C01DFE add esp,14h
00C01E01 lea eax,[a]
00C01E04 mov ecx,esp
00C01E06 push eax
00C01E07 call std::vector<std::vector<atom,std::allocator<atom> >,std::allocator<std::vector<atom,std::allocator<atom> > > >::vector<std::vector<atom,std::allocator<atom> >,std::allocator<std::vector<atom,std::allocator<atom> > > > (0C02420h)
00C01E0C call dpll (0C01CC0h)
00C01E11 mov bl,al
00C01E13 mov ecx,esp
00C01E15 lea eax,[b]
00C01E18 push eax
00C01E19 call std::vector<std::vector<atom,std::allocator<atom> >,std::allocator<std::vector<atom,std::allocator<atom> > > >::vector<std::vector<atom,std::allocator<atom> >,std::allocator<std::vector<atom,std::allocator<atom> > > > (0C02420h)
00C01E1E call dpll (0C01CC0h)
00C01E23 mov ecx,dword ptr [b]
00C01E26 mov bh,al
00C01E28 add esp,0Ch
00C01E2B or bh,bl
00C01E2D test ecx,ecx
00C01E2F je dpll+1B4h (0C01E74h)
00C01E31 push dword ptr [ebp-4Ch]
00C01E34 mov edx,dword ptr [ebp-18h]
00C01E37 push ecx
00C01E38 call std::_Destroy_range1<std::allocator<std::vector<atom,std::allocator<atom> > >,std::vector<atom,std::allocator<atom> > *> (0C035E0h)
00C01E3D mov ecx,dword ptr [ebp-14h]
00C01E40 mov eax,2AAAAAABh
00C01E45 mov esi,dword ptr [b]
00C01E48 add esp,8
00C01E4B sub ecx,esi
00C01E4D imul ecx
00C01E4F sar edx,1
00C01E51 mov eax,edx
00C01E53 shr eax,1Fh
00C01E56 add eax,edx
00C01E58 push eax
00C01E59 push esi
00C01E5A call std::_Wrap_alloc<std::allocator<std::vector<atom,std::allocator<atom> > > >::deallocate (0C02D20h)
00C01E5F mov dword ptr [b],0
00C01E66 mov dword ptr [ebp-18h],0
00C01E6D mov dword ptr [ebp-14h],0
00C01E74 mov ecx,dword ptr [a]
00C01E77 test ecx,ecx
00C01E79 je dpll+1FEh (0C01EBEh)
00C01E7B push dword ptr [ebp-4Ch]
00C01E7E mov edx,dword ptr [ebp-24h]
00C01E81 push ecx
00C01E82 call std::_Destroy_range1<std::allocator<std::vector<atom,std::allocator<atom> > >,std::vector<atom,std::allocator<atom> > *> (0C035E0h)
00C01E87 mov ecx,dword ptr [ebp-20h]
00C01E8A mov eax,2AAAAAABh
00C01E8F mov esi,dword ptr [a]
00C01E92 add esp,8
00C01E95 sub ecx,esi
00C01E97 imul ecx
00C01E99 sar edx,1
00C01E9B mov eax,edx
00C01E9D shr eax,1Fh
00C01EA0 add eax,edx
00C01EA2 push eax
00C01EA3 push esi
00C01EA4 call std::_Wrap_alloc<std::allocator<std::vector<atom,std::allocator<atom> > > >::deallocate (0C02D20h)
00C01EA9 mov dword ptr [a],0
00C01EB0 mov dword ptr [ebp-24h],0
00C01EB7 mov dword ptr [ebp-20h],0
00C01EBE lea ecx,[ebp-44h]
00C01EC1 call std::basic_string<char,std::char_traits<char>,std::allocator<char> >::~basic_string<char,std::char_traits<char>,std::allocator<char> > (0C027A0h)
00C01EC6 mov ecx,dword ptr [f]
00C01EC9 test ecx,ecx
00C01ECB je dpll+23Bh (0C01EFBh)
00C01ECD push dword ptr [ebp-4Ch]
00C01ED0 mov edx,dword ptr [ebp+0Ch]
00C01ED3 push ecx
00C01ED4 call std::_Destroy_range1<std::allocator<std::vector<atom,std::allocator<atom> > >,std::vector<atom,std::allocator<atom> > *> (0C035E0h)
00C01ED9 mov ecx,dword ptr [ebp+10h]
00C01EDC mov eax,2AAAAAABh
00C01EE1 mov esi,dword ptr [f]
00C01EE4 add esp,8
00C01EE7 sub ecx,esi
00C01EE9 imul ecx
00C01EEB sar edx,1
00C01EED mov ecx,edx
00C01EEF shr ecx,1Fh
00C01EF2 add ecx,edx
00C01EF4 push ecx
00C01EF5 push esi
00C01EF6 call std::_Wrap_alloc<std::allocator<std::vector<atom,std::allocator<atom> > > >::deallocate (0C02D20h)
00C01EFB mov al,bh
00C01EFD mov ecx,dword ptr [ebp-0Ch]
00C01F00 mov dword ptr fs:[0],ecx
00C01F07 pop ecx
00C01F08 pop esi
00C01F09 pop ebx
00C01F0A mov ecx,dword ptr [ebp-10h]
00C01F0D xor ecx,ebp
00C01F0F call __security_check_cookie (0C080CCh)
00C01F14 mov esp,ebp
00C01F16 pop ebp
00C01F17 ret

The source interleaving is wrong. This is the correct place you want to look at:
00C01CFC call checkFalseClause (0C01660h)
00C01D01 test al,al
00C01D03 je dpll+4Ch (0C01D0Ch)
00C01D05 xor bh,bh
00C01D07 jmp dpll+206h (0C01EC6h)
As you can see, it goes to the expected address if the return value was nonzero.
The part you looked at is actually for the else if(checkAllClausesTrue(f)) and the jump is the one going to the else clause because the compiler negated the condition.

Plain C++ Code 10 times faster than inline assembler. Why?

These two code snippets do the same thing: Adding two float arrays together and storing the result back into them.
Inline Assembler:
void vecAdd_SSE(float* v1, float* v2) {
_asm {
mov esi, v1
mov edi, v2
movups xmm0, [esi]
movups xmm1, [edi]
addps xmm0, xmm1
movups [esi], xmm0
movups [edi], xmm0
}
}
Plain C++ Code:
void vecAdd_Std(float* v1, float* v2) {
v1[0] = v1[0]+ v2[0];
v1[1] = v1[1]+ v2[1];
v1[2] = v1[2]+ v2[2];
v1[3] = v1[3]+ v2[3];
v2[0] = v1[0];
v2[1] = v1[1];
v2[2] = v1[2];
v2[3] = v1[3];
}
Disassembly for C++ Code (Disassembly made in Debug mode because i cannot view the Disassembly in Release mode for some reason):
void vecAdd_Std(float* v1, float* v2) {
push ebp
mov ebp,esp
sub esp,0C0h
push ebx
push esi
push edi
lea edi,[ebp-0C0h]
mov ecx,30h
mov eax,0CCCCCCCCh
rep stos dword ptr es:[edi]
v1[0] = v1[0]+ v2[0];
mov eax,4
imul ecx,eax,0
mov edx,4
imul eax,edx,0
mov edx,dword ptr [v1]
mov esi,dword ptr [v2]
movss xmm0,dword ptr [edx+ecx]
addss xmm0,dword ptr [esi+eax]
mov eax,4
imul ecx,eax,0
mov edx,dword ptr [v1]
movss dword ptr [edx+ecx],xmm0
v1[1] = v1[1]+ v2[1];
mov eax,4
shl eax,0
v1[1] = v1[1]+ v2[1];
mov ecx,4
shl ecx,0
mov edx,dword ptr [v1]
mov esi,dword ptr [v2]
movss xmm0,dword ptr [edx+eax]
addss xmm0,dword ptr [esi+ecx]
mov eax,4
shl eax,0
mov ecx,dword ptr [v1]
movss dword ptr [ecx+eax],xmm0
v1[2] = v1[2]+ v2[2];
mov eax,4
shl eax,1
mov ecx,4
shl ecx,1
mov edx,dword ptr [v1]
mov esi,dword ptr [v2]
movss xmm0,dword ptr [edx+eax]
addss xmm0,dword ptr [esi+ecx]
mov eax,4
shl eax,1
mov ecx,dword ptr [v1]
movss dword ptr [ecx+eax],xmm0
v1[3] = v1[3]+ v2[3];
mov eax,4
imul ecx,eax,3
mov edx,4
imul eax,edx,3
mov edx,dword ptr [v1]
mov esi,dword ptr [v2]
movss xmm0,dword ptr [edx+ecx]
addss xmm0,dword ptr [esi+eax]
mov eax,4
imul ecx,eax,3
mov edx,dword ptr [v1]
movss dword ptr [edx+ecx],xmm0
v2[0] = v1[0];
mov eax,4
imul ecx,eax,0
mov edx,4
imul eax,edx,0
mov edx,dword ptr [v2]
mov esi,dword ptr [v1]
mov ecx,dword ptr [esi+ecx]
mov dword ptr [edx+eax],ecx
v2[1] = v1[1];
mov eax,4
shl eax,0
mov ecx,4
shl ecx,0
mov edx,dword ptr [v2]
mov esi,dword ptr [v1]
mov eax,dword ptr [esi+eax]
mov dword ptr [edx+ecx],eax
v2[2] = v1[2];
mov eax,4
shl eax,1
mov ecx,4
shl ecx,1
mov edx,dword ptr [v2]
mov esi,dword ptr [v1]
mov eax,dword ptr [esi+eax]
mov dword ptr [edx+ecx],eax
v2[3] = v1[3];
mov eax,4
imul ecx,eax,3
mov edx,4
imul eax,edx,3
mov edx,dword ptr [v2]
mov esi,dword ptr [v1]
mov ecx,dword ptr [esi+ecx]
mov dword ptr [edx+eax],ecx
}
Now I made a time measurement on those to functions and noticed that the inline assembler code takes approximately 10 times longer (in Release mode).
Does anybody know why?

On my machine (VS2015 64-bit mode), the compiler inlines vecAdd_Std and produces
00007FF625921C8F vmovups xmm1,xmmword ptr [__xmm#4100000040c000004080000040000000 (07FF625929D60h)]
00007FF625921C97 vmovups xmm4,xmm1
00007FF625921C9B vcvtss2sd xmm1,xmm1,xmm4
Test code
int main() {
float x[4] = {1.0, 2.0, 3.0, 4.0};
float y[4] = {1.0, 2.0, 3.0, 4.0};
vecAdd_Std(x, y);
std::cout << x[0];
}

You aren't really calling a function that executes one SSE instruction, are you? There's non-trivial overhead involved in setting up the xmm registers, and you're copying the values from memory to the registers and back, which will take far longer than the actual calculation.
I wouldn't be at all surprised to find that the compiler inlines the C++ version of the function, but doesn't (can't, really) do the same for functions that contain inline assembly.

Default initialization with default constructor for primitive data types

Are there any drawbacks / disadvantages using the default constructor for default initialization for primitive data types?
For example
class MyClass
{
public:
MyClass();
private:
int miInt;
double mdDouble;
bool mbBool;
};
Using this constructor:
MyClass::MyClass()
: miInt(int())
, mdDouble(double())
, mbBool(bool())
{}
instead of this:
MyClass::MyClass()
: miInt(0)
, mdDouble(0.0)
, mbBool(false)
{}

No, and the compiler will most probably generate the same code for both.
With optimization off, the following code is generated:
MyClass::MyClass()
: miInt(0)
, mdDouble(0.0)
, mbBool(false)
{}
012313A0 push ebp
012313A1 mov ebp,esp
012313A3 sub esp,0CCh
012313A9 push ebx
012313AA push esi
012313AB push edi
012313AC push ecx
012313AD lea edi,[ebp-0CCh]
012313B3 mov ecx,33h
012313B8 mov eax,0CCCCCCCCh
012313BD rep stos dword ptr es:[edi]
012313BF pop ecx
012313C0 mov dword ptr [ebp-8],ecx
012313C3 mov eax,dword ptr [this]
012313C6 mov dword ptr [eax],0
012313CC mov eax,dword ptr [this]
012313CF fldz
012313D1 fstp qword ptr [eax+8]
012313D4 mov eax,dword ptr [this]
012313D7 mov byte ptr [eax+10h],0
012313DB mov eax,dword ptr [this]
012313DE pop edi
012313DF pop esi
012313E0 pop ebx
012313E1 mov esp,ebp
012313E3 pop ebp
012313E4 ret
and
MyClass::MyClass()
: miInt(int())
, mdDouble(double())
, mbBool(bool())
{}
001513A0 push ebp
001513A1 mov ebp,esp
001513A3 sub esp,0CCh
001513A9 push ebx
001513AA push esi
001513AB push edi
001513AC push ecx
001513AD lea edi,[ebp-0CCh]
001513B3 mov ecx,33h
001513B8 mov eax,0CCCCCCCCh
001513BD rep stos dword ptr es:[edi]
001513BF pop ecx
001513C0 mov dword ptr [ebp-8],ecx
001513C3 mov eax,dword ptr [this]
001513C6 mov dword ptr [eax],0
001513CC mov eax,dword ptr [this]
001513CF fldz
001513D1 fstp qword ptr [eax+8]
001513D4 mov eax,dword ptr [this]
001513D7 mov byte ptr [eax+10h],0
001513DB mov eax,dword ptr [this]
001513DE pop edi
001513DF pop esi
001513E0 pop ebx
001513E1 mov esp,ebp
001513E3 pop ebp
001513E4 ret
As you can see, it's identical.

There is more consistent syntax for creating default objects:
MyClass::MyClass()
: miInt()
, mdDouble()
, mbBool()
{
}
That is, don't pass anything. Just write T() and the object will be created with default value. It is also consistent with class types (think of POD types)!

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

x86-64 assembler for polymorphic call - c++

Related

C++ large deque - program takes very long time to exit?

Optimizations in case of multiple virtual target invocations (i.e. 1, 2, 3)

is Visual C++ actually generating blatantly incorrect code?

Plain C++ Code 10 times faster than inline assembler. Why?

Default initialization with default constructor for primitive data types

Categories

Resources