Related
Disclaimer: full code can be found here.
16 byte alignment
Given a fairly simple type to support proper SSE alignment
struct alignas(16) simd_pack
{
std::int32_t data[4];
};
and a function that adds two arrays together
void add_packed(simd_pack* lhs_and_result, simd_pack* rhs, std::size_t size)
{
for (std::size_t i = 0; i < size; i++)
for (std::size_t j = 0; j < 4; j++)
lhs_and_result[i].data[j] += rhs[i].data[j];
}
compile the code with clang and gcc using -O3.
Clang produces the following assembly:
add_packed(simd_pack*, simd_pack*, unsigned long): # #add_packed(simd_pack*, simd_pack*, unsigned long)
test rdx, rdx
je .LBB0_3
mov eax, 12
.LBB0_2: # =>This Inner Loop Header: Depth=1
mov ecx, dword ptr [rsi + rax - 12]
add dword ptr [rdi + rax - 12], ecx
mov ecx, dword ptr [rsi + rax - 8]
add dword ptr [rdi + rax - 8], ecx
mov ecx, dword ptr [rsi + rax - 4]
add dword ptr [rdi + rax - 4], ecx
mov ecx, dword ptr [rsi + rax]
add dword ptr [rdi + rax], ecx
add rax, 16
add rdx, -1
jne .LBB0_2
.LBB0_3:
ret
I'm not very literate in assembly but to me it looks like clang is simply unrolling the inner for loop. If we take a look at gcc we get:
add_packed(simd_pack*, simd_pack*, unsigned long):
test rdx, rdx
je .L1
sal rdx, 4
xor eax, eax
.L3:
movdqa xmm0, XMMWORD PTR [rdi+rax]
paddd xmm0, XMMWORD PTR [rsi+rax]
movaps XMMWORD PTR [rdi+rax], xmm0
add rax, 16
cmp rax, rdx
jne .L3
.L1:
ret
which is what I expect.
64 byte alignment
The difference gets even bigger (obviously) if we go to 64 byte alignment (which usually is a cache line if I'm not mistaken)
struct alignas(64) cache_line
{
std::int32_t data[16];
};
void add_cache_line(cache_line* lhs_and_result, cache_line* rhs, std::size_t size)
{
for (std::size_t i = 0; i < size; i++)
for (std::size_t j = 0; j < 16; j++)
lhs_and_result[i].data[j] += rhs[i].data[j];
}
Clang keeps simply unrolling:
add_cache_line(cache_line*, cache_line*, unsigned long): # #add_cache_line(cache_line*, cache_line*, unsigned long)
test rdx, rdx
je .LBB1_3
mov eax, 60
.LBB1_2: # =>This Inner Loop Header: Depth=1
mov ecx, dword ptr [rsi + rax - 60]
add dword ptr [rdi + rax - 60], ecx
mov ecx, dword ptr [rsi + rax - 56]
add dword ptr [rdi + rax - 56], ecx
mov ecx, dword ptr [rsi + rax - 52]
add dword ptr [rdi + rax - 52], ecx
mov ecx, dword ptr [rsi + rax - 48]
add dword ptr [rdi + rax - 48], ecx
mov ecx, dword ptr [rsi + rax - 44]
add dword ptr [rdi + rax - 44], ecx
mov ecx, dword ptr [rsi + rax - 40]
add dword ptr [rdi + rax - 40], ecx
mov ecx, dword ptr [rsi + rax - 36]
add dword ptr [rdi + rax - 36], ecx
mov ecx, dword ptr [rsi + rax - 32]
add dword ptr [rdi + rax - 32], ecx
mov ecx, dword ptr [rsi + rax - 28]
add dword ptr [rdi + rax - 28], ecx
mov ecx, dword ptr [rsi + rax - 24]
add dword ptr [rdi + rax - 24], ecx
mov ecx, dword ptr [rsi + rax - 20]
add dword ptr [rdi + rax - 20], ecx
mov ecx, dword ptr [rsi + rax - 16]
add dword ptr [rdi + rax - 16], ecx
mov ecx, dword ptr [rsi + rax - 12]
add dword ptr [rdi + rax - 12], ecx
mov ecx, dword ptr [rsi + rax - 8]
add dword ptr [rdi + rax - 8], ecx
mov ecx, dword ptr [rsi + rax - 4]
add dword ptr [rdi + rax - 4], ecx
mov ecx, dword ptr [rsi + rax]
add dword ptr [rdi + rax], ecx
add rax, 64
add rdx, -1
jne .LBB1_2
.LBB1_3:
ret
while gcc uses SSE and also unrolls that:
add_cache_line(cache_line*, cache_line*, unsigned long):
mov rcx, rdx
test rdx, rdx
je .L9
sal rcx, 6
mov rax, rdi
mov rdx, rsi
add rcx, rdi
.L11:
movdqa xmm2, XMMWORD PTR [rdx+16]
movdqa xmm3, XMMWORD PTR [rax]
add rax, 64
add rdx, 64
movdqa xmm1, XMMWORD PTR [rdx-32]
movdqa xmm0, XMMWORD PTR [rdx-16]
paddd xmm3, XMMWORD PTR [rdx-64]
paddd xmm2, XMMWORD PTR [rax-48]
paddd xmm1, XMMWORD PTR [rax-32]
paddd xmm0, XMMWORD PTR [rax-16]
movaps XMMWORD PTR [rax-64], xmm3
movaps XMMWORD PTR [rax-48], xmm2
movaps XMMWORD PTR [rax-32], xmm1
movaps XMMWORD PTR [rax-16], xmm0
cmp rax, rcx
jne .L11
.L9:
ret
No alignment
It's getting interesting if we use plain 32 bit integer arrays with no alignment at all. We use the exact same compiler flags.
void add_unaligned(std::int32_t* lhs_and_result, std::int32_t* rhs, std::size_t size)
{
for (std::size_t i = 0; i < size; i++)
lhs_and_result[i] += rhs[i];
}
Clang
Clang's assembly exploaded a fair bit by adding some branches:
add_unaligned(int*, int*, unsigned long): # #add_unaligned(int*, int*, unsigned long)
test rdx, rdx
je .LBB2_16
cmp rdx, 7
jbe .LBB2_2
lea rax, [rsi + 4*rdx]
cmp rax, rdi
jbe .LBB2_9
lea rax, [rdi + 4*rdx]
cmp rax, rsi
jbe .LBB2_9
.LBB2_2:
xor r10d, r10d
.LBB2_3:
mov r8, r10
not r8
add r8, rdx
mov rcx, rdx
and rcx, 3
je .LBB2_5
.LBB2_4: # =>This Inner Loop Header: Depth=1
mov eax, dword ptr [rsi + 4*r10]
add dword ptr [rdi + 4*r10], eax
add r10, 1
add rcx, -1
jne .LBB2_4
.LBB2_5:
cmp r8, 3
jb .LBB2_16
.LBB2_6: # =>This Inner Loop Header: Depth=1
mov eax, dword ptr [rsi + 4*r10]
add dword ptr [rdi + 4*r10], eax
mov eax, dword ptr [rsi + 4*r10 + 4]
add dword ptr [rdi + 4*r10 + 4], eax
mov eax, dword ptr [rsi + 4*r10 + 8]
add dword ptr [rdi + 4*r10 + 8], eax
mov eax, dword ptr [rsi + 4*r10 + 12]
add dword ptr [rdi + 4*r10 + 12], eax
add r10, 4
cmp rdx, r10
jne .LBB2_6
jmp .LBB2_16
.LBB2_9:
mov r10, rdx
and r10, -8
lea rax, [r10 - 8]
mov r9, rax
shr r9, 3
add r9, 1
mov r8d, r9d
and r8d, 1
test rax, rax
je .LBB2_10
sub r9, r8
xor ecx, ecx
.LBB2_12: # =>This Inner Loop Header: Depth=1
movdqu xmm0, xmmword ptr [rsi + 4*rcx]
movdqu xmm1, xmmword ptr [rsi + 4*rcx + 16]
movdqu xmm2, xmmword ptr [rdi + 4*rcx]
paddd xmm2, xmm0
movdqu xmm0, xmmword ptr [rdi + 4*rcx + 16]
paddd xmm0, xmm1
movdqu xmm1, xmmword ptr [rdi + 4*rcx + 32]
movdqu xmm3, xmmword ptr [rdi + 4*rcx + 48]
movdqu xmmword ptr [rdi + 4*rcx], xmm2
movdqu xmmword ptr [rdi + 4*rcx + 16], xmm0
movdqu xmm0, xmmword ptr [rsi + 4*rcx + 32]
paddd xmm0, xmm1
movdqu xmm1, xmmword ptr [rsi + 4*rcx + 48]
paddd xmm1, xmm3
movdqu xmmword ptr [rdi + 4*rcx + 32], xmm0
movdqu xmmword ptr [rdi + 4*rcx + 48], xmm1
add rcx, 16
add r9, -2
jne .LBB2_12
test r8, r8
je .LBB2_15
.LBB2_14:
movdqu xmm0, xmmword ptr [rsi + 4*rcx]
movdqu xmm1, xmmword ptr [rsi + 4*rcx + 16]
movdqu xmm2, xmmword ptr [rdi + 4*rcx]
paddd xmm2, xmm0
movdqu xmm0, xmmword ptr [rdi + 4*rcx + 16]
paddd xmm0, xmm1
movdqu xmmword ptr [rdi + 4*rcx], xmm2
movdqu xmmword ptr [rdi + 4*rcx + 16], xmm0
.LBB2_15:
cmp r10, rdx
jne .LBB2_3
.LBB2_16:
ret
.LBB2_10:
xor ecx, ecx
test r8, r8
jne .LBB2_14
jmp .LBB2_15
What is happening at .LBB2_4 and .LBB2_6? It looks like it's unrolling a loop again but I'm not sure what happens there (mainly because of the registers used).
In .LBB2_12 it even unrolls the SSE part. I think it's only unrolled two-fold though because it needs two SIMD registers to load each operand because they are unaligned now. .LBB2_14 contains the SSE part without the unrolling.
How is the control flow here? I'm assuming it should be:
keep using the unrolled SSE part until the remaining data is too small to fill all the registers (xmm0..3)
switch to the single stage SSE part and do it once if we have enough data remaining to fill xmm0 (4 integers in our case)
process the remaining data (3 operations at max, otherwise it would be SSE suitable again)
The order of the labels and the jump instructions are confusing, is that (approx.) what happens here?
GCC
Gcc's assembly is a bit easier to read:
add_unaligned(int*, int*, unsigned long):
test rdx, rdx
je .L16
lea rcx, [rsi+4]
mov rax, rdi
sub rax, rcx
cmp rax, 8
jbe .L22
lea rax, [rdx-1]
cmp rax, 2
jbe .L22
mov rcx, rdx
xor eax, eax
shr rcx, 2
sal rcx, 4
.L19:
movdqu xmm0, XMMWORD PTR [rdi+rax]
movdqu xmm1, XMMWORD PTR [rsi+rax]
paddd xmm0, xmm1
movups XMMWORD PTR [rdi+rax], xmm0
add rax, 16
cmp rax, rcx
jne .L19
mov rax, rdx
and rax, -4
test dl, 3
je .L16
mov ecx, DWORD PTR [rsi+rax*4]
add DWORD PTR [rdi+rax*4], ecx
lea rcx, [rax+1]
cmp rdx, rcx
jbe .L16
add rax, 2
mov r8d, DWORD PTR [rsi+rcx*4]
add DWORD PTR [rdi+rcx*4], r8d
cmp rdx, rax
jbe .L16
mov edx, DWORD PTR [rsi+rax*4]
add DWORD PTR [rdi+rax*4], edx
ret
.L22:
xor eax, eax
.L18:
mov ecx, DWORD PTR [rsi+rax*4]
add DWORD PTR [rdi+rax*4], ecx
add rax, 1
cmp rdx, rax
jne .L18
.L16:
ret
I assume the control flow is similar to clang
keep using the single stage SSE part until the remaining data is too small to fill xmm0 and xmm1
process the remaining data (3 operations at max, otherwise it would be SSE suitable again)
It looks like exactly this is happening in .L19 but what is .L18 doing then?
Summary
Here is the full code, including assembly. My question are:
Why is clang unrolling the functions that use aligned data instead of using SSE or a combination of both (like gcc)?
What are .LBB2_4 and .LBB2_6 in clang's assembly doing?
Are my assumptions about the control flow of the function with the unaligned data correct?
What is .L18 in gcc's assembly doing?
Here's a simple inheritance usinig a virtual base class (code available on Compiler Explorer).
class B {
public:
int i = 1;
};
class D : virtual public B {
public:
int j = 2;
};
void Assign(B *b) {
b->i = 2;
}
int main() {
B *b = new D();
Assign(b);
return 0;
}
The assembly listing of the main() function looks like this:
09 main: # #main
10 push rbp
11 mov rbp, rsp
12 sub rsp, 32
13 mov eax, 16
14 mov edi, eax
15 mov dword ptr [rbp - 4], 0
16 call operator new(unsigned long)
17 xor esi, esi
18 mov ecx, 16
19 mov edx, ecx
20 mov rdi, rax
21 mov qword ptr [rbp - 24], rax # 8-byte Spill
22 call memset
23 mov rdi, qword ptr [rbp - 24] # 8-byte Reload
24 call D::D() [complete object constructor]
25 xor ecx, ecx
26 mov eax, ecx
27 mov rdx, qword ptr [rbp - 24] # 8-byte Reload
28 cmp rdx, 0
29 mov qword ptr [rbp - 32], rax # 8-byte Spill
30 je .LBB1_2
31 mov rax, qword ptr [rbp - 24] # 8-byte Reload
32 mov rcx, qword ptr [rax]
33 mov rcx, qword ptr [rcx - 24]
34 add rax, rcx
35 mov qword ptr [rbp - 32], rax # 8-byte Spill
36 .LBB1_2:
37 mov rax, qword ptr [rbp - 32] # 8-byte Reload
38 mov qword ptr [rbp - 16], rax
39 mov rdi, qword ptr [rbp - 16]
40 call Assign(B*)
41 xor eax, eax
42 add rsp, 32
43 pop rbp
44 ret
What is the effect of line 27-38 of the assembly?
What is the value of rax in line 29?
Why is there a branch statement?
The effect of lines 27-38 is to convert a D * to a B *. Because B is a virtual base class, it can have a variable offset from the start of D. Those 12 lines calculate where the B object is, in an unoptimized way.
The value of eax on line 29 is 0 (see lines 25-26).
The branch statement on line 30 is the result of a NULL pointer check. If the pointer to D is NULL, the conversion to a B * will also be NULL and the extra code to determine the correct offset is not wanted in that case.
I have below code which actually uses 1, 2, 3 virtual call targets:
#include <random>
#include <memory>
#include <ctime>
struct IMath
{
virtual ~IMath() = default;
virtual int compute(int) = 0;
};
struct MathAlg1: IMath
{
int compute(int i) final
{
return i * 17;
}
};
struct MathAlg2: IMath
{
int compute(int i) final
{
return i * 19;
}
};
struct MathAlg3: IMath
{
int compute(int i) final
{
return i * 23;
}
};
struct MathAlg4: IMath
{
int compute(int i) final
{
return i * 29;
}
};
namespace
{
static std::shared_ptr<int> param = std::make_shared<int>(3);
int compute(IMath& alg, int i)
{
return alg.compute(i);
}
std::unique_ptr<IMath> alg1 = std::make_unique<MathAlg1>();
std::unique_ptr<IMath> alg2 = std::make_unique<MathAlg2>();
std::unique_ptr<IMath> alg3 = std::make_unique<MathAlg3>();
std::unique_ptr<IMath> alg4 = std::make_unique<MathAlg4>();
int monomorphicCall()
{
return compute(*alg1, *param);
}
int bimorphicCall()
{
return compute(*alg1, *param) + compute(*alg2, *param);
}
int megamorphic3Call()
{
return compute(*alg1, *param) + compute(*alg2, *param) + compute(*alg3, *param);
}
int megamorphic4Call()
{
return compute(*alg1, *param) + compute(*alg2, *param) + compute(*alg3, *param) + compute(*alg4, *param);
}
}
int main(){
return monomorphicCall();
//return bimorphicCall();
//return megamorphic3Call();
//return megamorphic4Call();
}
Generated ASM: (clang 6.0.0 w/ -03)
monomorphicCall()
main: # #main
mov rdi, qword ptr [rip + (anonymous namespace)::alg1]
mov rax, qword ptr [rip + (anonymous namespace)::param]
mov esi, dword ptr [rax]
mov rax, qword ptr [rdi]
mov rax, qword ptr [rax + 16]
jmp rax # TAILCALL
bimorphicCall()
main: # #main
push rbx
mov rdi, qword ptr [rip + (anonymous namespace)::alg1]
mov rax, qword ptr [rip + (anonymous namespace)::param]
mov esi, dword ptr [rax]
mov rax, qword ptr [rdi]
call qword ptr [rax + 16]
mov ebx, eax
mov rdi, qword ptr [rip + (anonymous namespace)::alg2]
mov rax, qword ptr [rip + (anonymous namespace)::param]
mov esi, dword ptr [rax]
mov rax, qword ptr [rdi]
call qword ptr [rax + 16]
add eax, ebx
pop rbx
ret
megamorphic3Call()
main: # #main
push rbp
push rbx
push rax
mov rdi, qword ptr [rip + (anonymous namespace)::alg1]
mov rax, qword ptr [rip + (anonymous namespace)::param]
mov esi, dword ptr [rax]
mov rax, qword ptr [rdi]
call qword ptr [rax + 16]
mov ebx, eax
mov rdi, qword ptr [rip + (anonymous namespace)::alg2]
mov rax, qword ptr [rip + (anonymous namespace)::param]
mov esi, dword ptr [rax]
mov rax, qword ptr [rdi]
call qword ptr [rax + 16]
mov ebp, eax
add ebp, ebx
mov rdi, qword ptr [rip + (anonymous namespace)::alg3]
mov rax, qword ptr [rip + (anonymous namespace)::param]
mov esi, dword ptr [rax]
mov rax, qword ptr [rdi]
call qword ptr [rax + 16]
add eax, ebp
add rsp, 8
pop rbx
pop rbp
ret
megamorphic4Call()
main: # #main
push rbp
push rbx
push rax
mov rdi, qword ptr [rip + (anonymous namespace)::alg1]
mov rax, qword ptr [rip + (anonymous namespace)::param]
mov esi, dword ptr [rax]
mov rax, qword ptr [rdi]
call qword ptr [rax + 16]
mov ebx, eax
mov rdi, qword ptr [rip + (anonymous namespace)::alg2]
mov rax, qword ptr [rip + (anonymous namespace)::param]
mov esi, dword ptr [rax]
mov rax, qword ptr [rdi]
call qword ptr [rax + 16]
mov ebp, eax
add ebp, ebx
mov rdi, qword ptr [rip + (anonymous namespace)::alg3]
mov rax, qword ptr [rip + (anonymous namespace)::param]
mov esi, dword ptr [rax]
mov rax, qword ptr [rdi]
call qword ptr [rax + 16]
mov ebx, eax
add ebx, ebp
mov rdi, qword ptr [rip + (anonymous namespace)::alg4]
mov rax, qword ptr [rip + (anonymous namespace)::param]
mov esi, dword ptr [rax]
mov rax, qword ptr [rdi]
call qword ptr [rax + 16]
add eax, ebx
add rsp, 8
pop rbx
pop rbp
ret
Questions/Confirmation points:
In the case of monomorphicCall() to me, it looks like there is no actual
virtual table call, but a jmp rax # TAILCALL. Is this a correct assessment?
In case of bimorphicCall(), megamorphic3Call(), megamorphic4Call() it triggers the vcalls in all cases. Is this a correct assessment?
I am printing some information about CPU in my OS using CPUID instruction.
Reading and printing vendor string(GenuineIntel) works well, but reading brand string gives me little strange string.
ok cpu-info <= Run command
CPU Vendor name: GenuineIntel <= Vendor string is good
CPU Brand: D: l(R) Core(TMD: CPU MD: <= What..?
ok
Vendor string supposed to be:
Intel(R) Core(TM) i5 CPU M 540
But what I got is:
D: l(R) Core(TMD: CPU MD:
C++ code:
char vendorString[13] = { 0, };
Dword eax, ebx, ecx, edx;
ACpuid(0, &eax, &ebx, &ecx, &edx);
*((Dword*)vendorString) = ebx;
*((Dword*)vendorString + 1) = edx;
*((Dword*)vendorString + 2) = ecx;
Console::Output.Write(L"CPU vendor name: ");
for (int i = 0; i < 13; i++) {
Console::Output.Write((wchar_t)(vendorString[i]));
}
Console::Output.WriteLine();
char brandString[48] = { 0, };
ACpuid(0x80000002, &eax, &ebx, &ecx, &edx);
*((Dword*)brandString) = eax;
*((Dword*)brandString + 1) = ebx;
*((Dword*)brandString + 2) = ecx;
*((Dword*)brandString + 3) = edx;
ACpuid(0x80000003, &eax, &ebx, &ecx, &edx);
*((Dword*)brandString + 4) = eax;
*((Dword*)brandString + 5) = ebx;
*((Dword*)brandString + 6) = ecx;
*((Dword*)brandString + 7) = edx;
ACpuid(0x80000004, &eax, &ebx, &ecx, &edx);
*((Dword*)brandString + 8) = eax;
*((Dword*)brandString + 9) = ebx;
*((Dword*)brandString + 10) = ecx;
*((Dword*)brandString + 11) = edx;
Console::Output.Write(L"CPU brand: ");
for (int i = 0; i < 48; i++) {
Console::Output.Write((wchar_t) brandString[i]);
}
Console::Output.WriteLine();
NOTE:
This program is UEFI application. No problem with permissions.
Console is an wrapper class for EFI console. Not C# stuff.
Dword = unsigned 32bit integer
Assembly code(MASM):
;Cpuid command
;ACpuid(Type, pEax, pEbx, pEcx, pEdx)
ACpuid Proc
;Type => Rcx
;pEax => Rdx
;pEbx => R8
;pEcx => R9
;pEdx => [ rbp + 48 ] ?
push rbp
mov rbp, rsp
push rax
push rsi
mov rax, rcx
cpuid
mov [ rdx ], eax
mov [ r8 ], ebx
mov [ r9 ], ecx
mov rsi, [ rbp + 48 ]
mov [ rsi ], rdx
pop rsi
pop rax
pop rbp
ret
ACpuid Endp
I agree with Ross Ridge that you should use the compiler intrinsic __cpuid. As for why your code likely doesn't work as is - there are some bugs that will cause problems.
CPUID destroys the contents of RAX, RBX, RCX, and RDX and yet you do this in your code:
cpuid
mov [ rdx ], eax
RDX has been destroyed by the time mov [ rdx ], eax is executed, rendering the pointer in RDX invalid. You'll need to move RDX to another register before using the CPUID instruction.
Per the Windows 64-bit Calling Convention these are the volatile registers that need to be preserved by the caller:
The registers RAX, RCX, RDX, R8, R9, R10, R11 are considered volatile and must be considered destroyed on function calls (unless otherwise safety-provable by analysis such as whole program optimization).
These are the non-volatile ones that need to be preserved by the callee:
The registers RBX, RBP, RDI, RSI, RSP, R12, R13, R14, and R15 are considered nonvolatile and must be saved and restored by a function that uses them.
We can use R10 (a volatile register) to store RDX temporarily. Rather than use RSI in the code we can reuse R10 for updating the value at pEdx. We won't need to preserve RSI if we don't use it. CPUID does destroy RBX, and RBX is non-volatile, so we need to preserve it. RAX is volatile so we don't need to preserve it.
In your code you have this line:
mov [ rsi ], rdx
RSI is a memory address (pEdx) provided by the caller to store the value in EDX. The code you have would move the contents of the 8-byte register RDX to a memory location that was expecting a 4-byte DWORD. This could potentially trash data in the caller. This really should have been:
mov [ rsi ], edx
With all of the above in mind we could code the ACpuid routine this way:
option casemap:none
.code
;Cpuid command
;ACpuid(Type, pEax, pEbx, pEcx, pEdx)
ACpuid Proc
;Type => Rcx
;pEax => Rdx
;pEbx => R8
;pEcx => R9
;pEdx => [ rbp + 48 ] ?
push rbp
mov rbp, rsp
push rbx ; Preserve RBX (destroyed by CPUID)
mov r10, rdx ; Save RDX before CPUID
mov rax, rcx
cpuid
mov [ r10 ], eax
mov [ r8 ], ebx
mov [ r9 ], ecx
mov r10, [ rbp + 48 ]
mov [ r10 ], edx ; Last parameter is pointer to 32-bit DWORD,
; Move EDX to the memory location, not RDX
pop rbx
pop rbp
ret
ACpuid Endp
end
In attempt to look at this, I wrote this simple code where I just created variables of different types and passed them into a function by value, by reference, and by pointer:
int i = 1;
char c = 'a';
int* p = &i;
float f = 1.1;
TestClass tc; // has 2 private data members: int i = 1 and int j = 2
the function bodies were left blank because i am just looking at how parameters are passed in.
passByValue(i, c, p, f, tc);
passByReference(i, c, p, f, tc);
passByPointer(&i, &c, &p, &f, &tc);
wanted to see how this is different for an array and also how the parameters are then accessed.
int numbers[] = {1, 2, 3};
passArray(numbers);
assembly:
passByValue(i, c, p, f, tc)
mov EAX, DWORD PTR [EBP - 16]
mov DL, BYTE PTR [EBP - 17]
mov ECX, DWORD PTR [EBP - 24]
movss XMM0, DWORD PTR [EBP - 28]
mov ESI, DWORD PTR [EBP - 40]
mov DWORD PTR [EBP - 48], ESI
mov ESI, DWORD PTR [EBP - 36]
mov DWORD PTR [EBP - 44], ESI
lea ESI, DWORD PTR [EBP - 48]
mov DWORD PTR [ESP], EAX
movsx EAX, DL
mov DWORD PTR [ESP + 4], EAX
mov DWORD PTR [ESP + 8], ECX
movss DWORD PTR [ESP + 12], XMM0
mov EAX, DWORD PTR [ESI]
mov DWORD PTR [ESP + 16], EAX
mov EAX, DWORD PTR [ESI + 4]
mov DWORD PTR [ESP + 20], EAX
call _Z11passByValueicPif9TestClass
passByReference(i, c, p, f, tc)
lea EAX, DWORD PTR [EBP - 16]
lea ECX, DWORD PTR [EBP - 17]
lea ESI, DWORD PTR [EBP - 24]
lea EDI, DWORD PTR [EBP - 28]
lea EBX, DWORD PTR [EBP - 40]
mov DWORD PTR [ESP], EAX
mov DWORD PTR [ESP + 4], ECX
mov DWORD PTR [ESP + 8], ESI
mov DWORD PTR [ESP + 12], EDI
mov DWORD PTR [ESP + 16], EBX
call _Z15passByReferenceRiRcRPiRfR9TestClass
passByPointer(&i, &c, &p, &f, &tc)
lea EAX, DWORD PTR [EBP - 16]
lea ECX, DWORD PTR [EBP - 17]
lea ESI, DWORD PTR [EBP - 24]
lea EDI, DWORD PTR [EBP - 28]
lea EBX, DWORD PTR [EBP - 40]
mov DWORD PTR [ESP], EAX
mov DWORD PTR [ESP + 4], ECX
mov DWORD PTR [ESP + 8], ESI
mov DWORD PTR [ESP + 12], EDI
mov DWORD PTR [ESP + 16], EBX
call _Z13passByPointerPiPcPS_PfP9TestClass
passArray(numbers)
mov EAX, .L_ZZ4mainE7numbers
mov DWORD PTR [EBP - 60], EAX
mov EAX, .L_ZZ4mainE7numbers+4
mov DWORD PTR [EBP - 56], EAX
mov EAX, .L_ZZ4mainE7numbers+8
mov DWORD PTR [EBP - 52], EAX
lea EAX, DWORD PTR [EBP - 60]
mov DWORD PTR [ESP], EAX
call _Z9passArrayPi
// parameter access
push EAX
mov EAX, DWORD PTR [ESP + 8]
mov DWORD PTR [ESP], EAX
pop EAX
I'm assuming I'm looking at the right assembly pertaining to the parameter passing because there are calls at the end of each!
But due to my very limited knowledge of assembly, I can't tell what's going on here. I learned about ccall convention, so I'm assuming something is going on that has to do with preserving the caller-saved registers and then pushing the parameters onto the stack. Because of this, I'm expecting to see things loaded into registers and "push" everywhere, but have no idea what's going on with the movs and leas. Also, I don't know what DWORD PTR is.
I've only learned about registers: eax, ebx, ecx, edx, esi, edi, esp and ebp, so seeing something like XMM0 or DL just confuses me as well. I guess it makes sense to see lea when it comes to passing by reference/pointer because they use memory addresses, but I can't actually tell what is going on. When it comes to passing by value, it seems like there are many instructions, so this could have to do with copying the value into registers. No idea when it comes to how arrays are passed and accessed as parameters.
If someone could explain the general idea of what's going on with each block of assembly to me, I would highly appreciate it.
Using CPU registers for passing arguments is faster than using memory, i.e. stack. However there is limited number of registers in CPU (especially in x86-compatible CPUs) so when a function has many parameters then stack is used instead of CPU registers. In your case there are 5 function arguments so the compiler uses stack for the arguments instead of registers.
In principle compilers can use push instructions to push arguments to stack before actual call to function, but many compilers (incl. gnu c++) use mov to push arguments to stack. This way is convenient as it does not change ESP register (top of the stack) in the part of code which calls the function.
In case of passByValue(i, c, p, f, tc) values of arguments are placed on the stack. You can see many mov instruction from a memory location to a register and from the register to an appropriate location of the stack. The reason for this is that x86 assembly forbids direct moving from one memory location to another (exception is movs which moves values from one array (or string as you wish) to another).
In case of passByReference(i, c, p, f, tc) you can see many 5 lea instructions which copy addresses of arguments to CPU registers, and these values of the registers are moved into stack.
The case of passByPointer(&i, &c, &p, &f, &tc) is similar to passByValue(i, c, p, f, tc). Internally, on the assembly level, pass by reference uses pointers, while on the higher, C++, level a programmer does not need to use explicitely the & and * operators on references.
After the parameters are moved to the stack call is issued, which pushes instruction pointer EIP to stack before transferring the program execution to the subroutine. All moves of the parameters to the stack account for the coming EIP on stack after the call instruction.
There's too much in your example above to dissect all of them. Instead I'll just go over passByValue since that seems to be the most interesting. Afterwards, you should be able to figure out the rest.
First some important points to keep in mind while studying the disassembly so you don't get completely lost in the sea of code:
There are no instructions to directly copy data from one mem location to another mem location. eg. mov [ebp - 44], [ebp - 36] is not a legal instruction. An intermediate register is needed to store the data first and then subsequently copied into the memory destination.
Bracket operator [] in conjunction with a mov means to access data from a computed memory address. This is analogous to derefing a pointer in C/C++.
When you see lea x, [y] that usually means compute address of y and save into x. This is analogous to taking the address of a variable in C/C++.
Data and objects that needs to be copied but are too big to fit into a register are copied onto the stack in a piece-meal fashion. IOW, it'll copy a native machine word at a time until all the bytes representing the object/data is copied. Usually that means either 4 or 8 bytes on modern processors.
The compiler will typically interleave instructions together to keep the processor pipeline busy and to minimize stalls. Good for code efficiency but bad if you're trying to understand the disassembly.
With the above in mind here's the call to passByValue function rearranged a bit to make it more understandable:
.define arg1 esp
.define arg2 esp + 4
.define arg3 esp + 8
.define arg4 esp + 12
.define arg5.1 esp + 16
.define arg5.2 esp + 20
; copy first parameter
mov EAX, [EBP - 16]
mov [arg1], EAX
; copy second parameter
mov DL, [EBP - 17]
movsx EAX, DL
mov [arg2], EAX
; copy third
mov ECX, [EBP - 24]
mov [arg3], ECX
; copy fourth
movss XMM0, DWORD PTR [EBP - 28]
movss DWORD PTR [arg4], XMM0
; intermediate copy of TestClass?
mov ESI, [EBP - 40]
mov [EBP - 48], ESI
mov ESI, [EBP - 36]
mov [EBP - 44], ESI
;copy fifth
lea ESI, [EBP - 48]
mov EAX, [ESI]
mov [arg5.1], EAX
mov EAX, [ESI + 4]
mov [arg5.2], EAX
call passByValue(int, char, int*, float, TestClass)
The code above is unmangled and instruction mixing undone to make it clear what is actually happening but some still needs explaining. First, the char is signed and it is a single byte in size. The instructions here:
; copy second parameter
mov DL, [EBP - 17]
movsx EAX, DL
mov [arg2], EAX
reads a byte from [ebp - 17](somewhere on stack) and stores it into the lower first byte of edx. That byte is then copied into eax using sign-extended move. The full 32-bit value in eax is finally copied onto the stack that passByValue can access. See register layout if you need more detail.
The fourth argument:
movss XMM0, DWORD PTR [EBP - 28]
movss DWORD PTR [arg4], XMM0
Uses the SSE movss instruction to copy the floating point value from stack into a xmm0 register. In brief, SSE instructions let you perform the same operation on multiple pieces of data simultaneously but here the compiler is using it as an intermediate storage for copying floating-point values on the stack.
The last argument:
; copy intermediate copy of TestClass?
mov ESI, [EBP - 40]
mov [EBP - 48], ESI
mov ESI, [EBP - 36]
mov [EBP - 44], ESI
corresponds to the TestClass. Apparently this class is 8-bytes in size located on the stack from [ebp - 40] to [ebp - 33]. The class here is being copied 4-bytes at a time since the object cannot fit into a single register.
Here's what the stack approximately looks like prior to call passByValue:
lower addr esp => int:arg1 <--.
esp + 4 char:arg2 |
esp + 8 int*:arg3 | copies passed
esp + 12 float:arg4 | to 'passByValue'
esp + 16 TestClass:arg5.1 |
esp + 20 TestClass:arg5.2 <--.
...
...
ebp - 48 TestClass:arg5.1 <-- intermediate copy of
ebp - 44 TestClass:arg5.2 <-- TestClass?
ebp - 40 original TestClass:arg5.1
ebp - 36 original TestClass:arg5.2
...
ebp - 28 original arg4 <--.
ebp - 24 original arg3 | original (local?) variables
ebp - 20 original arg2 | from calling function
ebp - 16 original arg1 <--.
...
higher addr ebp prev frame
What you're looking for are ABI calling conventions. Different platforms have different conventions. e.g. Windows on x86-64 has different conventions than Unix/Linux on x86-64.
http://www.agner.org/optimize/ has a calling-conventions doc detailing the various ones for x86 / amd64.
You can write code in ASM that does whatever you want, but if you want to call other functions, and be called by them, then pass parameters / return values according to the ABI.
It could be useful to make an internal-use-only helper function that doesn't use the standard ABI, but instead uses values in the registers that the calling function allocates them in. This is esp. likely if you're writing the main program in something other than ASM, with just a small part in ASM. Then the asm part only needs to care about being portable to systems with different ABIs for being called from the main program, not for its own internals.