allocating memory on stack is bigger when disassembling code - c++

I have the following code. I expected the size of the stack in main function to be 8 bytes on 64 bit system, but when disassembling I see strange thing: it is 16. I am using https://godbolt.org/ x86-64 GCC 9.3. So my question is why?
#include <memory>
struct my_struct {
char a[10];
int b;
char c;
short d;
};
int main() {
struct my_struct* s = (struct my_struct*)malloc(sizeof(struct my_struct));
printf("%lu\n", sizeof(s));
return 0;
}
.LC0:
.string "%lu\n"
main:
push rbp
mov rbp, rsp
sub rsp, 16
mov edi, 20
call malloc
mov QWORD PTR [rbp-8], rax
mov esi, 8
mov edi, OFFSET FLAT:.LC0
mov eax, 0
call printf
mov eax, 0
leave
ret

Related

Why is the C++ function parameter stored 20 bytes off of the rbp in x86-64 when the method body only has one 4 byte variable?

Consider the following program, compiled using x86-64 GCC 12.2 with flags --std=c++17 -O0:
int square(int num, int num2) {
int foo = 37;
return num * num;
}
int main () {
return square(10, 5);
}
The resulting assembly using godbolt is:
square(int, int):
push rbp
mov rbp, rsp
mov DWORD PTR [rbp-20], edi
mov DWORD PTR [rbp-24], esi
mov DWORD PTR [rbp-4], 37
mov eax, DWORD PTR [rbp-20]
imul eax, eax
pop rbp
ret
main:
push rbp
mov rbp, rsp
mov esi, 5
mov edi, 10
call square(int, int)
nop
pop rbp
ret
I read about shadow spaces and it appears that in x64 there must be at minimum 32 bytes allocated: "32 bytes above the return address which the called function owns" ...
With that said, how is the offset -20 determined for the parameter num? If there's 32 bytes from rbp, wouldn't that be -24?
I noticed even if you add more local variables, it'll remain -20 until it gets pushed over to -36, but I cannot understand why. Thanks!

A temporary array is assigned but not a temporary primary value

I am amazed that this C++ code is compiled:
int main()
{
(int[10]){}[0]=15;
return 0;
}
The equivalent assembly is
main:
push rbp
mov rbp, rsp
mov QWORD PTR [rbp-48], 0
mov QWORD PTR [rbp-40], 0
mov QWORD PTR [rbp-32], 0
mov QWORD PTR [rbp-24], 0
mov QWORD PTR [rbp-16], 0
mov DWORD PTR [rbp-48], 15
mov eax, 0
pop rbp
ret
According to this code, an array is defined without having any name and then assigned.
Interestingly, when there is no array, the code does not compile:
int main()
{
(int){}=15; /* <Compilation failed> */
return 0;
}
1- Why is the first expression (maybe you call it assigning to an xvalue) legal in C++ for a temporary array but not the second one for a basic primary type? Why the language is designed this way?
2- What is the application of such a temporary array?

Instructions after main in disassembly

I have a very simple program where my goal was to study how the compiler pushes the values to the different registers. But now the behaviour is much more complicated than expected, at least in debug-mode.
What is going on here?
#include <cstdio>
struct A
{
int B;
A() : B(0) { }
};
int main()
{
A a;
A b(a);
printf("%d", b.B);
printf("%d", a.B);
return 0;
}
This is how the disassembly looks like in Visual Studio:
int main()
{
01048210 push ebp
01048211 mov ebp,esp
01048213 sub esp,0D8h
01048219 push ebx
0104821A push esi
0104821B push edi
0104821C lea edi,[ebp-0D8h]
01048222 mov ecx,36h
01048227 mov eax,0CCCCCCCCh
0104822C rep stos dword ptr es:[edi]
A a;
0104822E lea ecx,[a]
01048231 call A::A (104678Ah)
A b(a);
01048236 mov eax,dword ptr [a]
01048239 mov dword ptr [b],eax
printf("%d", b.B);
0104823C mov eax,dword ptr [b]
0104823F push eax
01048240 push offset string "%d" (1093C6Ch)
01048245 call #ILT+3885(_printf) (1046F32h)
0104824A add esp,8
printf("%d", a.B);
0104824D mov eax,dword ptr [a]
01048250 push eax
01048251 push offset string "%d" (1093C6Ch)
01048256 call #ILT+3885(_printf) (1046F32h)
0104825B add esp,8
}
The first lines are explained in this answer, they are there to kep the frame pointer so that nice stack traces can be generated.
But the next lines are confusing: why subtract 216 (0D8h) from esp?
What are these lines after main, but before the first line of code A a; doing?
Edit: after setting the runtime checks to default the disassembly is much smaller:
int main()
{
00247110 push ebp
00247111 mov ebp,esp
00247113 sub esp,48h
00247116 push ebx
00247117 push esi
00247118 push edi
A a;
Edit 2: in Release mode (/Ox) a and b are completely optimized away and no memory is allocated on the stack at all:
int main()
{
A a;
A b(a);
printf("%d", b.B);
00B41000 push 0
00B41002 push 0B499A0h
00B41007 call printf (0B4102Dh)
printf("%d", a.B);
00B4100C push 0
00B4100E push 0B499A4h
00B41013 call printf (0B4102Dh)
00B41018 add esp,10h
return 0;
0127101B xor eax,eax
}
0127101D ret
Edit 3: this is the result using gcc -m32 -O3 -mpreferred-stack-boundary=2 (thanks to #CodyGray).
.LC0:
.string "%d"
Test():
push 0
push OFFSET FLAT:.LC0
call printf
pop eax
pop edx
push 0
push OFFSET FLAT:.LC0
call printf
pop ecx
pop eax
ret
00CC8223 sub esp,0D8h
Allocates the stack space for the local variables.
What are these lines after main, but before the first instruction doing?
What are you referring to?

GCC generated assembly

Why printf function causes the change of prologue?
C code_1:
#include <cstdio>
int main(){
int a = 11;
printf("%d", a);
}
GCC -m32 generated one:
.LC0:
.string "%d"
main:
lea ecx, [esp+4] // What's purpose of this three
and esp, -16 // lines?
push DWORD PTR [ecx-4] //
push ebp
mov ebp, esp
push ecx
sub esp, 20 // why sub 20?
mov DWORD PTR [ebp-12], 11
sub esp, 8
push DWORD PTR [ebp-12]
push OFFSET FLAT:.LC0
call printf
add esp, 16
mov eax, 0
mov ecx, DWORD PTR [ebp-4]
leave
lea esp, [ecx-4]
ret
C code_2:
#include <cstdio>
int main(){
int a = 11;
}
GCC -m32:
main:
push ebp
mov ebp, esp
sub esp, 16
mov DWORD PTR [ebp-4], 11
mov eax, 0
leave
ret
What is the purpose of first three lines added in first code?
Please, explain first assembly code, if you can.
EDIT:
64-bit mode:
.LC0:
.string "%d"
main:
push rbp
mov rbp, rsp
sub rsp, 16
mov DWORD PTR [rbp-4], 11
mov eax, DWORD PTR [rbp-4]
mov esi, eax
mov edi, OFFSET FLAT:.LC0
mov eax, 0
call printf
mov eax, 0
leave
ret
The insight is that the compiler keep the stack aligned at function calls.
The alignment is 16 byte.
lea ecx, [esp+4] ;Save original ESP to ECX (ESP+4 actually)
and esp, -16 ;Align stack on 16 bytes (Lower esp)
push DWORD PTR [ecx-4] ;Push main return address (Stack at 16B + 4)
;My guess is to aid debugging tools that expect the RA
;to be at [ebp+04h]
push ebp
mov ebp, esp ;Prolog (Stack at 16B+8)
push ecx ;Save ECX (Original stack pointer) (Stack at 16B+12)
sub esp, 20 ;Reserve 20 bytes (Stack at 16B+0, ALIGNED AGAIN)
;4 for alignment + 1x16 for a variable (variable space is
;allocated in multiple of 16)
mov DWORD PTR [ebp-12], 11 ;a = 11
sub esp, 8 ;Stack at 16B+8 for later alignment
push DWORD PTR [ebp-12] ;a
push OFFSET FLAT:.LC0 ;"%d" (Stack at 16B)
call printf
add esp, 16 ;Remove args+pad from the stack (Stack at 16B)
mov eax, 0 ;Return 0
mov ecx, DWORD PTR [ebp-4] ;Restore ECX without the need to add to esp
leave ;Restore EBP
lea esp, [ecx-4] ;Restore original ESP
ret
I don't know why the compiler saves esp+4 in ecx instead of esp (esp+4 is the address of the first parameter of main).

Pass 64-bit int as output to 32-bit inline asm

#include <stdarg.h>
#include <stdint.h>
uint64_t test_func(int n)
{
return 9223372036854775805;
}
int main()
{
uint64_t r = test_func(10);
return 0;
}
converts to:
test_func(int):
push ebp
mov ebp, esp
mov eax, -3
mov edx, 2147483647
pop ebp
ret
main:
push ebp
mov ebp, esp
and esp, -8
sub esp, 24
mov DWORD PTR [esp], 10
call test_func(int)
mov DWORD PTR [esp+16], eax
mov DWORD PTR [esp+20], edx
mov eax, 0
leave
ret
You can see it uses 2 registers to store that 64-bit integer. However, in the C/C++ code, it is only ONE variable.
I tried to replicate this in inline-assembly but I had to do:
#include <stdarg.h>
#include <stdint.h>
int64_t test_func(int n)
{
return 9223372036854775805;
}
int main()
{
int32_t rlow = 0, rhigh = 0;
asm(
"push $10\n"
"\tcall %P2"
: "=a"(rlow), "=d"(rhigh)
: "i"(&test_func) : "memory");
return 0;
}
And the output is:
test_func(int):
push ebp
mov ebp, esp
mov eax, -3
mov edx, 2147483647
pop ebp
ret
main:
push ebp
mov ebp, esp
sub esp, 16
mov DWORD PTR [ebp-8], 0
mov DWORD PTR [ebp-4], 0
push $10
call test_func(int)
mov DWORD PTR [ebp-8], eax
mov DWORD PTR [ebp-4], edx
mov eax, 0
leave
ret
Now you can see I had to manually place the lower and higher order bits into two separate integers. Then I perform shifting to make it into one 64-bit integer.
Is there a way to automatically get it to place into a single 64-bit integer without me having to supply it two 32-bit integers and then shifting the bits?
You want the "A" constraint, which binds a 64-bit value to the eax/edx register pair. Something like:
uint64_t r;
asm("push $10\n"
"\tcall %P1"
: "=A"(r) : "i"(&test_func) : "memory");
should do the trick.