Why QWORD seems to use 12 byte memory - c++

I am new to assembly language, and looking at GCC's unoptimized code-gen for a function that initializes int, long, and char variables.
int main()
{
int intVariable {};
long longVariable {};
char charVariable{};
return 0;
}
you can check the code on Godbolt
push rbp
mov rbp, rsp
mov DWORD PTR [rbp-4], 0
mov QWORD PTR [rbp-16], 0
mov BYTE PTR [rbp-17], 0
mov eax, 0
pop rbp
ret
I create an integer and it takes 4 bytes [rbp-4] : 0 -> 4
Create a long seems to use 12 bytes [rbp-16] 4 -> 16
Create a char takes 1 byte [rbp-17] : 16 -> 17
Can somebody explain me why long data type seems to use 12 byte?

Related

Why is the C++ function parameter stored 20 bytes off of the rbp in x86-64 when the method body only has one 4 byte variable?

Consider the following program, compiled using x86-64 GCC 12.2 with flags --std=c++17 -O0:
int square(int num, int num2) {
int foo = 37;
return num * num;
}
int main () {
return square(10, 5);
}
The resulting assembly using godbolt is:
square(int, int):
push rbp
mov rbp, rsp
mov DWORD PTR [rbp-20], edi
mov DWORD PTR [rbp-24], esi
mov DWORD PTR [rbp-4], 37
mov eax, DWORD PTR [rbp-20]
imul eax, eax
pop rbp
ret
main:
push rbp
mov rbp, rsp
mov esi, 5
mov edi, 10
call square(int, int)
nop
pop rbp
ret
I read about shadow spaces and it appears that in x64 there must be at minimum 32 bytes allocated: "32 bytes above the return address which the called function owns" ...
With that said, how is the offset -20 determined for the parameter num? If there's 32 bytes from rbp, wouldn't that be -24?
I noticed even if you add more local variables, it'll remain -20 until it gets pushed over to -36, but I cannot understand why. Thanks!

allocating memory on stack is bigger when disassembling code

I have the following code. I expected the size of the stack in main function to be 8 bytes on 64 bit system, but when disassembling I see strange thing: it is 16. I am using https://godbolt.org/ x86-64 GCC 9.3. So my question is why?
#include <memory>
struct my_struct {
char a[10];
int b;
char c;
short d;
};
int main() {
struct my_struct* s = (struct my_struct*)malloc(sizeof(struct my_struct));
printf("%lu\n", sizeof(s));
return 0;
}
.LC0:
.string "%lu\n"
main:
push rbp
mov rbp, rsp
sub rsp, 16
mov edi, 20
call malloc
mov QWORD PTR [rbp-8], rax
mov esi, 8
mov edi, OFFSET FLAT:.LC0
mov eax, 0
call printf
mov eax, 0
leave
ret

A temporary array is assigned but not a temporary primary value

I am amazed that this C++ code is compiled:
int main()
{
(int[10]){}[0]=15;
return 0;
}
The equivalent assembly is
main:
push rbp
mov rbp, rsp
mov QWORD PTR [rbp-48], 0
mov QWORD PTR [rbp-40], 0
mov QWORD PTR [rbp-32], 0
mov QWORD PTR [rbp-24], 0
mov QWORD PTR [rbp-16], 0
mov DWORD PTR [rbp-48], 15
mov eax, 0
pop rbp
ret
According to this code, an array is defined without having any name and then assigned.
Interestingly, when there is no array, the code does not compile:
int main()
{
(int){}=15; /* <Compilation failed> */
return 0;
}
1- Why is the first expression (maybe you call it assigning to an xvalue) legal in C++ for a temporary array but not the second one for a basic primary type? Why the language is designed this way?
2- What is the application of such a temporary array?

GCC generated assembly

Why printf function causes the change of prologue?
C code_1:
#include <cstdio>
int main(){
int a = 11;
printf("%d", a);
}
GCC -m32 generated one:
.LC0:
.string "%d"
main:
lea ecx, [esp+4] // What's purpose of this three
and esp, -16 // lines?
push DWORD PTR [ecx-4] //
push ebp
mov ebp, esp
push ecx
sub esp, 20 // why sub 20?
mov DWORD PTR [ebp-12], 11
sub esp, 8
push DWORD PTR [ebp-12]
push OFFSET FLAT:.LC0
call printf
add esp, 16
mov eax, 0
mov ecx, DWORD PTR [ebp-4]
leave
lea esp, [ecx-4]
ret
C code_2:
#include <cstdio>
int main(){
int a = 11;
}
GCC -m32:
main:
push ebp
mov ebp, esp
sub esp, 16
mov DWORD PTR [ebp-4], 11
mov eax, 0
leave
ret
What is the purpose of first three lines added in first code?
Please, explain first assembly code, if you can.
EDIT:
64-bit mode:
.LC0:
.string "%d"
main:
push rbp
mov rbp, rsp
sub rsp, 16
mov DWORD PTR [rbp-4], 11
mov eax, DWORD PTR [rbp-4]
mov esi, eax
mov edi, OFFSET FLAT:.LC0
mov eax, 0
call printf
mov eax, 0
leave
ret
The insight is that the compiler keep the stack aligned at function calls.
The alignment is 16 byte.
lea ecx, [esp+4] ;Save original ESP to ECX (ESP+4 actually)
and esp, -16 ;Align stack on 16 bytes (Lower esp)
push DWORD PTR [ecx-4] ;Push main return address (Stack at 16B + 4)
;My guess is to aid debugging tools that expect the RA
;to be at [ebp+04h]
push ebp
mov ebp, esp ;Prolog (Stack at 16B+8)
push ecx ;Save ECX (Original stack pointer) (Stack at 16B+12)
sub esp, 20 ;Reserve 20 bytes (Stack at 16B+0, ALIGNED AGAIN)
;4 for alignment + 1x16 for a variable (variable space is
;allocated in multiple of 16)
mov DWORD PTR [ebp-12], 11 ;a = 11
sub esp, 8 ;Stack at 16B+8 for later alignment
push DWORD PTR [ebp-12] ;a
push OFFSET FLAT:.LC0 ;"%d" (Stack at 16B)
call printf
add esp, 16 ;Remove args+pad from the stack (Stack at 16B)
mov eax, 0 ;Return 0
mov ecx, DWORD PTR [ebp-4] ;Restore ECX without the need to add to esp
leave ;Restore EBP
lea esp, [ecx-4] ;Restore original ESP
ret
I don't know why the compiler saves esp+4 in ecx instead of esp (esp+4 is the address of the first parameter of main).

Pass 64-bit int as output to 32-bit inline asm

#include <stdarg.h>
#include <stdint.h>
uint64_t test_func(int n)
{
return 9223372036854775805;
}
int main()
{
uint64_t r = test_func(10);
return 0;
}
converts to:
test_func(int):
push ebp
mov ebp, esp
mov eax, -3
mov edx, 2147483647
pop ebp
ret
main:
push ebp
mov ebp, esp
and esp, -8
sub esp, 24
mov DWORD PTR [esp], 10
call test_func(int)
mov DWORD PTR [esp+16], eax
mov DWORD PTR [esp+20], edx
mov eax, 0
leave
ret
You can see it uses 2 registers to store that 64-bit integer. However, in the C/C++ code, it is only ONE variable.
I tried to replicate this in inline-assembly but I had to do:
#include <stdarg.h>
#include <stdint.h>
int64_t test_func(int n)
{
return 9223372036854775805;
}
int main()
{
int32_t rlow = 0, rhigh = 0;
asm(
"push $10\n"
"\tcall %P2"
: "=a"(rlow), "=d"(rhigh)
: "i"(&test_func) : "memory");
return 0;
}
And the output is:
test_func(int):
push ebp
mov ebp, esp
mov eax, -3
mov edx, 2147483647
pop ebp
ret
main:
push ebp
mov ebp, esp
sub esp, 16
mov DWORD PTR [ebp-8], 0
mov DWORD PTR [ebp-4], 0
push $10
call test_func(int)
mov DWORD PTR [ebp-8], eax
mov DWORD PTR [ebp-4], edx
mov eax, 0
leave
ret
Now you can see I had to manually place the lower and higher order bits into two separate integers. Then I perform shifting to make it into one 64-bit integer.
Is there a way to automatically get it to place into a single 64-bit integer without me having to supply it two 32-bit integers and then shifting the bits?
You want the "A" constraint, which binds a 64-bit value to the eax/edx register pair. Something like:
uint64_t r;
asm("push $10\n"
"\tcall %P1"
: "=A"(r) : "i"(&test_func) : "memory");
should do the trick.