Escape Analysis with std::vector in C++ - c++

I am wondering there is any optimization option in Clang or GCC for escape analysis on std::vector in C++.
Since std::vector<int> in the example below does not require the actual data of v to be allocated in the heap or stack. Compiler can actually allocate v.data() on stack for better performance.
Assume that Clang/GCC does not do escape analysis, is there any particular motivation to not to use escape analysis?
Assume that Clang/GCC does escape analysis, why value of v.data() and &x so different?
#include<cstdio>
#include<vector>
int main() {
int x = 0;
std::vector<int> v(3, 0);
std::printf("&x: %p\n", &x);
//std::printf("&v: %p\n", &v); // we intentionally don't print the pointer to v here.
std::printf("v.data(): %p\n", v.data());
return x + v[0]; // we want compiler not to optimize everything out
}
Expected result
&x: <some address>
v.data(): <some address> + 4
Actual result from Clang and GCC
[*****#localhost test]$ g++ test.cc -O3
[khanh#localhost test]$ ./a.out
&x: 0x7ffe2af5a59c
v.data(): 0xadde70
[*****#localhost test]$ clang++ test.cc -O3
[*****#localhost test]$ ./a.out
&x: 0x7fff66ce1ab4
v.data(): 0xfeee70
Thanks!

There exists escape analysis on Clang compiler.
Sample code: from #geza https://godbolt.org/z/N1GLUI
int fn(int a, int b, int c) {
int *t = new int[3];
t[0] = a;
t[1] = b;
t[2] = c;
int r = t[0]+t[1]+t[2];
delete[] t;
return r;
}
GCC
fn(int, int, int):
push r12
mov r12d, edx
push rbp
mov ebp, esi
push rbx
mov ebx, edi
mov edi, 12
call operator new[](unsigned long)
mov DWORD PTR [rax], ebx
add ebx, ebp
mov rdi, rax
mov DWORD PTR [rax+4], ebp
mov DWORD PTR [rax+8], r12d
add r12d, ebx
call operator delete[](void*)
mov eax, r12d
pop rbx
pop rbp
pop r12
ret
Clang
fn(int, int, int): # #fn(int, int, int)
lea eax, [rdi + rsi]
add eax, edx
ret

Related

Why is the C++ function parameter stored 20 bytes off of the rbp in x86-64 when the method body only has one 4 byte variable?

Consider the following program, compiled using x86-64 GCC 12.2 with flags --std=c++17 -O0:
int square(int num, int num2) {
int foo = 37;
return num * num;
}
int main () {
return square(10, 5);
}
The resulting assembly using godbolt is:
square(int, int):
push rbp
mov rbp, rsp
mov DWORD PTR [rbp-20], edi
mov DWORD PTR [rbp-24], esi
mov DWORD PTR [rbp-4], 37
mov eax, DWORD PTR [rbp-20]
imul eax, eax
pop rbp
ret
main:
push rbp
mov rbp, rsp
mov esi, 5
mov edi, 10
call square(int, int)
nop
pop rbp
ret
I read about shadow spaces and it appears that in x64 there must be at minimum 32 bytes allocated: "32 bytes above the return address which the called function owns" ...
With that said, how is the offset -20 determined for the parameter num? If there's 32 bytes from rbp, wouldn't that be -24?
I noticed even if you add more local variables, it'll remain -20 until it gets pushed over to -36, but I cannot understand why. Thanks!

How do I convert C++ function to assembly(x86_64)?

This is my .CPP file
#include <iostream>
using namespace std;
extern "C" void KeysAsm(int arr[], int n, int thetha, int rho);
// Keep this and call it from assembler
extern "C"
void crim(int *xp, int *yp) {
int temp = *xp;
*xp = *yp;
*yp = temp+2;
}
// Translate this into Intel assembler
void KeysCpp(int arr[], int n, int thetha, int rho){
for (int i = 0; i < n - 1; i++) {
for (int j = 0; j < n - i - 1; j++) {
if (arr[j] > arr[j + 1]) {
crim(&arr[j], &arr[j + 1]);
}
}
arr[i]= arr[i] + thetha / rho * 2 - 4;
}
}
// Function to print an array
void printArray(int arr[], int size){
int i;
for (i = 0; i < size; i++)
cout << arr[i] << "\n";
cout << endl;
}
int main() {
int gamma1[]{
9,
270,
88,
-12,
456,
80,
45,
123,
427,
999
};
int gamma2[]{
900,
312,
542,
234,
234,
1,
566,
123,
427,
111
};
printf("Array:\n");
printArray(gamma1, 10);
KeysAsm(gamma1, 10, 5, 6);
printf("Array Result Asm:\n");
printArray(gamma1, 10);
KeysCpp(gamma2, 10, 5, 6);
printf("Array Result Cpp:\n");
printArray(gamma2, 10);
}
What I want to do is, convert the KeysCpp function into assembly language and call it from this very .CPP file. I want to keep the crim function as it is in .CPP, while only converting the KeysCpp.
Here is my .ASM file
PUBLIC KeysAsm
includelib kernel32.lib
_DATA SEGMENT
EXTERN crim:PROC
_DATA ENDS
_TEXT SEGMENT
KeysAsm PROC
push rbp
mov rbp, rsp
sub rsp, 40
mov QWORD PTR [rbp-24], rdi
mov DWORD PTR [rbp-28], esi
mov DWORD PTR [rbp-32], edx
mov DWORD PTR [rbp-36], ecx
mov DWORD PTR [rbp-4], 0
jmp L3
L3:
mov eax, DWORD PTR [rbp-28]
sub eax, 1
cmp DWORD PTR [rbp-4], eax
jl L7
L4:
mov eax, DWORD PTR [rbp-28]
sub eax, DWORD PTR [rbp-4]
sub eax, 1
cmp DWORD PTR [rbp-8], eax
jl L6
L5:
add DWORD PTR [rbp-8], 1
L6:
mov eax, DWORD PTR [rbp-8]
cdqe
lea rdx, [0+rax*4]
mov rax, QWORD PTR [rbp-24]
add rax, rdx
mov edx, DWORD PTR [rax]
mov eax, DWORD PTR [rbp-8]
cdqe
add rax, 1
lea rcx, [0+rax*4]
mov rax, QWORD PTR [rbp-24]
add rax, rcx
mov eax, DWORD PTR [rax]
cmp edx, eax
jle L5
mov eax, DWORD PTR [rbp-8]
cdqe
add rax, 1
lea rdx, [0+rax*4]
mov rax, QWORD PTR [rbp-24]
add rdx, rax
mov eax, DWORD PTR [rbp-8]
cdqe
lea rcx, [0+rax*4]
mov rax, QWORD PTR [rbp-24]
add rax, rcx
mov rsi, rdx
mov rdi, rax
call crim
L7:
mov DWORD PTR [rbp-8], 0
jmp L4
KeysAsm ENDP
_TEXT ENDS
END
I am using Visual Studio 2017 to run this project.
I am getting next error when I run this code.
Unhandled exception at 0x00007FF74B0E429C in MatrixMultiplication.exe: Stack cookie instrumentation code detected a stack-based buffer overrun. occurred
Your asm looks like it's expecting the x86-64 System V calling convention, with args in RDI, ESI, EDX, ECX. But you said you're compiling with Visual Studio, so the compiler-generated code will use the Windows x64 calling convention: RCX, EDX, R8D, R9D.
And when you call crim, it can use shadow space (32 bytes above its return address, which you didn't reserve space for).
It looks like you got this asm from un-optimized compiler output, probably from https://godbolt.org/z/ea4MPh81r using GCC for Linux, without using -mabi=ms to override the default -mabi=sysv when compiling for non-Windows targets. And then you modified it to make the loop infinite, with a jmp at the bottom instead of a ret? Maybe a different GCC version than 12.2 since the label numbers and code don't match exactly.
(The signs of being un-optimized compiler output are all the reloads from [rbp-whatever], and redoing sign-extension before using an int to index an array with cdqe. A human would know the int must be non-negative. And being GCC specifically, the numbered label like .L1: etc. where you just removed the ., and of heavily using RAX for as much as possible in a debug build. And choices like lea rdx, [0+rax*4] to copy-and-shift, and the exact syntax it used to print that instruction in Intel syntax match GCC.)
To compile a single function for Windows x64, isolate it and give the compiler only prototypes for anything it calls
extern "C" void crim(int *xp, int *yp); // prototype only
void KeysCpp(int arr[], int n, int thetha, int rho){
for (int i = 0; i < n - 1; i++) {
for (int j = 0; j < n - i - 1; j++) {
if (arr[j] > arr[j + 1]) {
crim(&arr[j], &arr[j + 1]);
}
}
arr[i]= arr[i] + thetha / rho * 2 - 4;
}
}
Then on Godbolt, use gcc -O3 -mabi=ms, or use MSVC which always targets Windows. https://godbolt.org/z/Mj5Gb54b5 shows both GCC and MSVC with optimization enabled.
KeysCpp(int*, int, int, int): ; demangled name
cmp edx, 1
jle .L11 ; "shrink wrap" optimization: early-out on n<=1 before saving regs
push r15 ; save some call-preserved regs
push r14
lea r14, [rcx+4] ; arr + 1
push r13
mov r13, rcx
Unfortunately GCC fails to hoist the thetha / rho * 2 - 4 loop-invariant, instead redoing idiv every time through the loop. Seems like an obvious optimization since those are local vars whose address hasn't been taken at all, and it keeps thetha (typo for theta?) and rho in registers. So MSVC is much more efficient here. Clang also misses this optimization.

GCC turning on O2 causes bug when 128-bit integer is subscripted

#include <cstdio>
__int128 idx;
int main() {
int a[2] = {1, 2};
idx++;
a[idx] = 0;
printf("%d %d", a[0], a[1]);
}
After turning on O2 a[idx] = 0 not executed.
I guess it shouldn't be undefined behavior.
Is this a bug in the compiler?
https://godbolt.org/z/qqccd9oEj
Looking at the compiler output for gcc-12.1 -std=c++20 -O2 -W -Wall
.LC0:
.string "%d %d"
main:
sub rsp, 8
mov edx, 2
add QWORD PTR idx[rip], 1
mov esi, 1
adc QWORD PTR idx[rip+8], 0
mov edi, OFFSET FLAT:.LC0
xor eax, eax
call printf
xor eax, eax
add rsp, 8
ret
idx:
.zero 16
The problem is mov edx, 2. That is just wrong, it should read a[1] and optimize that to 0 not 2.
clang gets it right but still generates horrible code. idx should get optimized out.
You should file that as compiler bug.

allocating memory on stack is bigger when disassembling code

I have the following code. I expected the size of the stack in main function to be 8 bytes on 64 bit system, but when disassembling I see strange thing: it is 16. I am using https://godbolt.org/ x86-64 GCC 9.3. So my question is why?
#include <memory>
struct my_struct {
char a[10];
int b;
char c;
short d;
};
int main() {
struct my_struct* s = (struct my_struct*)malloc(sizeof(struct my_struct));
printf("%lu\n", sizeof(s));
return 0;
}
.LC0:
.string "%lu\n"
main:
push rbp
mov rbp, rsp
sub rsp, 16
mov edi, 20
call malloc
mov QWORD PTR [rbp-8], rax
mov esi, 8
mov edi, OFFSET FLAT:.LC0
mov eax, 0
call printf
mov eax, 0
leave
ret

Instructions after main in disassembly

I have a very simple program where my goal was to study how the compiler pushes the values to the different registers. But now the behaviour is much more complicated than expected, at least in debug-mode.
What is going on here?
#include <cstdio>
struct A
{
int B;
A() : B(0) { }
};
int main()
{
A a;
A b(a);
printf("%d", b.B);
printf("%d", a.B);
return 0;
}
This is how the disassembly looks like in Visual Studio:
int main()
{
01048210 push ebp
01048211 mov ebp,esp
01048213 sub esp,0D8h
01048219 push ebx
0104821A push esi
0104821B push edi
0104821C lea edi,[ebp-0D8h]
01048222 mov ecx,36h
01048227 mov eax,0CCCCCCCCh
0104822C rep stos dword ptr es:[edi]
A a;
0104822E lea ecx,[a]
01048231 call A::A (104678Ah)
A b(a);
01048236 mov eax,dword ptr [a]
01048239 mov dword ptr [b],eax
printf("%d", b.B);
0104823C mov eax,dword ptr [b]
0104823F push eax
01048240 push offset string "%d" (1093C6Ch)
01048245 call #ILT+3885(_printf) (1046F32h)
0104824A add esp,8
printf("%d", a.B);
0104824D mov eax,dword ptr [a]
01048250 push eax
01048251 push offset string "%d" (1093C6Ch)
01048256 call #ILT+3885(_printf) (1046F32h)
0104825B add esp,8
}
The first lines are explained in this answer, they are there to kep the frame pointer so that nice stack traces can be generated.
But the next lines are confusing: why subtract 216 (0D8h) from esp?
What are these lines after main, but before the first line of code A a; doing?
Edit: after setting the runtime checks to default the disassembly is much smaller:
int main()
{
00247110 push ebp
00247111 mov ebp,esp
00247113 sub esp,48h
00247116 push ebx
00247117 push esi
00247118 push edi
A a;
Edit 2: in Release mode (/Ox) a and b are completely optimized away and no memory is allocated on the stack at all:
int main()
{
A a;
A b(a);
printf("%d", b.B);
00B41000 push 0
00B41002 push 0B499A0h
00B41007 call printf (0B4102Dh)
printf("%d", a.B);
00B4100C push 0
00B4100E push 0B499A4h
00B41013 call printf (0B4102Dh)
00B41018 add esp,10h
return 0;
0127101B xor eax,eax
}
0127101D ret
Edit 3: this is the result using gcc -m32 -O3 -mpreferred-stack-boundary=2 (thanks to #CodyGray).
.LC0:
.string "%d"
Test():
push 0
push OFFSET FLAT:.LC0
call printf
pop eax
pop edx
push 0
push OFFSET FLAT:.LC0
call printf
pop ecx
pop eax
ret
00CC8223 sub esp,0D8h
Allocates the stack space for the local variables.
What are these lines after main, but before the first instruction doing?
What are you referring to?