Related
I have some C++ code in which I must be sure that a specific destructor is called before exiting and I was wondering whether or not it was called before a [[noreturn]] function.
So I wrote this simple dummy example
#include <cstdio>
#include <cstdlib>
class A {
char *i;
public:
A() : i{new char[4]} {}
~A() { delete[] i; }
void hello() { puts(i); }
};
int func()
{
A b;
exit(1);
b.hello(); // Not reached
}
I compiled with g++ /tmp/l.cc -S -O0 and I got this assembly
.file "l.cc"
.text
.section .text._ZN1AC2Ev,"axG",#progbits,_ZN1AC5Ev,comdat
.align 2
.weak _ZN1AC2Ev
.type _ZN1AC2Ev, #function
_ZN1AC2Ev:
.LFB18:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $16, %rsp
movq %rdi, -8(%rbp)
movl $4, %edi
call _Znam
movq %rax, %rdx
movq -8(%rbp), %rax
movq %rdx, (%rax)
nop
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE18:
.size _ZN1AC2Ev, .-_ZN1AC2Ev
.weak _ZN1AC1Ev
.set _ZN1AC1Ev,_ZN1AC2Ev
.text
.globl func
.type func, #function
func:
.LFB24:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $16, %rsp
leaq -8(%rbp), %rax
movq %rax, %rdi
call _ZN1AC1Ev
movl $1, %edi
call exit
.cfi_endproc
.LFE24:
.size func, .-func
.ident "GCC: (GNU) 12.2.1 20221121 (Red Hat 12.2.1-4)"
.section .note.GNU-stack,"",#progbits
There was clearly no call to the destructor.
In this stupid case it doesn't matter much, but what if I had to close a file before exiting?
Apart from the fact that terminating a program with exit() is generally considered bad practice, you could try the following:
int func()
{
{
A b;
/* ... */
} // Leaving scope => destructing b
exit(1);
}
PS: Assuming that you aren't writing a driver, most kernels (including Microsoft Windows NT, Unix (e.g. BSD), XNU (macOS) and Linux) automatically deallocate any allocated memory as the program exits.
The output of the following program is
Global new
The code doesn't seem to call operator delete. Invoking ::operator delete directly does call the function, but using the regular delete operator doesn't.
I assume it's related to the compiler optimizing away the delete call. I tried to test that by placing all sorts of code after the delete call, including resetting the a pointer to a different expression. I would assume that would make the compiler insert the delete, because otherwise we have a leak. Still - the same result.
So, I assume this is the compiler eliminating an indeed unnecessary call to delete by rather sophisticated analysis. But, I'd like to be sure. I attempted reading the output assembly from g++, but I wasn't able to understand it.
Please explain this phenomenon.
#include <iostream>
using namespace std;
void* operator new(size_t size) {
cout << "Global new\n";
return malloc(size);
}
void operator delete(void* p) {
cout << "Global delete\n";
free(p);
}
void f() {
int* x = new int;
delete x;
}
int main(int argc, char** argv) {
f();
return 0;
}
I use Mingw-w64 on Windows 7, g++ std=c++17 version 9.3.0.
EDIT:
People have responded it does produce the expected output on their machines. So possibly a compiler bug on my machine?
Output assembly if anyone's interested:
.file "main.cpp"
.text
.lcomm _ZStL8__ioinit,1,1
.section .rdata,"dr"
.LC0:
.ascii "Global new\12\0"
.text
.globl _Znwy
.def _Znwy; .scl 2; .type 32; .endef
.seh_proc _Znwy
_Znwy:
.LFB1882:
pushq %rbp
.seh_pushreg %rbp
movq %rsp, %rbp
.seh_setframe %rbp, 0
subq $32, %rsp
.seh_stackalloc 32
.seh_endprologue
movq %rcx, 16(%rbp)
leaq .LC0(%rip), %rdx
movq .refptr._ZSt4cout(%rip), %rcx
call _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
movq 16(%rbp), %rcx
call malloc
addq $32, %rsp
popq %rbp
ret
.seh_endproc
.section .rdata,"dr"
.LC1:
.ascii "Global delete\12\0"
.text
.globl _ZdlPv
.def _ZdlPv; .scl 2; .type 32; .endef
.seh_proc _ZdlPv
_ZdlPv:
.LFB1883:
pushq %rbp
.seh_pushreg %rbp
movq %rsp, %rbp
.seh_setframe %rbp, 0
subq $32, %rsp
.seh_stackalloc 32
.seh_endprologue
movq %rcx, 16(%rbp)
leaq .LC1(%rip), %rdx
movq .refptr._ZSt4cout(%rip), %rcx
call _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
movq 16(%rbp), %rcx
call free
nop
addq $32, %rsp
popq %rbp
ret
.seh_endproc
.globl _Z1fv
.def _Z1fv; .scl 2; .type 32; .endef
.seh_proc _Z1fv
_Z1fv:
.LFB1884:
pushq %rbp
.seh_pushreg %rbp
movq %rsp, %rbp
.seh_setframe %rbp, 0
subq $48, %rsp
.seh_stackalloc 48
.seh_endprologue
movl $4, %ecx
call _Znwy
movq %rax, -8(%rbp)
movq -8(%rbp), %rax
testq %rax, %rax
je .L6
movl $4, %edx
movq %rax, %rcx
call _ZdlPvy
.L6:
nop
addq $48, %rsp
popq %rbp
ret
.seh_endproc
.def __main; .scl 2; .type 32; .endef
.globl main
.def main; .scl 2; .type 32; .endef
.seh_proc main
main:
.LFB1885:
pushq %rbp
.seh_pushreg %rbp
movq %rsp, %rbp
.seh_setframe %rbp, 0
subq $32, %rsp
.seh_stackalloc 32
.seh_endprologue
movl %ecx, 16(%rbp)
movq %rdx, 24(%rbp)
call __main
call _Z1fv
movl $0, %eax
addq $32, %rsp
popq %rbp
ret
.seh_endproc
.def __tcf_0; .scl 3; .type 32; .endef
.seh_proc __tcf_0
__tcf_0:
.LFB2377:
pushq %rbp
.seh_pushreg %rbp
movq %rsp, %rbp
.seh_setframe %rbp, 0
subq $32, %rsp
.seh_stackalloc 32
.seh_endprologue
leaq _ZStL8__ioinit(%rip), %rcx
call _ZNSt8ios_base4InitD1Ev
nop
addq $32, %rsp
popq %rbp
ret
.seh_endproc
.def _Z41__static_initialization_and_destruction_0ii; .scl 3; .type 32; .endef
.seh_proc _Z41__static_initialization_and_destruction_0ii
_Z41__static_initialization_and_destruction_0ii:
.LFB2376:
pushq %rbp
.seh_pushreg %rbp
movq %rsp, %rbp
.seh_setframe %rbp, 0
subq $32, %rsp
.seh_stackalloc 32
.seh_endprologue
movl %ecx, 16(%rbp)
movl %edx, 24(%rbp)
cmpl $1, 16(%rbp)
jne .L12
cmpl $65535, 24(%rbp)
jne .L12
leaq _ZStL8__ioinit(%rip), %rcx
call _ZNSt8ios_base4InitC1Ev
leaq __tcf_0(%rip), %rcx
call atexit
.L12:
nop
addq $32, %rsp
popq %rbp
ret
.seh_endproc
.def _GLOBAL__sub_I__Znwy; .scl 3; .type 32; .endef
.seh_proc _GLOBAL__sub_I__Znwy
_GLOBAL__sub_I__Znwy:
.LFB2378:
pushq %rbp
.seh_pushreg %rbp
movq %rsp, %rbp
.seh_setframe %rbp, 0
subq $32, %rsp
.seh_stackalloc 32
.seh_endprologue
movl $65535, %edx
movl $1, %ecx
call _Z41__static_initialization_and_destruction_0ii
nop
addq $32, %rsp
popq %rbp
ret
.seh_endproc
.section .ctors,"w"
.align 8
.quad _GLOBAL__sub_I__Znwy
.ident "GCC: (Rev1, Built by MSYS2 project) 9.3.0"
.def _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc; .scl 2; .type 32; .endef
.def malloc; .scl 2; .type 32; .endef
.def free; .scl 2; .type 32; .endef
.def _ZdlPvy; .scl 2; .type 32; .endef
.def _ZNSt8ios_base4InitD1Ev; .scl 2; .type 32; .endef
.def _ZNSt8ios_base4InitC1Ev; .scl 2; .type 32; .endef
.def atexit; .scl 2; .type 32; .endef
.section .rdata$.refptr._ZSt4cout, "dr"
.globl .refptr._ZSt4cout
.linkonce discard
.refptr._ZSt4cout:
.quad _ZSt4cout
I've created following program :
class CLexer
{
public:
CLexer( ) {
iCursorPos = 0;
}
void putCharacter(char character)
{
if(character != ' ' && character != '\n') {
m_strToken[iCursorPos] = character;
iCursorPos++;
}
else {
m_strToken[iCursorPos] = '\0';
iCursorPos = 0;
}
}
private:
char m_strToken[1024];
int iCursorPos = 0;
};
int main(int argc, char * argv[]) {
CLexer lex;
lex.putCharacter('m');
return 0;
}
Assembler output produced by compiler :
.file "main.cpp"
.section .text._ZN6CLexerC2Ev,"axG",#progbits,_ZN6CLexerC5Ev,comdat
.align 2
.weak _ZN6CLexerC2Ev
.type _ZN6CLexerC2Ev, #function
_ZN6CLexerC2Ev:
.LFB1:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movq %rdi, -8(%rbp)
movq -8(%rbp), %rax
movl $0, 1024(%rax)
movq -8(%rbp), %rax
movl $0, 1024(%rax)
nop
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE1:
.size _ZN6CLexerC2Ev, .-_ZN6CLexerC2Ev
.weak _ZN6CLexerC1Ev
.set _ZN6CLexerC1Ev,_ZN6CLexerC2Ev
.section .text._ZN6CLexer12putCharacterEc,"axG",#progbits,_ZN6CLexer12putCharacterEc,comdat
.align 2
.weak _ZN6CLexer12putCharacterEc
.type _ZN6CLexer12putCharacterEc, #function
_ZN6CLexer12putCharacterEc:
.LFB3:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movq %rdi, -8(%rbp)
movl %esi, %eax
movb %al, -12(%rbp)
cmpb $32, -12(%rbp)
je .L3
cmpb $10, -12(%rbp)
je .L3
movq -8(%rbp), %rax
movl 1024(%rax), %eax
movq -8(%rbp), %rdx
cltq
movzbl -12(%rbp), %ecx
movb %cl, (%rdx,%rax)
movq -8(%rbp), %rax
movl 1024(%rax), %eax
leal 1(%rax), %edx
movq -8(%rbp), %rax
movl %edx, 1024(%rax)
jmp .L4
.L3:
movq -8(%rbp), %rax
movl 1024(%rax), %eax
movq -8(%rbp), %rdx
cltq
movb $0, (%rdx,%rax)
movq -8(%rbp), %rax
movl $0, 1024(%rax)
.L4:
nop
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE3:
.size _ZN6CLexer12putCharacterEc, .-_ZN6CLexer12putCharacterEc
.text
.globl main
.type main, #function
main:
.LFB4:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $1056, %rsp
movl %edi, -1044(%rbp)
movq %rsi, -1056(%rbp)
leaq -1040(%rbp), %rax
movq %rax, %rdi
call _ZN6CLexerC1Ev
leaq -1040(%rbp), %rax
movl $109, %esi
movq %rax, %rdi
call _ZN6CLexer12putCharacterEc
movl $0, %eax
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE4:
.size main, .-main
.ident "GCC: (GNU) 6.1.1 20160501"
.section .note.GNU-stack,"",#progbits
And after execution, first call to putCharacter method with 'm' character as parameter is throwing segfault.
Attached gdb is giving following output :
Program received signal SIGSEGV, Segmentation fault.
0x00000000004018e5 in CLexer::putCharacter (this=0x7fffffffe370,
character=109 'm') at src/main.cpp:60
60 m_strToken[iCursorPos] = character;
I've managed to fix this error by moving iCursorPos variable above m_strToken in class declaration but i think it isn't proper way to fix this issue.
I'm using g++ (GCC) 6.1.1 20160501 on the lastest and updated version of ArchLinux x86_64.
if(character != ' ' && character != '\n') {
m_strToken[iCursorPos] = character;
iCursorPos++;
}
You don't check that iCursorPos < 1024 here. So you write past the end of the buffer, into iCursorPos itself.
The next access m_strToken[iCursorPos] = character; probably writes way past the end of the buffer, and you get a segfault (luckily).
Your "fix" still isn't correct, since you corrupt other parts of your objects memory regardless.
I have a real world program that is similar to this one, which I'll call test.cpp:
#include <stdlib.h>
extern void f(size_t i);
int sample(size_t x)
{
size_t a = x;
size_t i;
for (i = a-2; i>=0; i--) {
f(i);
}
}
And my problem is that i is an infinite loop.
If I run the following command:
g++ -S -o test.s test.cpp
I get the following assembly sequence:
.file "test.cpp"
.text
.globl _Z6samplem
.type _Z6samplem, #function
_Z6samplem:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $32, %rsp
movq %rdi, -24(%rbp)
movq -24(%rbp), %rax
movq %rax, -8(%rbp)
movq -8(%rbp), %rax
subq $2, %rax
movq %rax, -16(%rbp)
.L2:
movq -16(%rbp), %rax
movq %rax, %rdi
call _Z1fm
subq $1, -16(%rbp)
jmp .L2
.cfi_endproc
.LFE0:
.size _Z6samplem, .-_Z6samplem
.ident "GCC: (Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3"
.section .note.GNU-stack,"",#progbits
I'm no expert in assembly language, but I would expect to see code for the comparison i >= 0 and a conditional jump out of the loop. What's going on here??
GNU C++ 4.6.3 on Ubuntu Linux
size_t is unsigned, so the condition i>=0 is always true. It is impossible for i to be negative.
After reading a question related with the performance of sin/cos (Why is std::sin() and std::cos() slower than sin() and cos()?), I made some tests with his code and found a weird thing: If i call sin/cos with a float value, it is much slower than with double when compiled with optimization.
#include <cmath>
#include <cstdio>
const int N = 4000;
float cosine[N][N];
float sine[N][N];
int main() {
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
float ang = i*j*2*M_PI/N;
cosine[i][j] = cos(ang);
sine[i][j] = sin(ang);
}
}
}
With the above code I get:
With -O0: 2.402s
With -O1: 9.004s
With -O2: 9.013s
With -O3: 9.001s
Now if I change
float ang = i*j*2*M_PI/N;
To
double ang = i*j*2*M_PI/N;
I get:
With -O0: 2.362s
With -O1: 1.188s
With -O2: 1.197s
With -O3: 1.197s
How can the first test be that faster without optimizations?
I'm using g++ (Ubuntu/Linaro 4.5.2-8ubuntu4) 4.5.2, 64 bits.
EDIT: Changed the title to better describe the problem.
EDIT: Added assembly code
Assembly for first test with O0:
.file "main.cpp"
.globl cosine
.bss
.align 32
.type cosine, #object
.size cosine, 64000000
cosine:
.zero 64000000
.globl sine
.align 32
.type sine, #object
.size sine, 64000000
sine:
.zero 64000000
.text
.globl main
.type main, #function
main:
.LFB87:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
movq %rsp, %rbp
.cfi_offset 6, -16
.cfi_def_cfa_register 6
subq $16, %rsp
movl $0, -4(%rbp)
jmp .L2
.L5:
movl $0, -8(%rbp)
jmp .L3
.L4:
movl -4(%rbp), %eax
imull -8(%rbp), %eax
addl %eax, %eax
cvtsi2sd %eax, %xmm0
movsd .LC0(%rip), %xmm1
mulsd %xmm1, %xmm0
movsd .LC1(%rip), %xmm1
divsd %xmm1, %xmm0
unpcklpd %xmm0, %xmm0
cvtpd2ps %xmm0, %xmm0
movss %xmm0, -12(%rbp)
movss -12(%rbp), %xmm0
cvtps2pd %xmm0, %xmm0
call cos
unpcklpd %xmm0, %xmm0
cvtpd2ps %xmm0, %xmm0
movl -8(%rbp), %eax
cltq
movl -4(%rbp), %edx
movslq %edx, %rdx
imulq $4000, %rdx, %rdx
leaq (%rdx,%rax), %rax
movss %xmm0, cosine(,%rax,4)
movss -12(%rbp), %xmm0
cvtps2pd %xmm0, %xmm0
call sin
unpcklpd %xmm0, %xmm0
cvtpd2ps %xmm0, %xmm0
movl -8(%rbp), %eax
cltq
movl -4(%rbp), %edx
movslq %edx, %rdx
imulq $4000, %rdx, %rdx
leaq (%rdx,%rax), %rax
movss %xmm0, sine(,%rax,4)
addl $1, -8(%rbp)
.L3:
cmpl $3999, -8(%rbp)
setle %al
testb %al, %al
jne .L4
addl $1, -4(%rbp)
.L2:
cmpl $3999, -4(%rbp)
setle %al
testb %al, %al
jne .L5
movl $0, %eax
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE87:
.size main, .-main
.section .rodata
.align 4
.type _ZL1N, #object
.size _ZL1N, 4
_ZL1N:
.long 4000
.align 8
.LC0:
.long 1413754136
.long 1074340347
.align 8
.LC1:
.long 0
.long 1085227008
.ident "GCC: (Ubuntu/Linaro 4.5.2-8ubuntu4) 4.5.2"
.section .note.GNU-stack,"",#progbits
Assembly for first test with O3:
.file "main.cpp"
.text
.p2align 4,,15
.globl main
.type main, #function
main:
.LFB121:
.cfi_startproc
pushq %r15
.cfi_def_cfa_offset 16
xorl %r15d, %r15d
.cfi_offset 15, -16
pushq %r14
.cfi_def_cfa_offset 24
movl $cosine+16000, %r14d
.cfi_offset 14, -24
pushq %r13
.cfi_def_cfa_offset 32
xorl %r13d, %r13d
.cfi_offset 13, -32
pushq %r12
.cfi_def_cfa_offset 40
pushq %rbp
.cfi_def_cfa_offset 48
pushq %rbx
.cfi_def_cfa_offset 56
subq $24, %rsp
.cfi_def_cfa_offset 80
.p2align 4,,10
.p2align 3
.L2:
movslq %r15d, %rbp
.cfi_offset 3, -56
.cfi_offset 6, -48
.cfi_offset 12, -40
movl %r13d, %r12d
movl $0x3f800000, %edx
imulq $16000, %rbp, %rbp
xorl %eax, %eax
leaq cosine(%rbp), %rbx
addq $sine, %rbp
jmp .L5
.p2align 4,,10
.p2align 3
.L3:
movl %r12d, %eax
leaq 8(%rsp), %rsi
leaq 12(%rsp), %rdi
subl %r13d, %eax
cvtsi2sd %eax, %xmm0
mulsd .LC2(%rip), %xmm0
divsd .LC3(%rip), %xmm0
unpcklpd %xmm0, %xmm0
cvtpd2ps %xmm0, %xmm0
call sincosf
movl 8(%rsp), %edx
movl 12(%rsp), %eax
.L5:
movl %edx, (%rbx)
addq $4, %rbx
movl %eax, 0(%rbp)
addl %r13d, %r12d
addq $4, %rbp
cmpq %r14, %rbx
jne .L3
addl $1, %r15d
addl $2, %r13d
leaq 16000(%rbx), %r14
cmpl $4000, %r15d
jne .L2
addq $24, %rsp
.cfi_def_cfa_offset 56
xorl %eax, %eax
popq %rbx
.cfi_def_cfa_offset 48
popq %rbp
.cfi_def_cfa_offset 40
popq %r12
.cfi_def_cfa_offset 32
popq %r13
.cfi_def_cfa_offset 24
popq %r14
.cfi_def_cfa_offset 16
popq %r15
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE121:
.size main, .-main
.globl cosine
.bss
.align 32
.type cosine, #object
.size cosine, 64000000
cosine:
.zero 64000000
.globl sine
.align 32
.type sine, #object
.size sine, 64000000
sine:
.zero 64000000
.section .rodata.cst8,"aM",#progbits,8
.align 8
.LC2:
.long 1413754136
.long 1074340347
.align 8
.LC3:
.long 0
.long 1085227008
.ident "GCC: (Ubuntu/Linaro 4.5.2-8ubuntu4) 4.5.2"
.section .note.GNU-stack,"",#progbits
Here's a possibility:
In C, cos is double precision and cosf is single precision. In C++, std::cos has overloads for both double and single.
You aren't calling std::cos. If <cmath> doesn't also overload ::cos (as far as I know, it is not required to), then you are just calling the C double precision function. If this is the case, then you're suffering the cost of converting between float, double, and back.
Now, some standard libraries implement cos(float x) as (float)cos((double)x), so even if you are calling the float function it might still be doing conversions behind the scenes.
This shouldn't account for a 9x performance difference, though.
AFAIK it's because computers work at double precision natively. Using float requires conversions.'