This question already has an answer here:
Dual emission of constructor symbols
(1 answer)
Closed 9 years ago.
c++ codes
#include <cstdio>
#include <cstdlib>
struct trivialStruct
{
trivialStruct();
~trivialStruct();
int *a;
float *b;
float *c;
};
trivialStruct::trivialStruct() : a((int*)malloc(sizeof(int))), b((float*)malloc(sizeof(float))), c((float*)malloc(sizeof(float)))
{
*a = 100;
*b = 200;
*c = 300;
}
trivialStruct::~trivialStruct()
{
free(a);
free(b);
free(c);
a = nullptr;
b = nullptr;
c = nullptr;
}
int main()
{
trivialStruct A;
printf("%d, %f, %f", *A.a, *A.b, *A.c);
return 0;
}
assembly
.section __TEXT,__text,regular,pure_instructions
.globl __ZN13trivialStructC1Ev
.align 4, 0x90
__ZN13trivialStructC1Ev: ## #_ZN13trivialStructC1Ev
.cfi_startproc
## BB#0: ## %entry
push RBP
Ltmp3:
.cfi_def_cfa_offset 16
Ltmp4:
.cfi_offset rbp, -16
mov RBP, RSP
Ltmp5:
.cfi_def_cfa_register rbp
push R15
push R14
push RBX
push RAX
Ltmp6:
.cfi_offset rbx, -40
Ltmp7:
.cfi_offset r14, -32
Ltmp8:
.cfi_offset r15, -24
mov RBX, RDI
mov EDI, 4
call _malloc
mov R14, RAX
mov QWORD PTR [RBX], R14
mov EDI, 4
call _malloc
mov R15, RAX
mov QWORD PTR [RBX + 8], R15
mov EDI, 4
call _malloc
mov QWORD PTR [RBX + 16], RAX
mov DWORD PTR [R14], 100
mov DWORD PTR [R15], 1128792064
mov DWORD PTR [RAX], 1133903872
add RSP, 8
pop RBX
pop R14
pop R15
pop RBP
ret
.cfi_endproc
.globl __ZN13trivialStructC2Ev
.align 4, 0x90
__ZN13trivialStructC2Ev: ## #_ZN13trivialStructC2Ev
.cfi_startproc
## BB#0: ## %entry
push RBP
Ltmp12:
.cfi_def_cfa_offset 16
Ltmp13:
.cfi_offset rbp, -16
mov RBP, RSP
Ltmp14:
.cfi_def_cfa_register rbp
push R15
push R14
push RBX
push RAX
Ltmp15:
.cfi_offset rbx, -40
Ltmp16:
.cfi_offset r14, -32
Ltmp17:
.cfi_offset r15, -24
mov RBX, RDI
mov EDI, 4
call _malloc
mov R14, RAX
mov QWORD PTR [RBX], R14
mov EDI, 4
call _malloc
mov R15, RAX
mov QWORD PTR [RBX + 8], R15
mov EDI, 4
call _malloc
mov QWORD PTR [RBX + 16], RAX
mov DWORD PTR [R14], 100
mov DWORD PTR [R15], 1128792064
mov DWORD PTR [RAX], 1133903872
add RSP, 8
pop RBX
pop R14
pop R15
pop RBP
ret
.cfi_endproc
.globl __ZN13trivialStructD1Ev
.align 4, 0x90
__ZN13trivialStructD1Ev: ## #_ZN13trivialStructD1Ev
.cfi_startproc
## BB#0: ## %entry
push RBP
Ltmp21:
.cfi_def_cfa_offset 16
Ltmp22:
.cfi_offset rbp, -16
mov RBP, RSP
Ltmp23:
.cfi_def_cfa_register rbp
push RBX
push RAX
Ltmp24:
.cfi_offset rbx, -24
mov RBX, RDI
mov RDI, QWORD PTR [RBX]
call _free
mov RDI, QWORD PTR [RBX + 8]
call _free
mov RDI, QWORD PTR [RBX + 16]
call _free
mov QWORD PTR [RBX + 16], 0
mov QWORD PTR [RBX + 8], 0
mov QWORD PTR [RBX], 0
add RSP, 8
pop RBX
pop RBP
ret
.cfi_endproc
.globl __ZN13trivialStructD2Ev
.align 4, 0x90
__ZN13trivialStructD2Ev: ## #_ZN13trivialStructD2Ev
.cfi_startproc
## BB#0: ## %entry
push RBP
Ltmp28:
.cfi_def_cfa_offset 16
Ltmp29:
.cfi_offset rbp, -16
mov RBP, RSP
Ltmp30:
.cfi_def_cfa_register rbp
push RBX
push RAX
Ltmp31:
.cfi_offset rbx, -24
mov RBX, RDI
mov RDI, QWORD PTR [RBX]
call _free
mov RDI, QWORD PTR [RBX + 8]
call _free
mov RDI, QWORD PTR [RBX + 16]
call _free
mov QWORD PTR [RBX + 16], 0
mov QWORD PTR [RBX + 8], 0
mov QWORD PTR [RBX], 0
add RSP, 8
pop RBX
pop RBP
ret
.cfi_endproc
.section __TEXT,__literal8,8byte_literals
.align 3
LCPI4_0:
.quad 4641240890982006784 ## double 200
LCPI4_1:
.quad 4643985272004935680 ## double 300
.section __TEXT,__text,regular,pure_instructions
.globl _main
.align 4, 0x90
_main: ## #main
.cfi_startproc
## BB#0: ## %entry
push RBP
Ltmp34:
.cfi_def_cfa_offset 16
Ltmp35:
.cfi_offset rbp, -16
mov RBP, RSP
Ltmp36:
.cfi_def_cfa_register rbp
lea RDI, QWORD PTR [RIP + L_.str]
movsd XMM0, QWORD PTR [RIP + LCPI4_0]
movsd XMM1, QWORD PTR [RIP + LCPI4_1]
mov ESI, 100
mov AL, 2
call _printf
xor EAX, EAX
pop RBP
ret
.cfi_endproc
.section __TEXT,__cstring,cstring_literals
L_.str: ## #.str
.asciz "%d, %f, %f"
.subsections_via_symbols
command
clang++ -S -O2 -std=c++11 -mllvm --x86-asm-syntax=intel -fno-exceptions main.cpp
As you can see, there are two part of codes are same(constructor and destructor)
__ZN13trivialStructC1Ev: ## #_ZN13trivialStructC1Ev
__ZN13trivialStructC2Ev: ## #_ZN13trivialStructC2Ev
__ZN13trivialStructD1Ev: ## #_ZN13trivialStructD1Ev
__ZN13trivialStructD2Ev: ## #_ZN13trivialStructD2Ev
I have no idea why the compiler generate two part of codes but not just one?
I am no familiar with assembly, but looks like this just make the codes become
fatter(and maybe slower).
This is part of the ABI for your platform, and escapes the standard. Both constructors and destructors can generate multiple symbols in the binary. For example, the Itanium C++ABI will generate up to 3 constructors/destructors:
complete object constructor
base object constructor
complete object allocating constructor
deleting destructor
complete object destructor
base object destructor
The different symbols take on slightly different responsibilities as the implementation might need to do different things depending on how the object is being created/destroyed. In your particular case, the code is simple enough that all constructors might generate exactly the same code, but they need to be there to comply with the ABI, and the ABI has them to enable more complex use cases.
For example, a complete object constructor will initialize virtual bases, while the base object constructor will skip this part of construction. If there is multiple/virtual inheritance and virtual functions, the vptr in the complete object may have to jump through different sets of intermediate tables depending on how this subobject is being instantiated.
If you want an explanation other than the ABI mandates, you should take a look at the documentation for your particular ABI. You can also take a look at Inside the C++ object model that even if old, contains a good description of what the problems to solve are and some of the solutions provided.
Related
Any idea why code that looks like this
list<Foo> fooList;
processList(&fooList);
Generates the following machine code
lea rax, [rbp-48]
mov rdi, rax
call processList(std::__cxx11::list<Foo, std::allocator<Foo> >*)
lea rax, [rbp-48]
mov rdi, rax
call std::__cxx11::list<Foo, std::allocator<Foo> >::~list()
jmp .L11
mov rbx, rax
lea rax, [rbp-48]
mov rdi, rax
call std::__cxx11::list<Foo, std::allocator<Foo> >::~list()
mov rax, rbx
mov rdi, rax
call _Unwind_Resume
.L11:
add rsp, 40
pop rbx
pop rbp
ret
In particular, I don't see any paths leading to the line after the unconditional jmp .L11
(this is with GCC 6.2 with no optimization, generated on compiler explorer)
For comparison, clang 5.0.0 produces
call processList(std::__cxx11::list<Foo, std::allocator<Foo> >*)
jmp .LBB5_1
.LBB5_1:
lea rdi, [rbp - 24]
call std::__cxx11::list<Foo, std::allocator<Foo> >::~list()
add rsp, 48
pop rbp
ret
lea rdi, [rbp - 24]
mov ecx, edx
mov qword ptr [rbp - 32], rax
mov dword ptr [rbp - 36], ecx
call std::__cxx11::list<Foo, std::allocator<Foo> >::~list()
mov rdi, qword ptr [rbp - 32]
call _Unwind_Resume
Again there is an unconditional jump to a return block, and and unwind block (starting with the second lea rdi) that seems unreachable.
After a bit of research on C++ exception mechanisms, my conclusion is that the process is as follows:
At the point of exception throw, __cxa_throw gets called. This is somewhat like longjmp() in that the function gets called but never returns. The function performs two main tasks
It walks up the call stack looking for a catch. If it doesn't find any, std::terminate gets called.
If it does find a catch block then it calls all of the unwind handlers between the current function and the catch block, then calls the catch block.
Back to my original machine code (with filtering turned off in compiler explorer). My comments after the hashes.
# this is the normative path
call std::list<Handle, std::allocator<Handle> >::~list()
# unconditional jump around the unwind handler
jmp .L11
.L10:
# unwind handler code, calls the local variable destructor
mov rbx, rax
.loc 2 30 0
lea rax, [rbp-32]
mov rdi, rax
call std::list<Handle, std::allocator<Foo> >::~list()
mov rax, rbx
mov rdi, rax
.LEHB1:
# carry on unwinding
call _Unwind_Resume
.L11:
Then there is the exception table
.section .gcc_except_table,"a",#progbits
.LLSDA1386:
.byte 0xff
.byte 0xff
.byte 0x1
.uleb128 .LLSDACSE1386-.LLSDACSB1386
.LLSDACSB1386:
# entry for unwind handler
.uleb128 .LEHB0-.LFB1386
.uleb128 .LEHE0-.LEHB0
.uleb128 .L10-.LFB1386
.uleb128 0
.uleb128 .LEHB1-.LFB1386
.uleb128 .LEHE1-.LEHB1
.uleb128 0
.uleb128 0
I guess that the unwind handler function can work out the positions of the unwind handler blocks from the addresses on the stack and the offsets in this table.
I am trying to debug a tricky core dump (from an -O2 optimized binary).
// Caller Function
void caller(Container* c)
{
std::list < Message*> msgs;
if(!decoder.called(c->buf_, msgs))
{
....
.....
}
// Called Function
bool
Decoder::called(Buffer* buf, list < Message*>& msgs)
{
add_data(buf); // Inlined code to append buf to decoders buf chain
while(m_data_in && m_data_in->length() > 0)
{
.....
}
}
In both caller and the callee, the first argument is optimized out, that means it must be somewhere in the register.
Caller Disassembly:
push %r15
mov %rdi,%r15
push %r14
push %r13
push %r12
push %rbp
push %rbx
sub $0x68,%rsp
test %rsi,%rsi
je 0x8ccd62
cmpq $0x0,(%rsi)
je 0x8ccd62
lea 0x40(%rsp),%rax
lea 0x1b8(%rdi),%rdi
mov %rax,(%rsp)
mov %rax,0x40(%rsp)
mov %rax,%rdx
mov %rax,0x48(%rsp)
mov (%rsi),%rsi
callq 0x8cc820
Caller Register Info:
rax 0x7fbfffc7e0 548682057696
rbx 0x2a97905ba0 182931446688
rcx 0x0 0
rdx 0x2 2
rsi 0x1 1
rdi 0x7fbfffc7e2 548682057698
rbp 0x4f 0x4f
rsp 0x7fbfffc870 0x7fbfffc870
r8 0x40 64
r9 0x20 32
r10 0x7fbfffc7e0 548682057696
r11 0x2abe466600 183580911104
r12 0x7fbfffd910 548682062096 // THIS IS HOLDING buf_
r13 0x7fbfffdec0 548682063552
r14 0x5dc 1500
r15 0x2a97905ba0 182931446688
rip 0x8cca89 0x8cca89
eflags 0x206 [ PF IF ]
cs 0x33 51
ss 0x2b 43
ds 0x0 0
es 0x0 0
fs 0x0 0
gs 0x0 0
Called function Disassembly:
push %r14
push %r13
mov %rdx,%r13
push %r12
mov %rdi,%r12
push %rbp
push %rbx
sub $0x10,%rsp
mov 0x8(%rdi),%rdx
test %rdx,%rdx
jne 0x8cc843
jmpq 0x8cc9cb
mov %rax,%rdx
mov 0x8(%rdx),%rax
test %rax,%rax
mov %rsi,0x8(%rdx)
mov 0x8(%r12),%rax
test %rax,%rax
xor %edx,%edx
add 0x4(%rax),%edx
mov 0x8(%rax),%rax
lea 0x8(%rsp),%rsi
mov %r12,%rdi
movq $0x0,0x8(%rsp)
Called function Register Info :
rax 0x7fbfffc7e0 548682057696
rbx 0x2abc49f9c0 183547591104
rcx 0x0 0
rdx 0x2 2
rsi 0x1 1
rdi 0x7fbfffc7e2 548682057698
rbp 0xffffffff 0xffffffff
rsp 0x7fbfffc830 0x7fbfffc830
r8 0x40 64
r9 0x20 32
r10 0x7fbfffc7e0 548682057696
r11 0x2abe466600 183580911104
r12 0x2a97905d58 182931447128
r13 0x7fbfffc8b0 548682057904
r14 0x5dc 1500
r15 0x2a97905ba0 182931446688
rip 0x8cc88a 0x8cc88a
eflags 0x206 [ PF IF ]
cs 0x33 51
ss 0x2b 43
ds 0x0 0
es 0x0 0
fs 0x0 0
gs 0x0 0
The issue is, in the called function, it appears that "add_data" function achieved nothing.
So, wanted to know whether in disassembly of called function, do we see the "buf_" pointer being used anywhere (Register r12 in callee function).
I do understand assembly to some level, but all those code inlining has left me confused.
Would appreciate some help in demistifying called function disassembly.
UPDATE:
add_data does below:
if (m_data_in) {
m_data_in->next = data;
} else {
m_data_in = data;
}
This looks like if (m_data_in)
mov 0x8(%rdi),%rdx
test %rdx,%rdx
test %rdx,%rdx
jne 0x8cc843
jmpq 0x8cc9cb
Now, I don't quite know where 0x8cc843 and 0x8cc9cb are located in your code, so can't really follow the code further. There is still not enough code & information to say exactly what is going on in the original question. I'm happy to fill in more of this answer if more information is provided.
(For testing purposes) I have written a simple Method to calculate the transpose of a nxn Matrix
void transpose(const size_t _n, double* _A) {
for(uint i=0; i < _n; ++i) {
for(uint j=i+1; j < _n; ++j) {
double tmp = _A[i*_n+j];
_A[i*_n+j] = _A[j*_n+i];
_A[j*_n+i] = tmp;
}
}
}
When using optimization levels O3 or Ofast I expected the compiler to unroll some loops which would lead to higher performance especially when the matrix size is a multiple of 2 (i.e., the double loop body can be performed each iteration) or similar. Instead what I measured was the exact opposite. Powers of 2 actually show a significant spike in execution time.
These spikes are also at regular intervals of 64, more pronounced at intervals of 128 and so on. Each spike extends to the neighboring matrix sizes like in the following table
size n time(us)
1020 2649
1021 2815
1022 3100
1023 5428
1024 15791
1025 6778
1026 3106
1027 2847
1028 2660
1029 3038
1030 2613
I compiled with a gcc version 4.8.2 but the same thing happens with a clang 3.5 so this might be some generic thing?
So my question basically is: Why is there this periodic increase in execution time? Is it some generic thing coming with any of the optimization options (as it happens with clang and gcc alike)? If so which optimization option is causing this?
And how can this be so significant that even the O0 version of the program outperforms the 03 version at multiples of 512?
EDIT: Note the magnitude of the spikes in this (logarithmic) plot. Transposing a 1024x1024 matrix with optimization actually takes as much time as transposing a 1300x1300 matrix without optimization. If this is a cache-fault / page-fault problem, then someone needs to explain to me why the memory layout is so significantly different for the optimized version of the program, that it fails for powers of two, just to recover high performance for slightly larger matrices. Shouldn't cache-faults create more of a step-like pattern? Why does the execution times go down again at all? (and why should optimization create cache-faults that weren't there before?)
EDIT: the following should be the assembler codes that gcc produced
no optimization (O0):
_Z9transposemRPd:
.LFB0:
.cfi_startproc
push rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
mov rbp, rsp
.cfi_def_cfa_register 6
mov QWORD PTR [rbp-24], rdi
mov QWORD PTR [rbp-32], rsi
mov DWORD PTR [rbp-4], 0
jmp .L2
.L5:
mov eax, DWORD PTR [rbp-4]
add eax, 1
mov DWORD PTR [rbp-8], eax
jmp .L3
.L4:
mov rax, QWORD PTR [rbp-32]
mov rdx, QWORD PTR [rax]
mov eax, DWORD PTR [rbp-4]
imul rax, QWORD PTR [rbp-24]
mov rcx, rax
mov eax, DWORD PTR [rbp-8]
add rax, rcx
sal rax, 3
add rax, rdx
mov rax, QWORD PTR [rax]
mov QWORD PTR [rbp-16], rax
mov rax, QWORD PTR [rbp-32]
mov rdx, QWORD PTR [rax]
mov eax, DWORD PTR [rbp-4]
imul rax, QWORD PTR [rbp-24]
mov rcx, rax
mov eax, DWORD PTR [rbp-8]
add rax, rcx
sal rax, 3
add rdx, rax
mov rax, QWORD PTR [rbp-32]
mov rcx, QWORD PTR [rax]
mov eax, DWORD PTR [rbp-8]
imul rax, QWORD PTR [rbp-24]
mov rsi, rax
mov eax, DWORD PTR [rbp-4]
add rax, rsi
sal rax, 3
add rax, rcx
mov rax, QWORD PTR [rax]
mov QWORD PTR [rdx], rax
mov rax, QWORD PTR [rbp-32]
mov rdx, QWORD PTR [rax]
mov eax, DWORD PTR [rbp-8]
imul rax, QWORD PTR [rbp-24]
mov rcx, rax
mov eax, DWORD PTR [rbp-4]
add rax, rcx
sal rax, 3
add rdx, rax
mov rax, QWORD PTR [rbp-16]
mov QWORD PTR [rdx], rax
add DWORD PTR [rbp-8], 1
.L3:
mov eax, DWORD PTR [rbp-8]
cmp rax, QWORD PTR [rbp-24]
jb .L4
add DWORD PTR [rbp-4], 1
.L2:
mov eax, DWORD PTR [rbp-4]
cmp rax, QWORD PTR [rbp-24]
jb .L5
pop rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE0:
.size _Z9transposemRPd, .-_Z9transposemRPd
.ident "GCC: (Debian 4.8.2-15) 4.8.2"
.section .note.GNU-stack,"",#progbits
with optimization (O3)
_Z9transposemRPd:
.LFB0:
.cfi_startproc
push rbx
.cfi_def_cfa_offset 16
.cfi_offset 3, -16
xor r11d, r11d
xor ebx, ebx
.L2:
cmp r11, rdi
mov r9, r11
jae .L10
.p2align 4,,10
.p2align 3
.L7:
add ebx, 1
mov r11d, ebx
cmp rdi, r11
mov rax, r11
jbe .L2
mov r10, r9
mov r8, QWORD PTR [rsi]
mov edx, ebx
imul r10, rdi
.p2align 4,,10
.p2align 3
.L6:
lea rcx, [rax+r10]
add edx, 1
imul rax, rdi
lea rcx, [r8+rcx*8]
movsd xmm0, QWORD PTR [rcx]
add rax, r9
lea rax, [r8+rax*8]
movsd xmm1, QWORD PTR [rax]
movsd QWORD PTR [rcx], xmm1
movsd QWORD PTR [rax], xmm0
mov eax, edx
cmp rdi, rax
ja .L6
cmp r11, rdi
mov r9, r11
jb .L7
.L10:
pop rbx
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE0:
.size _Z9transposemRPd, .-_Z9transposemRPd
.ident "GCC: (Debian 4.8.2-15) 4.8.2"
.section .note.GNU-stack,"",#progbits
The periodic increase of execution time must be due to the cache being only N-way associative instead of fully associative. You are witnessing hash collision related to cache line selection algorithm.
The fastest L1 cache has a smaller number of cache lines than the next level L2. In each level each cache line can be filled only from a limited set of sources.
Typical HW implementations of cache line selection algorithms will just use few bits from the memory address to determine in which cache slot the data should be written -- in HW bit shifts are free.
This causes a competition between memory ranges e.g. between addresses 0x300010 and 0x341010.
In fully sequential algorithm this doesn't matter -- N is large enough for practically all algorithms of the form:
for (i=0;i<1000;i++) a[i] += b[i] * c[i] + d[i];
But when the number of the inputs (or outputs) gets larger, which happens internally when the algorithm is optimized, having one input in the cache forces another input out of the cache.
// one possible method of optimization with 2 outputs and 6 inputs
// with two unrelated execution paths -- should be faster, but maybe it isn't
for (i=0;i<500;i++) {
a[i] += b[i] * c[i] + d[i];
a[i+500] += b[i+500] * c[i+500] + d[i+500];
}
A graph in Example 5: Cache Associativity illustrates 512 byte offset between matrix lines being a global worst case dimension for the particular system. When this is known, a working mitigation is to over-allocate the matrix horizontally to some other dimension char matrix[512][512 + 64].
The improvement in performance is likely related to CPU/RAM caching.
When the data is not a power of 2, a cache line load (like 16, 32, or 64 words) transfers more than the data that is required tying up the bus—uselessly as it turns out. For a data set which is a power of 2, all of the pre-fetched data is used.
I bet if you were to disable L1 and L2 caching, the performance would be completely smooth and predictable. But it would be much slower. Caching really helps performance!
Comment with code: In the -O3 case, with
#include <cstdlib>
extern void transpose(const size_t n, double* a)
{
for (size_t i = 0; i < n; ++i) {
for (size_t j = i + 1; j < n; ++j) {
std::swap(a[i * n + j], a[j * n + i]); // or your expanded version.
}
}
}
compiling with
$ g++ --version
g++ (Ubuntu/Linaro 4.8.1-10ubuntu9) 4.8.1
...
$ g++ -g1 -std=c++11 -Wall -o test.S -S test.cpp -O3
I get
_Z9transposemPd:
.LFB68:
.cfi_startproc
.LBB2:
testq %rdi, %rdi
je .L1
leaq 8(,%rdi,8), %r10
xorl %r8d, %r8d
.LBB3:
addq $1, %r8
leaq -8(%r10), %rcx
cmpq %rdi, %r8
leaq (%rsi,%rcx), %r9
je .L1
.p2align 4,,10
.p2align 3
.L10:
movq %r9, %rdx
movq %r8, %rax
.p2align 4,,10
.p2align 3
.L5:
.LBB4:
movsd (%rdx), %xmm1
movsd (%rsi,%rax,8), %xmm0
movsd %xmm1, (%rsi,%rax,8)
.LBE4:
addq $1, %rax
.LBB5:
movsd %xmm0, (%rdx)
addq %rcx, %rdx
.LBE5:
cmpq %rdi, %rax
jne .L5
addq $1, %r8
addq %r10, %r9
addq %rcx, %rsi
cmpq %rdi, %r8
jne .L10
.L1:
rep ret
.LBE3:
.LBE2:
.cfi_endproc
And something quite different if I add -m32.
(Note: it makes no difference to the assembly whether I use std::swap or your variant)
In order to understand what is causing the spikes, though, you probably want to visualize the memory operations going on.
To add to others: g++ -std=c++11 -march=core2 -O3 -c -S - gcc version 4.8.2 (MacPorts gcc48 4.8.2_0) - x86_64-apple-darwin13.0.0 :
__Z9transposemPd:
LFB0:
testq %rdi, %rdi
je L1
leaq 8(,%rdi,8), %r10
xorl %r8d, %r8d
leaq -8(%r10), %rcx
addq $1, %r8
leaq (%rsi,%rcx), %r9
cmpq %rdi, %r8
je L1
.align 4,0x90
L10:
movq %r9, %rdx
movq %r8, %rax
.align 4,0x90
L5:
movsd (%rdx), %xmm0
movsd (%rsi,%rax,8), %xmm1
movsd %xmm0, (%rsi,%rax,8)
addq $1, %rax
movsd %xmm1, (%rdx)
addq %rcx, %rdx
cmpq %rdi, %rax
jne L5
addq $1, %r8
addq %r10, %r9
addq %rcx, %rsi
cmpq %rdi, %r8
jne L10
L1:
rep; ret
Basically the same as #ksfone's code, for:
#include <cstddef>
void transpose(const size_t _n, double* _A) {
for(size_t i=0; i < _n; ++i) {
for(size_t j=i+1; j < _n; ++j) {
double tmp = _A[i*_n+j];
_A[i*_n+j] = _A[j*_n+i];
_A[j*_n+i] = tmp;
}
}
}
Apart from the Mach-O 'as' differences (extra underscore, align and DWARF locations), it's the same. But very different from the OP's assembly output. A much 'tighter' inner loop.
doing some test about the new and malloc, i expect the new
would need more assembly codes than malloc,but it is not
clang++ -S -mllvm --x86-asm-syntax=intel main.cpp
new
int main()
{
char *a = new char[1024];
delete []a;
return 0;
}
malloc
#include <cstdlib> // pulls in declaration of malloc, free
int main()
{
char *a = (char*)malloc(1024);
free(a);
return 0;
}
ok, char is a POD,i guess i don't need to deal with the cost of constructor and destructor, but new do need to deal with exception(even I don't add -fno-exceptions, the result is the same), how could the assembly be same?
.section __TEXT,__text,regular,pure_instructions
.globl _main
.align 4, 0x90
_main: ## #main
.cfi_startproc
## BB#0: ## %entry
push RBP
Ltmp2:
.cfi_def_cfa_offset 16
Ltmp3:
.cfi_offset rbp, -16
mov RBP, RSP
Ltmp4:
.cfi_def_cfa_register rbp
xor EAX, EAX
pop RBP
ret
.cfi_endproc
.subsections_via_symbols
Edit :
I change the codes again, and this time work, I have to say the compiler is so smart
new
#include <cstdio>
int main()
{
char *a = new char[1024];
for(int i = 0; i != 1024; ++i){
printf("%c", a[i]);
}
delete []a;
return 0;
}
malloc
#include <cstdio>
#include <cstdlib> // pulls in declaration of malloc, free
int main()
{
char *a = (char*)malloc(1024);
for(int i = 0; i != 1024; ++i){
printf("%c", a[i]);
}
free(a);
return 0;
}
clang++ -S -O3 -mllvm --x86-asm-syntax=intel -fno-exceptions main.cpp
assembly of c
.section __TEXT,__text,regular,pure_instructions
.globl _main
.align 4, 0x90
_main: ## #main
.cfi_startproc
## BB#0: ## %entry
push RBP
Ltmp3:
.cfi_def_cfa_offset 16
Ltmp4:
.cfi_offset rbp, -16
mov RBP, RSP
Ltmp5:
.cfi_def_cfa_register rbp
push R14
push RBX
Ltmp6:
.cfi_offset rbx, -32
Ltmp7:
.cfi_offset r14, -24
mov EDI, 1024
call _malloc
mov R14, RAX
mov EBX, 1
xor EDI, EDI
jmp LBB0_1
.align 4, 0x90
LBB0_2: ## %for.body.for.body_crit_edge
## in Loop: Header=BB0_1 Depth=1
movsx EDI, BYTE PTR [R14 + RBX]
inc RBX
LBB0_1: ## %for.body
## =>This Inner Loop Header: Depth=1
call _putchar
cmp EBX, 1024
jne LBB0_2
## BB#3: ## %for.end
mov RDI, R14
call _free
xor EAX, EAX
pop RBX
pop R14
pop RBP
ret
.cfi_endproc
.subsections_via_symbols
assembly of c++
.section __TEXT,__text,regular,pure_instructions
.globl _main
.align 4, 0x90
_main: ## #main
.cfi_startproc
## BB#0: ## %entry
push RBP
Ltmp3:
.cfi_def_cfa_offset 16
Ltmp4:
.cfi_offset rbp, -16
mov RBP, RSP
Ltmp5:
.cfi_def_cfa_register rbp
push R14
push RBX
Ltmp6:
.cfi_offset rbx, -32
Ltmp7:
.cfi_offset r14, -24
mov EDI, 1024
call __Znam
mov R14, RAX
mov EBX, 1
xor EDI, EDI
jmp LBB0_1
.align 4, 0x90
LBB0_2: ## %for.body.for.body_crit_edge
## in Loop: Header=BB0_1 Depth=1
movsx EDI, BYTE PTR [R14 + RBX]
inc RBX
LBB0_1: ## %for.body
## =>This Inner Loop Header: Depth=1
call _putchar
cmp EBX, 1024
jne LBB0_2
## BB#3: ## %for.end
test R14, R14
je LBB0_5
## BB#4: ## %delete.notnull
mov RDI, R14
call __ZdaPv
LBB0_5: ## %delete.end
xor EAX, EAX
pop RBX
pop R14
pop RBP
ret
.cfi_endproc
.subsections_via_symbols
c++ version are calling the assembly of new and delete, before delete the buffer.
assembly of c++ check the pointer is point to nullptr or not, if yes, don't delete
the buffer;else delete the buffer.
Your assembly code looks like it's not actually doing anything. The compiler detected that you weren't doing anything important inside main and optimized out everything except "return 0". You might try disabling optimizations or printing the value of the pointer a.
Given that you specified no-execptions on your command line, its not surprising that the code is the same. Look at the difference in assembly between -fno-exceptions and exceptions enabled.
I have been reading through the following series of articles: http://www.altdevblogaday.com/2011/11/09/a-low-level-curriculum-for-c-and-c
The disassembled code shown and the disassembled code I am managing to produce whilst running the same code vary quite significantly and I lack the understanding to explain the differences.
Is there anyone that can step through it line by line and perhaps explain what it's doing at each step ? I get the feeling from the searching around I have done that the first few lines have something to do with frame pointers, there also seems to be a few extra lines in my disassembled code that ensures registers are empty before placing new values into them (absent from the code in the article)
I am running this on OSX (original author is using Windows) using the g++ compiler from within XCode 4. I am really clueless as to weather or not these variances are due to the OS, the architecture (32 bit vs 64 bit maybe?) or the compiler itself. It could even be the code I guess - mine is wrapped inside the main function declaration whereas the original code makes no mention of this.
My code:
int main(int argc, const char * argv[])
{
int x = 1;
int y = 2;
int z = 0;
z = x + y;
}
My disassembled code:
0x100000f40: pushq %rbp
0x100000f41: movq %rsp, %rbp
0x100000f44: movl $0, %eax
0x100000f49: movl %edi, -4(%rbp)
0x100000f4c: movq %rsi, -16(%rbp)
0x100000f50: movl $1, -20(%rbp)
0x100000f57: movl $2, -24(%rbp)
0x100000f5e: movl $0, -28(%rbp)
0x100000f65: movl -20(%rbp), %edi
0x100000f68: addl -24(%rbp), %edi
0x100000f6b: movl %edi, -28(%rbp)
0x100000f6e: popq %rbp
0x100000f6f: ret
The disassembled code from the original article:
mov dword ptr [ebp-8],1
mov dword ptr [ebp-14h],2
mov dword ptr [ebp-20h],0
mov eax, dword ptr [ebp-8]
add eax, dword ptr [ebp-14h]
mov dword ptr [ebp-20h],eax
A full line by line breakdown would be extremely enlightening but any help in understanding this would be appreciated.
All of the code from the original article is in your code, there's just some extra stuff around it. This:
0x100000f50: movl $1, -20(%rbp)
0x100000f57: movl $2, -24(%rbp)
0x100000f5e: movl $0, -28(%rbp)
0x100000f65: movl -20(%rbp), %edi
0x100000f68: addl -24(%rbp), %edi
0x100000f6b: movl %edi, -28(%rbp)
Corresponds directly to the 6 instructions talked about in the article.
There are two major differences between your disassembled code and the article's code.
One is that the article is using the Intel assembler syntax, while your disassembled code is using the traditional Unix/AT&T assembler syntax. Some differences between the two are documented on Wikipedia.
The other difference is that the article omits the function prologue, which sets up the stack frame, and the function epilogue, which destroys the stack frame and returns to the caller. The program he's disassembling has to contain instructions to do those things, but his disassembler isn't showing them. (Actually the stack frame could and probably would be omitted if the optimizer were enabled, but it's clearly not enabled.)
There are also some minor differences: your code is using a slightly different layout for local variables, and your code is computing the sum in a different register.
On the Mac, g++ doesn't support emitting Intel mnemonics, but clang does:
:; clang -S -mllvm --x86-asm-syntax=intel t.c
:; cat t.s
.section __TEXT,__text,regular,pure_instructions
.globl _main
.align 4, 0x90
_main: ## #main
.cfi_startproc
## BB#0:
push RBP
Ltmp2:
.cfi_def_cfa_offset 16
Ltmp3:
.cfi_offset rbp, -16
mov RBP, RSP
Ltmp4:
.cfi_def_cfa_register rbp
mov EAX, 0
mov DWORD PTR [RBP - 4], EDI
mov QWORD PTR [RBP - 16], RSI
mov DWORD PTR [RBP - 20], 1
mov DWORD PTR [RBP - 24], 2
mov DWORD PTR [RBP - 28], 0
mov EDI, DWORD PTR [RBP - 20]
add EDI, DWORD PTR [RBP - 24]
mov DWORD PTR [RBP - 28], EDI
pop RBP
ret
.cfi_endproc
.subsections_via_symbols
If you add the -g flag, the compiler will add debug information including source filenames and line numbers. It's too big to put here in its entirety, but this is the relevant part:
.loc 1 4 14 prologue_end ## t.c:4:14
Ltmp5:
mov DWORD PTR [RBP - 20], 1
.loc 1 5 14 ## t.c:5:14
mov DWORD PTR [RBP - 24], 2
.loc 1 6 14 ## t.c:6:14
mov DWORD PTR [RBP - 28], 0
.loc 1 8 5 ## t.c:8:5
mov EDI, DWORD PTR [RBP - 20]
add EDI, DWORD PTR [RBP - 24]
mov DWORD PTR [RBP - 28], EDI
First of all, the assembler listed as "from original article" is using "Intel" syntax, where the "disassembled output" in your post is "AT&T syntax". This explains the order of arguments to instructions being "back to front" [let's not argue about which is right or wrong, ok?], and register names are prefixed by a %, constants prefixed by $. There is also a difference in how memory locations/offsets to registers are referenced - dword ptr [reg+offs] in Intel assembler translates to l as a suffix on the instruction, and offs(%reg).
The 32-bit vs. 64-bit renames some of the registers - %rbp is the same as ebp in the article code.
The actual offsets (e.g -20) are different partly because the registers are bigger in 64-bit, but also because you have argc and argv as part of your function arguments, which is stored as part of the start of the function - I have a feeling the original article is actually disassembling a different function than main.