Related
(For testing purposes) I have written a simple Method to calculate the transpose of a nxn Matrix
void transpose(const size_t _n, double* _A) {
for(uint i=0; i < _n; ++i) {
for(uint j=i+1; j < _n; ++j) {
double tmp = _A[i*_n+j];
_A[i*_n+j] = _A[j*_n+i];
_A[j*_n+i] = tmp;
}
}
}
When using optimization levels O3 or Ofast I expected the compiler to unroll some loops which would lead to higher performance especially when the matrix size is a multiple of 2 (i.e., the double loop body can be performed each iteration) or similar. Instead what I measured was the exact opposite. Powers of 2 actually show a significant spike in execution time.
These spikes are also at regular intervals of 64, more pronounced at intervals of 128 and so on. Each spike extends to the neighboring matrix sizes like in the following table
size n time(us)
1020 2649
1021 2815
1022 3100
1023 5428
1024 15791
1025 6778
1026 3106
1027 2847
1028 2660
1029 3038
1030 2613
I compiled with a gcc version 4.8.2 but the same thing happens with a clang 3.5 so this might be some generic thing?
So my question basically is: Why is there this periodic increase in execution time? Is it some generic thing coming with any of the optimization options (as it happens with clang and gcc alike)? If so which optimization option is causing this?
And how can this be so significant that even the O0 version of the program outperforms the 03 version at multiples of 512?
EDIT: Note the magnitude of the spikes in this (logarithmic) plot. Transposing a 1024x1024 matrix with optimization actually takes as much time as transposing a 1300x1300 matrix without optimization. If this is a cache-fault / page-fault problem, then someone needs to explain to me why the memory layout is so significantly different for the optimized version of the program, that it fails for powers of two, just to recover high performance for slightly larger matrices. Shouldn't cache-faults create more of a step-like pattern? Why does the execution times go down again at all? (and why should optimization create cache-faults that weren't there before?)
EDIT: the following should be the assembler codes that gcc produced
no optimization (O0):
_Z9transposemRPd:
.LFB0:
.cfi_startproc
push rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
mov rbp, rsp
.cfi_def_cfa_register 6
mov QWORD PTR [rbp-24], rdi
mov QWORD PTR [rbp-32], rsi
mov DWORD PTR [rbp-4], 0
jmp .L2
.L5:
mov eax, DWORD PTR [rbp-4]
add eax, 1
mov DWORD PTR [rbp-8], eax
jmp .L3
.L4:
mov rax, QWORD PTR [rbp-32]
mov rdx, QWORD PTR [rax]
mov eax, DWORD PTR [rbp-4]
imul rax, QWORD PTR [rbp-24]
mov rcx, rax
mov eax, DWORD PTR [rbp-8]
add rax, rcx
sal rax, 3
add rax, rdx
mov rax, QWORD PTR [rax]
mov QWORD PTR [rbp-16], rax
mov rax, QWORD PTR [rbp-32]
mov rdx, QWORD PTR [rax]
mov eax, DWORD PTR [rbp-4]
imul rax, QWORD PTR [rbp-24]
mov rcx, rax
mov eax, DWORD PTR [rbp-8]
add rax, rcx
sal rax, 3
add rdx, rax
mov rax, QWORD PTR [rbp-32]
mov rcx, QWORD PTR [rax]
mov eax, DWORD PTR [rbp-8]
imul rax, QWORD PTR [rbp-24]
mov rsi, rax
mov eax, DWORD PTR [rbp-4]
add rax, rsi
sal rax, 3
add rax, rcx
mov rax, QWORD PTR [rax]
mov QWORD PTR [rdx], rax
mov rax, QWORD PTR [rbp-32]
mov rdx, QWORD PTR [rax]
mov eax, DWORD PTR [rbp-8]
imul rax, QWORD PTR [rbp-24]
mov rcx, rax
mov eax, DWORD PTR [rbp-4]
add rax, rcx
sal rax, 3
add rdx, rax
mov rax, QWORD PTR [rbp-16]
mov QWORD PTR [rdx], rax
add DWORD PTR [rbp-8], 1
.L3:
mov eax, DWORD PTR [rbp-8]
cmp rax, QWORD PTR [rbp-24]
jb .L4
add DWORD PTR [rbp-4], 1
.L2:
mov eax, DWORD PTR [rbp-4]
cmp rax, QWORD PTR [rbp-24]
jb .L5
pop rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE0:
.size _Z9transposemRPd, .-_Z9transposemRPd
.ident "GCC: (Debian 4.8.2-15) 4.8.2"
.section .note.GNU-stack,"",#progbits
with optimization (O3)
_Z9transposemRPd:
.LFB0:
.cfi_startproc
push rbx
.cfi_def_cfa_offset 16
.cfi_offset 3, -16
xor r11d, r11d
xor ebx, ebx
.L2:
cmp r11, rdi
mov r9, r11
jae .L10
.p2align 4,,10
.p2align 3
.L7:
add ebx, 1
mov r11d, ebx
cmp rdi, r11
mov rax, r11
jbe .L2
mov r10, r9
mov r8, QWORD PTR [rsi]
mov edx, ebx
imul r10, rdi
.p2align 4,,10
.p2align 3
.L6:
lea rcx, [rax+r10]
add edx, 1
imul rax, rdi
lea rcx, [r8+rcx*8]
movsd xmm0, QWORD PTR [rcx]
add rax, r9
lea rax, [r8+rax*8]
movsd xmm1, QWORD PTR [rax]
movsd QWORD PTR [rcx], xmm1
movsd QWORD PTR [rax], xmm0
mov eax, edx
cmp rdi, rax
ja .L6
cmp r11, rdi
mov r9, r11
jb .L7
.L10:
pop rbx
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE0:
.size _Z9transposemRPd, .-_Z9transposemRPd
.ident "GCC: (Debian 4.8.2-15) 4.8.2"
.section .note.GNU-stack,"",#progbits
The periodic increase of execution time must be due to the cache being only N-way associative instead of fully associative. You are witnessing hash collision related to cache line selection algorithm.
The fastest L1 cache has a smaller number of cache lines than the next level L2. In each level each cache line can be filled only from a limited set of sources.
Typical HW implementations of cache line selection algorithms will just use few bits from the memory address to determine in which cache slot the data should be written -- in HW bit shifts are free.
This causes a competition between memory ranges e.g. between addresses 0x300010 and 0x341010.
In fully sequential algorithm this doesn't matter -- N is large enough for practically all algorithms of the form:
for (i=0;i<1000;i++) a[i] += b[i] * c[i] + d[i];
But when the number of the inputs (or outputs) gets larger, which happens internally when the algorithm is optimized, having one input in the cache forces another input out of the cache.
// one possible method of optimization with 2 outputs and 6 inputs
// with two unrelated execution paths -- should be faster, but maybe it isn't
for (i=0;i<500;i++) {
a[i] += b[i] * c[i] + d[i];
a[i+500] += b[i+500] * c[i+500] + d[i+500];
}
A graph in Example 5: Cache Associativity illustrates 512 byte offset between matrix lines being a global worst case dimension for the particular system. When this is known, a working mitigation is to over-allocate the matrix horizontally to some other dimension char matrix[512][512 + 64].
The improvement in performance is likely related to CPU/RAM caching.
When the data is not a power of 2, a cache line load (like 16, 32, or 64 words) transfers more than the data that is required tying up the bus—uselessly as it turns out. For a data set which is a power of 2, all of the pre-fetched data is used.
I bet if you were to disable L1 and L2 caching, the performance would be completely smooth and predictable. But it would be much slower. Caching really helps performance!
Comment with code: In the -O3 case, with
#include <cstdlib>
extern void transpose(const size_t n, double* a)
{
for (size_t i = 0; i < n; ++i) {
for (size_t j = i + 1; j < n; ++j) {
std::swap(a[i * n + j], a[j * n + i]); // or your expanded version.
}
}
}
compiling with
$ g++ --version
g++ (Ubuntu/Linaro 4.8.1-10ubuntu9) 4.8.1
...
$ g++ -g1 -std=c++11 -Wall -o test.S -S test.cpp -O3
I get
_Z9transposemPd:
.LFB68:
.cfi_startproc
.LBB2:
testq %rdi, %rdi
je .L1
leaq 8(,%rdi,8), %r10
xorl %r8d, %r8d
.LBB3:
addq $1, %r8
leaq -8(%r10), %rcx
cmpq %rdi, %r8
leaq (%rsi,%rcx), %r9
je .L1
.p2align 4,,10
.p2align 3
.L10:
movq %r9, %rdx
movq %r8, %rax
.p2align 4,,10
.p2align 3
.L5:
.LBB4:
movsd (%rdx), %xmm1
movsd (%rsi,%rax,8), %xmm0
movsd %xmm1, (%rsi,%rax,8)
.LBE4:
addq $1, %rax
.LBB5:
movsd %xmm0, (%rdx)
addq %rcx, %rdx
.LBE5:
cmpq %rdi, %rax
jne .L5
addq $1, %r8
addq %r10, %r9
addq %rcx, %rsi
cmpq %rdi, %r8
jne .L10
.L1:
rep ret
.LBE3:
.LBE2:
.cfi_endproc
And something quite different if I add -m32.
(Note: it makes no difference to the assembly whether I use std::swap or your variant)
In order to understand what is causing the spikes, though, you probably want to visualize the memory operations going on.
To add to others: g++ -std=c++11 -march=core2 -O3 -c -S - gcc version 4.8.2 (MacPorts gcc48 4.8.2_0) - x86_64-apple-darwin13.0.0 :
__Z9transposemPd:
LFB0:
testq %rdi, %rdi
je L1
leaq 8(,%rdi,8), %r10
xorl %r8d, %r8d
leaq -8(%r10), %rcx
addq $1, %r8
leaq (%rsi,%rcx), %r9
cmpq %rdi, %r8
je L1
.align 4,0x90
L10:
movq %r9, %rdx
movq %r8, %rax
.align 4,0x90
L5:
movsd (%rdx), %xmm0
movsd (%rsi,%rax,8), %xmm1
movsd %xmm0, (%rsi,%rax,8)
addq $1, %rax
movsd %xmm1, (%rdx)
addq %rcx, %rdx
cmpq %rdi, %rax
jne L5
addq $1, %r8
addq %r10, %r9
addq %rcx, %rsi
cmpq %rdi, %r8
jne L10
L1:
rep; ret
Basically the same as #ksfone's code, for:
#include <cstddef>
void transpose(const size_t _n, double* _A) {
for(size_t i=0; i < _n; ++i) {
for(size_t j=i+1; j < _n; ++j) {
double tmp = _A[i*_n+j];
_A[i*_n+j] = _A[j*_n+i];
_A[j*_n+i] = tmp;
}
}
}
Apart from the Mach-O 'as' differences (extra underscore, align and DWARF locations), it's the same. But very different from the OP's assembly output. A much 'tighter' inner loop.
This question already has an answer here:
Dual emission of constructor symbols
(1 answer)
Closed 9 years ago.
c++ codes
#include <cstdio>
#include <cstdlib>
struct trivialStruct
{
trivialStruct();
~trivialStruct();
int *a;
float *b;
float *c;
};
trivialStruct::trivialStruct() : a((int*)malloc(sizeof(int))), b((float*)malloc(sizeof(float))), c((float*)malloc(sizeof(float)))
{
*a = 100;
*b = 200;
*c = 300;
}
trivialStruct::~trivialStruct()
{
free(a);
free(b);
free(c);
a = nullptr;
b = nullptr;
c = nullptr;
}
int main()
{
trivialStruct A;
printf("%d, %f, %f", *A.a, *A.b, *A.c);
return 0;
}
assembly
.section __TEXT,__text,regular,pure_instructions
.globl __ZN13trivialStructC1Ev
.align 4, 0x90
__ZN13trivialStructC1Ev: ## #_ZN13trivialStructC1Ev
.cfi_startproc
## BB#0: ## %entry
push RBP
Ltmp3:
.cfi_def_cfa_offset 16
Ltmp4:
.cfi_offset rbp, -16
mov RBP, RSP
Ltmp5:
.cfi_def_cfa_register rbp
push R15
push R14
push RBX
push RAX
Ltmp6:
.cfi_offset rbx, -40
Ltmp7:
.cfi_offset r14, -32
Ltmp8:
.cfi_offset r15, -24
mov RBX, RDI
mov EDI, 4
call _malloc
mov R14, RAX
mov QWORD PTR [RBX], R14
mov EDI, 4
call _malloc
mov R15, RAX
mov QWORD PTR [RBX + 8], R15
mov EDI, 4
call _malloc
mov QWORD PTR [RBX + 16], RAX
mov DWORD PTR [R14], 100
mov DWORD PTR [R15], 1128792064
mov DWORD PTR [RAX], 1133903872
add RSP, 8
pop RBX
pop R14
pop R15
pop RBP
ret
.cfi_endproc
.globl __ZN13trivialStructC2Ev
.align 4, 0x90
__ZN13trivialStructC2Ev: ## #_ZN13trivialStructC2Ev
.cfi_startproc
## BB#0: ## %entry
push RBP
Ltmp12:
.cfi_def_cfa_offset 16
Ltmp13:
.cfi_offset rbp, -16
mov RBP, RSP
Ltmp14:
.cfi_def_cfa_register rbp
push R15
push R14
push RBX
push RAX
Ltmp15:
.cfi_offset rbx, -40
Ltmp16:
.cfi_offset r14, -32
Ltmp17:
.cfi_offset r15, -24
mov RBX, RDI
mov EDI, 4
call _malloc
mov R14, RAX
mov QWORD PTR [RBX], R14
mov EDI, 4
call _malloc
mov R15, RAX
mov QWORD PTR [RBX + 8], R15
mov EDI, 4
call _malloc
mov QWORD PTR [RBX + 16], RAX
mov DWORD PTR [R14], 100
mov DWORD PTR [R15], 1128792064
mov DWORD PTR [RAX], 1133903872
add RSP, 8
pop RBX
pop R14
pop R15
pop RBP
ret
.cfi_endproc
.globl __ZN13trivialStructD1Ev
.align 4, 0x90
__ZN13trivialStructD1Ev: ## #_ZN13trivialStructD1Ev
.cfi_startproc
## BB#0: ## %entry
push RBP
Ltmp21:
.cfi_def_cfa_offset 16
Ltmp22:
.cfi_offset rbp, -16
mov RBP, RSP
Ltmp23:
.cfi_def_cfa_register rbp
push RBX
push RAX
Ltmp24:
.cfi_offset rbx, -24
mov RBX, RDI
mov RDI, QWORD PTR [RBX]
call _free
mov RDI, QWORD PTR [RBX + 8]
call _free
mov RDI, QWORD PTR [RBX + 16]
call _free
mov QWORD PTR [RBX + 16], 0
mov QWORD PTR [RBX + 8], 0
mov QWORD PTR [RBX], 0
add RSP, 8
pop RBX
pop RBP
ret
.cfi_endproc
.globl __ZN13trivialStructD2Ev
.align 4, 0x90
__ZN13trivialStructD2Ev: ## #_ZN13trivialStructD2Ev
.cfi_startproc
## BB#0: ## %entry
push RBP
Ltmp28:
.cfi_def_cfa_offset 16
Ltmp29:
.cfi_offset rbp, -16
mov RBP, RSP
Ltmp30:
.cfi_def_cfa_register rbp
push RBX
push RAX
Ltmp31:
.cfi_offset rbx, -24
mov RBX, RDI
mov RDI, QWORD PTR [RBX]
call _free
mov RDI, QWORD PTR [RBX + 8]
call _free
mov RDI, QWORD PTR [RBX + 16]
call _free
mov QWORD PTR [RBX + 16], 0
mov QWORD PTR [RBX + 8], 0
mov QWORD PTR [RBX], 0
add RSP, 8
pop RBX
pop RBP
ret
.cfi_endproc
.section __TEXT,__literal8,8byte_literals
.align 3
LCPI4_0:
.quad 4641240890982006784 ## double 200
LCPI4_1:
.quad 4643985272004935680 ## double 300
.section __TEXT,__text,regular,pure_instructions
.globl _main
.align 4, 0x90
_main: ## #main
.cfi_startproc
## BB#0: ## %entry
push RBP
Ltmp34:
.cfi_def_cfa_offset 16
Ltmp35:
.cfi_offset rbp, -16
mov RBP, RSP
Ltmp36:
.cfi_def_cfa_register rbp
lea RDI, QWORD PTR [RIP + L_.str]
movsd XMM0, QWORD PTR [RIP + LCPI4_0]
movsd XMM1, QWORD PTR [RIP + LCPI4_1]
mov ESI, 100
mov AL, 2
call _printf
xor EAX, EAX
pop RBP
ret
.cfi_endproc
.section __TEXT,__cstring,cstring_literals
L_.str: ## #.str
.asciz "%d, %f, %f"
.subsections_via_symbols
command
clang++ -S -O2 -std=c++11 -mllvm --x86-asm-syntax=intel -fno-exceptions main.cpp
As you can see, there are two part of codes are same(constructor and destructor)
__ZN13trivialStructC1Ev: ## #_ZN13trivialStructC1Ev
__ZN13trivialStructC2Ev: ## #_ZN13trivialStructC2Ev
__ZN13trivialStructD1Ev: ## #_ZN13trivialStructD1Ev
__ZN13trivialStructD2Ev: ## #_ZN13trivialStructD2Ev
I have no idea why the compiler generate two part of codes but not just one?
I am no familiar with assembly, but looks like this just make the codes become
fatter(and maybe slower).
This is part of the ABI for your platform, and escapes the standard. Both constructors and destructors can generate multiple symbols in the binary. For example, the Itanium C++ABI will generate up to 3 constructors/destructors:
complete object constructor
base object constructor
complete object allocating constructor
deleting destructor
complete object destructor
base object destructor
The different symbols take on slightly different responsibilities as the implementation might need to do different things depending on how the object is being created/destroyed. In your particular case, the code is simple enough that all constructors might generate exactly the same code, but they need to be there to comply with the ABI, and the ABI has them to enable more complex use cases.
For example, a complete object constructor will initialize virtual bases, while the base object constructor will skip this part of construction. If there is multiple/virtual inheritance and virtual functions, the vptr in the complete object may have to jump through different sets of intermediate tables depending on how this subobject is being instantiated.
If you want an explanation other than the ABI mandates, you should take a look at the documentation for your particular ABI. You can also take a look at Inside the C++ object model that even if old, contains a good description of what the problems to solve are and some of the solutions provided.
So, I had this code:
constexpr unsigned N = 1000;
void f1(char* sum, char* a, char* b) {
for(int i = 0; i < N; ++i) {
sum[i] = a[i] + b[i];
}
}
void f2(char* sum, char* a, char* b) {
char* end = sum + N;
while(sum != end) {
*sum++ = *a++ + *b++;
}
}
I wanted to see the code that GCC 4.7.2 would generate. So I ran g++ -march=native -O3 -masm=intel -S a.c++ -std=c++11 And got the following output:
.file "a.c++"
.intel_syntax noprefix
.text
.p2align 4,,15
.globl _Z2f1PcS_S_
.type _Z2f1PcS_S_, #function
_Z2f1PcS_S_:
.LFB0:
.cfi_startproc
lea rcx, [rdx+16]
lea rax, [rdi+16]
cmp rdi, rcx
setae r8b
cmp rdx, rax
setae cl
or cl, r8b
je .L5
lea rcx, [rsi+16]
cmp rdi, rcx
setae cl
cmp rsi, rax
setae al
or cl, al
je .L5
xor eax, eax
.p2align 4,,10
.p2align 3
.L3:
movdqu xmm0, XMMWORD PTR [rdx+rax]
movdqu xmm1, XMMWORD PTR [rsi+rax]
paddb xmm0, xmm1
movdqu XMMWORD PTR [rdi+rax], xmm0
add rax, 16
cmp rax, 992
jne .L3
mov ax, 8
mov r9d, 992
.L2:
sub eax, 1
lea rcx, [rdx+r9]
add rdi, r9
lea r8, [rax+1]
add rsi, r9
xor eax, eax
.p2align 4,,10
.p2align 3
.L4:
movzx edx, BYTE PTR [rcx+rax]
add dl, BYTE PTR [rsi+rax]
mov BYTE PTR [rdi+rax], dl
add rax, 1
cmp rax, r8
jne .L4
rep
ret
.L5:
mov eax, 1000
xor r9d, r9d
jmp .L2
.cfi_endproc
.LFE0:
.size _Z2f1PcS_S_, .-_Z2f1PcS_S_
.p2align 4,,15
.globl _Z2f2PcS_S_
.type _Z2f2PcS_S_, #function
_Z2f2PcS_S_:
.LFB1:
.cfi_startproc
lea rcx, [rdx+16]
lea rax, [rdi+16]
cmp rdi, rcx
setae r8b
cmp rdx, rax
setae cl
or cl, r8b
je .L19
lea rcx, [rsi+16]
cmp rdi, rcx
setae cl
cmp rsi, rax
setae al
or cl, al
je .L19
xor eax, eax
.p2align 4,,10
.p2align 3
.L17:
movdqu xmm0, XMMWORD PTR [rdx+rax]
movdqu xmm1, XMMWORD PTR [rsi+rax]
paddb xmm0, xmm1
movdqu XMMWORD PTR [rdi+rax], xmm0
add rax, 16
cmp rax, 992
jne .L17
add rdi, 992
add rsi, 992
add rdx, 992
mov r8d, 8
.L16:
xor eax, eax
.p2align 4,,10
.p2align 3
.L18:
movzx ecx, BYTE PTR [rdx+rax]
add cl, BYTE PTR [rsi+rax]
mov BYTE PTR [rdi+rax], cl
add rax, 1
cmp rax, r8
jne .L18
rep
ret
.L19:
mov r8d, 1000
jmp .L16
.cfi_endproc
.LFE1:
.size _Z2f2PcS_S_, .-_Z2f2PcS_S_
.ident "GCC: (GNU) 4.7.2"
.section .note.GNU-stack,"",#progbits
I suck at reading assembly, so I decided to add some markers to know where the bodies of the loops went:
constexpr unsigned N = 1000;
void f1(char* sum, char* a, char* b) {
for(int i = 0; i < N; ++i) {
asm("# im in ur loop");
sum[i] = a[i] + b[i];
}
}
void f2(char* sum, char* a, char* b) {
char* end = sum + N;
while(sum != end) {
asm("# im in ur loop");
*sum++ = *a++ + *b++;
}
}
And GCC spat this out:
.file "a.c++"
.intel_syntax noprefix
.text
.p2align 4,,15
.globl _Z2f1PcS_S_
.type _Z2f1PcS_S_, #function
_Z2f1PcS_S_:
.LFB0:
.cfi_startproc
xor eax, eax
.p2align 4,,10
.p2align 3
.L2:
#APP
# 4 "a.c++" 1
# im in ur loop
# 0 "" 2
#NO_APP
movzx ecx, BYTE PTR [rdx+rax]
add cl, BYTE PTR [rsi+rax]
mov BYTE PTR [rdi+rax], cl
add rax, 1
cmp rax, 1000
jne .L2
rep
ret
.cfi_endproc
.LFE0:
.size _Z2f1PcS_S_, .-_Z2f1PcS_S_
.p2align 4,,15
.globl _Z2f2PcS_S_
.type _Z2f2PcS_S_, #function
_Z2f2PcS_S_:
.LFB1:
.cfi_startproc
xor eax, eax
.p2align 4,,10
.p2align 3
.L6:
#APP
# 12 "a.c++" 1
# im in ur loop
# 0 "" 2
#NO_APP
movzx ecx, BYTE PTR [rdx+rax]
add cl, BYTE PTR [rsi+rax]
mov BYTE PTR [rdi+rax], cl
add rax, 1
cmp rax, 1000
jne .L6
rep
ret
.cfi_endproc
.LFE1:
.size _Z2f2PcS_S_, .-_Z2f2PcS_S_
.ident "GCC: (GNU) 4.7.2"
.section .note.GNU-stack,"",#progbits
This is considerably shorter, and has some significant differences like the lack of SIMD instructions. I was expecting the same output, with some comments somewhere in the middle of it. Am I making some wrong assumption here? Is GCC's optimizer hindered by asm comments?
The interactions with optimisations are explained about halfway down the "Assembler Instructions with C Expression Operands" page in the documentation.
GCC doesn't try to understand any of the actual assembly inside the asm; the only thing it knows about the content is what you (optionally) tell it in the output and input operand specification and the register clobber list.
In particular, note:
An asm instruction without any output operands will be treated identically to a volatile asm instruction.
and
The volatile keyword indicates that the instruction has important side-effects [...]
So the presence of the asm inside your loop has inhibited a vectorisation optimisation, because GCC assumes it has side effects.
Note that gcc vectorized the code, splitting the loop body into two parts, the first processing 16 items at a time, and the second doing the remainder later.
As Ira commented, the compiler doesn't parse the asm block, so it does not know that it's just a comment. Even if it did, it has no way of knowing what you intended. The optmized loops have the body doubled, should it put your asm in each? Would you like it that it isn't executed 1000 times? It doesn't know, so it goes the safe route and falls back to the simple single loop.
I don't agree with the "gcc doesn't understand what is in the asm() block". For example, gcc can deal quite well with optimising parameters, and even re-arranging asm() blocks such that it intermingles with the generated C code. This is why, if you look at inline assembler in for example the Linux kernel, it is nearly always prefixed with __volatile__ to ensure that the compiler "doesn't move the code around". I have had gcc move my "rdtsc" around, which made my measurements of the time it took to do certain thing.
As documented, gcc treats certain types of asm() blocks as "special", and thus doesn't optimise the code either side of the block.
That's not to say that gcc won't, sometimes, get confused by inline assembler blocks, or simply decide to give up on some particular optimisation because it can't follow the consequences of the assembler code, etc, etc. More importantly, it can often get confused by missing clobber tags - so if you have some instruction like cpuid that changes the value of EAX-EDX, it but you wrote the code so that it only uses EAX, the compiler may store things in EBX, ECX and EDX, and then your code acts very strange when these registers are overwritten... If you are lucky, it crashes immediately - then it's easy to figure out what goes on. But if you are unlucky, it crashes way down the line... Another tricky one is the divide instruction that give a second result in edx. If you don't care about the modulo, it's easy to forget that EDX was changed.
This answer is now modified: it was originally written with a mindset considering inline Basic Asm as a pretty strongly specified tool, but it's nothing like that in GCC. Basic Asm is weak and so the answer was edited.
Each assembly comment acts as a breakpoint.
EDIT: But a broken one, as you use Basic Asm. Inline asm (an asm statement inside a function body) without explicit clobber list is a weakly specified feature in GCC and its behavior is hard to define. It doesn't seem (I don't fully grasp its guarantees) attached to anything in particular, so while the assembly code must be run at some point if the function is run, it isn't clear when it is run for any non trivial optimization level. A breakpoint that can be reordered with neighboring instruction isn't a very useful "breakpoint". END EDIT
You could run your program in an interpreter that breaks at each comment and prints out the state of every variable (using debug information). These points must exist so that you observe the environment (state of registers and memory).
Without the comment, no observation point exists, and the loop is compiled as a single mathematical function taking an environment and producing a modified environment.
You want to know the answer of a meaningless question: you want to know how each instruction (or maybe block, or maybe range of instruction) is compiled, but no single isolated instruction (or block) is compiled; the whole stuff is compiled as a whole.
A better question would be:
Hello GCC. Why do you believe this asm output is implementing the source code? Please explain step by step, with every assumption.
But then you wouldn't want to read a proof longer than the asm output, written in term of GCC internal representation.
While looking at some questions on optimization, this accepted answer for the question on coding practices for most effective use of the optimizer piqued my curiosity. The assertion is that local variables should be used for computations in a function, not output arguments. It was suggested this would allow the compiler to make additional optimizations otherwise not possible.
So, writing a simple bit of code for the example Foo class and compiling the code fragments with g++ v4.4 and -O2 gave some assembler output (use -S). The parts of the assembler listing with just the loop portion shown below. On examination of the output, it seems the loop is nearly identical for both, with just a difference in one address. That address being a pointer to the output argument for the first example or the local variable for the second.
There seems to no change in the actual effect whether the local variable is used or not. So the question breaks down to 3 parts:
a) is GCC not doing additional optimization, even given the hint suggested;
b) is GCC successfully optimizing in both cases, but should not be;
c) is GCC successfully optimizing in both cases, and is producing compliant output as defined by the C++ standard?
Here is the unoptimized function:
void DoSomething(const Foo& foo1, const Foo* foo2, int numFoo, Foo& barOut)
{
for (int i=0; i<numFoo, i++)
{
barOut.munge(foo1, foo2[i]);
}
}
And corresponding assembly:
.L3:
movl (%esi), %eax
addl $1, %ebx
addl $4, %esi
movl %eax, 8(%esp)
movl (%edi), %eax
movl %eax, 4(%esp)
movl 20(%ebp), %eax ; Note address is that of the output argument
movl %eax, (%esp)
call _ZN3Foo5mungeES_S_
cmpl %ebx, 16(%ebp)
jg .L3
Here is the re-written function:
void DoSomethingFaster(const Foo& foo1, const Foo* foo2, int numFoo, Foo& barOut)
{
Foo barTemp = barOut;
for (int i=0; i<numFoo, i++)
{
barTemp.munge(foo1, foo2[i]);
}
barOut = barTemp;
}
And here is the compiler output for the function using a local variable:
.L3:
movl (%esi), %eax ; Load foo2[i] pointer into EAX
addl $1, %ebx ; increment i
addl $4, %esi ; increment foo2[i] (32-bit system, 8 on 64-bit systems)
movl %eax, 8(%esp) ; PUSH foo2[i] onto stack (careful! from EAX, not ESI)
movl (%edi), %eax ; Load foo1 pointer into EAX
movl %eax, 4(%esp) ; PUSH foo1
leal -28(%ebp), %eax ; Load barTemp pointer into EAX
movl %eax, (%esp) ; PUSH the this pointer for barTemp
call _ZN3Foo5mungeES_S_ ; munge()!
cmpl %ebx, 16(%ebp) ; i < numFoo
jg .L3 ; recall incrementing i by one coming into the loop
; so test if greater
The example given in that answer was not a very good one because of the call to an unknown function the compiler cannot reason much about. Here's a better example:
void FillOneA(int *array, int length, int& startIndex)
{
for (int i = 0; i < length; i++) array[startIndex + i] = 1;
}
void FillOneB(int *array, int length, int& startIndex)
{
int localIndex = startIndex;
for (int i = 0; i < length; i++) array[localIndex + i] = 1;
}
The first version optimizes poorly because it needs to protect against the possibility that somebody called it as
int array[10] = { 0 };
FillOneA(array, 5, array[1]);
resulting in {1, 1, 0, 1, 1, 1, 0, 0, 0, 0 } since the iteration with i=1 modifies the startIndex parameter.
The second one doesn't need to worry about the possibility that the array[localIndex + i] = 1 will modify localIndex because localIndex is a local variable whose address has never been taken.
In assembly (Intel notation, because that's what I use):
FillOneA:
mov edx, [esp+8]
xor eax, eax
test edx, edx
jle $b
push esi
mov esi, [esp+16]
push edi
mov edi, [esp+12]
$a: mov ecx, [esi]
add ecx, eax
inc eax
mov [edi+ecx*4], 1
cmp eax, edx
jl $a
pop edi
pop esi
$b: ret
FillOneB:
mov ecx, [esp+8]
mov eax, [esp+12]
mov edx, [eax]
test ecx, ecx
jle $a
mov eax, [esp+4]
push edi
lea edi, [eax+edx*4]
mov eax, 1
rep stosd
pop edi
$a: ret
ADDED: Here's an example where the compiler's insight is into Bar, and not munge:
class Bar
{
public:
float getValue() const
{
return valueBase * boost;
}
private:
float valueBase;
float boost;
};
class Foo
{
public:
void munge(float adjustment);
};
void Adjust10A(Foo& foo, const Bar& bar)
{
for (int i = 0; i < 10; i++)
foo.munge(bar.getValue());
}
void Adjust10B(Foo& foo, const Bar& bar)
{
Bar localBar = bar;
for (int i = 0; i < 10; i++)
foo.munge(localBar.getValue());
}
The resulting code is
Adjust10A:
push ecx
push ebx
mov ebx, [esp+12] ;; foo
push esi
mov esi, [esp+20] ;; bar
push edi
mov edi, 10
$a: fld [esi+4] ;; bar.valueBase
push ecx
fmul [esi] ;; valueBase * boost
mov ecx, ebx
fstp [esp+16]
fld [esp+16]
fstp [esp]
call Foo::munge
dec edi
jne $a
pop edi
pop esi
pop ebx
pop ecx
ret 0
Adjust10B:
sub esp, 8
mov ecx, [esp+16] ;; bar
mov eax, [ecx] ;; bar.valueBase
mov [esp], eax ;; localBar.valueBase
fld [esp] ;; localBar.valueBase
mov eax, [ecx+4] ;; bar.boost
mov [esp+4], eax ;; localBar.boost
fmul [esp+4] ;; localBar.getValue()
push esi
push edi
mov edi, [esp+20] ;; foo
fstp [esp+24]
fld [esp+24] ;; cache localBar.getValue()
mov esi, 10 ;; loop counter
$a: push ecx
mov ecx, edi ;; foo
fstp [esp] ;; use cached value
call Foo::munge
fld [esp]
dec esi
jne $a ;; loop
pop edi
fstp ST(0)
pop esi
add esp, 8
ret 0
Observe that the inner loop in Adjust10A must recalculate the value since it must protect against the possibility that foo.munge changed bar.
That said, this style of optimization is not a slam dunk. (For example, we could've gotten the same effect by manually caching bar.getValue() into localValue.) It tends to be most helpful for vectorized operations, since those can be paralellized.
First, I'm going to assume munge() cannot be inlined - that is, its definition is not in the same translation unit; you haven't provided complete source, so I can't be entirely sure, but it would explain these results.
Since foo1 is passed to munge as a reference, at the implementation level, the compiler just passes a pointer. If we just forward our argument, this is nice and fast - any aliasing issues are munge()'s problem - and have to be, since munge() can't assume anything about its arguments, and we can't assume anything about what munge() might do with them (since munge()'s definition is not available).
However, if we copy to a local variable, we must copy to a local variable and pass a pointer to the local variable. This is because munge() can observe a difference in behavior - if it takes the pointer to its first argument, it can see it's not equal to &foo1. Since munge()'s implementation is not in scope, the compiler can't assume it won't do this.
This local-variable-copy trick here thus ends up pessimizing, rather than optimizing - the optimizations it tries to help are not possible, because munge() cannot be inlined; for the same reason, the local variable actively hurts performance.
It would be instructive to try this again, making sure munge() is non-virtual and available as an inlinable function.
Dump of assembler code for function main:
0x0804833e <+0>: push %ebp
0x0804833f <+1>: mov %esp,%ebp
0x08048341 <+3>: sub $0x8,%esp
0x08048344 <+6>: and $0xfffffff0,%esp
0x08048347 <+9>: mov $0x0,%eax
0x0804834c <+14>: add $0xf,%eax
0x0804834f <+17>: add $0xf,%eax
0x08048352 <+20>: shr $0x4,%eax
0x08048355 <+23>: shl $0x4,%eax
0x08048358 <+26>: sub %eax,%esp
=> 0x0804835a <+28>: movl $0x10,-0x4(%ebp)
0x08048361 <+35>: movl $0x0,-0x8(%ebp)
0x08048368 <+42>: pushl -0x4(%ebp)
0x0804836b <+45>: call 0x8048334 <myfunc1 at test.c:4>
0x08048370 <+50>: add $0x4,%esp
0x08048373 <+53>: pushl -0x8(%ebp)
0x08048376 <+56>: call 0x8048339 <myfunc2 at test.c:8>
0x0804837b <+61>: add $0x4,%esp
0x0804837e <+64>: mov $0x0,%eax
0x08048383 <+69>: leave
0x08048384 <+70>: ret
End of assembler dump.
(gdb) info line
Line 16 of "test.c" starts at address 0x804835a <main+28 at test.c:16> and ends at 0x8048361 <main+35 at test.c:17>.------------------------------------(1)
(gdb) shell cat test.c
#include<stdio.h>
void myfunc1(int recv_arg1)
{
/* does nothing */
}
void myfunc2(int recv_arg1)
{
/* does nothing */
}
int main(int argc,char **argv)
{
int var1;
int var2;
var1 = 16;
var2 = 0;
myfunc1(var1);
myfunc2(var2);
return 0;
}
Note in (1) that the asm code for main is within that range !! and the asm code before this range is for something else ? What ? surely something mysterious !!
Allow me to comment this for you.
0x0804833e <+0>: push %ebp ; Establish standard
0x0804833f <+1>: mov %esp,%ebp ; stack frame record
0x08048341 <+3>: sub $0x8,%esp ; Make room for locals
0x08048344 <+6>: and $0xfffffff0,%esp ; Align esp to 16-byte memory
0x08048347 <+9>: mov $0x0,%eax ; eax=0
0x0804834c <+14>: add $0xf,%eax ; eax=f
0x0804834f <+17>: add $0xf,%eax ; eax= (eax + 0xf)
0x08048352 <+20>: shr $0x4,%eax ; ( >> 4)
0x08048355 <+23>: shl $0x4,%eax ; ( << 4)
;The above math rounds up eax as set by 0x0804834c to the next 16-byte boundary
;In this case, eax will be 0x10, rounded up from 0x0f. You compiled without
;optimizations? This could be a "probe" checking whether the upcoming call
;will fail?
0x08048358 <+26>: sub %eax,%esp ; Make room for "0x10 more mystery bytes"
0x0804835a <+28>: movl $0x10,-0x4(%ebp) ; var1 = 16
0x08048361 <+35>: movl $0x0,-0x8(%ebp) ; var2 = 0
0x08048368 <+42>: pushl -0x4(%ebp) ; push var1
0x0804836b <+45>: call 0x8048334 <myfunc1 at test.c:4> ;myfunc1( );
0x08048370 <+50>: add $0x4,%esp ; pop (var1)
0x08048373 <+53>: pushl -0x8(%ebp) ; push var2
0x08048376 <+56>: call 0x8048339 <myfunc2 at test.c:8> ;myfunc2( );
0x0804837b <+61>: add $0x4,%esp ; pop (var2)
0x0804837e <+64>: mov $0x0,%eax ; return 0;
0x08048383 <+69>: leave ; undo standard stack frame
0x08048384 <+70>: ret ; actual return
I think it is a good question, why finally execute 0x08048358 which allocates seemingly useless space. I suspect this is a check for esp out of range exception before performing the call. If you specify the processor you are using, I wonder if this will "go away" -- it smells like it might be for a specific chip's errata.
The code from 0x0804833e <+0> upto (and including) 0x08048358 <+26> is setting up what is known as a stack frame.
The first four statements are very standard. First you save the old base pointer (which is called the frame pointer in the Wikipedia article). You then set up a new base pointer by using the current value of the stack pointer.
Next, you decrement the stack pointer to make room for you local variables (notice you subtract 0x8 which is enough for you two ints). Finally, it makes sure the stack pointer is aligned to a 16 bit address.
The next group of lines (from 0x08048347 <+9> to 0x08048358 <+26>) are a bit odd. The effect is to grow the stack more, but I'm at a loss to explain why it used 5 instructions to compute the value (since there is no variable, it should be able to do that at compile time) or why it needs to grow the stack more.
This is a guess... I'm not quite sure I understand the question correctly.
<+3> to <+26> look a bit frivolous. Perhaps it's to make the variable declarations explicit in code to ease debugging? I bet nearly all that code would disappear if optimizations were enabled.
Edit:
Now that I've learned to horizontally scroll, I see this does appear to be what you're referring to. That message is saying line 16 (the first assignment) starts at main+28.
All the code before that is setting up the stack to hold the local variables.
Often functions need a prologue, and an epilogue (it depends on conventions for functions calling, a bit on the processor too, ...). The prologue sets up everything needed for local variables and arguments passed in and eventually other stuffs. The epilogue "clears" what the prologue has done.
The exact produced code depends on the compiler and its version. E.g. doing gcc -S with your C code, I obtain different output, and of course if I add -On options, the output changes too.
0x0804833e <+0>: push %ebp
save current ebp register
0x0804833f <+1>: mov %esp,%ebp
copy esp register to ebp (aka base pointer or frame pointer)
0x08048341 <+3>: sub $0x8,%esp
make rooms on the stack (for 2 32bit integers)
0x08048344 <+6>: and $0xfffffff0,%esp
align stack to multiple of 16
0x08048347 <+9>: mov $0x0,%eax
eax = 0
0x0804834c <+14>: add $0xf,%eax
eax += 15
0x0804834f <+17>: add $0xf,%eax
eax += 15 (eax == 30)
0x08048352 <+20>: shr $0x4,%eax
0x08048355 <+23>: shl $0x4,%eax
total effect: zeros less significant nibble of eax;
30 = b:11110 -> eax = b:10000
0x08048358 <+26>: sub %eax,%esp
more 16 bytes room on the esp
esp -> dword room made by last esp-eax
dword
dword
dword
... maybe stuffs because of alignment
dword first two dword created by esp-8 (var2)
dword (var1)
ebp -> dword original ebp ptr
...
=> 0x0804835a <+28>: movl $0x10,-0x4(%ebp)
put 16 in -4(ebp), so we realize that it is var1
0x08048361 <+35>: movl $0x0,-0x8(%ebp)
put 0 in -8(ebp) so we realize it is var2
0x08048368 <+42>: pushl -0x4(%ebp)
0x0804836b <+45>: call 0x8048334 <myfunc1 at test.c:4>
pass var1 to myfunc1 (args are passed on stack, by convention)
0x08048370 <+50>: add $0x4,%esp
and cleaning the stack is up to the caller
0x08048373 <+53>: pushl -0x8(%ebp)
0x08048376 <+56>: call 0x8048339 <myfunc2 at test.c:8>
0x0804837b <+61>: add $0x4,%esp
pass var2 to myfunc2 and "clears" the stack
0x0804837e <+64>: mov $0x0,%eax
return value (0)
0x08048383 <+69>: leave
is the same as doing esp = ebp; pop ebp, i.e. take the stack
back at the initial point after the first push, and then retrieve
back original ebp value
0x08048384 <+70>: ret
return to the caller (return 0 <- eax)
This code is suboptimal. It does things unneeded, and is not what I get with gcc v 4.3.2 and without optimizations. In particular, things like two immediate adds can becomes a single add (even at the most basic stage of default optimization), and shr-shl pair can become a single and. Effectively, this code looks to me stranger than "normal" compiler output looks to me.