Updated: 19 Aug. 2017, 16:49 UTC
I’m writing an AVX code to multiply a vector with 4 billion components by a constant, however, I see no difference between my small -- I hope -- optimized AVX code and the long scalar compiler optimized version.
Both versions run between 410 ms - 400 ms.
Can someone tell me why it is occurring?
And why the large assembly generated by the compiler code takes almost the same time even it's larger ?
It's an important question, because if small computations -- like this multiplication -- have no improvement then it has no sense to use made the manual code in an Intel Core CPU. Perhaps in an Intel Xeon ( with 16 components ) or for more complex computations.
I'm compiling with G++ with parameters:
g++ -O3 -mtune=native -march=native -mavx -g3 -Wall -c -fmessage-length=0 -MMD -MP -MF"src/Test AVX.d" -MT"src/Test\ AVX.d" -o "src/Test AVX.o" "../src/Test AVX.cpp"
My CPU is a Intel(R) Core(TM) i5-5200U CPU # 2.20GHz.
There is the AVX code:
/**
* Run AVX Code
*/
void AVX() {
// Loop control
uint_fast32_t loop = 0;
// The constant
__m256 _const = _mm256_set1_ps(5.0f);
// The register for multiplication
__m256 _ymm0 = _mm256_setzero_ps();
// A "buffer" between the vector and the YMM0 register
float f_data[8];
// The main loop
for ( loop = 0 ; loop < SIZE ; loop = loop + 8 ) {
// Load to buffer
f_data[0] = vector[loop];
f_data[1] = vector[loop+1];
f_data[2] = vector[loop+2];
f_data[3] = vector[loop+3];
f_data[4] = vector[loop+4];
f_data[5] = vector[loop+5];
f_data[6] = vector[loop+6];
f_data[7] = vector[loop+7];
/*
* I tried to use pointers insted to copy
* the data, but the software crash
*
* float **f_data;
* f_data = float*[8];
*
* f_data[0] = &vector[loop];
* ...
*
*/
// Load to XMM and YMM Registers
_ymm0 = _mm256_load_ps(f_data);
// Do the multiplication
_ymm0 = _mm256_mul_ps(_ymm0,_const);
// Copy the results from the register to the "buffer"
_mm256_store_ps(f_data,_ymm0);
// Copy from the "buffer" to the vector
vector[loop] = f_data[0];
vector[loop+1] = f_data[1];
vector[loop+2] = f_data[2];
vector[loop+3] = f_data[3];
vector[loop+4] = f_data[4];
vector[loop+5] = f_data[5];
vector[loop+6] = f_data[6];
vector[loop+7] = f_data[7];
}
}
The AVX assembled:
0000000000400de0 <_Z3AVXv>:
400de0: 48 8b 05 b1 13 20 00 mov rax,QWORD PTR [rip+0x2013b1] # 602198 <vector>
400de7: c5 fc 28 0d 71 06 00 vmovaps ymm1,YMMWORD PTR [rip+0x671] # 401460 <_IO_stdin_used+0x40>
400dee: 00
400def: 48 8d 90 00 00 00 40 lea rdx,[rax+0x40000000]
400df6: 66 2e 0f 1f 84 00 00 nop WORD PTR cs:[rax+rax*1+0x0]
400dfd: 00 00 00
400e00: c5 f4 59 00 vmulps ymm0,ymm1,YMMWORD PTR [rax]
400e04: 48 83 c0 20 add rax,0x20
400e08: c5 fc 11 40 e0 vmovups YMMWORD PTR [rax-0x20],ymm0
400e0d: 48 39 c2 cmp rdx,rax
400e10: 75 ee jne 400e00 <_Z3AVXv+0x20>
400e12: c5 f8 77 vzeroupper
400e15: c3 ret
400e16: 66 2e 0f 1f 84 00 00 nop WORD PTR cs:[rax+rax*1+0x0]
400e1d: 00 00 00
The Serial Version:
/**
* Run Compiler optimized version
*/
void Serial() {
uint_fast32_t loop;
// Do the multiplication
for ( loop = 0 ; loop < SIZE ; loop ++)
vector[loop] *= 5;
}
The serial assembled:
It's more large, move the data more times and take almost the same time. How it's possible ?
0000000000400e80 <_Z6Serialv>:
400e80: 48 8b 35 11 13 20 00 mov rsi,QWORD PTR [rip+0x201311] # 602198 <vector>
400e87: 48 89 f0 mov rax,rsi
400e8a: 48 c1 e8 02 shr rax,0x2
400e8e: 48 f7 d8 neg rax
400e91: 83 e0 07 and eax,0x7
400e94: 0f 84 96 01 00 00 je 401030 <_Z6Serialv+0x1b0>
400e9a: c5 fa 10 05 7a 04 00 vmovss xmm0,DWORD PTR [rip+0x47a] # 40131c <_IO_stdin_used+0x1c>
400ea1: 00
400ea2: c5 fa 59 0e vmulss xmm1,xmm0,DWORD PTR [rsi]
400ea6: c5 fa 11 0e vmovss DWORD PTR [rsi],xmm1
400eaa: 48 83 f8 01 cmp rax,0x1
400eae: 0f 84 8c 01 00 00 je 401040 <_Z6Serialv+0x1c0>
400eb4: c5 fa 59 4e 04 vmulss xmm1,xmm0,DWORD PTR [rsi+0x4]
400eb9: c5 fa 11 4e 04 vmovss DWORD PTR [rsi+0x4],xmm1
400ebe: 48 83 f8 02 cmp rax,0x2
400ec2: 0f 84 89 01 00 00 je 401051 <_Z6Serialv+0x1d1>
400ec8: c5 fa 59 4e 08 vmulss xmm1,xmm0,DWORD PTR [rsi+0x8]
400ecd: c5 fa 11 4e 08 vmovss DWORD PTR [rsi+0x8],xmm1
400ed2: 48 83 f8 03 cmp rax,0x3
400ed6: 0f 84 86 01 00 00 je 401062 <_Z6Serialv+0x1e2>
400edc: c5 fa 59 4e 0c vmulss xmm1,xmm0,DWORD PTR [rsi+0xc]
400ee1: c5 fa 11 4e 0c vmovss DWORD PTR [rsi+0xc],xmm1
400ee6: 48 83 f8 04 cmp rax,0x4
400eea: 0f 84 2d 01 00 00 je 40101d <_Z6Serialv+0x19d>
400ef0: c5 fa 59 4e 10 vmulss xmm1,xmm0,DWORD PTR [rsi+0x10]
400ef5: c5 fa 11 4e 10 vmovss DWORD PTR [rsi+0x10],xmm1
400efa: 48 83 f8 05 cmp rax,0x5
400efe: 0f 84 6f 01 00 00 je 401073 <_Z6Serialv+0x1f3>
400f04: c5 fa 59 4e 14 vmulss xmm1,xmm0,DWORD PTR [rsi+0x14]
400f09: c5 fa 11 4e 14 vmovss DWORD PTR [rsi+0x14],xmm1
400f0e: 48 83 f8 06 cmp rax,0x6
400f12: 0f 84 6c 01 00 00 je 401084 <_Z6Serialv+0x204>
400f18: c5 fa 59 46 18 vmulss xmm0,xmm0,DWORD PTR [rsi+0x18]
400f1d: 41 b9 f9 ff ff 0f mov r9d,0xffffff9
400f23: 41 ba 07 00 00 00 mov r10d,0x7
400f29: c5 fa 11 46 18 vmovss DWORD PTR [rsi+0x18],xmm0
400f2e: 41 b8 00 00 00 10 mov r8d,0x10000000
400f34: c5 fc 28 0d 04 04 00 vmovaps ymm1,YMMWORD PTR [rip+0x404] # 401340 <_IO_stdin_used+0x40>
400f3b: 00
400f3c: 48 8d 0c 86 lea rcx,[rsi+rax*4]
400f40: 31 d2 xor edx,edx
400f42: 49 29 c0 sub r8,rax
400f45: 31 c0 xor eax,eax
400f47: 4c 89 c7 mov rdi,r8
400f4a: 48 c1 ef 03 shr rdi,0x3
400f4e: 66 90 xchg ax,ax
400f50: c5 f4 59 04 01 vmulps ymm0,ymm1,YMMWORD PTR [rcx+rax*1]
400f55: 48 83 c2 01 add rdx,0x1
400f59: c5 fc 29 04 01 vmovaps YMMWORD PTR [rcx+rax*1],ymm0
400f5e: 48 83 c0 20 add rax,0x20
400f62: 48 39 d7 cmp rdi,rdx
400f65: 77 e9 ja 400f50 <_Z6Serialv+0xd0>
400f67: 4c 89 c1 mov rcx,r8
400f6a: 4c 89 ca mov rdx,r9
400f6d: 48 83 e1 f8 and rcx,0xfffffffffffffff8
400f71: 49 8d 04 0a lea rax,[r10+rcx*1]
400f75: 48 29 ca sub rdx,rcx
400f78: 49 39 c8 cmp r8,rcx
400f7b: 0f 84 98 00 00 00 je 401019 <_Z6Serialv+0x199>
400f81: 48 8d 0c 86 lea rcx,[rsi+rax*4]
400f85: c5 fa 10 05 8f 03 00 vmovss xmm0,DWORD PTR [rip+0x38f] # 40131c <_IO_stdin_used+0x1c>
400f8c: 00
400f8d: c5 fa 59 09 vmulss xmm1,xmm0,DWORD PTR [rcx]
400f91: c5 fa 11 09 vmovss DWORD PTR [rcx],xmm1
400f95: 48 8d 48 01 lea rcx,[rax+0x1]
400f99: 48 83 fa 01 cmp rdx,0x1
400f9d: 74 7a je 401019 <_Z6Serialv+0x199>
400f9f: 48 8d 0c 8e lea rcx,[rsi+rcx*4]
400fa3: c5 fa 59 09 vmulss xmm1,xmm0,DWORD PTR [rcx]
400fa7: c5 fa 11 09 vmovss DWORD PTR [rcx],xmm1
400fab: 48 8d 48 02 lea rcx,[rax+0x2]
400faf: 48 83 fa 02 cmp rdx,0x2
400fb3: 74 64 je 401019 <_Z6Serialv+0x199>
400fb5: 48 8d 0c 8e lea rcx,[rsi+rcx*4]
400fb9: c5 fa 59 09 vmulss xmm1,xmm0,DWORD PTR [rcx]
400fbd: c5 fa 11 09 vmovss DWORD PTR [rcx],xmm1
400fc1: 48 8d 48 03 lea rcx,[rax+0x3]
400fc5: 48 83 fa 03 cmp rdx,0x3
400fc9: 74 4e je 401019 <_Z6Serialv+0x199>
400fcb: 48 8d 0c 8e lea rcx,[rsi+rcx*4]
400fcf: c5 fa 59 09 vmulss xmm1,xmm0,DWORD PTR [rcx]
400fd3: c5 fa 11 09 vmovss DWORD PTR [rcx],xmm1
400fd7: 48 8d 48 04 lea rcx,[rax+0x4]
400fdb: 48 83 fa 04 cmp rdx,0x4
400fdf: 74 38 je 401019 <_Z6Serialv+0x199>
400fe1: 48 8d 0c 8e lea rcx,[rsi+rcx*4]
400fe5: c5 fa 59 09 vmulss xmm1,xmm0,DWORD PTR [rcx]
400fe9: c5 fa 11 09 vmovss DWORD PTR [rcx],xmm1
400fed: 48 8d 48 05 lea rcx,[rax+0x5]
400ff1: 48 83 fa 05 cmp rdx,0x5
400ff5: 74 22 je 401019 <_Z6Serialv+0x199>
400ff7: 48 8d 0c 8e lea rcx,[rsi+rcx*4]
400ffb: 48 83 c0 06 add rax,0x6
400fff: c5 fa 59 09 vmulss xmm1,xmm0,DWORD PTR [rcx]
401003: c5 fa 11 09 vmovss DWORD PTR [rcx],xmm1
401007: 48 83 fa 06 cmp rdx,0x6
40100b: 74 0c je 401019 <_Z6Serialv+0x199>
40100d: 48 8d 04 86 lea rax,[rsi+rax*4]
401011: c5 fa 59 00 vmulss xmm0,xmm0,DWORD PTR [rax]
401015: c5 fa 11 00 vmovss DWORD PTR [rax],xmm0
401019: c5 f8 77 vzeroupper
40101c: c3 ret
40101d: 41 ba 04 00 00 00 mov r10d,0x4
401023: 41 b9 fc ff ff 0f mov r9d,0xffffffc
401029: e9 00 ff ff ff jmp 400f2e <_Z6Serialv+0xae>
40102e: 66 90 xchg ax,ax
401030: 41 b9 00 00 00 10 mov r9d,0x10000000
401036: 45 31 d2 xor r10d,r10d
401039: e9 f0 fe ff ff jmp 400f2e <_Z6Serialv+0xae>
40103e: 66 90 xchg ax,ax
401040: 41 b9 ff ff ff 0f mov r9d,0xfffffff
401046: 41 ba 01 00 00 00 mov r10d,0x1
40104c: e9 dd fe ff ff jmp 400f2e <_Z6Serialv+0xae>
401051: 41 ba 02 00 00 00 mov r10d,0x2
401057: 41 b9 fe ff ff 0f mov r9d,0xffffffe
40105d: e9 cc fe ff ff jmp 400f2e <_Z6Serialv+0xae>
401062: 41 ba 03 00 00 00 mov r10d,0x3
401068: 41 b9 fd ff ff 0f mov r9d,0xffffffd
40106e: e9 bb fe ff ff jmp 400f2e <_Z6Serialv+0xae>
401073: 41 ba 05 00 00 00 mov r10d,0x5
401079: 41 b9 fb ff ff 0f mov r9d,0xffffffb
40107f: e9 aa fe ff ff jmp 400f2e <_Z6Serialv+0xae>
401084: 41 ba 06 00 00 00 mov r10d,0x6
40108a: 41 b9 fa ff ff 0f mov r9d,0xffffffa
401090: e9 99 fe ff ff jmp 400f2e <_Z6Serialv+0xae>
401095: 90 nop
401096: 66 2e 0f 1f 84 00 00 nop WORD PTR cs:[rax+rax*1+0x0]
40109d: 00 00 00
The full code:
#include <iostream>
#include <xmmintrin.h>
#include <immintrin.h>
using namespace std;
/**
* The vector size
* 268435456 -> 32*8388608 -> 2^32
*/
#define SIZE 268435456
/**
* The vector for computations
*/
float *vector;
/**
* Run AVX Code
*/
void AVX() { ... }
/**
* Run Compiler optimized version
*/
void Serial() { ... }
/**
* Create the vector
*/
void create() {
vector = new float[SIZE];
}
/**
* Fill the vector with data
* to be used for validation
*/
void fill() {
uint_fast32_t loop = 0;
// Fill the vector
for ( loop = 0 ; loop < SIZE ; loop++ )
vector[loop] = 1;
}
/**
* A validation to ensure the compiler have
* computed all the vector data
*/
void validation() {
// The loop variable
unsigned long loop = 0;
unsigned long errors = 0;
unsigned long checks = 0;
for ( loop = 0 ; loop < SIZE ; loop ++ ) {
// All the vector must be 5
if ( vector[loop] != 5 ) {
errors ++;
// To avoid to show too many errors
if ( errors < 12 )
std::cout << loop << ": " << vector[loop] << std::endl;
}
checks ++;
}
// The result
std::cout << "Errors: " << errors << "\nChecks: " << checks << std::endl;
}
int main() {
// Create the vector
create();
// Fill with data
//fill();
// The tests
//Serial();
AVX();
/*
* To ensure that the g++ optimization have executed the loop
*/
//validation();
}
Compiled with:
g++ -O3 -mtune=native -march=native -mavx -g3 -Wall -c -fmessage-length=0 -MMD -MP -MF"src/Test AVX.d" -MT"src/Test\ AVX.d" -o "src/Test AVX.o" "../src/Test AVX.cpp"
Multiplying by 5 is so trivial that you should do that on the fly next time you read the array, or fold it into the code that wrote this array. Loading all that data from RAM into the CPU and storing it back again just to multiply by 5.0 is not efficient.
If you can't just fold it into a different pass of your algorithm, try cache-blocking aka loop-tiling to run multiple steps of your algorithm over a part of this array that fits into cache, before moving on to the next cache-sized block.
Your scalar code auto-vectorizes to nearly the same inner loop as your manually-vectorized version. Neither one is unrolled at all.
The extra code size in gcc's version is just scalar startup / cleanup so its inner loop can use aligned loads/stores. gcc fully unrolls those loops.
Also note that your manually-vectorized code doesn't handle the case where SIZE is not a multiple of 8. (gcc does handle the cleanup at the end even then, because it doesn't know where the alignment boundary will be.)
clang usually just uses unaligned loads/stores on arrays that it can't prove at compile time are always aligned. gcc's default behaviour is maybe good for large arrays that actually are misaligned at run-time, but a total waste of I-cache and branches for cases where the data is in fact aligned at run time most of the time, or for small arrays where doing a bunch of branching and scalar iterations isn't worth it.
The inner loops are nearly the same. In your manually vectorized version, gcc managed to optimize away the element-by-element copy through f_data and emit what you would get from _mm256_loadu_ps(&vector[loop]), instead of actually copying to a local and then doing a vector load. And same for storing back into vector[], luckily for you.
# top of inner loop in the manually-vectorized version:
400e00: c5 f4 59 00 vmulps ymm0,ymm1,YMMWORD PTR [rax]
400e04: 48 83 c0 20 add rax,0x20
400e08: c5 fc 11 40 e0 vmovups YMMWORD PTR [rax-0x20],ymm0
400e0d: 48 39 c2 cmp rdx,rax
400e10: 75 ee jne 400e00 <_Z3AVXv+0x20>
gcc's inner loop uses a loop counter separate from the pointer, so it has an extra instruction, and it uses an indexed addressing mode. vmulps ymm0,ymm1,YMMWORD PTR [rcx+rax*1] can't stay micro-fused on Haswell, so it will issue as 2 fused-domain uops.
# top of gcc's inner loop:
400f50: c5 f4 59 04 01 vmulps ymm0,ymm1,YMMWORD PTR [rcx+rax*1]
400f55: 48 83 c2 01 add rdx,0x1
400f59: c5 fc 29 04 01 vmovaps YMMWORD PTR [rcx+rax*1],ymm0
400f5e: 48 83 c0 20 add rax,0x20
400f62: 48 39 d7 cmp rdi,rdx
400f65: 77 e9 ja 400f50 <_Z6Serialv+0xd0>
The extra add instruction is another extra uop. This is 6 fused-domain uops (and thus can run at best one iteration per 1.5 cycles, bottlenecked on the front-end).
Your manual version is only 4 fused-domain uops, so it can issue at 1 per clock. It can in theory run that fast if the buffer is hot in L1D cache (or maybe L2), also limited by 1 store per clock.
Of course, since you're running it over a giant buffer, you just bottleneck on memory bandwidth. The minor front-end bottleneck in the auto-vectorized version is a total non-issue. Even an SSE2 version would barely run slower.
You said something about a Xeon with 16 cores. If you want gcc to auto-parallelize as well as SIMD vectorize, you could use OpenMP. As it is, your code is purely single-threaded.
Related
Consider the following MWE:
#include <stdexcept>
#include <iostream>
int main() {
int a = 2;
try {
throw std::runtime_error("Example");
a = 4;
} catch (const std::exception &) {
a = 5;
}
std::cout << a << std::endl;
}
If I compile this with g++ -g -O0 test.cpp and then run the resulting binary through gdb via gdb ./a.out I can start stepping through the program. However, as soon as I reach the line that throws the exception inputting next or step does not lead me to the line a = 5 but instead immediately prints 5 to the console and then gdb tells me that the process has exited normally.
This is quite annoying as oftentimes it would actually be useful to figure out where the thrown exception is handled.
Is there a way to get gdb to follow the full program flow even when exceptions are involved?
My GDB version is GNU gdb (Ubuntu 12.1-0ubuntu1~22.04) 12.1
EDIT: To ensure that the thrown exception has not been optimized out of my binary, I checked the respective assembly via objdump and if interpret that corretly, throwing the exception is still part of the binary:
objdump -drwC -Mintel a.out
a.out: file format elf64-x86-64
Disassembly of section .init:
0000000000001000 <_init>:
1000: f3 0f 1e fa endbr64
1004: 48 83 ec 08 sub rsp,0x8
1008: 48 8b 05 d1 2f 00 00 mov rax,QWORD PTR [rip+0x2fd1] # 3fe0 <__gmon_start__#Base>
100f: 48 85 c0 test rax,rax
1012: 74 02 je 1016 <_init+0x16>
1014: ff d0 call rax
1016: 48 83 c4 08 add rsp,0x8
101a: c3 ret
Disassembly of section .plt:
0000000000001020 <.plt>:
1020: ff 35 2a 2f 00 00 push QWORD PTR [rip+0x2f2a] # 3f50 <_GLOBAL_OFFSET_TABLE_+0x8>
1026: f2 ff 25 2b 2f 00 00 bnd jmp QWORD PTR [rip+0x2f2b] # 3f58 <_GLOBAL_OFFSET_TABLE_+0x10>
102d: 0f 1f 00 nop DWORD PTR [rax]
1030: f3 0f 1e fa endbr64
1034: 68 00 00 00 00 push 0x0
1039: f2 e9 e1 ff ff ff bnd jmp 1020 <_init+0x20>
103f: 90 nop
1040: f3 0f 1e fa endbr64
1044: 68 01 00 00 00 push 0x1
1049: f2 e9 d1 ff ff ff bnd jmp 1020 <_init+0x20>
104f: 90 nop
1050: f3 0f 1e fa endbr64
1054: 68 02 00 00 00 push 0x2
1059: f2 e9 c1 ff ff ff bnd jmp 1020 <_init+0x20>
105f: 90 nop
1060: f3 0f 1e fa endbr64
1064: 68 03 00 00 00 push 0x3
1069: f2 e9 b1 ff ff ff bnd jmp 1020 <_init+0x20>
106f: 90 nop
1070: f3 0f 1e fa endbr64
1074: 68 04 00 00 00 push 0x4
1079: f2 e9 a1 ff ff ff bnd jmp 1020 <_init+0x20>
107f: 90 nop
1080: f3 0f 1e fa endbr64
1084: 68 05 00 00 00 push 0x5
1089: f2 e9 91 ff ff ff bnd jmp 1020 <_init+0x20>
108f: 90 nop
1090: f3 0f 1e fa endbr64
1094: 68 06 00 00 00 push 0x6
1099: f2 e9 81 ff ff ff bnd jmp 1020 <_init+0x20>
109f: 90 nop
10a0: f3 0f 1e fa endbr64
10a4: 68 07 00 00 00 push 0x7
10a9: f2 e9 71 ff ff ff bnd jmp 1020 <_init+0x20>
10af: 90 nop
10b0: f3 0f 1e fa endbr64
10b4: 68 08 00 00 00 push 0x8
10b9: f2 e9 61 ff ff ff bnd jmp 1020 <_init+0x20>
10bf: 90 nop
10c0: f3 0f 1e fa endbr64
10c4: 68 09 00 00 00 push 0x9
10c9: f2 e9 51 ff ff ff bnd jmp 1020 <_init+0x20>
10cf: 90 nop
10d0: f3 0f 1e fa endbr64
10d4: 68 0a 00 00 00 push 0xa
10d9: f2 e9 41 ff ff ff bnd jmp 1020 <_init+0x20>
10df: 90 nop
Disassembly of section .plt.got:
00000000000010e0 <__cxa_finalize#plt>:
10e0: f3 0f 1e fa endbr64
10e4: f2 ff 25 cd 2e 00 00 bnd jmp QWORD PTR [rip+0x2ecd] # 3fb8 <__cxa_finalize#GLIBC_2.2.5>
10eb: 0f 1f 44 00 00 nop DWORD PTR [rax+rax*1+0x0]
Disassembly of section .plt.sec:
00000000000010f0 <std::runtime_error::runtime_error(char const*)#plt>:
10f0: f3 0f 1e fa endbr64
10f4: f2 ff 25 65 2e 00 00 bnd jmp QWORD PTR [rip+0x2e65] # 3f60 <std::runtime_error::runtime_error(char const*)#GLIBCXX_3.4.21>
10fb: 0f 1f 44 00 00 nop DWORD PTR [rax+rax*1+0x0]
0000000000001100 <__cxa_begin_catch#plt>:
1100: f3 0f 1e fa endbr64
1104: f2 ff 25 5d 2e 00 00 bnd jmp QWORD PTR [rip+0x2e5d] # 3f68 <__cxa_begin_catch#CXXABI_1.3>
110b: 0f 1f 44 00 00 nop DWORD PTR [rax+rax*1+0x0]
0000000000001110 <__cxa_allocate_exception#plt>:
1110: f3 0f 1e fa endbr64
1114: f2 ff 25 55 2e 00 00 bnd jmp QWORD PTR [rip+0x2e55] # 3f70 <__cxa_allocate_exception#CXXABI_1.3>
111b: 0f 1f 44 00 00 nop DWORD PTR [rax+rax*1+0x0]
0000000000001120 <__cxa_free_exception#plt>:
1120: f3 0f 1e fa endbr64
1124: f2 ff 25 4d 2e 00 00 bnd jmp QWORD PTR [rip+0x2e4d] # 3f78 <__cxa_free_exception#CXXABI_1.3>
112b: 0f 1f 44 00 00 nop DWORD PTR [rax+rax*1+0x0]
0000000000001130 <__cxa_atexit#plt>:
1130: f3 0f 1e fa endbr64
1134: f2 ff 25 45 2e 00 00 bnd jmp QWORD PTR [rip+0x2e45] # 3f80 <__cxa_atexit#GLIBC_2.2.5>
113b: 0f 1f 44 00 00 nop DWORD PTR [rax+rax*1+0x0]
0000000000001140 <std::ostream::operator<<(std::ostream& (*)(std::ostream&))#plt>:
1140: f3 0f 1e fa endbr64
1144: f2 ff 25 3d 2e 00 00 bnd jmp QWORD PTR [rip+0x2e3d] # 3f88 <std::ostream::operator<<(std::ostream& (*)(std::ostream&))#GLIBCXX_3.4>
114b: 0f 1f 44 00 00 nop DWORD PTR [rax+rax*1+0x0]
0000000000001150 <std::ios_base::Init::Init()#plt>:
1150: f3 0f 1e fa endbr64
1154: f2 ff 25 35 2e 00 00 bnd jmp QWORD PTR [rip+0x2e35] # 3f90 <std::ios_base::Init::Init()#GLIBCXX_3.4>
115b: 0f 1f 44 00 00 nop DWORD PTR [rax+rax*1+0x0]
0000000000001160 <__cxa_end_catch#plt>:
1160: f3 0f 1e fa endbr64
1164: f2 ff 25 2d 2e 00 00 bnd jmp QWORD PTR [rip+0x2e2d] # 3f98 <__cxa_end_catch#CXXABI_1.3>
116b: 0f 1f 44 00 00 nop DWORD PTR [rax+rax*1+0x0]
0000000000001170 <__cxa_throw#plt>:
1170: f3 0f 1e fa endbr64
1174: f2 ff 25 25 2e 00 00 bnd jmp QWORD PTR [rip+0x2e25] # 3fa0 <__cxa_throw#CXXABI_1.3>
117b: 0f 1f 44 00 00 nop DWORD PTR [rax+rax*1+0x0]
0000000000001180 <std::ostream::operator<<(int)#plt>:
1180: f3 0f 1e fa endbr64
1184: f2 ff 25 1d 2e 00 00 bnd jmp QWORD PTR [rip+0x2e1d] # 3fa8 <std::ostream::operator<<(int)#GLIBCXX_3.4>
118b: 0f 1f 44 00 00 nop DWORD PTR [rax+rax*1+0x0]
0000000000001190 <_Unwind_Resume#plt>:
1190: f3 0f 1e fa endbr64
1194: f2 ff 25 15 2e 00 00 bnd jmp QWORD PTR [rip+0x2e15] # 3fb0 <_Unwind_Resume#GCC_3.0>
119b: 0f 1f 44 00 00 nop DWORD PTR [rax+rax*1+0x0]
Disassembly of section .text:
00000000000011a0 <_start>:
11a0: f3 0f 1e fa endbr64
11a4: 31 ed xor ebp,ebp
11a6: 49 89 d1 mov r9,rdx
11a9: 5e pop rsi
11aa: 48 89 e2 mov rdx,rsp
11ad: 48 83 e4 f0 and rsp,0xfffffffffffffff0
11b1: 50 push rax
11b2: 54 push rsp
11b3: 45 31 c0 xor r8d,r8d
11b6: 31 c9 xor ecx,ecx
11b8: 48 8d 3d ca 00 00 00 lea rdi,[rip+0xca] # 1289 <main>
11bf: ff 15 03 2e 00 00 call QWORD PTR [rip+0x2e03] # 3fc8 <__libc_start_main#GLIBC_2.34>
11c5: f4 hlt
11c6: 66 2e 0f 1f 84 00 00 00 00 00 cs nop WORD PTR [rax+rax*1+0x0]
00000000000011d0 <deregister_tm_clones>:
11d0: 48 8d 3d 49 2e 00 00 lea rdi,[rip+0x2e49] # 4020 <__TMC_END__>
11d7: 48 8d 05 42 2e 00 00 lea rax,[rip+0x2e42] # 4020 <__TMC_END__>
11de: 48 39 f8 cmp rax,rdi
11e1: 74 15 je 11f8 <deregister_tm_clones+0x28>
11e3: 48 8b 05 ee 2d 00 00 mov rax,QWORD PTR [rip+0x2dee] # 3fd8 <_ITM_deregisterTMCloneTable#Base>
11ea: 48 85 c0 test rax,rax
11ed: 74 09 je 11f8 <deregister_tm_clones+0x28>
11ef: ff e0 jmp rax
11f1: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
11f8: c3 ret
11f9: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
0000000000001200 <register_tm_clones>:
1200: 48 8d 3d 19 2e 00 00 lea rdi,[rip+0x2e19] # 4020 <__TMC_END__>
1207: 48 8d 35 12 2e 00 00 lea rsi,[rip+0x2e12] # 4020 <__TMC_END__>
120e: 48 29 fe sub rsi,rdi
1211: 48 89 f0 mov rax,rsi
1214: 48 c1 ee 3f shr rsi,0x3f
1218: 48 c1 f8 03 sar rax,0x3
121c: 48 01 c6 add rsi,rax
121f: 48 d1 fe sar rsi,1
1222: 74 14 je 1238 <register_tm_clones+0x38>
1224: 48 8b 05 bd 2d 00 00 mov rax,QWORD PTR [rip+0x2dbd] # 3fe8 <_ITM_registerTMCloneTable#Base>
122b: 48 85 c0 test rax,rax
122e: 74 08 je 1238 <register_tm_clones+0x38>
1230: ff e0 jmp rax
1232: 66 0f 1f 44 00 00 nop WORD PTR [rax+rax*1+0x0]
1238: c3 ret
1239: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
0000000000001240 <__do_global_dtors_aux>:
1240: f3 0f 1e fa endbr64
1244: 80 3d 05 2f 00 00 00 cmp BYTE PTR [rip+0x2f05],0x0 # 4150 <completed.0>
124b: 75 2b jne 1278 <__do_global_dtors_aux+0x38>
124d: 55 push rbp
124e: 48 83 3d 62 2d 00 00 00 cmp QWORD PTR [rip+0x2d62],0x0 # 3fb8 <__cxa_finalize#GLIBC_2.2.5>
1256: 48 89 e5 mov rbp,rsp
1259: 74 0c je 1267 <__do_global_dtors_aux+0x27>
125b: 48 8b 3d a6 2d 00 00 mov rdi,QWORD PTR [rip+0x2da6] # 4008 <__dso_handle>
1262: e8 79 fe ff ff call 10e0 <__cxa_finalize#plt>
1267: e8 64 ff ff ff call 11d0 <deregister_tm_clones>
126c: c6 05 dd 2e 00 00 01 mov BYTE PTR [rip+0x2edd],0x1 # 4150 <completed.0>
1273: 5d pop rbp
1274: c3 ret
1275: 0f 1f 00 nop DWORD PTR [rax]
1278: c3 ret
1279: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
0000000000001280 <frame_dummy>:
1280: f3 0f 1e fa endbr64
1284: e9 77 ff ff ff jmp 1200 <register_tm_clones>
0000000000001289 <main>:
1289: f3 0f 1e fa endbr64
128d: 55 push rbp
128e: 48 89 e5 mov rbp,rsp
1291: 41 55 push r13
1293: 41 54 push r12
1295: 53 push rbx
1296: 48 83 ec 18 sub rsp,0x18
129a: c7 45 d4 02 00 00 00 mov DWORD PTR [rbp-0x2c],0x2
12a1: bf 10 00 00 00 mov edi,0x10
12a6: e8 65 fe ff ff call 1110 <__cxa_allocate_exception#plt>
12ab: 48 89 c3 mov rbx,rax
12ae: 48 8d 05 4f 0d 00 00 lea rax,[rip+0xd4f] # 2004 <_IO_stdin_used+0x4>
12b5: 48 89 c6 mov rsi,rax
12b8: 48 89 df mov rdi,rbx
12bb: e8 30 fe ff ff call 10f0 <std::runtime_error::runtime_error(char const*)#plt>
12c0: 48 8b 05 09 2d 00 00 mov rax,QWORD PTR [rip+0x2d09] # 3fd0 <std::runtime_error::~runtime_error()#GLIBCXX_3.4>
12c7: 48 89 c2 mov rdx,rax
12ca: 48 8d 05 4f 2a 00 00 lea rax,[rip+0x2a4f] # 3d20 <typeinfo for std::runtime_error#GLIBCXX_3.4>
12d1: 48 89 c6 mov rsi,rax
12d4: 48 89 df mov rdi,rbx
12d7: e8 94 fe ff ff call 1170 <__cxa_throw#plt>
12dc: f3 0f 1e fa endbr64
12e0: 49 89 c5 mov r13,rax
12e3: 49 89 d4 mov r12,rdx
12e6: 48 89 df mov rdi,rbx
12e9: e8 32 fe ff ff call 1120 <__cxa_free_exception#plt>
12ee: 4c 89 e8 mov rax,r13
12f1: 4c 89 e2 mov rdx,r12
12f4: eb 04 jmp 12fa <main+0x71>
12f6: f3 0f 1e fa endbr64
12fa: 48 83 fa 01 cmp rdx,0x1
12fe: 74 08 je 1308 <main+0x7f>
1300: 48 89 c7 mov rdi,rax
1303: e8 88 fe ff ff call 1190 <_Unwind_Resume#plt>
1308: 48 89 c7 mov rdi,rax
130b: e8 f0 fd ff ff call 1100 <__cxa_begin_catch#plt>
1310: 48 89 45 d8 mov QWORD PTR [rbp-0x28],rax
1314: c7 45 d4 05 00 00 00 mov DWORD PTR [rbp-0x2c],0x5
131b: e8 40 fe ff ff call 1160 <__cxa_end_catch#plt>
1320: 8b 45 d4 mov eax,DWORD PTR [rbp-0x2c]
1323: 89 c6 mov esi,eax
1325: 48 8d 05 14 2d 00 00 lea rax,[rip+0x2d14] # 4040 <std::cout#GLIBCXX_3.4>
132c: 48 89 c7 mov rdi,rax
132f: e8 4c fe ff ff call 1180 <std::ostream::operator<<(int)#plt>
1334: 48 8b 15 85 2c 00 00 mov rdx,QWORD PTR [rip+0x2c85] # 3fc0 <std::basic_ostream<char, std::char_traits<char> >& std::endl<char, std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&)#GLIBCXX_3.4>
133b: 48 89 d6 mov rsi,rdx
133e: 48 89 c7 mov rdi,rax
1341: e8 fa fd ff ff call 1140 <std::ostream::operator<<(std::ostream& (*)(std::ostream&))#plt>
1346: b8 00 00 00 00 mov eax,0x0
134b: 48 83 c4 18 add rsp,0x18
134f: 5b pop rbx
1350: 41 5c pop r12
1352: 41 5d pop r13
1354: 5d pop rbp
1355: c3 ret
0000000000001356 <__static_initialization_and_destruction_0(int, int)>:
1356: f3 0f 1e fa endbr64
135a: 55 push rbp
135b: 48 89 e5 mov rbp,rsp
135e: 48 83 ec 10 sub rsp,0x10
1362: 89 7d fc mov DWORD PTR [rbp-0x4],edi
1365: 89 75 f8 mov DWORD PTR [rbp-0x8],esi
1368: 83 7d fc 01 cmp DWORD PTR [rbp-0x4],0x1
136c: 75 3b jne 13a9 <__static_initialization_and_destruction_0(int, int)+0x53>
136e: 81 7d f8 ff ff 00 00 cmp DWORD PTR [rbp-0x8],0xffff
1375: 75 32 jne 13a9 <__static_initialization_and_destruction_0(int, int)+0x53>
1377: 48 8d 05 d3 2d 00 00 lea rax,[rip+0x2dd3] # 4151 <std::__ioinit>
137e: 48 89 c7 mov rdi,rax
1381: e8 ca fd ff ff call 1150 <std::ios_base::Init::Init()#plt>
1386: 48 8d 05 7b 2c 00 00 lea rax,[rip+0x2c7b] # 4008 <__dso_handle>
138d: 48 89 c2 mov rdx,rax
1390: 48 8d 05 ba 2d 00 00 lea rax,[rip+0x2dba] # 4151 <std::__ioinit>
1397: 48 89 c6 mov rsi,rax
139a: 48 8b 05 4f 2c 00 00 mov rax,QWORD PTR [rip+0x2c4f] # 3ff0 <std::ios_base::Init::~Init()#GLIBCXX_3.4>
13a1: 48 89 c7 mov rdi,rax
13a4: e8 87 fd ff ff call 1130 <__cxa_atexit#plt>
13a9: 90 nop
13aa: c9 leave
13ab: c3 ret
00000000000013ac <_GLOBAL__sub_I_main>:
13ac: f3 0f 1e fa endbr64
13b0: 55 push rbp
13b1: 48 89 e5 mov rbp,rsp
13b4: be ff ff 00 00 mov esi,0xffff
13b9: bf 01 00 00 00 mov edi,0x1
13be: e8 93 ff ff ff call 1356 <__static_initialization_and_destruction_0(int, int)>
13c3: 5d pop rbp
13c4: c3 ret
Disassembly of section .fini:
00000000000013c8 <_fini>:
13c8: f3 0f 1e fa endbr64
13cc: 48 83 ec 08 sub rsp,0x8
13d0: 48 83 c4 08 add rsp,0x8
13d4: c3 ret
I wanted to know how methods are implemented in C++. I wanted to know how methods are implemented "under the hood".
So, I have made a simple C++ program which has a class with 1 non static field and 1 non static, non virtual method.
Then I instantiated the class in the main function and called the method. I have used objdump -d option in order to see the CPU instructions of this program. I have a x86-64 processor.
Here's the code:
#include<stdio.h>
class TestClass {
public:
int x;
int xPlus2(){
return x + 2;
}
};
int main(){
TestClass tc1 = {5};
int variable = tc1.xPlus2();
printf("%d \n", variable);
return 0;
}
Here are instructions for the method xPlus2:
0000000000402c30 <_ZN9TestClass6xPlus2Ev>:
402c30: 55 push %rbp
402c31: 48 89 e5 mov %rsp,%rbp
402c34: 48 89 4d 10 mov %rcx,0x10(%rbp)
402c38: 48 8b 45 10 mov 0x10(%rbp),%rax
402c3c: 8b 00 mov (%rax),%eax
402c3e: 83 c0 02 add $0x2,%eax
402c41: 5d pop %rbp
402c42: c3 retq
402c43: 90 nop
402c44: 90 nop
402c45: 90 nop
402c46: 90 nop
402c47: 90 nop
402c48: 90 nop
402c49: 90 nop
402c4a: 90 nop
402c4b: 90 nop
402c4c: 90 nop
402c4d: 90 nop
402c4e: 90 nop
402c4f: 90 nop
If I understand it correctly, these instructions can be replaced by just 3 instructions, because I believe that I don't need to use the stack, I think the compiler used it redundantly:
mov (%rcx), eax
add $2, eax
retq
and then maybe I still need lots of nop instructions for synchronization purposes or whatnot. If you look at the CPU instructions, it looks like the value that x field has is stored at the location in memory which rcx register holds. You will see the rest of the CPU instructions in a moment. It is a little bit hard for me to track what has happened here (especially what is going on with the call of _main function), I don't even know what parts of assembly are important to look at. Compiler produces main function (as I expected), but then it also produced _main function which is called from the main, there are some weird functions in between those two as well.
Here are other parts of the assembly that I think may be interesting:
0000000000401550 <main>:
401550: 55 push %rbp
401551: 48 89 e5 mov %rsp,%rbp
401554: 48 83 ec 30 sub $0x30,%rsp
401558: e8 e3 00 00 00 callq 401640 <__main>
40155d: c7 45 f8 05 00 00 00 movl $0x5,-0x8(%rbp)
401564: 48 8d 45 f8 lea -0x8(%rbp),%rax
401568: 48 89 c1 mov %rax,%rcx
40156b: e8 c0 16 00 00 callq 402c30 <_ZN9TestClass6xPlus2Ev>
401570: 89 45 fc mov %eax,-0x4(%rbp)
401573: 8b 45 fc mov -0x4(%rbp),%eax
401576: 89 c2 mov %eax,%edx
401578: 48 8d 0d 81 2a 00 00 lea 0x2a81(%rip),%rcx # 404000 <.rdata>
40157f: e8 ec 14 00 00 callq 402a70 <printf>
401584: b8 00 00 00 00 mov $0x0,%eax
401589: 48 83 c4 30 add $0x30,%rsp
40158d: 5d pop %rbp
40158e: c3 retq
40158f: 90 nop
0000000000401590 <__do_global_dtors>:
401590: 48 83 ec 28 sub $0x28,%rsp
401594: 48 8b 05 75 1a 00 00 mov 0x1a75(%rip),%rax # 403010 <p.93846>
40159b: 48 8b 00 mov (%rax),%rax
40159e: 48 85 c0 test %rax,%rax
4015a1: 74 1d je 4015c0 <__do_global_dtors+0x30>
4015a3: ff d0 callq *%rax
4015a5: 48 8b 05 64 1a 00 00 mov 0x1a64(%rip),%rax # 403010 <p.93846>
4015ac: 48 8d 50 08 lea 0x8(%rax),%rdx
4015b0: 48 8b 40 08 mov 0x8(%rax),%rax
4015b4: 48 89 15 55 1a 00 00 mov %rdx,0x1a55(%rip) # 403010 <p.93846>
4015bb: 48 85 c0 test %rax,%rax
4015be: 75 e3 jne 4015a3 <__do_global_dtors+0x13>
4015c0: 48 83 c4 28 add $0x28,%rsp
4015c4: c3 retq
4015c5: 90 nop
4015c6: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1)
4015cd: 00 00 00
00000000004015d0 <__do_global_ctors>:
4015d0: 56 push %rsi
4015d1: 53 push %rbx
4015d2: 48 83 ec 28 sub $0x28,%rsp
4015d6: 48 8b 0d 23 2d 00 00 mov 0x2d23(%rip),%rcx # 404300 <.refptr.__CTOR_LIST__>
4015dd: 48 8b 11 mov (%rcx),%rdx
4015e0: 83 fa ff cmp $0xffffffff,%edx
4015e3: 89 d0 mov %edx,%eax
4015e5: 74 39 je 401620 <__do_global_ctors+0x50>
4015e7: 85 c0 test %eax,%eax
4015e9: 74 20 je 40160b <__do_global_ctors+0x3b>
4015eb: 89 c2 mov %eax,%edx
4015ed: 83 e8 01 sub $0x1,%eax
4015f0: 48 8d 1c d1 lea (%rcx,%rdx,8),%rbx
4015f4: 48 29 c2 sub %rax,%rdx
4015f7: 48 8d 74 d1 f8 lea -0x8(%rcx,%rdx,8),%rsi
4015fc: 0f 1f 40 00 nopl 0x0(%rax)
401600: ff 13 callq *(%rbx)
401602: 48 83 eb 08 sub $0x8,%rbx
401606: 48 39 f3 cmp %rsi,%rbx
401609: 75 f5 jne 401600 <__do_global_ctors+0x30>
40160b: 48 8d 0d 7e ff ff ff lea -0x82(%rip),%rcx # 401590 <__do_global_dtors>
401612: 48 83 c4 28 add $0x28,%rsp
401616: 5b pop %rbx
401617: 5e pop %rsi
401618: e9 f3 fe ff ff jmpq 401510 <atexit>
40161d: 0f 1f 00 nopl (%rax)
401620: 31 c0 xor %eax,%eax
401622: eb 02 jmp 401626 <__do_global_ctors+0x56>
401624: 89 d0 mov %edx,%eax
401626: 44 8d 40 01 lea 0x1(%rax),%r8d
40162a: 4a 83 3c c1 00 cmpq $0x0,(%rcx,%r8,8)
40162f: 4c 89 c2 mov %r8,%rdx
401632: 75 f0 jne 401624 <__do_global_ctors+0x54>
401634: eb b1 jmp 4015e7 <__do_global_ctors+0x17>
401636: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1)
40163d: 00 00 00
0000000000401640 <__main>:
401640: 8b 05 ea 59 00 00 mov 0x59ea(%rip),%eax # 407030 <initialized>
401646: 85 c0 test %eax,%eax
401648: 74 06 je 401650 <__main+0x10>
40164a: c3 retq
40164b: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
401650: c7 05 d6 59 00 00 01 movl $0x1,0x59d6(%rip) # 407030 <initialized>
401657: 00 00 00
40165a: e9 71 ff ff ff jmpq 4015d0 <__do_global_ctors>
40165f: 90 nop
I think what you are looking for are these instructions:
40155d: c7 45 f8 05 00 00 00 movl $0x5,-0x8(%rbp)
401564: 48 8d 45 f8 lea -0x8(%rbp),%rax
401568: 48 89 c1 mov %rax,%rcx
40156b: e8 c0 16 00 00 callq 402c30 <_ZN9TestClass6xPlus2Ev>
401570: 89 45 fc mov %eax,-0x4(%rbp)
These match with the code from main:
TestClass tc1 = {5};
int variable = tc1.xPlus2();
At address 40155d the field tc1.x is initialized with the value 5.
At address 401564 the pointer to tc1 is loaded into the register %rax
At address 401568 the pointer to tc1 is copied into the register %rcx
At address 40156b is the call of the method tc1.xPlus2()
At address 401570 the result is store in variable
Your observations are mostly correct. rcx holds the this pointer to the object on which the method was called. x is stored in the first area of memory that the this pointer points to, so that is why rcx was dereferenced and the result added to. It is the responsibility of the caller to make sure that rcx is the address of the object before invoking the function. We can see main prepare rcx by setting it to an address in its stack frame. You are correct that the compiler produced inefficient code here and did not need to use the stack. Compiling with higher optimization levels -O1, -O2, or -O3 will likely fix that. These higher optimizations will probably get rid of the nops too, since they are used for function alignment. You can mostly ignore __main. It's used for libc initialization.
The do_compare function is in the libstdc++ library. It basically checks two strings and returns -1, 1, or 0 accordingly.
Here is the C++ code:
template<typename _CharT>
int
collate<_CharT>::
do_compare(const _CharT* __lo1, const _CharT* __hi1,
const _CharT* __lo2, const _CharT* __hi2) const
{
// strcoll assumes zero-terminated strings so we make a copy
// and then put a zero at the end.
const string_type __one(__lo1, __hi1);
const string_type __two(__lo2, __hi2);
const _CharT* __p = __one.c_str();
const _CharT* __pend = __one.data() + __one.length();
const _CharT* __q = __two.c_str();
const _CharT* __qend = __two.data() + __two.length();
// strcoll stops when it sees a nul character so we break
// the strings into zero-terminated substrings and pass those
// to strcoll.
for (;;)
{
const int __res = _M_compare(__p, __q);
if (__res)
return __res;
__p += char_traits<_CharT>::length(__p);
__q += char_traits<_CharT>::length(__q);
if (__p == __pend && __q == __qend)
return 0;
else if (__p == __pend)
return -1;
else if (__q == __qend)
return 1;
__p++;
__q++;
}
}
I have to put the entire assembly code of do_compare to show my problem, sorry:
0000000000101c40 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4>:
101c40: 41 57 push %r15
101c42: 41 56 push %r14
101c44: 49 89 fe mov %rdi,%r14
101c47: 48 89 f7 mov %rsi,%rdi
101c4a: 48 89 d6 mov %rdx,%rsi
101c4d: 41 55 push %r13
101c4f: 41 54 push %r12
101c51: 55 push %rbp
101c52: 4c 89 c5 mov %r8,%rbp
101c55: 53 push %rbx
101c56: 48 89 cb mov %rcx,%rbx
101c59: 48 83 ec 38 sub $0x38,%rsp
101c5d: 64 48 8b 04 25 28 00 mov %fs:0x28,%rax
101c64: 00 00
101c66: 48 89 44 24 28 mov %rax,0x28(%rsp)
101c6b: 31 c0 xor %eax,%eax
101c6d: 4c 8d 6c 24 27 lea 0x27(%rsp),%r13
101c72: 4c 89 ea mov %r13,%rdx
101c75: 4c 89 6c 24 18 mov %r13,0x18(%rsp)
101c7a: e8 f1 a2 f8 ff callq 8bf70 <_ZNSs12_S_constructIPKcEEPcT_S3_RKSaIcESt20forward_iterator_tag#plt>
101c7f: 4c 89 ea mov %r13,%rdx
101c82: 48 89 ee mov %rbp,%rsi
101c85: 48 89 df mov %rbx,%rdi
101c88: 49 89 c7 mov %rax,%r15
101c8b: 48 89 44 24 08 mov %rax,0x8(%rsp)
101c90: e8 db a2 f8 ff callq 8bf70 <_ZNSs12_S_constructIPKcEEPcT_S3_RKSaIcESt20forward_iterator_tag#plt>
101c95: 4d 8b 67 e8 mov -0x18(%r15),%r12
101c99: 4c 8b 68 e8 mov -0x18(%rax),%r13
101c9d: 48 89 c5 mov %rax,%rbp
101ca0: 48 89 44 24 10 mov %rax,0x10(%rsp)
101ca5: 4c 89 fb mov %r15,%rbx
101ca8: 4d 01 fc add %r15,%r12
101cab: 49 01 c5 add %rax,%r13
101cae: eb 32 jmp 101ce2 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0xa2>
101cb0: 48 89 df mov %rbx,%rdi
101cb3: e8 98 87 f8 ff callq 8a450 <strlen#plt>
101cb8: 48 89 ef mov %rbp,%rdi
101cbb: 48 01 c3 add %rax,%rbx
101cbe: e8 8d 87 f8 ff callq 8a450 <strlen#plt>
101cc3: 48 01 c5 add %rax,%rbp
101cc6: 49 39 dc cmp %rbx,%r12
101cc9: 75 05 jne 101cd0 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0x90>
101ccb: 49 39 ed cmp %rbp,%r13
101cce: 74 27 je 101cf7 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0xb7>
101cd0: 49 39 dc cmp %rbx,%r12
101cd3: 74 6b je 101d40 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0x100>
101cd5: 49 39 ed cmp %rbp,%r13
101cd8: 74 76 je 101d50 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0x110>
101cda: 48 83 c3 01 add $0x1,%rbx
101cde: 48 83 c5 01 add $0x1,%rbp
101ce2: 48 89 ea mov %rbp,%rdx
101ce5: 48 89 de mov %rbx,%rsi
101ce8: 4c 89 f7 mov %r14,%rdi
101ceb: e8 20 8b f8 ff callq 8a810 <_ZNKSt7collateIcE10_M_compareEPKcS2_#plt>
101cf0: 41 89 c7 mov %eax,%r15d
101cf3: 85 c0 test %eax,%eax
101cf5: 74 b9 je 101cb0 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0x70>
101cf7: 48 8b 7c 24 10 mov 0x10(%rsp),%rdi
101cfc: 48 8b 1d 9d 08 28 00 mov 0x28089d(%rip),%rbx # 3825a0 <_ZNSs4_Rep20_S_empty_rep_storageE##GLIBCXX_3.4-0x57e0>
101d03: 48 83 ef 18 sub $0x18,%rdi
101d07: 48 39 df cmp %rbx,%rdi
101d0a: 75 54 jne 101d60 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0x120>
101d0c: 48 8b 7c 24 08 mov 0x8(%rsp),%rdi
101d11: 48 83 ef 18 sub $0x18,%rdi
101d15: 48 39 df cmp %rbx,%rdi
101d18: 75 56 jne 101d70 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0x130>
101d1a: 48 8b 4c 24 28 mov 0x28(%rsp),%rcx
101d1f: 64 48 33 0c 25 28 00 xor %fs:0x28,%rcx
101d26: 00 00
101d28: 44 89 f8 mov %r15d,%eax
101d2b: 75 4f jne 101d7c <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0x13c>
101d2d: 48 83 c4 38 add $0x38,%rsp
101d31: 5b pop %rbx
101d32: 5d pop %rbp
101d33: 41 5c pop %r12
101d35: 41 5d pop %r13
101d37: 41 5e pop %r14
101d39: 41 5f pop %r15
101d3b: c3 retq
101d3c: 0f 1f 40 00 nopl 0x0(%rax)
101d40: 41 bf ff ff ff ff mov $0xffffffff,%r15d
101d46: eb af jmp 101cf7 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0xb7>
101d48: 0f 1f 84 00 00 00 00 nopl 0x0(%rax,%rax,1)
101d4f: 00
101d50: 41 bf 01 00 00 00 mov $0x1,%r15d
101d56: eb 9f jmp 101cf7 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0xb7>
101d58: 0f 1f 84 00 00 00 00 nopl 0x0(%rax,%rax,1)
101d5f: 00
101d60: 48 8b 74 24 18 mov 0x18(%rsp),%rsi
101d65: e8 96 fe ff ff callq 101c00 <_ZNSt14codecvt_bynameIcc11__mbstate_tED0Ev##GLIBCXX_3.4+0x20>
101d6a: eb a0 jmp 101d0c <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0xcc>
101d6c: 0f 1f 40 00 nopl 0x0(%rax)
101d70: 48 8b 74 24 18 mov 0x18(%rsp),%rsi
101d75: e8 86 fe ff ff callq 101c00 <_ZNSt14codecvt_bynameIcc11__mbstate_tED0Ev##GLIBCXX_3.4+0x20>
101d7a: eb 9e jmp 101d1a <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0xda>
101d7c: e8 7f 95 f8 ff callq 8b300 <__stack_chk_fail#plt>
101d81: 48 89 c3 mov %rax,%rbx
101d84: 48 8b 7c 24 08 mov 0x8(%rsp),%rdi
101d89: 48 83 ef 18 sub $0x18,%rdi
101d8d: 48 3b 3d 0c 08 28 00 cmp 0x28080c(%rip),%rdi # 3825a0 <_ZNSs4_Rep20_S_empty_rep_storageE##GLIBCXX_3.4-0x57e0>
101d94: 74 0a je 101da0 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0x160>
101d96: 48 8b 74 24 18 mov 0x18(%rsp),%rsi
101d9b: e8 60 fe ff ff callq 101c00 <_ZNSt14codecvt_bynameIcc11__mbstate_tED0Ev##GLIBCXX_3.4+0x20>
101da0: 48 89 df mov %rbx,%rdi
101da3: e8 e8 a1 f8 ff callq 8bf90 <_Unwind_Resume#plt>
101da8: 0f 1f 84 00 00 00 00 nopl 0x0(%rax,%rax,1)
101daf: 00
*******101db0: 53 push %rbx
101db1: 48 89 fb mov %rdi,%rbx
101db4: 48 8b 3f mov (%rdi),%rdi
101db7: 89 f0 mov %esi,%eax
101db9: 48 85 ff test %rdi,%rdi
101dbc: 74 05 je 101dc3 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0x183>
101dbe: 83 fe ff cmp $0xffffffff,%esi
101dc1: 74 05 je 101dc8 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0x188>
101dc3: 5b pop %rbx
101dc4: c3 retq
101dc5: 0f 1f 00 nopl (%rax)
101dc8: 48 8b 47 10 mov 0x10(%rdi),%rax
101dcc: 48 3b 47 18 cmp 0x18(%rdi),%rax
101dd0: 73 0e jae 101de0 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0x1a0>
101dd2: 0f b6 00 movzbl (%rax),%eax
101dd5: 5b pop %rbx
101dd6: c3 retq
101dd7: 66 0f 1f 84 00 00 00 nopw 0x0(%rax,%rax,1)
101dde: 00 00
101de0: 48 8b 07 mov (%rdi),%rax
101de3: ff 50 48 callq *0x48(%rax)
101de6: 83 f8 ff cmp $0xffffffff,%eax
101de9: 75 d8 jne 101dc3 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0x183>
101deb: 48 c7 03 00 00 00 00 movq $0x0,(%rbx)
101df2: 5b pop %rbx
101df3: c3 retq
101df4: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1)
101dfb: 00 00 00
101dfe: 66 90 xchg %ax,%ax
101e00: 55 push %rbp
101e01: 89 f5 mov %esi,%ebp
101e03: 53 push %rbx
101e04: 48 89 fb mov %rdi,%rbx
101e07: 48 83 ec 08 sub $0x8,%rsp
101e0b: e8 b0 88 f8 ff callq 8a6c0 <_ZNKSt5ctypeIcE13_M_widen_initEv#plt>
101e10: 48 8b 03 mov (%rbx),%rax
101e13: 48 8b 40 30 mov 0x30(%rax),%rax
101e17: 48 3b 05 7a 11 28 00 cmp 0x28117a(%rip),%rax # 382f98 <_ZNKSt5ctypeIcE8do_widenEc##GLIBCXX_3.4+0x2e2c48>
101e1e: 75 10 jne 101e30 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0x1f0>
101e20: 48 83 c4 08 add $0x8,%rsp
101e24: 89 e8 mov %ebp,%eax
101e26: 5b pop %rbx
101e27: 5d pop %rbp
101e28: c3 retq
101e29: 0f 1f 80 00 00 00 00 nopl 0x0(%rax)
101e30: 48 83 c4 08 add $0x8,%rsp
101e34: 40 0f be f5 movsbl %bpl,%esi
101e38: 48 89 df mov %rbx,%rdi
101e3b: 5b pop %rbx
101e3c: 5d pop %rbp
101e3d: ff e0 jmpq *%rax
101e3f: 90 nop
It seems to me that the assembly code not only performs the C++ code logic but also adds other logic.
As an example, the function _M_extract_int in libstdc++ which coverts a char to int calls this function as the following:
callq 0x101db0
The instruction address 0x101db0 is in the middle of the assembly code. The code section from 0x101db0 to 0x101dbc seems to have nothing to do with the above C++ code. Really confused about what is going on here...
i read some old articles about the local scoped static variable initialzation order problem from
C++ scoped static initialization is not thread-safe back in 2004, and
Function Static Variables in Multi-Threaded Environments in 2006.
then I start to produce an example and check my compiler, gcc 4.4.7
int calcSomething(){}
void foo(){
static int x = calcSomething();
}
int main(){
foo();
return 0;
}
the result from objdump shows:
000000000040061a <_Z3foov>:
40061a: 55 push %rbp
40061b: 48 89 e5 mov %rsp,%rbp
40061e: b8 d0 0a 60 00 mov $0x600ad0,%eax
400623: 0f b6 00 movzbl (%rax),%eax
400626: 84 c0 test %al,%al
400628: 75 28 jne 400652 <_Z3foov+0x38>
40062a: bf d0 0a 60 00 mov $0x600ad0,%edi
40062f: e8 bc fe ff ff callq 4004f0 <__cxa_guard_acquire#plt>
400634: 85 c0 test %eax,%eax
400636: 0f 95 c0 setne %al
400639: 84 c0 test %al,%al
40063b: 74 15 je 400652 <_Z3foov+0x38>
40063d: e8 d2 ff ff ff callq 400614 <_Z13calcSomethingv>
400642: 89 05 90 04 20 00 mov %eax,0x200490(%rip) # 600ad8 <_ZZ3foovE1x>
400648: bf d0 0a 60 00 mov $0x600ad0,%edi
40064d: e8 be fe ff ff callq 400510 <__cxa_guard_release#plt>
400652: c9 leaveq
400653: c3 retq
unfortunately, my knowledge of asssmbly code is so limited that I cannot tell what compiler does here. Can anyone shed me some light, what this assembly code do? and is it still not thread-safe? I really appreciate some "pseudo code" showing what gcc is doing here.
EDIT-1:
as Jerry commented, I enabled optimization with O2, the assembly code is:
0000000000400620 <_Z3foov>:
400620: 48 83 ec 08 sub $0x8,%rsp
400624: 80 3d 85 04 20 00 00 cmpb $0x0,0x200485(%rip) # 600ab0 <_ZGVZ3foovE1x>
40062b: 74 0b je 400638 <_Z3foov+0x18>
40062d: 48 83 c4 08 add $0x8,%rsp
400631: c3 retq
400632: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
400638: bf b0 0a 60 00 mov $0x600ab0,%edi
40063d: e8 9e fe ff ff callq 4004e0 <__cxa_guard_acquire#plt>
400642: 85 c0 test %eax,%eax
400644: 74 e7 je 40062d <_Z3foov+0xd>
400646: c7 05 68 04 20 00 00 movl $0x0,0x200468(%rip) # 600ab8 <_ZZ3foovE1x>
40064d: 00 00 00
400650: bf b0 0a 60 00 mov $0x600ab0,%edi
400655: 48 83 c4 08 add $0x8,%rsp
400659: e9 a2 fe ff ff jmpq 400500 <__cxa_guard_release#plt>
40065e: 66 90 xchg %ax,%ax
Yes. In pseudocode (for the un-optimized case) it's something like:
if (flag_val() != 0) goto done;
if (guard_acquire() != 0) goto done;
x = calcSomething();
guard_release_and_set_flag();
// Note releasing the guard lock causes later
// calls to flag_val() to return non-zero.
done: return
The flag_val() is really a non-blocking check, apparently for efficiency to avoid calling the acquire primitive unless necessary. The flag must be set by guard_release as shown. The acquire seems to be the synchronized call to grab the lock. Only one thread will get a true value back and perform the initialization. After it releases the lock, the non-zero flag prevents any further touches of the lock.
Another interesting tidbit is that the guard data structure is 8 bytes away from the value of x itself in static memory.
Those familiar with the singleton pattern in languages with built-in threads e.g. Java will recognize this!
Addition
A bit more time now, so in a bit more detail:
000000000040061a <_Z3foov>:
; Prepare to access stack variables (never used in un-optimized code).
40061a: 55 push %rbp
40061b: 48 89 e5 mov %rsp,%rbp
; Test a byte 8 away from the static int x. This is apparently an "initialized" flag.
40061e: b8 d0 0a 60 00 mov $0x600ad0,%eax
400623: 0f b6 00 movzbl (%rax),%eax
400626: 84 c0 test %al,%al
; Goto the end of the function if the byte was no-zero.
400628: 75 28 jne 400652 <_Z3foov+0x38>
; Load the same byte address in di: the argument for the call to
; acquire the guard lock.
40062a: bf d0 0a 60 00 mov $0x600ad0,%edi
40062f: e8 bc fe ff ff callq 4004f0 <__cxa_guard_acquire#plt>
; Test the return value. Goto end of function if not zero (non-optimized code).
400634: 85 c0 test %eax,%eax
400636: 0f 95 c0 setne %al
400639: 84 c0 test %al,%al
40063b: 74 15 je 400652 <_Z3foov+0x38>
; Call the user's initialization function and move result into x.
40063d: e8 d2 ff ff ff callq 400614 <_Z13calcSomethingv>
400642: 89 05 90 04 20 00 mov %eax,0x200490(%rip) # 600ad8 <_ZZ3foovE1x>
; Load the guard byte's address again and call the release routine.
; This must set the flag to non-zero.
400648: bf d0 0a 60 00 mov $0x600ad0,%edi
40064d: e8 be fe ff ff callq 400510 <__cxa_guard_release#plt>
; Restore state and return.
400652: c9 leaveq
400653: c3 retq
This listing, although for the LLVM compiler rather than g++ (are you running OS X? OS X aliases g++ to LLVM), agrees with the guesswork above. The set_initialized routine is setting a flag value in guard_release.
Can't someone explain to me why the output of this program is [nan, nan]? The code is supposed to load the value of d into the high and low 64-bits of the XMM1 register and then move the contents of XMM1 into a. Because a is not initialized to a set of specific values, D initializes each element to nan. If the movupd instruction was not in the objdump, I would understand the result, but the instruction is there. Thoughts?
import std.stdio;
void main()
{
enum double d = 1.0 / cast(double)2;
double[] a = new double[2];
auto aptr = a.ptr;
asm
{
movddup XMM1, d;
movupd [aptr], XMM1;
}
writeln(a);
}
Here is the objdump of the main function:
0000000000426b88 <_Dmain>:
426b88: 55 push %rbp
426b89: 48 8b ec mov %rsp,%rbp
426b8c: 48 83 ec 50 sub $0x50,%rsp
426b90: f2 48 0f 10 05 77 81 rex.W movsd 0x28177(%rip),%xmm0
426b97: 02 00
426b99: f2 48 0f 11 45 b0 rex.W movsd %xmm0,-0x50(%rbp)
426b9f: 48 be 02 00 00 00 00 movabs $0x2,%rsi
426ba6: 00 00 00
426ba9: f2 48 0f 10 05 66 81 rex.W movsd 0x28166(%rip),%xmm0
426bb0: 02 00
426bb2: 48 8d 7d c0 lea -0x40(%rbp),%rdi
426bb6: e8 65 d1 00 00 callq 433d20 <_memsetDouble>
426bbb: f2 48 0f 10 0d 4c 81 rex.W movsd 0x2814c(%rip),%xmm1
426bc2: 02 00
426bc4: f2 48 0f 11 4d c0 rex.W movsd %xmm1,-0x40(%rbp)
426bca: f2 48 0f 10 15 3d 81 rex.W movsd 0x2813d(%rip),%xmm2
426bd1: 02 00
426bd3: f2 48 0f 11 55 c8 rex.W movsd %xmm2,-0x38(%rbp)
426bd9: 48 8d 45 c0 lea -0x40(%rbp),%rax
426bdd: 48 89 45 d0 mov %rax,-0x30(%rbp)
426be1: 48 8d 55 e0 lea -0x20(%rbp),%rdx
426be5: 48 b8 02 00 00 00 00 movabs $0x2,%rax
426bec: 00 00 00
426bef: 48 89 c1 mov %rax,%rcx
426bf2: 49 89 d0 mov %rdx,%r8
426bf5: 51 push %rcx
426bf6: 41 50 push %r8
426bf8: 48 be 02 00 00 00 00 movabs $0x2,%rsi
426bff: 00 00 00
426c02: 48 bf c0 84 65 00 00 movabs $0x6584c0,%rdi
426c09: 00 00 00
426c0c: e8 87 ce 00 00 callq 433a98 <_d_arrayliteralTX>
426c11: 48 89 45 f0 mov %rax,-0x10(%rbp)
426c15: f2 48 0f 10 05 02 81 rex.W movsd 0x28102(%rip),%xmm0
426c1c: 02 00
426c1e: f2 48 0f 11 00 rex.W movsd %xmm0,(%rax)
426c23: f2 48 0f 10 0d f4 80 rex.W movsd 0x280f4(%rip),%xmm1
426c2a: 02 00
426c2c: 48 8b 45 f0 mov -0x10(%rbp),%rax
426c30: f2 48 0f 11 48 08 rex.W movsd %xmm1,0x8(%rax)
426c36: 48 8b 55 f0 mov -0x10(%rbp),%rdx
426c3a: 48 be 02 00 00 00 00 movabs $0x2,%rsi
426c41: 00 00 00
426c44: 41 58 pop %r8
426c46: 59 pop %rcx
426c47: 48 bf 08 00 00 00 00 movabs $0x8,%rdi
426c4e: 00 00 00
426c51: e8 8e 95 00 00 callq 4301e4 <_d_arraycopy>
426c56: f2 0f 12 4d b0 movddup -0x50(%rbp),%xmm1
426c5b: 66 0f 11 4d d0 movupd %xmm1,-0x30(%rbp)
426c60: ff 75 c8 pushq -0x38(%rbp)
426c63: ff 75 c0 pushq -0x40(%rbp)
426c66: e8 09 00 00 00 callq 426c74 <_D3std5stdio16__T7writelnTG2dZ7writelnFG2dZv>
426c6b: 48 83 c4 10 add $0x10,%rsp
426c6f: 31 c0 xor %eax,%eax
426c71: c9 leaveq
426c72: c3 retq
426c73: 90 nop
I looked into it, and apparently the compiler decides that by movupd [aptr], XMM1 you really mean movupd aptr, XMM1. Loading aptr into a register beforehand (mov aptr, RAX; movupd [RAX], XMM1) will make it work.
You should probably file a bug report.