Deciphering RIP-relative LEA instructions as part of a switch statement [duplicate]

Deciphering RIP-relative LEA instructions as part of a switch statement [duplicate] - c++

This question already has answers here:
Why does this MOVSS instruction use RIP-relative addressing? [duplicate]
(1 answer)
Why are global variables in x86-64 accessed relative to the instruction pointer?
(2 answers)
How to load address of function or label into register
(1 answer)
Closed 22 days ago.
What is the compiler doing in the beginning of the switch statement (snippet section below) to come up with the address in %rax so it can notrack jmpq *%rax to the correct offset ?
Are the constants 0xe07 and 0xdfb padding ?
lea 0xe07(%rip),%rax
lea 0xdfb(%rip),%rdx
Snippet
10 [1] switch (c) {
0x5555555551ec <+ 35> 0f be 45 fc movsbl -0x4(%rbp),%eax
0x5555555551f0 <+ 39> 83 e8 2d sub $0x2d,%eax
0x5555555551f3 <+ 42> 83 f8 32 cmp $0x32,%eax
0x5555555551f6 <+ 45> 0f 87 18 01 00 00 ja 0x555555555314 <aa2i(char)+331>
0x5555555551fc <+ 51> 89 c0 mov %eax,%eax
0x5555555551fe <+ 53> 48 8d 14 85 00 00 00 00 lea 0x0(,%rax,4),%rdx
0x555555555206 <+ 61> 48 8d 05 07 0e 00 00 lea 0xe07(%rip),%rax # 0x555555556014
0x55555555520d <+ 68> 8b 04 02 mov (%rdx,%rax,1),%eax
0x555555555210 <+ 71> 48 98 cltq
0x555555555212 <+ 73> 48 8d 15 fb 0d 00 00 lea 0xdfb(%rip),%rdx # 0x555555556014
0x555555555219 <+ 80> 48 01 d0 add %rdx,%rax
0x55555555521c <+ 83> 3e ff e0 notrack jmpq *%rax
cpp code
#include <QCoreApplication>
const int ANY=20; //number representing an X (any amino acid) internally
const int GAP=21; //number representing a gap internally
char aa2i(char c) {
//A R N D C Q E G H I L K M F P S T W Y V
if (c >= 'a' && c <= 'z') c += 'A' - 'a';
switch (c) {
case 'A':
return 0;
case 'R':
return 1;
case 'N':
return 2;
case 'D':
return 3;
case 'C':
return 4;
case 'Q':
return 5;
case 'E':
return 6;
case 'G':
return 7;
case 'H':
return 8;
case 'I':
return 9;
case 'L':
return 10;
case 'K':
return 11;
case 'M':
return 12;
case 'F':
return 13;
case 'P':
return 14;
case 'S':
return 15;
case 'T':
return 16;
case 'W':
return 17;
case 'Y':
return 18;
case 'V':
return 19;
case 'X':
return ANY;
case 'J':
return ANY;
case 'O':
return ANY;
case 'U':
return 4; //Selenocystein -> Cystein
case 'B':
return 3; //D (or N)
case 'Z':
return 6; //E (or Q)
case '-':
return GAP;
case '.':
return GAP;
case '_':
return GAP;
}
if (c >= 0 && c <= 32) return -1; // white space and control characters
return -2;
}
int main(int argc, char *argv[])
{
aa2i('R');
}
Disassembly:
6 [1] char aa2i(char c) {
0x5555555551c9 f3 0f 1e fa endbr64
0x5555555551cd <+ 4> 55 push %rbp
0x5555555551ce <+ 5> 48 89 e5 mov %rsp,%rbp
0x5555555551d1 <+ 8> 89 f8 mov %edi,%eax
0x5555555551d3 <+ 10> 88 45 fc mov %al,-0x4(%rbp)
9 [1] if (c >= 'a' && c <= 'z') c += 'A' - 'a';
0x5555555551d6 <+ 13> 80 7d fc 60 cmpb $0x60,-0x4(%rbp)
0x5555555551da <+ 17> 7e 10 jle 0x5555555551ec <aa2i(char)+35>
0x5555555551dc <+ 19> 80 7d fc 7a cmpb $0x7a,-0x4(%rbp)
0x5555555551e0 <+ 23> 7f 0a jg 0x5555555551ec <aa2i(char)+35>
0x5555555551e2 <+ 25> 0f b6 45 fc movzbl -0x4(%rbp),%eax
0x5555555551e6 <+ 29> 83 e8 20 sub $0x20,%eax
0x5555555551e9 <+ 32> 88 45 fc mov %al,-0x4(%rbp)
10 [1] switch (c) {
0x5555555551ec <+ 35> 0f be 45 fc movsbl -0x4(%rbp),%eax
0x5555555551f0 <+ 39> 83 e8 2d sub $0x2d,%eax
0x5555555551f3 <+ 42> 83 f8 32 cmp $0x32,%eax
0x5555555551f6 <+ 45> 0f 87 18 01 00 00 ja 0x555555555314 <aa2i(char)+331>
0x5555555551fc <+ 51> 89 c0 mov %eax,%eax
0x5555555551fe <+ 53> 48 8d 14 85 00 00 00 00 lea 0x0(,%rax,4),%rdx
0x555555555206 <+ 61> 48 8d 05 07 0e 00 00 lea 0xe07(%rip),%rax # 0x555555556014
0x55555555520d <+ 68> 8b 04 02 mov (%rdx,%rax,1),%eax
0x555555555210 <+ 71> 48 98 cltq
0x555555555212 <+ 73> 48 8d 15 fb 0d 00 00 lea 0xdfb(%rip),%rdx # 0x555555556014
0x555555555219 <+ 80> 48 01 d0 add %rdx,%rax
0x55555555521c <+ 83> 3e ff e0 notrack jmpq *%rax
12 [1] return 0;
0x55555555521f <+ 86> b8 00 00 00 00 mov $0x0,%eax
0x555555555224 <+ 91> e9 03 01 00 00 jmpq 0x55555555532c <aa2i(char)+355>
14 [1] return 1;
0x555555555229 <+ 96> b8 01 00 00 00 mov $0x1,%eax
0x55555555522e <+ 101> e9 f9 00 00 00 jmpq 0x55555555532c <aa2i(char)+355>
16 [1] return 2;
0x555555555233 <+ 106> b8 02 00 00 00 mov $0x2,%eax
0x555555555238 <+ 111> e9 ef 00 00 00 jmpq 0x55555555532c <aa2i(char)+355>
18 [1] return 3;
0x55555555523d <+ 116> b8 03 00 00 00 mov $0x3,%eax
0x555555555242 <+ 121> e9 e5 00 00 00 jmpq 0x55555555532c <aa2i(char)+355>
20 [1] return 4;
0x555555555247 <+ 126> b8 04 00 00 00 mov $0x4,%eax
0x55555555524c <+ 131> e9 db 00 00 00 jmpq 0x55555555532c <aa2i(char)+355>
22 [1] return 5;
0x555555555251 <+ 136> b8 05 00 00 00 mov $0x5,%eax
0x555555555256 <+ 141> e9 d1 00 00 00 jmpq 0x55555555532c <aa2i(char)+355>
24 [1] return 6;
0x55555555525b <+ 146> b8 06 00 00 00 mov $0x6,%eax
0x555555555260 <+ 151> e9 c7 00 00 00 jmpq 0x55555555532c <aa2i(char)+355>
26 [1] return 7;
0x555555555265 <+ 156> b8 07 00 00 00 mov $0x7,%eax
0x55555555526a <+ 161> e9 bd 00 00 00 jmpq 0x55555555532c <aa2i(char)+355>
28 [1] return 8;
0x55555555526f <+ 166> b8 08 00 00 00 mov $0x8,%eax
0x555555555274 <+ 171> e9 b3 00 00 00 jmpq 0x55555555532c <aa2i(char)+355>
30 [1] return 9;
0x555555555279 <+ 176> b8 09 00 00 00 mov $0x9,%eax
0x55555555527e <+ 181> e9 a9 00 00 00 jmpq 0x55555555532c <aa2i(char)+355>
32 [1] return 10;
0x555555555283 <+ 186> b8 0a 00 00 00 mov $0xa,%eax
0x555555555288 <+ 191> e9 9f 00 00 00 jmpq 0x55555555532c <aa2i(char)+355>
34 [1] return 11;
0x55555555528d <+ 196> b8 0b 00 00 00 mov $0xb,%eax
0x555555555292 <+ 201> e9 95 00 00 00 jmpq 0x55555555532c <aa2i(char)+355>
36 [1] return 12;
0x555555555297 <+ 206> b8 0c 00 00 00 mov $0xc,%eax
0x55555555529c <+ 211> e9 8b 00 00 00 jmpq 0x55555555532c <aa2i(char)+355>
38 [1] return 13;
0x5555555552a1 <+ 216> b8 0d 00 00 00 mov $0xd,%eax
0x5555555552a6 <+ 221> e9 81 00 00 00 jmpq 0x55555555532c <aa2i(char)+355>
40 [1] return 14;
0x5555555552ab <+ 226> b8 0e 00 00 00 mov $0xe,%eax
0x5555555552b0 <+ 231> eb 7a jmp 0x55555555532c <aa2i(char)+355>
42 [1] return 15;
0x5555555552b2 <+ 233> b8 0f 00 00 00 mov $0xf,%eax
0x5555555552b7 <+ 238> eb 73 jmp 0x55555555532c <aa2i(char)+355>
44 [1] return 16;
0x5555555552b9 <+ 240> b8 10 00 00 00 mov $0x10,%eax
0x5555555552be <+ 245> eb 6c jmp 0x55555555532c <aa2i(char)+355>
46 [1] return 17;
0x5555555552c0 <+ 247> b8 11 00 00 00 mov $0x11,%eax
0x5555555552c5 <+ 252> eb 65 jmp 0x55555555532c <aa2i(char)+355>
48 [1] return 18;
0x5555555552c7 <+ 254> b8 12 00 00 00 mov $0x12,%eax
0x5555555552cc <+ 259> eb 5e jmp 0x55555555532c <aa2i(char)+355>
50 [1] return 19;
0x5555555552ce <+ 261> b8 13 00 00 00 mov $0x13,%eax
0x5555555552d3 <+ 266> eb 57 jmp 0x55555555532c <aa2i(char)+355>
52 [1] return ANY;
0x5555555552d5 <+ 268> b8 14 00 00 00 mov $0x14,%eax
0x5555555552da <+ 273> eb 50 jmp 0x55555555532c <aa2i(char)+355>
54 [1] return ANY;
0x5555555552dc <+ 275> b8 14 00 00 00 mov $0x14,%eax
0x5555555552e1 <+ 280> eb 49 jmp 0x55555555532c <aa2i(char)+355>
56 [1] return ANY;
0x5555555552e3 <+ 282> b8 14 00 00 00 mov $0x14,%eax
0x5555555552e8 <+ 287> eb 42 jmp 0x55555555532c <aa2i(char)+355>
58 [1] return 4; //Selenocystein -> Cystein
0x5555555552ea <+ 289> b8 04 00 00 00 mov $0x4,%eax
0x5555555552ef <+ 294> eb 3b jmp 0x55555555532c <aa2i(char)+355>
60 [1] return 3; //D (or N)
0x5555555552f1 <+ 296> b8 03 00 00 00 mov $0x3,%eax
0x5555555552f6 <+ 301> eb 34 jmp 0x55555555532c <aa2i(char)+355>
62 [1] return 6; //E (or Q)
0x5555555552f8 <+ 303> b8 06 00 00 00 mov $0x6,%eax
0x5555555552fd <+ 308> eb 2d jmp 0x55555555532c <aa2i(char)+355>
64 [1] return GAP;
0x5555555552ff <+ 310> b8 15 00 00 00 mov $0x15,%eax
0x555555555304 <+ 315> eb 26 jmp 0x55555555532c <aa2i(char)+355>
66 [1] return GAP;
0x555555555306 <+ 317> b8 15 00 00 00 mov $0x15,%eax
0x55555555530b <+ 322> eb 1f jmp 0x55555555532c <aa2i(char)+355>
68 [1] return GAP;
0x55555555530d <+ 324> b8 15 00 00 00 mov $0x15,%eax
0x555555555312 <+ 329> eb 18 jmp 0x55555555532c <aa2i(char)+355>
70 [1] if (c >= 0 && c <= 32) return -1; // white space and control characters
0x555555555314 <+ 331> 80 7d fc 00 cmpb $0x0,-0x4(%rbp)
0x555555555318 <+ 335> 78 0d js 0x555555555327 <aa2i(char)+350>
0x55555555531a <+ 337> 80 7d fc 20 cmpb $0x20,-0x4(%rbp)
0x55555555531e <+ 341> 7f 07 jg 0x555555555327 <aa2i(char)+350>
0x555555555320 <+ 343> b8 ff ff ff ff mov $0xffffffff,%eax
0x555555555325 <+ 348> eb 05 jmp 0x55555555532c <aa2i(char)+355>
71 [1] return -2;
0x555555555327 <+ 350> b8 fe ff ff ff mov $0xfffffffe,%eax
72 [1] }
0x55555555532c <+ 355> 5d pop %rbp
0x55555555532d <+ 356> c3 retq

This is the so called RIP-Relative Addressing (described in Intel Volume 2. Chapter 2.2.1.6). Basically it is adding the offset (0xe07 and 0xdfb) to the address of the next instruction. Most probably this is the location where the jump table is located, and it looks like it is just after the code of the function.
In first case address of the next instruction is 0x55555555520d + 0xe07 = 0x555555556014.
In second case address of the next instruction is 0x555555555219 + 0xdfb = 0x555555556014.

Related

Which is better: returning tuple or passing arguments to function as references?

I created code where I have two functions returnValues and returnValuesVoid. One returns tuple of 2 values and other accept argument's references to the function.
#include <iostream>
#include <tuple>
std::tuple<int, int> returnValues(const int a, const int b) {
return std::tuple(a,b);
}
void returnValuesVoid(int &a,int &b) {
a += 100;
b += 100;
}
int main() {
auto [x,y] = returnValues(10,20);
std::cout << x ;
std::cout << y ;
int a = 10, b = 20;
returnValuesVoid(a, b);
std::cout << a ;
std::cout << b ;
}
I read about http://en.cppreference.com/w/cpp/language/structured_binding
which can destruct tuple to auto [x,y] variables.
Is auto [x,y] = returnValues(10,20); better than passing by references? As I know it's slower because it does have to return tuple object and reference just works on orginal variables passed to function so there's no reason to use it except cleaner code.
As auto [x,y] is since C++17 do people use it on production? I see that it looks cleaner than returnValuesVoid which is void type and but does it have other advantages over passing by reference?

Look at disassemble (compiled with GCC -O3):
It takes more instruction to implement tuple call.
0000000000000000 <returnValues(int, int)>:
0: 83 c2 64 add $0x64,%edx
3: 83 c6 64 add $0x64,%esi
6: 48 89 f8 mov %rdi,%rax
9: 89 17 mov %edx,(%rdi)
b: 89 77 04 mov %esi,0x4(%rdi)
e: c3 retq
f: 90 nop
0000000000000010 <returnValuesVoid(int&, int&)>:
10: 83 07 64 addl $0x64,(%rdi)
13: 83 06 64 addl $0x64,(%rsi)
16: c3 retq
But less instructions for the tuple caller:
0000000000000000 <callTuple()>:
0: 48 83 ec 18 sub $0x18,%rsp
4: ba 14 00 00 00 mov $0x14,%edx
9: be 0a 00 00 00 mov $0xa,%esi
e: 48 8d 7c 24 08 lea 0x8(%rsp),%rdi
13: e8 00 00 00 00 callq 18 <callTuple()+0x18> // call returnValues
18: 8b 74 24 0c mov 0xc(%rsp),%esi
1c: 48 8d 3d 00 00 00 00 lea 0x0(%rip),%rdi
23: e8 00 00 00 00 callq 28 <callTuple()+0x28> // std::cout::operator<<
28: 8b 74 24 08 mov 0x8(%rsp),%esi
2c: 48 8d 3d 00 00 00 00 lea 0x0(%rip),%rdi
33: e8 00 00 00 00 callq 38 <callTuple()+0x38> // std::cout::operator<<
38: 48 83 c4 18 add $0x18,%rsp
3c: c3 retq
3d: 0f 1f 00 nopl (%rax)
0000000000000040 <callRef()>:
40: 48 83 ec 18 sub $0x18,%rsp
44: 48 8d 74 24 0c lea 0xc(%rsp),%rsi
49: 48 8d 7c 24 08 lea 0x8(%rsp),%rdi
4e: c7 44 24 08 0a 00 00 movl $0xa,0x8(%rsp)
55: 00
56: c7 44 24 0c 14 00 00 movl $0x14,0xc(%rsp)
5d: 00
5e: e8 00 00 00 00 callq 63 <callRef()+0x23> // call returnValuesVoid
63: 8b 74 24 08 mov 0x8(%rsp),%esi
67: 48 8d 3d 00 00 00 00 lea 0x0(%rip),%rdi
6e: e8 00 00 00 00 callq 73 <callRef()+0x33> // std::cout::operator<<
73: 8b 74 24 0c mov 0xc(%rsp),%esi
77: 48 8d 3d 00 00 00 00 lea 0x0(%rip),%rdi
7e: e8 00 00 00 00 callq 83 <callRef()+0x43> // std::cout::operator<<
83: 48 83 c4 18 add $0x18,%rsp
87: c3 retq
I don't think there is any considerable performance different, but the tuple one is more clear, more readable.
Also tried inlined call, there is absolutely no different at all. Both of them generate exactly the same assemble code.
0000000000000000 <callTuple()>:
0: 48 8d 3d 00 00 00 00 lea 0x0(%rip),%rdi
7: 48 83 ec 08 sub $0x8,%rsp
b: be 6e 00 00 00 mov $0x6e,%esi
10: e8 00 00 00 00 callq 15 <callTuple()+0x15>
15: 48 8d 3d 00 00 00 00 lea 0x0(%rip),%rdi
1c: be 78 00 00 00 mov $0x78,%esi
21: 48 83 c4 08 add $0x8,%rsp
25: e9 00 00 00 00 jmpq 2a <callTuple()+0x2a> // TCO, optimized way to call a function and also return
2a: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
0000000000000030 <callRef()>:
30: 48 8d 3d 00 00 00 00 lea 0x0(%rip),%rdi
37: 48 83 ec 08 sub $0x8,%rsp
3b: be 6e 00 00 00 mov $0x6e,%esi
40: e8 00 00 00 00 callq 45 <callRef()+0x15>
45: 48 8d 3d 00 00 00 00 lea 0x0(%rip),%rdi
4c: be 78 00 00 00 mov $0x78,%esi
51: 48 83 c4 08 add $0x8,%rsp
55: e9 00 00 00 00 jmpq 5a <callRef()+0x2a> // TCO, optimized way to call a function and also return

Focus on what's more readable and which approach provides a better intuition to the reader, and please keep the performance issues you might think that arise in the background.
A function that returns a tuple (or a pair, a struct, etc.) is yelling to the author that the function returns something, that almost always has some meaning that the user can take into account.
A function that gives back the results in variables passed by reference, may slip the eye's attention of a tired reader.
So, in general, prefer to return the results by a tuple.
Mike van Dyke pointed to this link:
F.21: To return multiple "out" values, prefer returning a tuple or struct
Reason
A return value is self-documenting as an "output-only"
value. Note that C++ does have multiple return values, by convention
of using a tuple (including pair), possibly with the extra convenience
of tie at the call site.
[...]
Exception
Sometimes, we need to pass an object to a function to manipulate its state. In such cases, passing the object by reference T& is usually the right technique.

Using another compiler (VS 2017) the resulting code shows no difference, as the function calls are just optimized away.
int main() {
00007FF6A9C51E50 sub rsp,28h
auto [x,y] = returnValues(10,20);
std::cout << x ;
00007FF6A9C51E54 mov edx,0Ah
00007FF6A9C51E59 call std::basic_ostream<char,std::char_traits<char> >::operator<< (07FF6A9C51F60h)
std::cout << y ;
00007FF6A9C51E5E mov edx,14h
00007FF6A9C51E63 call std::basic_ostream<char,std::char_traits<char> >::operator<< (07FF6A9C51F60h)
int a = 10, b = 20;
returnValuesVoid(a, b);
std::cout << a ;
00007FF6A9C51E68 mov edx,6Eh
00007FF6A9C51E6D call std::basic_ostream<char,std::char_traits<char> >::operator<< (07FF6A9C51F60h)
std::cout << b ;
00007FF6A9C51E72 mov edx,78h
00007FF6A9C51E77 call std::basic_ostream<char,std::char_traits<char> >::operator<< (07FF6A9C51F60h)
}
00007FF6A9C51E7C xor eax,eax
00007FF6A9C51E7E add rsp,28h
00007FF6A9C51E82 ret
So using clearer code seems to be the obvious choice.

What Zang said is true but not up-to the point. I ran the code provided in question with chrono to measure time. I think the answer needs to be edited after observing what happened.
For 1M iterations, time taken by function call via reference was 3ms while time taken by function call via std::tie combined with std::tuple was about 94ms.
Though the difference seems very less in practice, still tuple one will perform slightly slower. Hence, for performance intensive systems, I suggest using call by reference.
My code:
#include <iostream>
#include <tuple>
#include <chrono>
std::tuple<int, int> returnValues(const int a, const int b)
{
return std::tuple<int, int>(a, b);
}
void returnValuesVoid(int &a, int &b)
{
a += 100;
b += 100;
}
int main()
{
int a = 10, b = 20;
auto begin = std::chrono::high_resolution_clock::now();
int x, y;
for (int i = 0; i < 1000000; i++)
{
std::tie(x, y) = returnValues(a, b);
}
auto end = std::chrono::high_resolution_clock::now();
std::cout << double(std::chrono::duration_cast<std::chrono::milliseconds>(end - begin).count()) << '\n';
a = 10;
b = 20;
auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < 1000000; i++)
{
returnValuesVoid(a, b);
}
auto stop = std::chrono::high_resolution_clock::now();
std::cout << double(std::chrono::duration_cast<std::chrono::milliseconds>(stop - start).count()) << '\n';
}

Hardware supported popcount for dynamic bitset in Boost library

How to enable the hardware supported popcount for counting set bits in the dynamic bitset from the Boost 1.64.0 library?

#include <boost/dynamic_bitset.hpp>
#include <boost/function_output_iterator.hpp>
#include <cstddef>
std::size_t fn(boost::dynamic_bitset<> const & p)
{
std::size_t acc = 0;
boost::to_block_range(p, boost::make_function_output_iterator(
[&acc](boost::dynamic_bitset<>::block_type v)
{
acc += __builtin_popcountll(v);
}
));
return acc;
}
Compiles to (g++ -O3 -march=native -c bitset.cpp -std=c++14):
30: 48 8b 77 08 mov 0x8(%rdi),%rsi
34: 48 8b 17 mov (%rdi),%rdx
37: 48 89 f0 mov %rsi,%rax
3a: 48 29 d0 sub %rdx,%rax
3d: 48 83 f8 07 cmp $0x7,%rax
41: b8 00 00 00 00 mov $0x0,%eax
46: 7e 1d jle 65 <_Z3fn3RKN5boost14dynamic_bitsetImSaImEEE+0x35>
48: 0f 1f 84 00 00 00 00 nopl 0x0(%rax,%rax,1)
4f: 00
50: 31 c9 xor %ecx,%ecx
52: 48 83 c2 08 add $0x8,%rdx
56: f3 48 0f b8 4a f8 popcnt -0x8(%rdx),%rcx
5c: 48 01 c8 add %rcx,%rax
5f: 48 39 d6 cmp %rdx,%rsi
62: 75 ec jne 50 <_Z3fn3RKN5boost14dynamic_bitsetImSaImEEE+0x20>
64: c3 retq
65: c3 retq
66: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1)
6d: 00 00 00

clang generated code for a simple factorial function

Here is the code :
unsigned int factorial(unsigned int n) {
if (n == 0) return 1;
return n * factorial(n - 1);
}
int main() {
return factorial(0);
}
I generate both gcc and clang assemblies using
g++ -g -O2 -c factorial.cpp (resp clang++)
objdump -d -M intel -S factorial.o
Here is what I get for gcc
factorial.o: file format elf64-x86-64
Disassembly of section .text:
0000000000000000 <_Z9factorialj>:
unsigned int factorial(unsigned int n)
{
if (n == 0)
0: 85 ff test edi,edi
2: b8 01 00 00 00 mov eax,0x1
7: 74 11 je 1a <_Z9factorialj+0x1a>
9: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
10: 0f af c7 imul eax,edi
13: 83 ef 01 sub edi,0x1
16: 75 f8 jne 10 <_Z9factorialj+0x10>
18: f3 c3 repz ret
{
return 1;
}
return n * factorial(n - 1);
}
1a: f3 c3 repz ret
Disassembly of section .text.startup:
0000000000000000 <main>:
if (n == 0)
0: b8 01 00 00 00 mov eax,0x1
5: c3 ret
and for clang
factorial.o: file format elf64-x86-64
Disassembly of section .text:
0000000000000000 <_Z9factorialj>:
unsigned int factorial(unsigned int n)
{
0: b8 01 00 00 00 mov eax,0x1
if (n == 0)
5: 85 ff test edi,edi
7: 0f 84 ba 01 00 00 je 1c7 <_Z9factorialj+0x1c7>
d: b8 01 00 00 00 mov eax,0x1
{
return 1;
}
return n * factorial(n - 1);
12: 83 ff 08 cmp edi,0x8
15: 0f 82 a5 01 00 00 jb 1c0 <_Z9factorialj+0x1c0>
1b: 41 89 f8 mov r8d,edi
1e: 41 83 e0 f8 and r8d,0xfffffff8
22: 89 fa mov edx,edi
24: 83 e2 f8 and edx,0xfffffff8
27: 0f 84 93 01 00 00 je 1c0 <_Z9factorialj+0x1c0>
2d: 8d 4f f8 lea ecx,[rdi-0x8]
30: 89 c8 mov eax,ecx
32: c1 e8 03 shr eax,0x3
35: 0f ba e1 03 bt ecx,0x3
39: 72 24 jb 5f <_Z9factorialj+0x5f>
3b: 66 0f 6e c7 movd xmm0,edi
3f: 66 0f 70 c8 00 pshufd xmm1,xmm0,0x0
44: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 4c <_Z9factorialj+0x4c>
4b: 00
4c: 66 0f fe c1 paddd xmm0,xmm1
50: 66 0f fe 0d 00 00 00 paddd xmm1,XMMWORD PTR [rip+0x0] # 58 <_Z9factorialj+0x58>
57: 00
58: b9 08 00 00 00 mov ecx,0x8
5d: eb 0e jmp 6d <_Z9factorialj+0x6d>
5f: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 67 <_Z9factorialj+0x67>
66: 00
67: 31 c9 xor ecx,ecx
69: 66 0f 6f c8 movdqa xmm1,xmm0
6d: 85 c0 test eax,eax
6f: 0f 84 d4 00 00 00 je 149 <_Z9factorialj+0x149>
75: 89 d0 mov eax,edx
77: 29 c8 sub eax,ecx
79: 89 fe mov esi,edi
7b: 29 ce sub esi,ecx
7d: 66 0f 6f 15 00 00 00 movdqa xmm2,XMMWORD PTR [rip+0x0] # 85 <_Z9factorialj+0x85>
84: 00
85: 66 0f 6f 1d 00 00 00 movdqa xmm3,XMMWORD PTR [rip+0x0] # 8d <_Z9factorialj+0x8d>
8c: 00
8d: 0f 1f 00 nop DWORD PTR [rax]
90: 66 0f 6e e6 movd xmm4,esi
94: 66 0f 70 e4 00 pshufd xmm4,xmm4,0x0
99: 66 0f 6f ec movdqa xmm5,xmm4
9d: 66 0f fe ea paddd xmm5,xmm2
a1: 66 0f fe e3 paddd xmm4,xmm3
a5: 66 0f 70 f5 f5 pshufd xmm6,xmm5,0xf5
aa: 66 0f f4 e8 pmuludq xmm5,xmm0
ae: 66 0f 70 ed e8 pshufd xmm5,xmm5,0xe8
b3: 66 0f 70 c0 f5 pshufd xmm0,xmm0,0xf5
b8: 66 0f f4 c6 pmuludq xmm0,xmm6
bc: 66 0f 70 c0 e8 pshufd xmm0,xmm0,0xe8
c1: 66 0f 62 e8 punpckldq xmm5,xmm0
c5: 66 0f 70 c4 f5 pshufd xmm0,xmm4,0xf5
ca: 66 0f f4 e1 pmuludq xmm4,xmm1
ce: 66 0f 70 e4 e8 pshufd xmm4,xmm4,0xe8
d3: 66 0f 70 c9 f5 pshufd xmm1,xmm1,0xf5
d8: 66 0f f4 c8 pmuludq xmm1,xmm0
dc: 66 0f 70 c1 e8 pshufd xmm0,xmm1,0xe8
e1: 66 0f 62 e0 punpckldq xmm4,xmm0
e5: 8d 4e f8 lea ecx,[rsi-0x8]
e8: 66 0f 6e c1 movd xmm0,ecx
ec: 66 0f 70 c8 00 pshufd xmm1,xmm0,0x0
f1: 66 0f 6f c1 movdqa xmm0,xmm1
f5: 66 0f fe c2 paddd xmm0,xmm2
f9: 66 0f fe cb paddd xmm1,xmm3
fd: 66 0f 70 f0 f5 pshufd xmm6,xmm0,0xf5
102: 66 0f f4 c5 pmuludq xmm0,xmm5
106: 66 0f 70 c0 e8 pshufd xmm0,xmm0,0xe8
10b: 66 0f 70 ed f5 pshufd xmm5,xmm5,0xf5
110: 66 0f f4 ee pmuludq xmm5,xmm6
114: 66 0f 70 ed e8 pshufd xmm5,xmm5,0xe8
119: 66 0f 62 c5 punpckldq xmm0,xmm5
11d: 66 0f 70 e9 f5 pshufd xmm5,xmm1,0xf5
122: 66 0f f4 cc pmuludq xmm1,xmm4
126: 66 0f 70 c9 e8 pshufd xmm1,xmm1,0xe8
12b: 66 0f 70 e4 f5 pshufd xmm4,xmm4,0xf5
130: 66 0f f4 e5 pmuludq xmm4,xmm5
134: 66 0f 70 e4 e8 pshufd xmm4,xmm4,0xe8
139: 66 0f 62 cc punpckldq xmm1,xmm4
13d: 83 c6 f0 add esi,0xfffffff0
140: 83 c0 f0 add eax,0xfffffff0
143: 0f 85 47 ff ff ff jne 90 <_Z9factorialj+0x90>
149: 66 0f 70 d1 f5 pshufd xmm2,xmm1,0xf5
14e: 66 0f f4 c8 pmuludq xmm1,xmm0
152: 66 0f 70 c9 e8 pshufd xmm1,xmm1,0xe8
157: 66 0f 70 c0 f5 pshufd xmm0,xmm0,0xf5
15c: 66 0f f4 c2 pmuludq xmm0,xmm2
160: 66 0f 70 c0 e8 pshufd xmm0,xmm0,0xe8
165: 66 0f 62 c8 punpckldq xmm1,xmm0
169: 66 0f 70 c1 4e pshufd xmm0,xmm1,0x4e
16e: 66 0f 70 d1 f5 pshufd xmm2,xmm1,0xf5
173: 66 0f f4 c8 pmuludq xmm1,xmm0
177: 66 0f 70 c9 e8 pshufd xmm1,xmm1,0xe8
17c: 66 0f 70 c0 f5 pshufd xmm0,xmm0,0xf5
181: 66 0f f4 c2 pmuludq xmm0,xmm2
185: 66 0f 70 c0 e8 pshufd xmm0,xmm0,0xe8
18a: 66 0f 62 c8 punpckldq xmm1,xmm0
18e: 66 0f 70 c1 e5 pshufd xmm0,xmm1,0xe5
193: 66 0f 70 d1 f5 pshufd xmm2,xmm1,0xf5
198: 66 0f f4 c8 pmuludq xmm1,xmm0
19c: 66 0f 70 c9 e8 pshufd xmm1,xmm1,0xe8
1a1: 66 0f 70 c0 f5 pshufd xmm0,xmm0,0xf5
1a6: 66 0f f4 c2 pmuludq xmm0,xmm2
1aa: 66 0f 70 c0 e8 pshufd xmm0,xmm0,0xe8
1af: 66 0f 62 c8 punpckldq xmm1,xmm0
1b3: 66 0f 7e c8 movd eax,xmm1
1b7: 39 fa cmp edx,edi
1b9: 74 0c je 1c7 <_Z9factorialj+0x1c7>
1bb: 44 29 c7 sub edi,r8d
1be: 66 90 xchg ax,ax
1c0: 0f af c7 imul eax,edi
if (n == 0)
1c3: ff cf dec edi
1c5: 75 f9 jne 1c0 <_Z9factorialj+0x1c0>
}
1c7: c3 ret
1c8: 0f 1f 84 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0]
1cf: 00
00000000000001d0 <main>:
int main()
{
return factorial(0);
1d0: b8 01 00 00 00 mov eax,0x1
1d5: c3 ret
I understand they both notice they can unroll the loop and precompute the value but I don't get why clang generates all this code (and I have no idea what this can be ever doing and why there is sse stuff...)
Bonus question: while gcc precomputes the value up to factorial(7), clang computes it for any value.
Up to factorial(31) the values look fine, for factorial(32) and factorial(33) it returns 0x80000000 and for greater values it does xor eax, eax instead. What is this black magic ?

What are these seemingly-useless callq instructions in my x86 object files for?

I have some template-heavy C++ code that I want to ensure the compiler optimizes as much as possible due to the large amount of information it has at compile time. To evaluate its performance, I decided to take a look at the disassembly of the object file that it generates. Below is a snippet of what I got from objdump -dC:
0000000000000000 <bar<foo, 0u>::get(bool)>:
0: 41 57 push %r15
2: 49 89 f7 mov %rsi,%r15
5: 41 56 push %r14
7: 41 55 push %r13
9: 41 54 push %r12
b: 55 push %rbp
c: 53 push %rbx
d: 48 81 ec 68 02 00 00 sub $0x268,%rsp
14: 48 89 7c 24 10 mov %rdi,0x10(%rsp)
19: 48 89 f7 mov %rsi,%rdi
1c: 89 54 24 1c mov %edx,0x1c(%rsp)
20: e8 00 00 00 00 callq 25 <bar<foo, 0u>::get(bool)+0x25>
25: 84 c0 test %al,%al
27: 0f 85 eb 00 00 00 jne 118 <bar<foo, 0u>::get(bool)+0x118>
2d: 48 c7 44 24 08 00 00 movq $0x0,0x8(%rsp)
34: 00 00
36: 4c 89 ff mov %r15,%rdi
39: 4d 8d b7 30 01 00 00 lea 0x130(%r15),%r14
40: e8 00 00 00 00 callq 45 <bar<foo, 0u>::get(bool)+0x45>
45: 84 c0 test %al,%al
47: 88 44 24 1b mov %al,0x1b(%rsp)
4b: 0f 85 ef 00 00 00 jne 140 <bar<foo, 0u>::get(bool)+0x140>
51: 80 7c 24 1c 00 cmpb $0x0,0x1c(%rsp)
56: 0f 85 24 03 00 00 jne 380 <bar<foo, 0u>::get(bool)+0x380>
5c: 48 8b 44 24 10 mov 0x10(%rsp),%rax
61: c6 00 00 movb $0x0,(%rax)
64: 80 7c 24 1b 00 cmpb $0x0,0x1b(%rsp)
69: 75 25 jne 90 <bar<foo, 0u>::get(bool)+0x90>
6b: 48 8b 74 24 10 mov 0x10(%rsp),%rsi
70: 4c 89 ff mov %r15,%rdi
73: e8 00 00 00 00 callq 78 <bar<foo, 0u>::get(bool)+0x78>
78: 48 8b 44 24 10 mov 0x10(%rsp),%rax
7d: 48 81 c4 68 02 00 00 add $0x268,%rsp
84: 5b pop %rbx
85: 5d pop %rbp
86: 41 5c pop %r12
88: 41 5d pop %r13
8a: 41 5e pop %r14
8c: 41 5f pop %r15
8e: c3 retq
8f: 90 nop
90: 4c 89 f7 mov %r14,%rdi
93: e8 00 00 00 00 callq 98 <bar<foo, 0u>::get(bool)+0x98>
98: 83 f8 04 cmp $0x4,%eax
9b: 74 f3 je 90 <bar<foo, 0u>::get(bool)+0x90>
9d: 85 c0 test %eax,%eax
9f: 0f 85 e4 08 00 00 jne 989 <bar<foo, 0u>::get(bool)+0x989>
a5: 49 83 87 b0 01 00 00 addq $0x1,0x1b0(%r15)
ac: 01
ad: 49 8d 9f 58 01 00 00 lea 0x158(%r15),%rbx
b4: 48 89 df mov %rbx,%rdi
b7: e8 00 00 00 00 callq bc <bar<foo, 0u>::get(bool)+0xbc>
bc: 49 8d bf 80 01 00 00 lea 0x180(%r15),%rdi
c3: e8 00 00 00 00 callq c8 <bar<foo, 0u>::get(bool)+0xc8>
c8: 48 89 df mov %rbx,%rdi
cb: e8 00 00 00 00 callq d0 <bar<foo, 0u>::get(bool)+0xd0>
d0: 4c 89 f7 mov %r14,%rdi
d3: e8 00 00 00 00 callq d8 <bar<foo, 0u>::get(bool)+0xd8>
d8: 83 f8 04 cmp $0x4,%eax
The disassembly of this particular function continues on, but one thing I noticed is the relatively large number of call instructions like this one:
20: e8 00 00 00 00 callq 25 <bar<foo, 0u>::get(bool)+0x25>
These instructions, always with the opcode e8 00 00 00 00, occur frequently throughout the generated code, and from what I can tell, are nothing more than no-ops; they all seem to just fall through to the next instruction. This begs the question, then, is there a good reason why all these instructions are generated?
I'm concerned about the instruction cache footprint of the generated code, so wasting 5 bytes many times throughout a function seems counterproductive. It seems a bit heavyweight for a nop, unless the compiler is trying to preserve some kind of memory alignment or something. I wouldn't be surprised if this were the case.
I compiled my code using g++ 4.8.5 using -O3 -fomit-frame-pointer. For what it's worth, I saw similar code generation using clang 3.7.

The 00 00 00 00 (relative) target address in e8 00 00 00 00 is intended to be filled in by the linker. It doesn't mean that the call falls through. It just means you are disassembling an object file that has not been linked yet.
Also, a call to the next instruction, if that was the end result after the link phase, would not be a no-op, because it changes the stack (a certain hint that this is not what is going on in your case).

SSE2 movddup Not Moving Values

Can't someone explain to me why the output of this program is [nan, nan]? The code is supposed to load the value of d into the high and low 64-bits of the XMM1 register and then move the contents of XMM1 into a. Because a is not initialized to a set of specific values, D initializes each element to nan. If the movupd instruction was not in the objdump, I would understand the result, but the instruction is there. Thoughts?
import std.stdio;
void main()
{
enum double d = 1.0 / cast(double)2;
double[] a = new double[2];
auto aptr = a.ptr;
asm
{
movddup XMM1, d;
movupd [aptr], XMM1;
}
writeln(a);
}
Here is the objdump of the main function:
0000000000426b88 <_Dmain>:
426b88: 55 push %rbp
426b89: 48 8b ec mov %rsp,%rbp
426b8c: 48 83 ec 50 sub $0x50,%rsp
426b90: f2 48 0f 10 05 77 81 rex.W movsd 0x28177(%rip),%xmm0
426b97: 02 00
426b99: f2 48 0f 11 45 b0 rex.W movsd %xmm0,-0x50(%rbp)
426b9f: 48 be 02 00 00 00 00 movabs $0x2,%rsi
426ba6: 00 00 00
426ba9: f2 48 0f 10 05 66 81 rex.W movsd 0x28166(%rip),%xmm0
426bb0: 02 00
426bb2: 48 8d 7d c0 lea -0x40(%rbp),%rdi
426bb6: e8 65 d1 00 00 callq 433d20 <_memsetDouble>
426bbb: f2 48 0f 10 0d 4c 81 rex.W movsd 0x2814c(%rip),%xmm1
426bc2: 02 00
426bc4: f2 48 0f 11 4d c0 rex.W movsd %xmm1,-0x40(%rbp)
426bca: f2 48 0f 10 15 3d 81 rex.W movsd 0x2813d(%rip),%xmm2
426bd1: 02 00
426bd3: f2 48 0f 11 55 c8 rex.W movsd %xmm2,-0x38(%rbp)
426bd9: 48 8d 45 c0 lea -0x40(%rbp),%rax
426bdd: 48 89 45 d0 mov %rax,-0x30(%rbp)
426be1: 48 8d 55 e0 lea -0x20(%rbp),%rdx
426be5: 48 b8 02 00 00 00 00 movabs $0x2,%rax
426bec: 00 00 00
426bef: 48 89 c1 mov %rax,%rcx
426bf2: 49 89 d0 mov %rdx,%r8
426bf5: 51 push %rcx
426bf6: 41 50 push %r8
426bf8: 48 be 02 00 00 00 00 movabs $0x2,%rsi
426bff: 00 00 00
426c02: 48 bf c0 84 65 00 00 movabs $0x6584c0,%rdi
426c09: 00 00 00
426c0c: e8 87 ce 00 00 callq 433a98 <_d_arrayliteralTX>
426c11: 48 89 45 f0 mov %rax,-0x10(%rbp)
426c15: f2 48 0f 10 05 02 81 rex.W movsd 0x28102(%rip),%xmm0
426c1c: 02 00
426c1e: f2 48 0f 11 00 rex.W movsd %xmm0,(%rax)
426c23: f2 48 0f 10 0d f4 80 rex.W movsd 0x280f4(%rip),%xmm1
426c2a: 02 00
426c2c: 48 8b 45 f0 mov -0x10(%rbp),%rax
426c30: f2 48 0f 11 48 08 rex.W movsd %xmm1,0x8(%rax)
426c36: 48 8b 55 f0 mov -0x10(%rbp),%rdx
426c3a: 48 be 02 00 00 00 00 movabs $0x2,%rsi
426c41: 00 00 00
426c44: 41 58 pop %r8
426c46: 59 pop %rcx
426c47: 48 bf 08 00 00 00 00 movabs $0x8,%rdi
426c4e: 00 00 00
426c51: e8 8e 95 00 00 callq 4301e4 <_d_arraycopy>
426c56: f2 0f 12 4d b0 movddup -0x50(%rbp),%xmm1
426c5b: 66 0f 11 4d d0 movupd %xmm1,-0x30(%rbp)
426c60: ff 75 c8 pushq -0x38(%rbp)
426c63: ff 75 c0 pushq -0x40(%rbp)
426c66: e8 09 00 00 00 callq 426c74 <_D3std5stdio16__T7writelnTG2dZ7writelnFG2dZv>
426c6b: 48 83 c4 10 add $0x10,%rsp
426c6f: 31 c0 xor %eax,%eax
426c71: c9 leaveq
426c72: c3 retq
426c73: 90 nop

I looked into it, and apparently the compiler decides that by movupd [aptr], XMM1 you really mean movupd aptr, XMM1. Loading aptr into a register beforehand (mov aptr, RAX; movupd [RAX], XMM1) will make it work.
You should probably file a bug report.

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

Deciphering RIP-relative LEA instructions as part of a switch statement [duplicate] - c++

Related

Which is better: returning tuple or passing arguments to function as references?

Hardware supported popcount for dynamic bitset in Boost library

clang generated code for a simple factorial function

What are these seemingly-useless callq instructions in my x86 object files for?

SSE2 movddup Not Moving Values

Categories

Resources