Hardware supported popcount for dynamic bitset in Boost library - c++

How to enable the hardware supported popcount for counting set bits in the dynamic bitset from the Boost 1.64.0 library?

#include <boost/dynamic_bitset.hpp>
#include <boost/function_output_iterator.hpp>
#include <cstddef>
std::size_t fn(boost::dynamic_bitset<> const & p)
{
std::size_t acc = 0;
boost::to_block_range(p, boost::make_function_output_iterator(
[&acc](boost::dynamic_bitset<>::block_type v)
{
acc += __builtin_popcountll(v);
}
));
return acc;
}
Compiles to (g++ -O3 -march=native -c bitset.cpp -std=c++14):
30: 48 8b 77 08 mov 0x8(%rdi),%rsi
34: 48 8b 17 mov (%rdi),%rdx
37: 48 89 f0 mov %rsi,%rax
3a: 48 29 d0 sub %rdx,%rax
3d: 48 83 f8 07 cmp $0x7,%rax
41: b8 00 00 00 00 mov $0x0,%eax
46: 7e 1d jle 65 <_Z3fn3RKN5boost14dynamic_bitsetImSaImEEE+0x35>
48: 0f 1f 84 00 00 00 00 nopl 0x0(%rax,%rax,1)
4f: 00
50: 31 c9 xor %ecx,%ecx
52: 48 83 c2 08 add $0x8,%rdx
56: f3 48 0f b8 4a f8 popcnt -0x8(%rdx),%rcx
5c: 48 01 c8 add %rcx,%rax
5f: 48 39 d6 cmp %rdx,%rsi
62: 75 ec jne 50 <_Z3fn3RKN5boost14dynamic_bitsetImSaImEEE+0x20>
64: c3 retq
65: c3 retq
66: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1)
6d: 00 00 00

Related

Why is g++ using jle instead of jbe for two unsigned numbers?

In a case where I get this error:
error: assuming signed overflow does not occur when simplifying conditional
I looked at the assembly and the if() uses:
d34: 48 83 fa 01 cmp $0x1,%rdx
d38: 7e 54 jle d8e <main+0x3ae>
Interestingly enough, %rdx is defined as an unsigned (std::size_t) and the number $0x1 is also defined as an unsigned (2UL in the original). So why would g++ decide to use jle instead of jbe?
Note: Just in case, I tried with if(colons > 1UL) ... and that did not help. Same error, same results in assembly.
C++ code to reproduce the error:
#include <algorithm>
#include <string>
#include <iostream>
int main(int argc, char * argv[])
{
std::string const in(argv[1]);
std::size_t const colons(std::count(in.begin(), in.end(), ':'));
if(colons >= 2UL)
{
std::cerr << "2 or more colons...\n";
}
else
{
std::cerr << "no or just one colon.\n";
}
return 0;
}
Command line used to reproduce the error:
g++ -Werror=strict-overflow -std=c++17 -O3 -o a a.cpp
To compile anyway, just don't use the -Werror=strict-overflow option.
The complete result (this is a bit of a killer since the std::count() gets overly optimized for speed):
00000000000009e0 <main>:
9e0: 41 55 push %r13
9e2: 41 54 push %r12
9e4: 55 push %rbp
9e5: 53 push %rbx
9e6: 48 83 ec 38 sub $0x38,%rsp
9ea: 4c 8b 66 08 mov 0x8(%rsi),%r12
9ee: 48 89 e3 mov %rsp,%rbx
9f1: 4c 8d 6b 10 lea 0x10(%rbx),%r13
9f5: 64 48 8b 04 25 28 00 mov %fs:0x28,%rax
9fc: 00 00
9fe: 48 89 44 24 28 mov %rax,0x28(%rsp)
a03: 31 c0 xor %eax,%eax
a05: 4d 85 e4 test %r12,%r12
a08: 4c 89 2c 24 mov %r13,(%rsp)
a0c: 0f 84 e8 03 00 00 je dfa <main+0x41a>
a12: 4c 89 e7 mov %r12,%rdi
a15: e8 16 ff ff ff callq 930 <strlen#plt>
a1a: 48 83 f8 0f cmp $0xf,%rax
a1e: 48 89 c5 mov %rax,%rbp
a21: 0f 87 7c 03 00 00 ja da3 <main+0x3c3>
a27: 48 83 f8 01 cmp $0x1,%rax
a2b: 0f 84 4f 03 00 00 je d80 <main+0x3a0>
a31: 48 85 c0 test %rax,%rax
a34: 0f 85 cc 03 00 00 jne e06 <main+0x426>
a3a: 48 8b 04 24 mov (%rsp),%rax
a3e: 48 89 6c 24 08 mov %rbp,0x8(%rsp)
a43: c6 04 28 00 movb $0x0,(%rax,%rbp,1)
a47: 48 8b 04 24 mov (%rsp),%rax
a4b: 48 8b 54 24 08 mov 0x8(%rsp),%rdx
a50: 48 8d 34 10 lea (%rax,%rdx,1),%rsi
a54: 48 39 f0 cmp %rsi,%rax
a57: 0f 84 31 03 00 00 je d8e <main+0x3ae>
a5d: 48 89 c1 mov %rax,%rcx
a60: 49 89 f1 mov %rsi,%r9
a63: 48 83 ea 01 sub $0x1,%rdx
a67: 48 f7 d9 neg %rcx
a6a: 49 29 c1 sub %rax,%r9
a6d: 41 ba 12 00 00 00 mov $0x12,%r10d
a73: 83 e1 0f and $0xf,%ecx
a76: 48 8d 78 01 lea 0x1(%rax),%rdi
a7a: 4c 8d 41 0f lea 0xf(%rcx),%r8
a7e: 49 83 f8 12 cmp $0x12,%r8
a82: 4d 0f 42 c2 cmovb %r10,%r8
a86: 4c 39 c2 cmp %r8,%rdx
a89: 0f 82 4b 03 00 00 jb dda <main+0x3fa>
a8f: 48 85 c9 test %rcx,%rcx
a92: 0f 84 52 03 00 00 je dea <main+0x40a>
a98: 45 31 d2 xor %r10d,%r10d
a9b: 80 38 3a cmpb $0x3a,(%rax)
a9e: 41 0f 94 c2 sete %r10b
aa2: 48 83 f9 01 cmp $0x1,%rcx
aa6: 0f 84 34 01 00 00 je be0 <main+0x200>
aac: 80 78 01 3a cmpb $0x3a,0x1(%rax)
ab0: 75 04 jne ab6 <main+0xd6>
ab2: 49 83 c2 01 add $0x1,%r10
ab6: 48 83 f9 02 cmp $0x2,%rcx
aba: 48 8d 78 02 lea 0x2(%rax),%rdi
abe: 0f 84 1c 01 00 00 je be0 <main+0x200>
ac4: 80 78 02 3a cmpb $0x3a,0x2(%rax)
ac8: 75 04 jne ace <main+0xee>
aca: 49 83 c2 01 add $0x1,%r10
ace: 48 83 f9 03 cmp $0x3,%rcx
ad2: 48 8d 78 03 lea 0x3(%rax),%rdi
ad6: 0f 84 04 01 00 00 je be0 <main+0x200>
adc: 80 78 03 3a cmpb $0x3a,0x3(%rax)
ae0: 75 04 jne ae6 <main+0x106>
ae2: 49 83 c2 01 add $0x1,%r10
ae6: 48 83 f9 04 cmp $0x4,%rcx
aea: 48 8d 78 04 lea 0x4(%rax),%rdi
aee: 0f 84 ec 00 00 00 je be0 <main+0x200>
af4: 80 78 04 3a cmpb $0x3a,0x4(%rax)
af8: 75 04 jne afe <main+0x11e>
afa: 49 83 c2 01 add $0x1,%r10
afe: 48 83 f9 05 cmp $0x5,%rcx
b02: 48 8d 78 05 lea 0x5(%rax),%rdi
b06: 0f 84 d4 00 00 00 je be0 <main+0x200>
b0c: 80 78 05 3a cmpb $0x3a,0x5(%rax)
b10: 75 04 jne b16 <main+0x136>
b12: 49 83 c2 01 add $0x1,%r10
b16: 48 83 f9 06 cmp $0x6,%rcx
b1a: 48 8d 78 06 lea 0x6(%rax),%rdi
b1e: 0f 84 bc 00 00 00 je be0 <main+0x200>
b24: 80 78 06 3a cmpb $0x3a,0x6(%rax)
b28: 0f 84 9a 02 00 00 je dc8 <main+0x3e8>
b2e: 48 83 f9 07 cmp $0x7,%rcx
b32: 48 8d 78 07 lea 0x7(%rax),%rdi
b36: 0f 84 a4 00 00 00 je be0 <main+0x200>
b3c: 80 78 07 3a cmpb $0x3a,0x7(%rax)
b40: 0f 84 8b 02 00 00 je dd1 <main+0x3f1>
b46: 48 83 f9 08 cmp $0x8,%rcx
b4a: 48 8d 78 08 lea 0x8(%rax),%rdi
b4e: 0f 84 8c 00 00 00 je be0 <main+0x200>
b54: 80 78 08 3a cmpb $0x3a,0x8(%rax)
b58: 75 04 jne b5e <main+0x17e>
b5a: 49 83 c2 01 add $0x1,%r10
b5e: 48 83 f9 09 cmp $0x9,%rcx
b62: 48 8d 78 09 lea 0x9(%rax),%rdi
b66: 74 78 je be0 <main+0x200>
b68: 80 78 09 3a cmpb $0x3a,0x9(%rax)
b6c: 75 04 jne b72 <main+0x192>
b6e: 49 83 c2 01 add $0x1,%r10
b72: 48 83 f9 0a cmp $0xa,%rcx
b76: 48 8d 78 0a lea 0xa(%rax),%rdi
b7a: 74 64 je be0 <main+0x200>
b7c: 80 78 0a 3a cmpb $0x3a,0xa(%rax)
b80: 75 04 jne b86 <main+0x1a6>
b82: 49 83 c2 01 add $0x1,%r10
b86: 48 83 f9 0b cmp $0xb,%rcx
b8a: 48 8d 78 0b lea 0xb(%rax),%rdi
b8e: 74 50 je be0 <main+0x200>
b90: 80 78 0b 3a cmpb $0x3a,0xb(%rax)
b94: 75 04 jne b9a <main+0x1ba>
b96: 49 83 c2 01 add $0x1,%r10
b9a: 48 83 f9 0c cmp $0xc,%rcx
b9e: 48 8d 78 0c lea 0xc(%rax),%rdi
ba2: 74 3c je be0 <main+0x200>
ba4: 80 78 0c 3a cmpb $0x3a,0xc(%rax)
ba8: 75 04 jne bae <main+0x1ce>
baa: 49 83 c2 01 add $0x1,%r10
bae: 48 83 f9 0d cmp $0xd,%rcx
bb2: 48 8d 78 0d lea 0xd(%rax),%rdi
bb6: 74 28 je be0 <main+0x200>
bb8: 80 78 0d 3a cmpb $0x3a,0xd(%rax)
bbc: 75 04 jne bc2 <main+0x1e2>
bbe: 49 83 c2 01 add $0x1,%r10
bc2: 48 83 f9 0f cmp $0xf,%rcx
bc6: 48 8d 78 0e lea 0xe(%rax),%rdi
bca: 75 14 jne be0 <main+0x200>
bcc: 80 78 0e 3a cmpb $0x3a,0xe(%rax)
bd0: 0f 84 0b 02 00 00 je de1 <main+0x401>
bd6: 48 8d 78 0f lea 0xf(%rax),%rdi
bda: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
be0: 49 29 c9 sub %rcx,%r9
be3: 66 45 0f ef c0 pxor %xmm8,%xmm8
be8: 66 0f ef e4 pxor %xmm4,%xmm4
bec: 4d 89 c8 mov %r9,%r8
bef: 66 0f ef db pxor %xmm3,%xmm3
bf3: 48 01 c8 add %rcx,%rax
bf6: 66 0f ef d2 pxor %xmm2,%xmm2
bfa: 49 c1 e8 04 shr $0x4,%r8
bfe: 66 0f 6f 35 5a 04 00 movdqa 0x45a(%rip),%xmm6 # 1060 <_IO_stdin_used+0x70>
c05: 00
c06: 31 c9 xor %ecx,%ecx
c08: 66 0f 6f 2d 60 04 00 movdqa 0x460(%rip),%xmm5 # 1070 <_IO_stdin_used+0x80>
c0f: 00
c10: 66 0f 6f cc movdqa %xmm4,%xmm1
c14: 66 44 0f 6f da movdqa %xmm2,%xmm11
c19: 66 0f 6f 00 movdqa (%rax),%xmm0
c1d: 48 83 c1 01 add $0x1,%rcx
c21: 48 83 c0 10 add $0x10,%rax
c25: 49 39 c8 cmp %rcx,%r8
c28: 66 0f 74 c6 pcmpeqb %xmm6,%xmm0
c2c: 66 0f db c5 pand %xmm5,%xmm0
c30: 66 0f 64 c8 pcmpgtb %xmm0,%xmm1
c34: 66 0f 6f f8 movdqa %xmm0,%xmm7
c38: 66 0f 60 f9 punpcklbw %xmm1,%xmm7
c3c: 66 0f 68 c1 punpckhbw %xmm1,%xmm0
c40: 66 0f 6f cb movdqa %xmm3,%xmm1
c44: 66 44 0f 6f d7 movdqa %xmm7,%xmm10
c49: 66 0f 65 cf pcmpgtw %xmm7,%xmm1
c4d: 66 44 0f 6f c8 movdqa %xmm0,%xmm9
c52: 66 44 0f 61 d1 punpcklwd %xmm1,%xmm10
c57: 66 0f 69 f9 punpckhwd %xmm1,%xmm7
c5b: 66 0f 6f cb movdqa %xmm3,%xmm1
c5f: 66 0f 65 c8 pcmpgtw %xmm0,%xmm1
c63: 66 45 0f 66 da pcmpgtd %xmm10,%xmm11
c68: 66 44 0f 61 c9 punpcklwd %xmm1,%xmm9
c6d: 66 0f 69 c1 punpckhwd %xmm1,%xmm0
c71: 66 41 0f 6f ca movdqa %xmm10,%xmm1
c76: 66 45 0f 6a d3 punpckhdq %xmm11,%xmm10
c7b: 66 41 0f 62 cb punpckldq %xmm11,%xmm1
c80: 66 41 0f d4 c8 paddq %xmm8,%xmm1
c85: 66 44 0f 6f c2 movdqa %xmm2,%xmm8
c8a: 66 41 0f d4 ca paddq %xmm10,%xmm1
c8f: 66 44 0f 6f d7 movdqa %xmm7,%xmm10
c94: 66 44 0f 66 c7 pcmpgtd %xmm7,%xmm8
c99: 66 41 0f 6a f8 punpckhdq %xmm8,%xmm7
c9e: 66 45 0f 62 d0 punpckldq %xmm8,%xmm10
ca3: 66 45 0f 6f c1 movdqa %xmm9,%xmm8
ca8: 66 41 0f d4 ca paddq %xmm10,%xmm1
cad: 66 0f d4 cf paddq %xmm7,%xmm1
cb1: 66 0f 6f fa movdqa %xmm2,%xmm7
cb5: 66 41 0f 66 f9 pcmpgtd %xmm9,%xmm7
cba: 66 44 0f 62 c7 punpckldq %xmm7,%xmm8
cbf: 66 44 0f 6a cf punpckhdq %xmm7,%xmm9
cc4: 66 0f 6f fa movdqa %xmm2,%xmm7
cc8: 66 41 0f d4 c8 paddq %xmm8,%xmm1
ccd: 66 0f 66 f8 pcmpgtd %xmm0,%xmm7
cd1: 66 44 0f 6f c0 movdqa %xmm0,%xmm8
cd6: 66 41 0f d4 c9 paddq %xmm9,%xmm1
cdb: 66 44 0f 62 c7 punpckldq %xmm7,%xmm8
ce0: 66 0f 6a c7 punpckhdq %xmm7,%xmm0
ce4: 66 41 0f d4 c8 paddq %xmm8,%xmm1
ce9: 66 0f d4 c8 paddq %xmm0,%xmm1
ced: 66 44 0f 6f c1 movdqa %xmm1,%xmm8
cf2: 0f 87 18 ff ff ff ja c10 <main+0x230>
cf8: 66 0f 73 d9 08 psrldq $0x8,%xmm1
cfd: 4c 89 c9 mov %r9,%rcx
d00: 66 41 0f d4 c8 paddq %xmm8,%xmm1
d05: 66 48 0f 7e ca movq %xmm1,%rdx
d0a: 48 83 e1 f0 and $0xfffffffffffffff0,%rcx
d0e: 48 8d 04 0f lea (%rdi,%rcx,1),%rax
d12: 4c 01 d2 add %r10,%rdx
d15: 49 39 c9 cmp %rcx,%r9
d18: 74 1a je d34 <main+0x354>
d1a: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
d20: 31 c9 xor %ecx,%ecx
d22: 80 38 3a cmpb $0x3a,(%rax)
d25: 0f 94 c1 sete %cl
d28: 48 83 c0 01 add $0x1,%rax
d2c: 48 01 ca add %rcx,%rdx
d2f: 48 39 c6 cmp %rax,%rsi
d32: 75 ec jne d20 <main+0x340>
# Area of interest:
d34: 48 83 fa 01 cmp $0x1,%rdx
d38: 7e 54 jle d8e <main+0x3ae>
d3a: 48 8d 35 e7 02 00 00 lea 0x2e7(%rip),%rsi # 1028 <_IO_stdin_used+0x38>
d41: 48 8d 3d d8 12 20 00 lea 0x2012d8(%rip),%rdi # 202020 <_ZSt4cerr##GLIBCXX_3.4>
d48: e8 33 fc ff ff callq 980 <_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc#plt>
d4d: 48 8b 3c 24 mov (%rsp),%rdi
d51: 48 83 c3 10 add $0x10,%rbx
d55: 48 39 df cmp %rbx,%rdi
d58: 74 05 je d5f <main+0x37f>
d5a: e8 11 fc ff ff callq 970 <_ZdlPv#plt>
d5f: 31 c0 xor %eax,%eax
d61: 48 8b 5c 24 28 mov 0x28(%rsp),%rbx
d66: 64 48 33 1c 25 28 00 xor %fs:0x28,%rbx
d6d: 00 00
d6f: 0f 85 80 00 00 00 jne df5 <main+0x415>
d75: 48 83 c4 38 add $0x38,%rsp
d79: 5b pop %rbx
d7a: 5d pop %rbp
d7b: 41 5c pop %r12
d7d: 41 5d pop %r13
d7f: c3 retq
d80: 41 0f b6 04 24 movzbl (%r12),%eax
d85: 88 44 24 10 mov %al,0x10(%rsp)
d89: e9 ac fc ff ff jmpq a3a <main+0x5a>
d8e: 48 8d 35 a8 02 00 00 lea 0x2a8(%rip),%rsi # 103d <_IO_stdin_used+0x4d>
d95: 48 8d 3d 84 12 20 00 lea 0x201284(%rip),%rdi # 202020 <_ZSt4cerr##GLIBCXX_3.4>
d9c: e8 df fb ff ff callq 980 <_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc#plt>
da1: eb aa jmp d4d <main+0x36d>
da3: 48 8d 78 01 lea 0x1(%rax),%rdi
da7: e8 e4 fb ff ff callq 990 <_Znwm#plt>
dac: 48 89 6c 24 10 mov %rbp,0x10(%rsp)
db1: 48 89 04 24 mov %rax,(%rsp)
db5: 48 89 ea mov %rbp,%rdx
db8: 4c 89 e6 mov %r12,%rsi
dbb: 48 89 c7 mov %rax,%rdi
dbe: e8 8d fb ff ff callq 950 <memcpy#plt>
dc3: e9 72 fc ff ff jmpq a3a <main+0x5a>
dc8: 49 83 c2 01 add $0x1,%r10
dcc: e9 5d fd ff ff jmpq b2e <main+0x14e>
dd1: 49 83 c2 01 add $0x1,%r10
dd5: e9 6c fd ff ff jmpq b46 <main+0x166>
dda: 31 d2 xor %edx,%edx
ddc: e9 3f ff ff ff jmpq d20 <main+0x340>
de1: 49 83 c2 01 add $0x1,%r10
de5: e9 ec fd ff ff jmpq bd6 <main+0x1f6>
dea: 48 89 c7 mov %rax,%rdi
ded: 45 31 d2 xor %r10d,%r10d
df0: e9 eb fd ff ff jmpq be0 <main+0x200>
df5: e8 a6 fb ff ff callq 9a0 <__stack_chk_fail#plt>
dfa: 48 8d 3d f7 01 00 00 lea 0x1f7(%rip),%rdi # ff8 <_IO_stdin_used+0x8>
e01: e8 3a fb ff ff callq 940 <_ZSt19__throw_logic_errorPKc#plt>
e06: 4c 89 e8 mov %r13,%rax
e09: eb aa jmp db5 <main+0x3d5>
e0b: 48 8b 3c 24 mov (%rsp),%rdi
e0f: 48 83 c3 10 add $0x10,%rbx
e13: 48 89 c5 mov %rax,%rbp
e16: 48 39 df cmp %rbx,%rdi
e19: 74 05 je e20 <main+0x440>
e1b: e8 50 fb ff ff callq 970 <_ZdlPv#plt>
e20: 48 89 ef mov %rbp,%rdi
e23: e8 98 fb ff ff callq 9c0 <_Unwind_Resume#plt>
e28: 0f 1f 84 00 00 00 00 nopl 0x0(%rax,%rax,1)
e2f: 00
For those interested, you may fix the issue by using signed numbers as in:
#include <type_traits>
...
if(static_cast<std::make_signed_t<decltype(colons)>>(colons) >= 2LL)
...
or wrap the if() statement around #pragma like so:
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wstrict-overflow"
if(colons >= 2LL)
#pragma GCC diagnostic pop
But this is clearly not the question here.
std::count is defined as: https://github.com/gcc-mirror/gcc/blob/16e2427f50c208dfe07d07f18009969502c25dc8/libstdc%2B%2B-v3/include/bits/stl_algo.h#L4045
/**
* #brief Count the number of copies of a value in a sequence.
* #ingroup non_mutating_algorithms
* #param __first An input iterator.
* #param __last An input iterator.
* #param __value The value to be counted.
* #return The number of iterators #c i in the range #p [__first,__last)
* for which #c *i == #p __value
*/
template<typename _InputIterator, typename _Tp>
_GLIBCXX20_CONSTEXPR
inline typename iterator_traits<_InputIterator>::difference_type
count(_InputIterator __first, _InputIterator __last, const _Tp& __value)
{
// concept requirements
__glibcxx_function_requires(_InputIteratorConcept<_InputIterator>)
__glibcxx_function_requires(_EqualOpConcept<
typename iterator_traits<_InputIterator>::value_type, _Tp>)
__glibcxx_requires_valid_range(__first, __last);
return std::__count_if(__first, __last,
__gnu_cxx::__ops::__iter_equals_val(__value));
}
Then https://github.com/gcc-mirror/gcc/blob/16e2427f50c208dfe07d07f18009969502c25dc8/libstdc%2B%2B-v3/include/bits/stl_algobase.h#L2118 :
template<typename _InputIterator, typename _Predicate>
_GLIBCXX20_CONSTEXPR
typename iterator_traits<_InputIterator>::difference_type
__count_if(_InputIterator __first, _InputIterator __last, _Predicate __pred)
{
typename iterator_traits<_InputIterator>::difference_type __n = 0;
for (; __first != __last; ++__first)
if (__pred(__first))
++__n;
return __n;
}
__n is iterator_traits<_InputIterator>::difference_type which is ptrdiff_t a signed type. Doing ++__n could result in signed type overflow, but that would be undefined behavior. Ergo, std::count() can't return negative, cause that would be undefined behavior. Because it can't return negative, compiler can use jle, the number can't be negative.

What does assembly code of function "do_compare" exactly do?

The do_compare function is in the libstdc++ library. It basically checks two strings and returns -1, 1, or 0 accordingly.
Here is the C++ code:
template<typename _CharT>
int
collate<_CharT>::
do_compare(const _CharT* __lo1, const _CharT* __hi1,
const _CharT* __lo2, const _CharT* __hi2) const
{
// strcoll assumes zero-terminated strings so we make a copy
// and then put a zero at the end.
const string_type __one(__lo1, __hi1);
const string_type __two(__lo2, __hi2);
const _CharT* __p = __one.c_str();
const _CharT* __pend = __one.data() + __one.length();
const _CharT* __q = __two.c_str();
const _CharT* __qend = __two.data() + __two.length();
// strcoll stops when it sees a nul character so we break
// the strings into zero-terminated substrings and pass those
// to strcoll.
for (;;)
{
const int __res = _M_compare(__p, __q);
if (__res)
return __res;
__p += char_traits<_CharT>::length(__p);
__q += char_traits<_CharT>::length(__q);
if (__p == __pend && __q == __qend)
return 0;
else if (__p == __pend)
return -1;
else if (__q == __qend)
return 1;
__p++;
__q++;
}
}
I have to put the entire assembly code of do_compare to show my problem, sorry:
0000000000101c40 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4>:
101c40: 41 57 push %r15
101c42: 41 56 push %r14
101c44: 49 89 fe mov %rdi,%r14
101c47: 48 89 f7 mov %rsi,%rdi
101c4a: 48 89 d6 mov %rdx,%rsi
101c4d: 41 55 push %r13
101c4f: 41 54 push %r12
101c51: 55 push %rbp
101c52: 4c 89 c5 mov %r8,%rbp
101c55: 53 push %rbx
101c56: 48 89 cb mov %rcx,%rbx
101c59: 48 83 ec 38 sub $0x38,%rsp
101c5d: 64 48 8b 04 25 28 00 mov %fs:0x28,%rax
101c64: 00 00
101c66: 48 89 44 24 28 mov %rax,0x28(%rsp)
101c6b: 31 c0 xor %eax,%eax
101c6d: 4c 8d 6c 24 27 lea 0x27(%rsp),%r13
101c72: 4c 89 ea mov %r13,%rdx
101c75: 4c 89 6c 24 18 mov %r13,0x18(%rsp)
101c7a: e8 f1 a2 f8 ff callq 8bf70 <_ZNSs12_S_constructIPKcEEPcT_S3_RKSaIcESt20forward_iterator_tag#plt>
101c7f: 4c 89 ea mov %r13,%rdx
101c82: 48 89 ee mov %rbp,%rsi
101c85: 48 89 df mov %rbx,%rdi
101c88: 49 89 c7 mov %rax,%r15
101c8b: 48 89 44 24 08 mov %rax,0x8(%rsp)
101c90: e8 db a2 f8 ff callq 8bf70 <_ZNSs12_S_constructIPKcEEPcT_S3_RKSaIcESt20forward_iterator_tag#plt>
101c95: 4d 8b 67 e8 mov -0x18(%r15),%r12
101c99: 4c 8b 68 e8 mov -0x18(%rax),%r13
101c9d: 48 89 c5 mov %rax,%rbp
101ca0: 48 89 44 24 10 mov %rax,0x10(%rsp)
101ca5: 4c 89 fb mov %r15,%rbx
101ca8: 4d 01 fc add %r15,%r12
101cab: 49 01 c5 add %rax,%r13
101cae: eb 32 jmp 101ce2 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0xa2>
101cb0: 48 89 df mov %rbx,%rdi
101cb3: e8 98 87 f8 ff callq 8a450 <strlen#plt>
101cb8: 48 89 ef mov %rbp,%rdi
101cbb: 48 01 c3 add %rax,%rbx
101cbe: e8 8d 87 f8 ff callq 8a450 <strlen#plt>
101cc3: 48 01 c5 add %rax,%rbp
101cc6: 49 39 dc cmp %rbx,%r12
101cc9: 75 05 jne 101cd0 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0x90>
101ccb: 49 39 ed cmp %rbp,%r13
101cce: 74 27 je 101cf7 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0xb7>
101cd0: 49 39 dc cmp %rbx,%r12
101cd3: 74 6b je 101d40 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0x100>
101cd5: 49 39 ed cmp %rbp,%r13
101cd8: 74 76 je 101d50 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0x110>
101cda: 48 83 c3 01 add $0x1,%rbx
101cde: 48 83 c5 01 add $0x1,%rbp
101ce2: 48 89 ea mov %rbp,%rdx
101ce5: 48 89 de mov %rbx,%rsi
101ce8: 4c 89 f7 mov %r14,%rdi
101ceb: e8 20 8b f8 ff callq 8a810 <_ZNKSt7collateIcE10_M_compareEPKcS2_#plt>
101cf0: 41 89 c7 mov %eax,%r15d
101cf3: 85 c0 test %eax,%eax
101cf5: 74 b9 je 101cb0 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0x70>
101cf7: 48 8b 7c 24 10 mov 0x10(%rsp),%rdi
101cfc: 48 8b 1d 9d 08 28 00 mov 0x28089d(%rip),%rbx # 3825a0 <_ZNSs4_Rep20_S_empty_rep_storageE##GLIBCXX_3.4-0x57e0>
101d03: 48 83 ef 18 sub $0x18,%rdi
101d07: 48 39 df cmp %rbx,%rdi
101d0a: 75 54 jne 101d60 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0x120>
101d0c: 48 8b 7c 24 08 mov 0x8(%rsp),%rdi
101d11: 48 83 ef 18 sub $0x18,%rdi
101d15: 48 39 df cmp %rbx,%rdi
101d18: 75 56 jne 101d70 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0x130>
101d1a: 48 8b 4c 24 28 mov 0x28(%rsp),%rcx
101d1f: 64 48 33 0c 25 28 00 xor %fs:0x28,%rcx
101d26: 00 00
101d28: 44 89 f8 mov %r15d,%eax
101d2b: 75 4f jne 101d7c <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0x13c>
101d2d: 48 83 c4 38 add $0x38,%rsp
101d31: 5b pop %rbx
101d32: 5d pop %rbp
101d33: 41 5c pop %r12
101d35: 41 5d pop %r13
101d37: 41 5e pop %r14
101d39: 41 5f pop %r15
101d3b: c3 retq
101d3c: 0f 1f 40 00 nopl 0x0(%rax)
101d40: 41 bf ff ff ff ff mov $0xffffffff,%r15d
101d46: eb af jmp 101cf7 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0xb7>
101d48: 0f 1f 84 00 00 00 00 nopl 0x0(%rax,%rax,1)
101d4f: 00
101d50: 41 bf 01 00 00 00 mov $0x1,%r15d
101d56: eb 9f jmp 101cf7 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0xb7>
101d58: 0f 1f 84 00 00 00 00 nopl 0x0(%rax,%rax,1)
101d5f: 00
101d60: 48 8b 74 24 18 mov 0x18(%rsp),%rsi
101d65: e8 96 fe ff ff callq 101c00 <_ZNSt14codecvt_bynameIcc11__mbstate_tED0Ev##GLIBCXX_3.4+0x20>
101d6a: eb a0 jmp 101d0c <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0xcc>
101d6c: 0f 1f 40 00 nopl 0x0(%rax)
101d70: 48 8b 74 24 18 mov 0x18(%rsp),%rsi
101d75: e8 86 fe ff ff callq 101c00 <_ZNSt14codecvt_bynameIcc11__mbstate_tED0Ev##GLIBCXX_3.4+0x20>
101d7a: eb 9e jmp 101d1a <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0xda>
101d7c: e8 7f 95 f8 ff callq 8b300 <__stack_chk_fail#plt>
101d81: 48 89 c3 mov %rax,%rbx
101d84: 48 8b 7c 24 08 mov 0x8(%rsp),%rdi
101d89: 48 83 ef 18 sub $0x18,%rdi
101d8d: 48 3b 3d 0c 08 28 00 cmp 0x28080c(%rip),%rdi # 3825a0 <_ZNSs4_Rep20_S_empty_rep_storageE##GLIBCXX_3.4-0x57e0>
101d94: 74 0a je 101da0 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0x160>
101d96: 48 8b 74 24 18 mov 0x18(%rsp),%rsi
101d9b: e8 60 fe ff ff callq 101c00 <_ZNSt14codecvt_bynameIcc11__mbstate_tED0Ev##GLIBCXX_3.4+0x20>
101da0: 48 89 df mov %rbx,%rdi
101da3: e8 e8 a1 f8 ff callq 8bf90 <_Unwind_Resume#plt>
101da8: 0f 1f 84 00 00 00 00 nopl 0x0(%rax,%rax,1)
101daf: 00
*******101db0: 53 push %rbx
101db1: 48 89 fb mov %rdi,%rbx
101db4: 48 8b 3f mov (%rdi),%rdi
101db7: 89 f0 mov %esi,%eax
101db9: 48 85 ff test %rdi,%rdi
101dbc: 74 05 je 101dc3 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0x183>
101dbe: 83 fe ff cmp $0xffffffff,%esi
101dc1: 74 05 je 101dc8 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0x188>
101dc3: 5b pop %rbx
101dc4: c3 retq
101dc5: 0f 1f 00 nopl (%rax)
101dc8: 48 8b 47 10 mov 0x10(%rdi),%rax
101dcc: 48 3b 47 18 cmp 0x18(%rdi),%rax
101dd0: 73 0e jae 101de0 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0x1a0>
101dd2: 0f b6 00 movzbl (%rax),%eax
101dd5: 5b pop %rbx
101dd6: c3 retq
101dd7: 66 0f 1f 84 00 00 00 nopw 0x0(%rax,%rax,1)
101dde: 00 00
101de0: 48 8b 07 mov (%rdi),%rax
101de3: ff 50 48 callq *0x48(%rax)
101de6: 83 f8 ff cmp $0xffffffff,%eax
101de9: 75 d8 jne 101dc3 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0x183>
101deb: 48 c7 03 00 00 00 00 movq $0x0,(%rbx)
101df2: 5b pop %rbx
101df3: c3 retq
101df4: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1)
101dfb: 00 00 00
101dfe: 66 90 xchg %ax,%ax
101e00: 55 push %rbp
101e01: 89 f5 mov %esi,%ebp
101e03: 53 push %rbx
101e04: 48 89 fb mov %rdi,%rbx
101e07: 48 83 ec 08 sub $0x8,%rsp
101e0b: e8 b0 88 f8 ff callq 8a6c0 <_ZNKSt5ctypeIcE13_M_widen_initEv#plt>
101e10: 48 8b 03 mov (%rbx),%rax
101e13: 48 8b 40 30 mov 0x30(%rax),%rax
101e17: 48 3b 05 7a 11 28 00 cmp 0x28117a(%rip),%rax # 382f98 <_ZNKSt5ctypeIcE8do_widenEc##GLIBCXX_3.4+0x2e2c48>
101e1e: 75 10 jne 101e30 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0x1f0>
101e20: 48 83 c4 08 add $0x8,%rsp
101e24: 89 e8 mov %ebp,%eax
101e26: 5b pop %rbx
101e27: 5d pop %rbp
101e28: c3 retq
101e29: 0f 1f 80 00 00 00 00 nopl 0x0(%rax)
101e30: 48 83 c4 08 add $0x8,%rsp
101e34: 40 0f be f5 movsbl %bpl,%esi
101e38: 48 89 df mov %rbx,%rdi
101e3b: 5b pop %rbx
101e3c: 5d pop %rbp
101e3d: ff e0 jmpq *%rax
101e3f: 90 nop
It seems to me that the assembly code not only performs the C++ code logic but also adds other logic.
As an example, the function _M_extract_int in libstdc++ which coverts a char to int calls this function as the following:
callq 0x101db0
The instruction address 0x101db0 is in the middle of the assembly code. The code section from 0x101db0 to 0x101dbc seems to have nothing to do with the above C++ code. Really confused about what is going on here...

clang generated code for a simple factorial function

Here is the code :
unsigned int factorial(unsigned int n) {
if (n == 0) return 1;
return n * factorial(n - 1);
}
int main() {
return factorial(0);
}
I generate both gcc and clang assemblies using
g++ -g -O2 -c factorial.cpp (resp clang++)
objdump -d -M intel -S factorial.o
Here is what I get for gcc
factorial.o: file format elf64-x86-64
Disassembly of section .text:
0000000000000000 <_Z9factorialj>:
unsigned int factorial(unsigned int n)
{
if (n == 0)
0: 85 ff test edi,edi
2: b8 01 00 00 00 mov eax,0x1
7: 74 11 je 1a <_Z9factorialj+0x1a>
9: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
10: 0f af c7 imul eax,edi
13: 83 ef 01 sub edi,0x1
16: 75 f8 jne 10 <_Z9factorialj+0x10>
18: f3 c3 repz ret
{
return 1;
}
return n * factorial(n - 1);
}
1a: f3 c3 repz ret
Disassembly of section .text.startup:
0000000000000000 <main>:
if (n == 0)
0: b8 01 00 00 00 mov eax,0x1
5: c3 ret
and for clang
factorial.o: file format elf64-x86-64
Disassembly of section .text:
0000000000000000 <_Z9factorialj>:
unsigned int factorial(unsigned int n)
{
0: b8 01 00 00 00 mov eax,0x1
if (n == 0)
5: 85 ff test edi,edi
7: 0f 84 ba 01 00 00 je 1c7 <_Z9factorialj+0x1c7>
d: b8 01 00 00 00 mov eax,0x1
{
return 1;
}
return n * factorial(n - 1);
12: 83 ff 08 cmp edi,0x8
15: 0f 82 a5 01 00 00 jb 1c0 <_Z9factorialj+0x1c0>
1b: 41 89 f8 mov r8d,edi
1e: 41 83 e0 f8 and r8d,0xfffffff8
22: 89 fa mov edx,edi
24: 83 e2 f8 and edx,0xfffffff8
27: 0f 84 93 01 00 00 je 1c0 <_Z9factorialj+0x1c0>
2d: 8d 4f f8 lea ecx,[rdi-0x8]
30: 89 c8 mov eax,ecx
32: c1 e8 03 shr eax,0x3
35: 0f ba e1 03 bt ecx,0x3
39: 72 24 jb 5f <_Z9factorialj+0x5f>
3b: 66 0f 6e c7 movd xmm0,edi
3f: 66 0f 70 c8 00 pshufd xmm1,xmm0,0x0
44: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 4c <_Z9factorialj+0x4c>
4b: 00
4c: 66 0f fe c1 paddd xmm0,xmm1
50: 66 0f fe 0d 00 00 00 paddd xmm1,XMMWORD PTR [rip+0x0] # 58 <_Z9factorialj+0x58>
57: 00
58: b9 08 00 00 00 mov ecx,0x8
5d: eb 0e jmp 6d <_Z9factorialj+0x6d>
5f: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 67 <_Z9factorialj+0x67>
66: 00
67: 31 c9 xor ecx,ecx
69: 66 0f 6f c8 movdqa xmm1,xmm0
6d: 85 c0 test eax,eax
6f: 0f 84 d4 00 00 00 je 149 <_Z9factorialj+0x149>
75: 89 d0 mov eax,edx
77: 29 c8 sub eax,ecx
79: 89 fe mov esi,edi
7b: 29 ce sub esi,ecx
7d: 66 0f 6f 15 00 00 00 movdqa xmm2,XMMWORD PTR [rip+0x0] # 85 <_Z9factorialj+0x85>
84: 00
85: 66 0f 6f 1d 00 00 00 movdqa xmm3,XMMWORD PTR [rip+0x0] # 8d <_Z9factorialj+0x8d>
8c: 00
8d: 0f 1f 00 nop DWORD PTR [rax]
90: 66 0f 6e e6 movd xmm4,esi
94: 66 0f 70 e4 00 pshufd xmm4,xmm4,0x0
99: 66 0f 6f ec movdqa xmm5,xmm4
9d: 66 0f fe ea paddd xmm5,xmm2
a1: 66 0f fe e3 paddd xmm4,xmm3
a5: 66 0f 70 f5 f5 pshufd xmm6,xmm5,0xf5
aa: 66 0f f4 e8 pmuludq xmm5,xmm0
ae: 66 0f 70 ed e8 pshufd xmm5,xmm5,0xe8
b3: 66 0f 70 c0 f5 pshufd xmm0,xmm0,0xf5
b8: 66 0f f4 c6 pmuludq xmm0,xmm6
bc: 66 0f 70 c0 e8 pshufd xmm0,xmm0,0xe8
c1: 66 0f 62 e8 punpckldq xmm5,xmm0
c5: 66 0f 70 c4 f5 pshufd xmm0,xmm4,0xf5
ca: 66 0f f4 e1 pmuludq xmm4,xmm1
ce: 66 0f 70 e4 e8 pshufd xmm4,xmm4,0xe8
d3: 66 0f 70 c9 f5 pshufd xmm1,xmm1,0xf5
d8: 66 0f f4 c8 pmuludq xmm1,xmm0
dc: 66 0f 70 c1 e8 pshufd xmm0,xmm1,0xe8
e1: 66 0f 62 e0 punpckldq xmm4,xmm0
e5: 8d 4e f8 lea ecx,[rsi-0x8]
e8: 66 0f 6e c1 movd xmm0,ecx
ec: 66 0f 70 c8 00 pshufd xmm1,xmm0,0x0
f1: 66 0f 6f c1 movdqa xmm0,xmm1
f5: 66 0f fe c2 paddd xmm0,xmm2
f9: 66 0f fe cb paddd xmm1,xmm3
fd: 66 0f 70 f0 f5 pshufd xmm6,xmm0,0xf5
102: 66 0f f4 c5 pmuludq xmm0,xmm5
106: 66 0f 70 c0 e8 pshufd xmm0,xmm0,0xe8
10b: 66 0f 70 ed f5 pshufd xmm5,xmm5,0xf5
110: 66 0f f4 ee pmuludq xmm5,xmm6
114: 66 0f 70 ed e8 pshufd xmm5,xmm5,0xe8
119: 66 0f 62 c5 punpckldq xmm0,xmm5
11d: 66 0f 70 e9 f5 pshufd xmm5,xmm1,0xf5
122: 66 0f f4 cc pmuludq xmm1,xmm4
126: 66 0f 70 c9 e8 pshufd xmm1,xmm1,0xe8
12b: 66 0f 70 e4 f5 pshufd xmm4,xmm4,0xf5
130: 66 0f f4 e5 pmuludq xmm4,xmm5
134: 66 0f 70 e4 e8 pshufd xmm4,xmm4,0xe8
139: 66 0f 62 cc punpckldq xmm1,xmm4
13d: 83 c6 f0 add esi,0xfffffff0
140: 83 c0 f0 add eax,0xfffffff0
143: 0f 85 47 ff ff ff jne 90 <_Z9factorialj+0x90>
149: 66 0f 70 d1 f5 pshufd xmm2,xmm1,0xf5
14e: 66 0f f4 c8 pmuludq xmm1,xmm0
152: 66 0f 70 c9 e8 pshufd xmm1,xmm1,0xe8
157: 66 0f 70 c0 f5 pshufd xmm0,xmm0,0xf5
15c: 66 0f f4 c2 pmuludq xmm0,xmm2
160: 66 0f 70 c0 e8 pshufd xmm0,xmm0,0xe8
165: 66 0f 62 c8 punpckldq xmm1,xmm0
169: 66 0f 70 c1 4e pshufd xmm0,xmm1,0x4e
16e: 66 0f 70 d1 f5 pshufd xmm2,xmm1,0xf5
173: 66 0f f4 c8 pmuludq xmm1,xmm0
177: 66 0f 70 c9 e8 pshufd xmm1,xmm1,0xe8
17c: 66 0f 70 c0 f5 pshufd xmm0,xmm0,0xf5
181: 66 0f f4 c2 pmuludq xmm0,xmm2
185: 66 0f 70 c0 e8 pshufd xmm0,xmm0,0xe8
18a: 66 0f 62 c8 punpckldq xmm1,xmm0
18e: 66 0f 70 c1 e5 pshufd xmm0,xmm1,0xe5
193: 66 0f 70 d1 f5 pshufd xmm2,xmm1,0xf5
198: 66 0f f4 c8 pmuludq xmm1,xmm0
19c: 66 0f 70 c9 e8 pshufd xmm1,xmm1,0xe8
1a1: 66 0f 70 c0 f5 pshufd xmm0,xmm0,0xf5
1a6: 66 0f f4 c2 pmuludq xmm0,xmm2
1aa: 66 0f 70 c0 e8 pshufd xmm0,xmm0,0xe8
1af: 66 0f 62 c8 punpckldq xmm1,xmm0
1b3: 66 0f 7e c8 movd eax,xmm1
1b7: 39 fa cmp edx,edi
1b9: 74 0c je 1c7 <_Z9factorialj+0x1c7>
1bb: 44 29 c7 sub edi,r8d
1be: 66 90 xchg ax,ax
1c0: 0f af c7 imul eax,edi
if (n == 0)
1c3: ff cf dec edi
1c5: 75 f9 jne 1c0 <_Z9factorialj+0x1c0>
}
1c7: c3 ret
1c8: 0f 1f 84 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0]
1cf: 00
00000000000001d0 <main>:
int main()
{
return factorial(0);
1d0: b8 01 00 00 00 mov eax,0x1
1d5: c3 ret
I understand they both notice they can unroll the loop and precompute the value but I don't get why clang generates all this code (and I have no idea what this can be ever doing and why there is sse stuff...)
Bonus question: while gcc precomputes the value up to factorial(7), clang computes it for any value.
Up to factorial(31) the values look fine, for factorial(32) and factorial(33) it returns 0x80000000 and for greater values it does xor eax, eax instead. What is this black magic ?

What are these seemingly-useless callq instructions in my x86 object files for?

I have some template-heavy C++ code that I want to ensure the compiler optimizes as much as possible due to the large amount of information it has at compile time. To evaluate its performance, I decided to take a look at the disassembly of the object file that it generates. Below is a snippet of what I got from objdump -dC:
0000000000000000 <bar<foo, 0u>::get(bool)>:
0: 41 57 push %r15
2: 49 89 f7 mov %rsi,%r15
5: 41 56 push %r14
7: 41 55 push %r13
9: 41 54 push %r12
b: 55 push %rbp
c: 53 push %rbx
d: 48 81 ec 68 02 00 00 sub $0x268,%rsp
14: 48 89 7c 24 10 mov %rdi,0x10(%rsp)
19: 48 89 f7 mov %rsi,%rdi
1c: 89 54 24 1c mov %edx,0x1c(%rsp)
20: e8 00 00 00 00 callq 25 <bar<foo, 0u>::get(bool)+0x25>
25: 84 c0 test %al,%al
27: 0f 85 eb 00 00 00 jne 118 <bar<foo, 0u>::get(bool)+0x118>
2d: 48 c7 44 24 08 00 00 movq $0x0,0x8(%rsp)
34: 00 00
36: 4c 89 ff mov %r15,%rdi
39: 4d 8d b7 30 01 00 00 lea 0x130(%r15),%r14
40: e8 00 00 00 00 callq 45 <bar<foo, 0u>::get(bool)+0x45>
45: 84 c0 test %al,%al
47: 88 44 24 1b mov %al,0x1b(%rsp)
4b: 0f 85 ef 00 00 00 jne 140 <bar<foo, 0u>::get(bool)+0x140>
51: 80 7c 24 1c 00 cmpb $0x0,0x1c(%rsp)
56: 0f 85 24 03 00 00 jne 380 <bar<foo, 0u>::get(bool)+0x380>
5c: 48 8b 44 24 10 mov 0x10(%rsp),%rax
61: c6 00 00 movb $0x0,(%rax)
64: 80 7c 24 1b 00 cmpb $0x0,0x1b(%rsp)
69: 75 25 jne 90 <bar<foo, 0u>::get(bool)+0x90>
6b: 48 8b 74 24 10 mov 0x10(%rsp),%rsi
70: 4c 89 ff mov %r15,%rdi
73: e8 00 00 00 00 callq 78 <bar<foo, 0u>::get(bool)+0x78>
78: 48 8b 44 24 10 mov 0x10(%rsp),%rax
7d: 48 81 c4 68 02 00 00 add $0x268,%rsp
84: 5b pop %rbx
85: 5d pop %rbp
86: 41 5c pop %r12
88: 41 5d pop %r13
8a: 41 5e pop %r14
8c: 41 5f pop %r15
8e: c3 retq
8f: 90 nop
90: 4c 89 f7 mov %r14,%rdi
93: e8 00 00 00 00 callq 98 <bar<foo, 0u>::get(bool)+0x98>
98: 83 f8 04 cmp $0x4,%eax
9b: 74 f3 je 90 <bar<foo, 0u>::get(bool)+0x90>
9d: 85 c0 test %eax,%eax
9f: 0f 85 e4 08 00 00 jne 989 <bar<foo, 0u>::get(bool)+0x989>
a5: 49 83 87 b0 01 00 00 addq $0x1,0x1b0(%r15)
ac: 01
ad: 49 8d 9f 58 01 00 00 lea 0x158(%r15),%rbx
b4: 48 89 df mov %rbx,%rdi
b7: e8 00 00 00 00 callq bc <bar<foo, 0u>::get(bool)+0xbc>
bc: 49 8d bf 80 01 00 00 lea 0x180(%r15),%rdi
c3: e8 00 00 00 00 callq c8 <bar<foo, 0u>::get(bool)+0xc8>
c8: 48 89 df mov %rbx,%rdi
cb: e8 00 00 00 00 callq d0 <bar<foo, 0u>::get(bool)+0xd0>
d0: 4c 89 f7 mov %r14,%rdi
d3: e8 00 00 00 00 callq d8 <bar<foo, 0u>::get(bool)+0xd8>
d8: 83 f8 04 cmp $0x4,%eax
The disassembly of this particular function continues on, but one thing I noticed is the relatively large number of call instructions like this one:
20: e8 00 00 00 00 callq 25 <bar<foo, 0u>::get(bool)+0x25>
These instructions, always with the opcode e8 00 00 00 00, occur frequently throughout the generated code, and from what I can tell, are nothing more than no-ops; they all seem to just fall through to the next instruction. This begs the question, then, is there a good reason why all these instructions are generated?
I'm concerned about the instruction cache footprint of the generated code, so wasting 5 bytes many times throughout a function seems counterproductive. It seems a bit heavyweight for a nop, unless the compiler is trying to preserve some kind of memory alignment or something. I wouldn't be surprised if this were the case.
I compiled my code using g++ 4.8.5 using -O3 -fomit-frame-pointer. For what it's worth, I saw similar code generation using clang 3.7.
The 00 00 00 00 (relative) target address in e8 00 00 00 00 is intended to be filled in by the linker. It doesn't mean that the call falls through. It just means you are disassembling an object file that has not been linked yet.
Also, a call to the next instruction, if that was the end result after the link phase, would not be a no-op, because it changes the stack (a certain hint that this is not what is going on in your case).

SSE load/store memory transactions

There are two ways for memory-register interactions in use SSE intrinsics:
Intermediate pointers:
void f_sse(float *input, float *output, unsigned int n)
{
_m128 *input_sse = reinterpret_cast<__m128*>(input);//Input intermediate pointer
_m128 *output_sse = reinterpret_cast<__m128*>(output);//Output intermediate pointer
_m128 s = _mm_set1_ps(0.1f);
auto loop_size = n/4;
for(auto i=0; i<loop_size; ++i)
output_sse[i] = _mm_add_ps(input_sse[i], s);
}
Explicit fetch/store:
void f_sse(float *input, float *output, unsigned int n)
{
_m128 input_sse, output_sse, result;
_m128 s = _mm_set1_ps(0.1f);
for(auto i=0; i<n; i+=4)
{
input_sse = _mm_load_ps(input+i);
result = _mm_add_ps(input_sse, s);
_mm_store_ps(output+i, result);
}
}
What's the difference between mentioned approaches and which method is better in terms of perfomance? input and output pointers are aligned by _mm_malloc().
Compiled with g++ at optimization level O3 the assembly code of the inner loop (using objdump -d) are
20: 0f 28 04 07 movaps (%rdi,%rax,1),%xmm0
24: 0f 58 c1 addps %xmm1,%xmm0
27: 0f 29 04 06 movaps %xmm0,(%rsi,%rax,1)
2b: 48 83 c0 10 add $0x10,%rax
2f: 48 39 d0 cmp %rdx,%rax
32: 75 ec jne 20 <_Z5f_ssePfS_j+0x20>
and
10: 0f 28 04 07 movaps (%rdi,%rax,1),%xmm0
14: 83 c1 04 add $0x4,%ecx
17: 0f 58 c1 addps %xmm1,%xmm0
1a: 0f 29 04 06 movaps %xmm0,(%rsi,%rax,1)
1e: 48 83 c0 10 add $0x10,%rax
22: 39 ca cmp %ecx,%edx
24: 77 ea ja 10 <_Z5f_ssePfS_j+0x10>
They are pretty similar. In the first g++ manage to use only one counter (only one add instruction). So I guess its better.
I compiled both of your samples with g++ -O2, and the main difference I found was that the value in edx (n) is used differently, which leads to slightly different code.
First function:
0000000000000000 <_Z6f_sse2PfS_j>:
0: c1 ea 02 shr $0x2,%edx # loop_size = n / 4.
3: 85 d2 test %edx,%edx
5: 74 2d je 34 <_Z6f_sse2PfS_j+0x34>
7: 83 ea 01 sub $0x1,%edx
a: 0f 28 0d 00 00 00 00 movaps 0x0(%rip),%xmm1 # 11 <_Z6f_sse2PfS_j+0x11>
11: 48 83 c2 01 add $0x1,%rdx
15: 31 c0 xor %eax,%eax
17: 48 c1 e2 04 shl $0x4,%rdx // Adjust for loop size vs. index.
1b: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
20: 0f 28 04 07 movaps (%rdi,%rax,1),%xmm0
24: 0f 58 c1 addps %xmm1,%xmm0
27: 0f 29 04 06 movaps %xmm0,(%rsi,%rax,1)
2b: 48 83 c0 10 add $0x10,%rax
2f: 48 39 d0 cmp %rdx,%rax
32: 75 ec jne 20 <_Z6f_sse2PfS_j+0x20>
34: f3 c3 repz retq
Second function:
0000000000000000 <_Z5f_ssePfS_j>:
0: 85 d2 test %edx,%edx
2: 74 22 je 26 <_Z5f_ssePfS_j+0x26>
4: 0f 28 0d 00 00 00 00 movaps 0x0(%rip),%xmm1 # b <_Z5f_ssePfS_j+0xb>
b: 31 c0 xor %eax,%eax
d: 31 c9 xor %ecx,%ecx
f: 90 nop
10: 0f 28 04 07 movaps (%rdi,%rax,1),%xmm0
14: 83 c1 04 add $0x4,%ecx
17: 0f 58 c1 addps %xmm1,%xmm0
1a: 0f 29 04 06 movaps %xmm0,(%rsi,%rax,1)
1e: 48 83 c0 10 add $0x10,%rax
22: 39 ca cmp %ecx,%edx
24: 77 ea ja 10 <_Z5f_ssePfS_j+0x10>
26: f3 c3 repz retq
I also looked at the code generated, and came up with this:
void f_sse2(float *input, float *output, unsigned int n)
{
__m128 *end = reinterpret_cast<__m128*>(&input[n]);
__m128 *input_sse = reinterpret_cast<__m128*>(input);//Input intermediate pointer
__m128 *output_sse = reinterpret_cast<__m128*>(output);//Output intermediate pointer
__m128 s = _mm_set1_ps(0.1f);
while(input_sse < end)
*output_sse++ = _mm_add_ps(*input_sse++, s);
}
which generates this code:
0000000000000000 <_Z6f_sse2PfS_j>:
0: 89 d2 mov %edx,%edx
2: 48 8d 04 97 lea (%rdi,%rdx,4),%rax
6: 48 39 c7 cmp %rax,%rdi
9: 73 23 jae 2e <_Z6f_sse2PfS_j+0x2e>
b: 0f 28 0d 00 00 00 00 movaps 0x0(%rip),%xmm1 # 12 <_Z6f_sse2PfS_j+0x12>
12: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
18: 0f 28 07 movaps (%rdi),%xmm0
1b: 48 83 c7 10 add $0x10,%rdi
1f: 0f 58 c1 addps %xmm1,%xmm0
22: 0f 29 06 movaps %xmm0,(%rsi)
25: 48 83 c6 10 add $0x10,%rsi
29: 48 39 f8 cmp %rdi,%rax
2c: 77 ea ja 18 <_Z6f_sse2PfS_j+0x18>
2e: f3 c3 repz retq
Which I think may be a tiny bit more efficient, but probably not worth changing it for. But it gave me something to do for 15 minutes.