SSE load/store memory transactions

SSE load/store memory transactions - c++

There are two ways for memory-register interactions in use SSE intrinsics:
Intermediate pointers:
void f_sse(float *input, float *output, unsigned int n)
{
_m128 *input_sse = reinterpret_cast<__m128*>(input);//Input intermediate pointer
_m128 *output_sse = reinterpret_cast<__m128*>(output);//Output intermediate pointer
_m128 s = _mm_set1_ps(0.1f);
auto loop_size = n/4;
for(auto i=0; i<loop_size; ++i)
output_sse[i] = _mm_add_ps(input_sse[i], s);
}
Explicit fetch/store:
void f_sse(float *input, float *output, unsigned int n)
{
_m128 input_sse, output_sse, result;
_m128 s = _mm_set1_ps(0.1f);
for(auto i=0; i<n; i+=4)
{
input_sse = _mm_load_ps(input+i);
result = _mm_add_ps(input_sse, s);
_mm_store_ps(output+i, result);
}
}
What's the difference between mentioned approaches and which method is better in terms of perfomance? input and output pointers are aligned by _mm_malloc().

Compiled with g++ at optimization level O3 the assembly code of the inner loop (using objdump -d) are
20: 0f 28 04 07 movaps (%rdi,%rax,1),%xmm0
24: 0f 58 c1 addps %xmm1,%xmm0
27: 0f 29 04 06 movaps %xmm0,(%rsi,%rax,1)
2b: 48 83 c0 10 add $0x10,%rax
2f: 48 39 d0 cmp %rdx,%rax
32: 75 ec jne 20 <_Z5f_ssePfS_j+0x20>
and
10: 0f 28 04 07 movaps (%rdi,%rax,1),%xmm0
14: 83 c1 04 add $0x4,%ecx
17: 0f 58 c1 addps %xmm1,%xmm0
1a: 0f 29 04 06 movaps %xmm0,(%rsi,%rax,1)
1e: 48 83 c0 10 add $0x10,%rax
22: 39 ca cmp %ecx,%edx
24: 77 ea ja 10 <_Z5f_ssePfS_j+0x10>
They are pretty similar. In the first g++ manage to use only one counter (only one add instruction). So I guess its better.

I compiled both of your samples with g++ -O2, and the main difference I found was that the value in edx (n) is used differently, which leads to slightly different code.
First function:
0000000000000000 <_Z6f_sse2PfS_j>:
0: c1 ea 02 shr $0x2,%edx # loop_size = n / 4.
3: 85 d2 test %edx,%edx
5: 74 2d je 34 <_Z6f_sse2PfS_j+0x34>
7: 83 ea 01 sub $0x1,%edx
a: 0f 28 0d 00 00 00 00 movaps 0x0(%rip),%xmm1 # 11 <_Z6f_sse2PfS_j+0x11>
11: 48 83 c2 01 add $0x1,%rdx
15: 31 c0 xor %eax,%eax
17: 48 c1 e2 04 shl $0x4,%rdx // Adjust for loop size vs. index.
1b: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
20: 0f 28 04 07 movaps (%rdi,%rax,1),%xmm0
24: 0f 58 c1 addps %xmm1,%xmm0
27: 0f 29 04 06 movaps %xmm0,(%rsi,%rax,1)
2b: 48 83 c0 10 add $0x10,%rax
2f: 48 39 d0 cmp %rdx,%rax
32: 75 ec jne 20 <_Z6f_sse2PfS_j+0x20>
34: f3 c3 repz retq
Second function:
0000000000000000 <_Z5f_ssePfS_j>:
0: 85 d2 test %edx,%edx
2: 74 22 je 26 <_Z5f_ssePfS_j+0x26>
4: 0f 28 0d 00 00 00 00 movaps 0x0(%rip),%xmm1 # b <_Z5f_ssePfS_j+0xb>
b: 31 c0 xor %eax,%eax
d: 31 c9 xor %ecx,%ecx
f: 90 nop
10: 0f 28 04 07 movaps (%rdi,%rax,1),%xmm0
14: 83 c1 04 add $0x4,%ecx
17: 0f 58 c1 addps %xmm1,%xmm0
1a: 0f 29 04 06 movaps %xmm0,(%rsi,%rax,1)
1e: 48 83 c0 10 add $0x10,%rax
22: 39 ca cmp %ecx,%edx
24: 77 ea ja 10 <_Z5f_ssePfS_j+0x10>
26: f3 c3 repz retq
I also looked at the code generated, and came up with this:
void f_sse2(float *input, float *output, unsigned int n)
{
__m128 *end = reinterpret_cast<__m128*>(&input[n]);
__m128 *input_sse = reinterpret_cast<__m128*>(input);//Input intermediate pointer
__m128 *output_sse = reinterpret_cast<__m128*>(output);//Output intermediate pointer
__m128 s = _mm_set1_ps(0.1f);
while(input_sse < end)
*output_sse++ = _mm_add_ps(*input_sse++, s);
}
which generates this code:
0000000000000000 <_Z6f_sse2PfS_j>:
0: 89 d2 mov %edx,%edx
2: 48 8d 04 97 lea (%rdi,%rdx,4),%rax
6: 48 39 c7 cmp %rax,%rdi
9: 73 23 jae 2e <_Z6f_sse2PfS_j+0x2e>
b: 0f 28 0d 00 00 00 00 movaps 0x0(%rip),%xmm1 # 12 <_Z6f_sse2PfS_j+0x12>
12: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
18: 0f 28 07 movaps (%rdi),%xmm0
1b: 48 83 c7 10 add $0x10,%rdi
1f: 0f 58 c1 addps %xmm1,%xmm0
22: 0f 29 06 movaps %xmm0,(%rsi)
25: 48 83 c6 10 add $0x10,%rsi
29: 48 39 f8 cmp %rdi,%rax
2c: 77 ea ja 18 <_Z6f_sse2PfS_j+0x18>
2e: f3 c3 repz retq
Which I think may be a tiny bit more efficient, but probably not worth changing it for. But it gave me something to do for 15 minutes.

Related

What does assembly code of function "do_compare" exactly do?

The do_compare function is in the libstdc++ library. It basically checks two strings and returns -1, 1, or 0 accordingly.
Here is the C++ code:
template<typename _CharT>
int
collate<_CharT>::
do_compare(const _CharT* __lo1, const _CharT* __hi1,
const _CharT* __lo2, const _CharT* __hi2) const
{
// strcoll assumes zero-terminated strings so we make a copy
// and then put a zero at the end.
const string_type __one(__lo1, __hi1);
const string_type __two(__lo2, __hi2);
const _CharT* __p = __one.c_str();
const _CharT* __pend = __one.data() + __one.length();
const _CharT* __q = __two.c_str();
const _CharT* __qend = __two.data() + __two.length();
// strcoll stops when it sees a nul character so we break
// the strings into zero-terminated substrings and pass those
// to strcoll.
for (;;)
{
const int __res = _M_compare(__p, __q);
if (__res)
return __res;
__p += char_traits<_CharT>::length(__p);
__q += char_traits<_CharT>::length(__q);
if (__p == __pend && __q == __qend)
return 0;
else if (__p == __pend)
return -1;
else if (__q == __qend)
return 1;
__p++;
__q++;
}
}
I have to put the entire assembly code of do_compare to show my problem, sorry：
0000000000101c40 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4>:
101c40: 41 57 push %r15
101c42: 41 56 push %r14
101c44: 49 89 fe mov %rdi,%r14
101c47: 48 89 f7 mov %rsi,%rdi
101c4a: 48 89 d6 mov %rdx,%rsi
101c4d: 41 55 push %r13
101c4f: 41 54 push %r12
101c51: 55 push %rbp
101c52: 4c 89 c5 mov %r8,%rbp
101c55: 53 push %rbx
101c56: 48 89 cb mov %rcx,%rbx
101c59: 48 83 ec 38 sub $0x38,%rsp
101c5d: 64 48 8b 04 25 28 00 mov %fs:0x28,%rax
101c64: 00 00
101c66: 48 89 44 24 28 mov %rax,0x28(%rsp)
101c6b: 31 c0 xor %eax,%eax
101c6d: 4c 8d 6c 24 27 lea 0x27(%rsp),%r13
101c72: 4c 89 ea mov %r13,%rdx
101c75: 4c 89 6c 24 18 mov %r13,0x18(%rsp)
101c7a: e8 f1 a2 f8 ff callq 8bf70 <_ZNSs12_S_constructIPKcEEPcT_S3_RKSaIcESt20forward_iterator_tag#plt>
101c7f: 4c 89 ea mov %r13,%rdx
101c82: 48 89 ee mov %rbp,%rsi
101c85: 48 89 df mov %rbx,%rdi
101c88: 49 89 c7 mov %rax,%r15
101c8b: 48 89 44 24 08 mov %rax,0x8(%rsp)
101c90: e8 db a2 f8 ff callq 8bf70 <_ZNSs12_S_constructIPKcEEPcT_S3_RKSaIcESt20forward_iterator_tag#plt>
101c95: 4d 8b 67 e8 mov -0x18(%r15),%r12
101c99: 4c 8b 68 e8 mov -0x18(%rax),%r13
101c9d: 48 89 c5 mov %rax,%rbp
101ca0: 48 89 44 24 10 mov %rax,0x10(%rsp)
101ca5: 4c 89 fb mov %r15,%rbx
101ca8: 4d 01 fc add %r15,%r12
101cab: 49 01 c5 add %rax,%r13
101cae: eb 32 jmp 101ce2 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0xa2>
101cb0: 48 89 df mov %rbx,%rdi
101cb3: e8 98 87 f8 ff callq 8a450 <strlen#plt>
101cb8: 48 89 ef mov %rbp,%rdi
101cbb: 48 01 c3 add %rax,%rbx
101cbe: e8 8d 87 f8 ff callq 8a450 <strlen#plt>
101cc3: 48 01 c5 add %rax,%rbp
101cc6: 49 39 dc cmp %rbx,%r12
101cc9: 75 05 jne 101cd0 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0x90>
101ccb: 49 39 ed cmp %rbp,%r13
101cce: 74 27 je 101cf7 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0xb7>
101cd0: 49 39 dc cmp %rbx,%r12
101cd3: 74 6b je 101d40 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0x100>
101cd5: 49 39 ed cmp %rbp,%r13
101cd8: 74 76 je 101d50 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0x110>
101cda: 48 83 c3 01 add $0x1,%rbx
101cde: 48 83 c5 01 add $0x1,%rbp
101ce2: 48 89 ea mov %rbp,%rdx
101ce5: 48 89 de mov %rbx,%rsi
101ce8: 4c 89 f7 mov %r14,%rdi
101ceb: e8 20 8b f8 ff callq 8a810 <_ZNKSt7collateIcE10_M_compareEPKcS2_#plt>
101cf0: 41 89 c7 mov %eax,%r15d
101cf3: 85 c0 test %eax,%eax
101cf5: 74 b9 je 101cb0 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0x70>
101cf7: 48 8b 7c 24 10 mov 0x10(%rsp),%rdi
101cfc: 48 8b 1d 9d 08 28 00 mov 0x28089d(%rip),%rbx # 3825a0 <_ZNSs4_Rep20_S_empty_rep_storageE##GLIBCXX_3.4-0x57e0>
101d03: 48 83 ef 18 sub $0x18,%rdi
101d07: 48 39 df cmp %rbx,%rdi
101d0a: 75 54 jne 101d60 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0x120>
101d0c: 48 8b 7c 24 08 mov 0x8(%rsp),%rdi
101d11: 48 83 ef 18 sub $0x18,%rdi
101d15: 48 39 df cmp %rbx,%rdi
101d18: 75 56 jne 101d70 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0x130>
101d1a: 48 8b 4c 24 28 mov 0x28(%rsp),%rcx
101d1f: 64 48 33 0c 25 28 00 xor %fs:0x28,%rcx
101d26: 00 00
101d28: 44 89 f8 mov %r15d,%eax
101d2b: 75 4f jne 101d7c <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0x13c>
101d2d: 48 83 c4 38 add $0x38,%rsp
101d31: 5b pop %rbx
101d32: 5d pop %rbp
101d33: 41 5c pop %r12
101d35: 41 5d pop %r13
101d37: 41 5e pop %r14
101d39: 41 5f pop %r15
101d3b: c3 retq
101d3c: 0f 1f 40 00 nopl 0x0(%rax)
101d40: 41 bf ff ff ff ff mov $0xffffffff,%r15d
101d46: eb af jmp 101cf7 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0xb7>
101d48: 0f 1f 84 00 00 00 00 nopl 0x0(%rax,%rax,1)
101d4f: 00
101d50: 41 bf 01 00 00 00 mov $0x1,%r15d
101d56: eb 9f jmp 101cf7 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0xb7>
101d58: 0f 1f 84 00 00 00 00 nopl 0x0(%rax,%rax,1)
101d5f: 00
101d60: 48 8b 74 24 18 mov 0x18(%rsp),%rsi
101d65: e8 96 fe ff ff callq 101c00 <_ZNSt14codecvt_bynameIcc11__mbstate_tED0Ev##GLIBCXX_3.4+0x20>
101d6a: eb a0 jmp 101d0c <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0xcc>
101d6c: 0f 1f 40 00 nopl 0x0(%rax)
101d70: 48 8b 74 24 18 mov 0x18(%rsp),%rsi
101d75: e8 86 fe ff ff callq 101c00 <_ZNSt14codecvt_bynameIcc11__mbstate_tED0Ev##GLIBCXX_3.4+0x20>
101d7a: eb 9e jmp 101d1a <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0xda>
101d7c: e8 7f 95 f8 ff callq 8b300 <__stack_chk_fail#plt>
101d81: 48 89 c3 mov %rax,%rbx
101d84: 48 8b 7c 24 08 mov 0x8(%rsp),%rdi
101d89: 48 83 ef 18 sub $0x18,%rdi
101d8d: 48 3b 3d 0c 08 28 00 cmp 0x28080c(%rip),%rdi # 3825a0 <_ZNSs4_Rep20_S_empty_rep_storageE##GLIBCXX_3.4-0x57e0>
101d94: 74 0a je 101da0 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0x160>
101d96: 48 8b 74 24 18 mov 0x18(%rsp),%rsi
101d9b: e8 60 fe ff ff callq 101c00 <_ZNSt14codecvt_bynameIcc11__mbstate_tED0Ev##GLIBCXX_3.4+0x20>
101da0: 48 89 df mov %rbx,%rdi
101da3: e8 e8 a1 f8 ff callq 8bf90 <_Unwind_Resume#plt>
101da8: 0f 1f 84 00 00 00 00 nopl 0x0(%rax,%rax,1)
101daf: 00
*******101db0: 53 push %rbx
101db1: 48 89 fb mov %rdi,%rbx
101db4: 48 8b 3f mov (%rdi),%rdi
101db7: 89 f0 mov %esi,%eax
101db9: 48 85 ff test %rdi,%rdi
101dbc: 74 05 je 101dc3 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0x183>
101dbe: 83 fe ff cmp $0xffffffff,%esi
101dc1: 74 05 je 101dc8 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0x188>
101dc3: 5b pop %rbx
101dc4: c3 retq
101dc5: 0f 1f 00 nopl (%rax)
101dc8: 48 8b 47 10 mov 0x10(%rdi),%rax
101dcc: 48 3b 47 18 cmp 0x18(%rdi),%rax
101dd0: 73 0e jae 101de0 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0x1a0>
101dd2: 0f b6 00 movzbl (%rax),%eax
101dd5: 5b pop %rbx
101dd6: c3 retq
101dd7: 66 0f 1f 84 00 00 00 nopw 0x0(%rax,%rax,1)
101dde: 00 00
101de0: 48 8b 07 mov (%rdi),%rax
101de3: ff 50 48 callq *0x48(%rax)
101de6: 83 f8 ff cmp $0xffffffff,%eax
101de9: 75 d8 jne 101dc3 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0x183>
101deb: 48 c7 03 00 00 00 00 movq $0x0,(%rbx)
101df2: 5b pop %rbx
101df3: c3 retq
101df4: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1)
101dfb: 00 00 00
101dfe: 66 90 xchg %ax,%ax
101e00: 55 push %rbp
101e01: 89 f5 mov %esi,%ebp
101e03: 53 push %rbx
101e04: 48 89 fb mov %rdi,%rbx
101e07: 48 83 ec 08 sub $0x8,%rsp
101e0b: e8 b0 88 f8 ff callq 8a6c0 <_ZNKSt5ctypeIcE13_M_widen_initEv#plt>
101e10: 48 8b 03 mov (%rbx),%rax
101e13: 48 8b 40 30 mov 0x30(%rax),%rax
101e17: 48 3b 05 7a 11 28 00 cmp 0x28117a(%rip),%rax # 382f98 <_ZNKSt5ctypeIcE8do_widenEc##GLIBCXX_3.4+0x2e2c48>
101e1e: 75 10 jne 101e30 <_ZNKSt7collateIcE10do_compareEPKcS2_S2_S2_##GLIBCXX_3.4+0x1f0>
101e20: 48 83 c4 08 add $0x8,%rsp
101e24: 89 e8 mov %ebp,%eax
101e26: 5b pop %rbx
101e27: 5d pop %rbp
101e28: c3 retq
101e29: 0f 1f 80 00 00 00 00 nopl 0x0(%rax)
101e30: 48 83 c4 08 add $0x8,%rsp
101e34: 40 0f be f5 movsbl %bpl,%esi
101e38: 48 89 df mov %rbx,%rdi
101e3b: 5b pop %rbx
101e3c: 5d pop %rbp
101e3d: ff e0 jmpq *%rax
101e3f: 90 nop
It seems to me that the assembly code not only performs the C++ code logic but also adds other logic.
As an example, the function _M_extract_int in libstdc++ which coverts a char to int calls this function as the following:
callq 0x101db0
The instruction address 0x101db0 is in the middle of the assembly code. The code section from 0x101db0 to 0x101dbc seems to have nothing to do with the above C++ code. Really confused about what is going on here...

Hardware supported popcount for dynamic bitset in Boost library

How to enable the hardware supported popcount for counting set bits in the dynamic bitset from the Boost 1.64.0 library?

#include <boost/dynamic_bitset.hpp>
#include <boost/function_output_iterator.hpp>
#include <cstddef>
std::size_t fn(boost::dynamic_bitset<> const & p)
{
std::size_t acc = 0;
boost::to_block_range(p, boost::make_function_output_iterator(
[&acc](boost::dynamic_bitset<>::block_type v)
{
acc += __builtin_popcountll(v);
}
));
return acc;
}
Compiles to (g++ -O3 -march=native -c bitset.cpp -std=c++14):
30: 48 8b 77 08 mov 0x8(%rdi),%rsi
34: 48 8b 17 mov (%rdi),%rdx
37: 48 89 f0 mov %rsi,%rax
3a: 48 29 d0 sub %rdx,%rax
3d: 48 83 f8 07 cmp $0x7,%rax
41: b8 00 00 00 00 mov $0x0,%eax
46: 7e 1d jle 65 <_Z3fn3RKN5boost14dynamic_bitsetImSaImEEE+0x35>
48: 0f 1f 84 00 00 00 00 nopl 0x0(%rax,%rax,1)
4f: 00
50: 31 c9 xor %ecx,%ecx
52: 48 83 c2 08 add $0x8,%rdx
56: f3 48 0f b8 4a f8 popcnt -0x8(%rdx),%rcx
5c: 48 01 c8 add %rcx,%rax
5f: 48 39 d6 cmp %rdx,%rsi
62: 75 ec jne 50 <_Z3fn3RKN5boost14dynamic_bitsetImSaImEEE+0x20>
64: c3 retq
65: c3 retq
66: 66 2e 0f 1f 84 00 00 nopw %cs:0x0(%rax,%rax,1)
6d: 00 00 00

Why is this AVX code slower?

Updated: 19 Aug. 2017, 16:49 UTC
I’m writing an AVX code to multiply a vector with 4 billion components by a constant, however, I see no difference between my small -- I hope -- optimized AVX code and the long scalar compiler optimized version.
Both versions run between 410 ms - 400 ms.
Can someone tell me why it is occurring?
And why the large assembly generated by the compiler code takes almost the same time even it's larger ?
It's an important question, because if small computations -- like this multiplication -- have no improvement then it has no sense to use made the manual code in an Intel Core CPU. Perhaps in an Intel Xeon ( with 16 components ) or for more complex computations.
I'm compiling with G++ with parameters:
g++ -O3 -mtune=native -march=native -mavx -g3 -Wall -c -fmessage-length=0 -MMD -MP -MF"src/Test AVX.d" -MT"src/Test\ AVX.d" -o "src/Test AVX.o" "../src/Test AVX.cpp"
My CPU is a Intel(R) Core(TM) i5-5200U CPU # 2.20GHz.
There is the AVX code:
/**
* Run AVX Code
*/
void AVX() {
// Loop control
uint_fast32_t loop = 0;
// The constant
__m256 _const = _mm256_set1_ps(5.0f);
// The register for multiplication
__m256 _ymm0 = _mm256_setzero_ps();
// A "buffer" between the vector and the YMM0 register
float f_data[8];
// The main loop
for ( loop = 0 ; loop < SIZE ; loop = loop + 8 ) {
// Load to buffer
f_data[0] = vector[loop];
f_data[1] = vector[loop+1];
f_data[2] = vector[loop+2];
f_data[3] = vector[loop+3];
f_data[4] = vector[loop+4];
f_data[5] = vector[loop+5];
f_data[6] = vector[loop+6];
f_data[7] = vector[loop+7];
/*
* I tried to use pointers insted to copy
* the data, but the software crash
*
* float **f_data;
* f_data = float*[8];
*
* f_data[0] = &vector[loop];
* ...
*
*/
// Load to XMM and YMM Registers
_ymm0 = _mm256_load_ps(f_data);
// Do the multiplication
_ymm0 = _mm256_mul_ps(_ymm0,_const);
// Copy the results from the register to the "buffer"
_mm256_store_ps(f_data,_ymm0);
// Copy from the "buffer" to the vector
vector[loop] = f_data[0];
vector[loop+1] = f_data[1];
vector[loop+2] = f_data[2];
vector[loop+3] = f_data[3];
vector[loop+4] = f_data[4];
vector[loop+5] = f_data[5];
vector[loop+6] = f_data[6];
vector[loop+7] = f_data[7];
}
}
The AVX assembled:
0000000000400de0 <_Z3AVXv>:
400de0: 48 8b 05 b1 13 20 00 mov rax,QWORD PTR [rip+0x2013b1] # 602198 <vector>
400de7: c5 fc 28 0d 71 06 00 vmovaps ymm1,YMMWORD PTR [rip+0x671] # 401460 <_IO_stdin_used+0x40>
400dee: 00
400def: 48 8d 90 00 00 00 40 lea rdx,[rax+0x40000000]
400df6: 66 2e 0f 1f 84 00 00 nop WORD PTR cs:[rax+rax*1+0x0]
400dfd: 00 00 00
400e00: c5 f4 59 00 vmulps ymm0,ymm1,YMMWORD PTR [rax]
400e04: 48 83 c0 20 add rax,0x20
400e08: c5 fc 11 40 e0 vmovups YMMWORD PTR [rax-0x20],ymm0
400e0d: 48 39 c2 cmp rdx,rax
400e10: 75 ee jne 400e00 <_Z3AVXv+0x20>
400e12: c5 f8 77 vzeroupper
400e15: c3 ret
400e16: 66 2e 0f 1f 84 00 00 nop WORD PTR cs:[rax+rax*1+0x0]
400e1d: 00 00 00
The Serial Version:
/**
* Run Compiler optimized version
*/
void Serial() {
uint_fast32_t loop;
// Do the multiplication
for ( loop = 0 ; loop < SIZE ; loop ++)
vector[loop] *= 5;
}
The serial assembled:
It's more large, move the data more times and take almost the same time. How it's possible ?
0000000000400e80 <_Z6Serialv>:
400e80: 48 8b 35 11 13 20 00 mov rsi,QWORD PTR [rip+0x201311] # 602198 <vector>
400e87: 48 89 f0 mov rax,rsi
400e8a: 48 c1 e8 02 shr rax,0x2
400e8e: 48 f7 d8 neg rax
400e91: 83 e0 07 and eax,0x7
400e94: 0f 84 96 01 00 00 je 401030 <_Z6Serialv+0x1b0>
400e9a: c5 fa 10 05 7a 04 00 vmovss xmm0,DWORD PTR [rip+0x47a] # 40131c <_IO_stdin_used+0x1c>
400ea1: 00
400ea2: c5 fa 59 0e vmulss xmm1,xmm0,DWORD PTR [rsi]
400ea6: c5 fa 11 0e vmovss DWORD PTR [rsi],xmm1
400eaa: 48 83 f8 01 cmp rax,0x1
400eae: 0f 84 8c 01 00 00 je 401040 <_Z6Serialv+0x1c0>
400eb4: c5 fa 59 4e 04 vmulss xmm1,xmm0,DWORD PTR [rsi+0x4]
400eb9: c5 fa 11 4e 04 vmovss DWORD PTR [rsi+0x4],xmm1
400ebe: 48 83 f8 02 cmp rax,0x2
400ec2: 0f 84 89 01 00 00 je 401051 <_Z6Serialv+0x1d1>
400ec8: c5 fa 59 4e 08 vmulss xmm1,xmm0,DWORD PTR [rsi+0x8]
400ecd: c5 fa 11 4e 08 vmovss DWORD PTR [rsi+0x8],xmm1
400ed2: 48 83 f8 03 cmp rax,0x3
400ed6: 0f 84 86 01 00 00 je 401062 <_Z6Serialv+0x1e2>
400edc: c5 fa 59 4e 0c vmulss xmm1,xmm0,DWORD PTR [rsi+0xc]
400ee1: c5 fa 11 4e 0c vmovss DWORD PTR [rsi+0xc],xmm1
400ee6: 48 83 f8 04 cmp rax,0x4
400eea: 0f 84 2d 01 00 00 je 40101d <_Z6Serialv+0x19d>
400ef0: c5 fa 59 4e 10 vmulss xmm1,xmm0,DWORD PTR [rsi+0x10]
400ef5: c5 fa 11 4e 10 vmovss DWORD PTR [rsi+0x10],xmm1
400efa: 48 83 f8 05 cmp rax,0x5
400efe: 0f 84 6f 01 00 00 je 401073 <_Z6Serialv+0x1f3>
400f04: c5 fa 59 4e 14 vmulss xmm1,xmm0,DWORD PTR [rsi+0x14]
400f09: c5 fa 11 4e 14 vmovss DWORD PTR [rsi+0x14],xmm1
400f0e: 48 83 f8 06 cmp rax,0x6
400f12: 0f 84 6c 01 00 00 je 401084 <_Z6Serialv+0x204>
400f18: c5 fa 59 46 18 vmulss xmm0,xmm0,DWORD PTR [rsi+0x18]
400f1d: 41 b9 f9 ff ff 0f mov r9d,0xffffff9
400f23: 41 ba 07 00 00 00 mov r10d,0x7
400f29: c5 fa 11 46 18 vmovss DWORD PTR [rsi+0x18],xmm0
400f2e: 41 b8 00 00 00 10 mov r8d,0x10000000
400f34: c5 fc 28 0d 04 04 00 vmovaps ymm1,YMMWORD PTR [rip+0x404] # 401340 <_IO_stdin_used+0x40>
400f3b: 00
400f3c: 48 8d 0c 86 lea rcx,[rsi+rax*4]
400f40: 31 d2 xor edx,edx
400f42: 49 29 c0 sub r8,rax
400f45: 31 c0 xor eax,eax
400f47: 4c 89 c7 mov rdi,r8
400f4a: 48 c1 ef 03 shr rdi,0x3
400f4e: 66 90 xchg ax,ax
400f50: c5 f4 59 04 01 vmulps ymm0,ymm1,YMMWORD PTR [rcx+rax*1]
400f55: 48 83 c2 01 add rdx,0x1
400f59: c5 fc 29 04 01 vmovaps YMMWORD PTR [rcx+rax*1],ymm0
400f5e: 48 83 c0 20 add rax,0x20
400f62: 48 39 d7 cmp rdi,rdx
400f65: 77 e9 ja 400f50 <_Z6Serialv+0xd0>
400f67: 4c 89 c1 mov rcx,r8
400f6a: 4c 89 ca mov rdx,r9
400f6d: 48 83 e1 f8 and rcx,0xfffffffffffffff8
400f71: 49 8d 04 0a lea rax,[r10+rcx*1]
400f75: 48 29 ca sub rdx,rcx
400f78: 49 39 c8 cmp r8,rcx
400f7b: 0f 84 98 00 00 00 je 401019 <_Z6Serialv+0x199>
400f81: 48 8d 0c 86 lea rcx,[rsi+rax*4]
400f85: c5 fa 10 05 8f 03 00 vmovss xmm0,DWORD PTR [rip+0x38f] # 40131c <_IO_stdin_used+0x1c>
400f8c: 00
400f8d: c5 fa 59 09 vmulss xmm1,xmm0,DWORD PTR [rcx]
400f91: c5 fa 11 09 vmovss DWORD PTR [rcx],xmm1
400f95: 48 8d 48 01 lea rcx,[rax+0x1]
400f99: 48 83 fa 01 cmp rdx,0x1
400f9d: 74 7a je 401019 <_Z6Serialv+0x199>
400f9f: 48 8d 0c 8e lea rcx,[rsi+rcx*4]
400fa3: c5 fa 59 09 vmulss xmm1,xmm0,DWORD PTR [rcx]
400fa7: c5 fa 11 09 vmovss DWORD PTR [rcx],xmm1
400fab: 48 8d 48 02 lea rcx,[rax+0x2]
400faf: 48 83 fa 02 cmp rdx,0x2
400fb3: 74 64 je 401019 <_Z6Serialv+0x199>
400fb5: 48 8d 0c 8e lea rcx,[rsi+rcx*4]
400fb9: c5 fa 59 09 vmulss xmm1,xmm0,DWORD PTR [rcx]
400fbd: c5 fa 11 09 vmovss DWORD PTR [rcx],xmm1
400fc1: 48 8d 48 03 lea rcx,[rax+0x3]
400fc5: 48 83 fa 03 cmp rdx,0x3
400fc9: 74 4e je 401019 <_Z6Serialv+0x199>
400fcb: 48 8d 0c 8e lea rcx,[rsi+rcx*4]
400fcf: c5 fa 59 09 vmulss xmm1,xmm0,DWORD PTR [rcx]
400fd3: c5 fa 11 09 vmovss DWORD PTR [rcx],xmm1
400fd7: 48 8d 48 04 lea rcx,[rax+0x4]
400fdb: 48 83 fa 04 cmp rdx,0x4
400fdf: 74 38 je 401019 <_Z6Serialv+0x199>
400fe1: 48 8d 0c 8e lea rcx,[rsi+rcx*4]
400fe5: c5 fa 59 09 vmulss xmm1,xmm0,DWORD PTR [rcx]
400fe9: c5 fa 11 09 vmovss DWORD PTR [rcx],xmm1
400fed: 48 8d 48 05 lea rcx,[rax+0x5]
400ff1: 48 83 fa 05 cmp rdx,0x5
400ff5: 74 22 je 401019 <_Z6Serialv+0x199>
400ff7: 48 8d 0c 8e lea rcx,[rsi+rcx*4]
400ffb: 48 83 c0 06 add rax,0x6
400fff: c5 fa 59 09 vmulss xmm1,xmm0,DWORD PTR [rcx]
401003: c5 fa 11 09 vmovss DWORD PTR [rcx],xmm1
401007: 48 83 fa 06 cmp rdx,0x6
40100b: 74 0c je 401019 <_Z6Serialv+0x199>
40100d: 48 8d 04 86 lea rax,[rsi+rax*4]
401011: c5 fa 59 00 vmulss xmm0,xmm0,DWORD PTR [rax]
401015: c5 fa 11 00 vmovss DWORD PTR [rax],xmm0
401019: c5 f8 77 vzeroupper
40101c: c3 ret
40101d: 41 ba 04 00 00 00 mov r10d,0x4
401023: 41 b9 fc ff ff 0f mov r9d,0xffffffc
401029: e9 00 ff ff ff jmp 400f2e <_Z6Serialv+0xae>
40102e: 66 90 xchg ax,ax
401030: 41 b9 00 00 00 10 mov r9d,0x10000000
401036: 45 31 d2 xor r10d,r10d
401039: e9 f0 fe ff ff jmp 400f2e <_Z6Serialv+0xae>
40103e: 66 90 xchg ax,ax
401040: 41 b9 ff ff ff 0f mov r9d,0xfffffff
401046: 41 ba 01 00 00 00 mov r10d,0x1
40104c: e9 dd fe ff ff jmp 400f2e <_Z6Serialv+0xae>
401051: 41 ba 02 00 00 00 mov r10d,0x2
401057: 41 b9 fe ff ff 0f mov r9d,0xffffffe
40105d: e9 cc fe ff ff jmp 400f2e <_Z6Serialv+0xae>
401062: 41 ba 03 00 00 00 mov r10d,0x3
401068: 41 b9 fd ff ff 0f mov r9d,0xffffffd
40106e: e9 bb fe ff ff jmp 400f2e <_Z6Serialv+0xae>
401073: 41 ba 05 00 00 00 mov r10d,0x5
401079: 41 b9 fb ff ff 0f mov r9d,0xffffffb
40107f: e9 aa fe ff ff jmp 400f2e <_Z6Serialv+0xae>
401084: 41 ba 06 00 00 00 mov r10d,0x6
40108a: 41 b9 fa ff ff 0f mov r9d,0xffffffa
401090: e9 99 fe ff ff jmp 400f2e <_Z6Serialv+0xae>
401095: 90 nop
401096: 66 2e 0f 1f 84 00 00 nop WORD PTR cs:[rax+rax*1+0x0]
40109d: 00 00 00
The full code:
#include <iostream>
#include <xmmintrin.h>
#include <immintrin.h>
using namespace std;
/**
* The vector size
* 268435456 -> 32*8388608 -> 2^32
*/
#define SIZE 268435456
/**
* The vector for computations
*/
float *vector;
/**
* Run AVX Code
*/
void AVX() { ... }
/**
* Run Compiler optimized version
*/
void Serial() { ... }
/**
* Create the vector
*/
void create() {
vector = new float[SIZE];
}
/**
* Fill the vector with data
* to be used for validation
*/
void fill() {
uint_fast32_t loop = 0;
// Fill the vector
for ( loop = 0 ; loop < SIZE ; loop++ )
vector[loop] = 1;
}
/**
* A validation to ensure the compiler have
* computed all the vector data
*/
void validation() {
// The loop variable
unsigned long loop = 0;
unsigned long errors = 0;
unsigned long checks = 0;
for ( loop = 0 ; loop < SIZE ; loop ++ ) {
// All the vector must be 5
if ( vector[loop] != 5 ) {
errors ++;
// To avoid to show too many errors
if ( errors < 12 )
std::cout << loop << ": " << vector[loop] << std::endl;
}
checks ++;
}
// The result
std::cout << "Errors: " << errors << "\nChecks: " << checks << std::endl;
}
int main() {
// Create the vector
create();
// Fill with data
//fill();
// The tests
//Serial();
AVX();
/*
* To ensure that the g++ optimization have executed the loop
*/
//validation();
}
Compiled with:
g++ -O3 -mtune=native -march=native -mavx -g3 -Wall -c -fmessage-length=0 -MMD -MP -MF"src/Test AVX.d" -MT"src/Test\ AVX.d" -o "src/Test AVX.o" "../src/Test AVX.cpp"

Multiplying by 5 is so trivial that you should do that on the fly next time you read the array, or fold it into the code that wrote this array. Loading all that data from RAM into the CPU and storing it back again just to multiply by 5.0 is not efficient.
If you can't just fold it into a different pass of your algorithm, try cache-blocking aka loop-tiling to run multiple steps of your algorithm over a part of this array that fits into cache, before moving on to the next cache-sized block.
Your scalar code auto-vectorizes to nearly the same inner loop as your manually-vectorized version. Neither one is unrolled at all.
The extra code size in gcc's version is just scalar startup / cleanup so its inner loop can use aligned loads/stores. gcc fully unrolls those loops.
Also note that your manually-vectorized code doesn't handle the case where SIZE is not a multiple of 8. (gcc does handle the cleanup at the end even then, because it doesn't know where the alignment boundary will be.)
clang usually just uses unaligned loads/stores on arrays that it can't prove at compile time are always aligned. gcc's default behaviour is maybe good for large arrays that actually are misaligned at run-time, but a total waste of I-cache and branches for cases where the data is in fact aligned at run time most of the time, or for small arrays where doing a bunch of branching and scalar iterations isn't worth it.
The inner loops are nearly the same. In your manually vectorized version, gcc managed to optimize away the element-by-element copy through f_data and emit what you would get from _mm256_loadu_ps(&vector[loop]), instead of actually copying to a local and then doing a vector load. And same for storing back into vector[], luckily for you.
# top of inner loop in the manually-vectorized version:
400e00: c5 f4 59 00 vmulps ymm0,ymm1,YMMWORD PTR [rax]
400e04: 48 83 c0 20 add rax,0x20
400e08: c5 fc 11 40 e0 vmovups YMMWORD PTR [rax-0x20],ymm0
400e0d: 48 39 c2 cmp rdx,rax
400e10: 75 ee jne 400e00 <_Z3AVXv+0x20>
gcc's inner loop uses a loop counter separate from the pointer, so it has an extra instruction, and it uses an indexed addressing mode. vmulps ymm0,ymm1,YMMWORD PTR [rcx+rax*1] can't stay micro-fused on Haswell, so it will issue as 2 fused-domain uops.
# top of gcc's inner loop:
400f50: c5 f4 59 04 01 vmulps ymm0,ymm1,YMMWORD PTR [rcx+rax*1]
400f55: 48 83 c2 01 add rdx,0x1
400f59: c5 fc 29 04 01 vmovaps YMMWORD PTR [rcx+rax*1],ymm0
400f5e: 48 83 c0 20 add rax,0x20
400f62: 48 39 d7 cmp rdi,rdx
400f65: 77 e9 ja 400f50 <_Z6Serialv+0xd0>
The extra add instruction is another extra uop. This is 6 fused-domain uops (and thus can run at best one iteration per 1.5 cycles, bottlenecked on the front-end).
Your manual version is only 4 fused-domain uops, so it can issue at 1 per clock. It can in theory run that fast if the buffer is hot in L1D cache (or maybe L2), also limited by 1 store per clock.
Of course, since you're running it over a giant buffer, you just bottleneck on memory bandwidth. The minor front-end bottleneck in the auto-vectorized version is a total non-issue. Even an SSE2 version would barely run slower.
You said something about a Xeon with 16 cores. If you want gcc to auto-parallelize as well as SIMD vectorize, you could use OpenMP. As it is, your code is purely single-threaded.

clang generated code for a simple factorial function

Here is the code :
unsigned int factorial(unsigned int n) {
if (n == 0) return 1;
return n * factorial(n - 1);
}
int main() {
return factorial(0);
}
I generate both gcc and clang assemblies using
g++ -g -O2 -c factorial.cpp (resp clang++)
objdump -d -M intel -S factorial.o
Here is what I get for gcc
factorial.o: file format elf64-x86-64
Disassembly of section .text:
0000000000000000 <_Z9factorialj>:
unsigned int factorial(unsigned int n)
{
if (n == 0)
0: 85 ff test edi,edi
2: b8 01 00 00 00 mov eax,0x1
7: 74 11 je 1a <_Z9factorialj+0x1a>
9: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
10: 0f af c7 imul eax,edi
13: 83 ef 01 sub edi,0x1
16: 75 f8 jne 10 <_Z9factorialj+0x10>
18: f3 c3 repz ret
{
return 1;
}
return n * factorial(n - 1);
}
1a: f3 c3 repz ret
Disassembly of section .text.startup:
0000000000000000 <main>:
if (n == 0)
0: b8 01 00 00 00 mov eax,0x1
5: c3 ret
and for clang
factorial.o: file format elf64-x86-64
Disassembly of section .text:
0000000000000000 <_Z9factorialj>:
unsigned int factorial(unsigned int n)
{
0: b8 01 00 00 00 mov eax,0x1
if (n == 0)
5: 85 ff test edi,edi
7: 0f 84 ba 01 00 00 je 1c7 <_Z9factorialj+0x1c7>
d: b8 01 00 00 00 mov eax,0x1
{
return 1;
}
return n * factorial(n - 1);
12: 83 ff 08 cmp edi,0x8
15: 0f 82 a5 01 00 00 jb 1c0 <_Z9factorialj+0x1c0>
1b: 41 89 f8 mov r8d,edi
1e: 41 83 e0 f8 and r8d,0xfffffff8
22: 89 fa mov edx,edi
24: 83 e2 f8 and edx,0xfffffff8
27: 0f 84 93 01 00 00 je 1c0 <_Z9factorialj+0x1c0>
2d: 8d 4f f8 lea ecx,[rdi-0x8]
30: 89 c8 mov eax,ecx
32: c1 e8 03 shr eax,0x3
35: 0f ba e1 03 bt ecx,0x3
39: 72 24 jb 5f <_Z9factorialj+0x5f>
3b: 66 0f 6e c7 movd xmm0,edi
3f: 66 0f 70 c8 00 pshufd xmm1,xmm0,0x0
44: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 4c <_Z9factorialj+0x4c>
4b: 00
4c: 66 0f fe c1 paddd xmm0,xmm1
50: 66 0f fe 0d 00 00 00 paddd xmm1,XMMWORD PTR [rip+0x0] # 58 <_Z9factorialj+0x58>
57: 00
58: b9 08 00 00 00 mov ecx,0x8
5d: eb 0e jmp 6d <_Z9factorialj+0x6d>
5f: 66 0f 6f 05 00 00 00 movdqa xmm0,XMMWORD PTR [rip+0x0] # 67 <_Z9factorialj+0x67>
66: 00
67: 31 c9 xor ecx,ecx
69: 66 0f 6f c8 movdqa xmm1,xmm0
6d: 85 c0 test eax,eax
6f: 0f 84 d4 00 00 00 je 149 <_Z9factorialj+0x149>
75: 89 d0 mov eax,edx
77: 29 c8 sub eax,ecx
79: 89 fe mov esi,edi
7b: 29 ce sub esi,ecx
7d: 66 0f 6f 15 00 00 00 movdqa xmm2,XMMWORD PTR [rip+0x0] # 85 <_Z9factorialj+0x85>
84: 00
85: 66 0f 6f 1d 00 00 00 movdqa xmm3,XMMWORD PTR [rip+0x0] # 8d <_Z9factorialj+0x8d>
8c: 00
8d: 0f 1f 00 nop DWORD PTR [rax]
90: 66 0f 6e e6 movd xmm4,esi
94: 66 0f 70 e4 00 pshufd xmm4,xmm4,0x0
99: 66 0f 6f ec movdqa xmm5,xmm4
9d: 66 0f fe ea paddd xmm5,xmm2
a1: 66 0f fe e3 paddd xmm4,xmm3
a5: 66 0f 70 f5 f5 pshufd xmm6,xmm5,0xf5
aa: 66 0f f4 e8 pmuludq xmm5,xmm0
ae: 66 0f 70 ed e8 pshufd xmm5,xmm5,0xe8
b3: 66 0f 70 c0 f5 pshufd xmm0,xmm0,0xf5
b8: 66 0f f4 c6 pmuludq xmm0,xmm6
bc: 66 0f 70 c0 e8 pshufd xmm0,xmm0,0xe8
c1: 66 0f 62 e8 punpckldq xmm5,xmm0
c5: 66 0f 70 c4 f5 pshufd xmm0,xmm4,0xf5
ca: 66 0f f4 e1 pmuludq xmm4,xmm1
ce: 66 0f 70 e4 e8 pshufd xmm4,xmm4,0xe8
d3: 66 0f 70 c9 f5 pshufd xmm1,xmm1,0xf5
d8: 66 0f f4 c8 pmuludq xmm1,xmm0
dc: 66 0f 70 c1 e8 pshufd xmm0,xmm1,0xe8
e1: 66 0f 62 e0 punpckldq xmm4,xmm0
e5: 8d 4e f8 lea ecx,[rsi-0x8]
e8: 66 0f 6e c1 movd xmm0,ecx
ec: 66 0f 70 c8 00 pshufd xmm1,xmm0,0x0
f1: 66 0f 6f c1 movdqa xmm0,xmm1
f5: 66 0f fe c2 paddd xmm0,xmm2
f9: 66 0f fe cb paddd xmm1,xmm3
fd: 66 0f 70 f0 f5 pshufd xmm6,xmm0,0xf5
102: 66 0f f4 c5 pmuludq xmm0,xmm5
106: 66 0f 70 c0 e8 pshufd xmm0,xmm0,0xe8
10b: 66 0f 70 ed f5 pshufd xmm5,xmm5,0xf5
110: 66 0f f4 ee pmuludq xmm5,xmm6
114: 66 0f 70 ed e8 pshufd xmm5,xmm5,0xe8
119: 66 0f 62 c5 punpckldq xmm0,xmm5
11d: 66 0f 70 e9 f5 pshufd xmm5,xmm1,0xf5
122: 66 0f f4 cc pmuludq xmm1,xmm4
126: 66 0f 70 c9 e8 pshufd xmm1,xmm1,0xe8
12b: 66 0f 70 e4 f5 pshufd xmm4,xmm4,0xf5
130: 66 0f f4 e5 pmuludq xmm4,xmm5
134: 66 0f 70 e4 e8 pshufd xmm4,xmm4,0xe8
139: 66 0f 62 cc punpckldq xmm1,xmm4
13d: 83 c6 f0 add esi,0xfffffff0
140: 83 c0 f0 add eax,0xfffffff0
143: 0f 85 47 ff ff ff jne 90 <_Z9factorialj+0x90>
149: 66 0f 70 d1 f5 pshufd xmm2,xmm1,0xf5
14e: 66 0f f4 c8 pmuludq xmm1,xmm0
152: 66 0f 70 c9 e8 pshufd xmm1,xmm1,0xe8
157: 66 0f 70 c0 f5 pshufd xmm0,xmm0,0xf5
15c: 66 0f f4 c2 pmuludq xmm0,xmm2
160: 66 0f 70 c0 e8 pshufd xmm0,xmm0,0xe8
165: 66 0f 62 c8 punpckldq xmm1,xmm0
169: 66 0f 70 c1 4e pshufd xmm0,xmm1,0x4e
16e: 66 0f 70 d1 f5 pshufd xmm2,xmm1,0xf5
173: 66 0f f4 c8 pmuludq xmm1,xmm0
177: 66 0f 70 c9 e8 pshufd xmm1,xmm1,0xe8
17c: 66 0f 70 c0 f5 pshufd xmm0,xmm0,0xf5
181: 66 0f f4 c2 pmuludq xmm0,xmm2
185: 66 0f 70 c0 e8 pshufd xmm0,xmm0,0xe8
18a: 66 0f 62 c8 punpckldq xmm1,xmm0
18e: 66 0f 70 c1 e5 pshufd xmm0,xmm1,0xe5
193: 66 0f 70 d1 f5 pshufd xmm2,xmm1,0xf5
198: 66 0f f4 c8 pmuludq xmm1,xmm0
19c: 66 0f 70 c9 e8 pshufd xmm1,xmm1,0xe8
1a1: 66 0f 70 c0 f5 pshufd xmm0,xmm0,0xf5
1a6: 66 0f f4 c2 pmuludq xmm0,xmm2
1aa: 66 0f 70 c0 e8 pshufd xmm0,xmm0,0xe8
1af: 66 0f 62 c8 punpckldq xmm1,xmm0
1b3: 66 0f 7e c8 movd eax,xmm1
1b7: 39 fa cmp edx,edi
1b9: 74 0c je 1c7 <_Z9factorialj+0x1c7>
1bb: 44 29 c7 sub edi,r8d
1be: 66 90 xchg ax,ax
1c0: 0f af c7 imul eax,edi
if (n == 0)
1c3: ff cf dec edi
1c5: 75 f9 jne 1c0 <_Z9factorialj+0x1c0>
}
1c7: c3 ret
1c8: 0f 1f 84 00 00 00 00 nop DWORD PTR [rax+rax*1+0x0]
1cf: 00
00000000000001d0 <main>:
int main()
{
return factorial(0);
1d0: b8 01 00 00 00 mov eax,0x1
1d5: c3 ret
I understand they both notice they can unroll the loop and precompute the value but I don't get why clang generates all this code (and I have no idea what this can be ever doing and why there is sse stuff...)
Bonus question: while gcc precomputes the value up to factorial(7), clang computes it for any value.
Up to factorial(31) the values look fine, for factorial(32) and factorial(33) it returns 0x80000000 and for greater values it does xor eax, eax instead. What is this black magic ?

"call" instruction that seemingly jumps into itself

I have some C++ code
#include <cstdio>
#include <boost/bind.hpp>
#include <boost/function.hpp>
class A {
public:
void do_it() { std::printf("aaa"); }
};
void
call_it(const boost::function<void()> &f)
{
f();
}
void
func()
{
A *a = new A;
call_it(boost::bind(&A::do_it, a));
}
which gcc 4 compiles into the following assembly (from objdump):
00000030 <func()>:
30: 55 push %ebp
31: 89 e5 mov %esp,%ebp
33: 56 push %esi
34: 31 f6 xor %esi,%esi
36: 53 push %ebx
37: bb 00 00 00 00 mov $0x0,%ebx
3c: 83 ec 40 sub $0x40,%esp
3f: c7 04 24 01 00 00 00 movl $0x1,(%esp)
46: e8 fc ff ff ff call 47 <func()+0x17>
4b: 8d 55 ec lea 0xffffffec(%ebp),%edx
4e: 89 14 24 mov %edx,(%esp)
51: 89 5c 24 04 mov %ebx,0x4(%esp)
55: 89 74 24 08 mov %esi,0x8(%esp)
59: 89 44 24 0c mov %eax,0xc(%esp)
; the rest of the function is omitted
I can't understand the operand of call instruction here, why does it call into itself, but with one byte off?

The call is probably to an external function, and the address you see (FFFFFFFC) is just a placeholder for the real address, which the linker and/or loader will take care of later.

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js