Explanation of std::vector<int> sum ASM

Explanation of std::vector<int> sum ASM - c++

I was playing around with the Compiler Explorer, and I'm struggling to understand the ASM output (x86 Clang 3.7 -O3) of a simple std::vector<int> sum function:
#include <vector>
#include <numeric>
int sum(const std::vector<int>& v)
{
return std::accumulate(v.begin(), v.end(), 0);
}
The ASM for this code is:
sum(std::vector<int, std::allocator<int> > const&): # #sum(std::vector<int, std::allocator<int> > const&)
movq (%rdi), %rsi
movq 8(%rdi), %r11
xorl %eax, %eax
cmpq %r11, %rsi
je .LBB0_13
movabsq $9223372036854775800, %rax # imm = 0x7FFFFFFFFFFFFFF8
leaq -4(%r11), %rdx
movq %rdx, %r10
subq %rsi, %r10
shrq $2, %r10
incq %r10
xorl %edi, %edi
movq %r10, %r8
andq %rax, %r8
pxor %xmm0, %xmm0
je .LBB0_2
andq %r10, %rax
leaq -8(%rax), %r9
movl %r9d, %ecx
shrl $3, %ecx
incl %ecx
xorl %edi, %edi
testb $3, %cl
je .LBB0_4
subl %esi, %edx
shrl $2, %edx
incl %edx
andl $24, %edx
addl $-8, %edx
shrl $3, %edx
incl %edx
andl $3, %edx
negq %rdx
pxor %xmm0, %xmm0
xorl %edi, %edi
pxor %xmm1, %xmm1
.LBB0_6: # %vector.body.prol
movdqu (%rsi,%rdi,4), %xmm2
movdqu 16(%rsi,%rdi,4), %xmm3
paddd %xmm2, %xmm0
paddd %xmm3, %xmm1
addq $8, %rdi
incq %rdx
jne .LBB0_6
jmp .LBB0_7
.LBB0_2:
pxor %xmm1, %xmm1
jmp .LBB0_11
.LBB0_4:
pxor %xmm0, %xmm0
pxor %xmm1, %xmm1
.LBB0_7: # %vector.body.preheader.split
leaq (%rsi,%r8,4), %rdx
cmpq $24, %r9
jb .LBB0_10
subq %rdi, %rax
leaq 112(%rsi,%rdi,4), %rsi
.LBB0_9: # %vector.body
movdqu -112(%rsi), %xmm2
movdqu -96(%rsi), %xmm3
movdqu -80(%rsi), %xmm4
movdqu -64(%rsi), %xmm5
paddd %xmm0, %xmm2
paddd %xmm1, %xmm3
paddd %xmm4, %xmm2
paddd %xmm5, %xmm3
movdqu -48(%rsi), %xmm4
movdqu -32(%rsi), %xmm5
paddd %xmm2, %xmm4
paddd %xmm3, %xmm5
movdqu -16(%rsi), %xmm0
movdqu (%rsi), %xmm1
paddd %xmm4, %xmm0
paddd %xmm5, %xmm1
subq $-128, %rsi
addq $-32, %rax
jne .LBB0_9
.LBB0_10:
movq %rdx, %rsi
movq %r8, %rdi
.LBB0_11: # %middle.block
paddd %xmm1, %xmm0
pshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1]
paddd %xmm0, %xmm1
pshufd $229, %xmm1, %xmm0 # xmm0 = xmm1[1,1,2,3]
paddd %xmm1, %xmm0
movd %xmm0, %eax
cmpq %rdi, %r10
je .LBB0_13
.LBB0_12: # %.lr.ph.i
addl (%rsi), %eax
addq $4, %rsi
cmpq %rsi, %r11
jne .LBB0_12
.LBB0_13: # %int std::accumulate<__gnu_cxx::__normal_iterator<int const*, std::vector<int, std::allocator<int> > >, int>(__gnu_cxx::__normal_iterator<int const*, std::vector<int, std::allocator<int> > >, __gnu_cxx::__normal_iterator<int const*, std::vector<int, std::allocator<int> > >, int) [clone .exit]
req
For comparison, the ASM for the same function but using std::vector<double> is:
sum(std::vector<double, std::allocator<double> > const&):
movq 8(%rdi), %rdx
movq (%rdi), %rax
pxor %xmm0, %xmm0
cmpq %rax, %rdx
je .L4
.L3:
addsd (%rax), %xmm0
addq $8, %rax
cmpq %rax, %rdx
jne .L3
rep ret
.L4:
rep ret
The ASM for std::vector<double> seems fairly trivial, while the ASM for std::vector<int> appears markedly more complex. I'm assuming there is some clever optimisation going on with std::vector<int>, but I'm at a bit of a loss to explain what's going on. Could someone enlighten me?

Short answer - the compiler has vectorised and unrolled the loop for adding integers. Compare the vector<double> version which has these lines:
addsd (%rax), %xmm0
addq $8, %rax
This means its adding a single double into the sum and then moving on 8 bytes and looping.
The same code in the main loop of the vector<int> version does:
movdqu -112(%rsi), %xmm2
movdqu -96(%rsi), %xmm3
movdqu -80(%rsi), %xmm4
movdqu -64(%rsi), %xmm5
...
movdqu -48(%rsi), %xmm4
movdqu -32(%rsi), %xmm5
...
movdqu -16(%rsi), %xmm0
...
movdqu (%rsi), %xmm1
...
subq $-128, %rsi
The movdq shows its doing 16 bytes at once (4 integers) and the subq $-128, %rsi shows its doing 128 bytes (or 32 ints) in a single loop across 8 loads. The end result of each iteration of the loop adds the next 32 ints into one of the 8 slots in xmm0:xmm1
LBB0_11 then takes the output from the main loop (which is 8 integers across xmm0 and xmm1) and finds the sum of those.
LBB0_12 then finishes off any ints at the end of the vector which couldn't be consumed by the main loop (as the main loop works on 32 integers at the same time)
It vectorises the adds so it can handle 4 integers at once which is generally faster than doing one integer at a time. It also unrolls the loop so that it can do more than 1 iteration of adding per loop.
Explanation of vectorization: What does vectorization mean?
Explanation of loop unrolling: When, if ever, is loop unrolling still useful?
I've not analysed the start of the code for the integer case, but generally this is setting up the loop by aligning it to a 16 byte boundary before it starts the main loop.

Related

Why some of sse intrinsics introduce move back and forth?

In my code, I set a 128-bit variable to zero. But I don't quite understand why it translates to two move instructions in assembly code?
__m128i zeros = reinterpret_cast<__m128i>(_mm_setzero_pd());
Corresponding assembly code has two move back and forth from xmm0 to 0x40(%rsp).
00709658: 0F 57 C0 xorps %xmm0, %xmm0
0070965B: 66 0F 29 44 24 40 movapd %xmm0, 0x40(%rsp)
00709661: 66 0F 28 44 24 40 movapd 0x40(%rsp), %xmm0
My compiler is Clang 10.0 and no optimization turned on when I ask the question.
Here is minimal implementation of my code.
template <int LEN>
bool SSEEncodeChunk(const char** srcp, char** dstp) {
__m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(*srcp));
__m128i zeros = reinterpret_cast<__m128i>(_mm_setzero_pd());
__m128i zero_bytes = _mm_cmpeq_epi8(data, zeros);
bool all_zeros = _mm_testz_si128(zero_bytes, zero_bytes);
if ((!all_zeros)) {
return false;
}
_mm_storeu_si128(reinterpret_cast<__m128i*>(*dstp), data);
*dstp += LEN;
*srcp += LEN;
return true;
}
Update in June 28th, clang version 10.0.0-4ubuntu1, ubuntu2004.
Here is my code in assembly with O0. I also check the options, with -fomit-frame-pointer. There are indeed more than one moving back and forth from %xmm, offset(%rsp). My first post only take part of them out.
(lldb) disassemble --name SSEEncodeChunk
index_type_traits_test`SSEEncodeChunk<16>:
[0x709630] <+0>: subq $0x58, %rsp
[0x709634] <+4>: movq %rdi, -0x38(%rsp)
[0x709639] <+9>: movq %rsi, -0x40(%rsp)
[0x70963e] <+14>: movq -0x38(%rsp), %rax
[0x709643] <+19>: movq (%rax), %rax
[0x709646] <+22>: movq %rax, -0x28(%rsp)
[0x70964b] <+27>: movq -0x28(%rsp), %rax
[0x709650] <+32>: movups (%rax), %xmm0
[0x709653] <+35>: movaps %xmm0, -0x50(%rsp)
[0x709658] <+40>: xorps %xmm0, %xmm0
[0x70965b] <+43>: movapd %xmm0, 0x40(%rsp) ; this is the point I ask the question at the first time
[0x709661] <+49>: movapd 0x40(%rsp), %xmm0
[0x709667] <+55>: movapd %xmm0, -0x60(%rsp)
[0x70966d] <+61>: movaps -0x50(%rsp), %xmm0
[0x709672] <+66>: movaps -0x60(%rsp), %xmm1
[0x709677] <+71>: movaps %xmm0, 0x30(%rsp)
[0x70967c] <+76>: movaps %xmm1, 0x20(%rsp)
[0x709681] <+81>: movaps 0x30(%rsp), %xmm0
[0x709686] <+86>: movaps 0x20(%rsp), %xmm1
[0x70968b] <+91>: pcmpeqb %xmm1, %xmm0
[0x70968f] <+95>: movdqa %xmm0, -0x70(%rsp)
[0x709695] <+101>: movdqa -0x70(%rsp), %xmm0
[0x70969b] <+107>: movdqa -0x70(%rsp), %xmm1
[0x7096a1] <+113>: movdqa %xmm0, 0x10(%rsp)
[0x7096a7] <+119>: movdqa %xmm1, (%rsp)
[0x7096ac] <+124>: movdqa 0x10(%rsp), %xmm0
[0x7096b2] <+130>: movdqa (%rsp), %xmm1
[0x7096b7] <+135>: ptest %xmm1, %xmm0
[0x7096bc] <+140>: sete %cl
[0x7096bf] <+143>: movzbl %cl, %edx
[0x7096c2] <+146>: cmpl $0x0, %edx
[0x7096c5] <+149>: setne %cl
[0x7096c8] <+152>: andb $0x1, %cl
[0x7096cb] <+155>: movb %cl, -0x71(%rsp)
[0x7096cf] <+159>: movb -0x71(%rsp), %cl
[0x7096d3] <+163>: xorb $-0x1, %cl
[0x7096d6] <+166>: testb $0x1, %cl
[0x7096d9] <+169>: jne 0x7096e4
[0x7096df] <+175>: jmp 0x7096ee
[0x7096e4] <+180>: movb $0x0, -0x29(%rsp)
[0x7096e9] <+185>: jmp 0x70973f
[0x7096ee] <+190>: movq -0x40(%rsp), %rax
[0x7096f3] <+195>: movq (%rax), %rax
[0x7096f6] <+198>: movdqa -0x50(%rsp), %xmm0
[0x7096fc] <+204>: movq %rax, -0x8(%rsp)
[0x709701] <+209>: movdqa %xmm0, -0x20(%rsp)
[0x709707] <+215>: movdqa -0x20(%rsp), %xmm0
[0x70970d] <+221>: movq -0x8(%rsp), %rax
[0x709712] <+226>: movdqu %xmm0, (%rax)
[0x709716] <+230>: movq -0x40(%rsp), %rax
[0x70971b] <+235>: movq (%rax), %rcx
[0x70971e] <+238>: addq $0x10, %rcx
[0x709725] <+245>: movq %rcx, (%rax)
[0x709728] <+248>: movq -0x38(%rsp), %rax
[0x70972d] <+253>: movq (%rax), %rcx
[0x709730] <+256>: addq $0x10, %rcx
[0x709737] <+263>: movq %rcx, (%rax)
[0x70973a] <+266>: movb $0x1, -0x29(%rsp)
[0x70973f] <+271>: movb -0x29(%rsp), %al
[0x709743] <+275>: andb $0x1, %al
[0x709745] <+277>: movzbl %al, %eax
[0x709748] <+280>: addq $0x58, %rsp
[0x70974c] <+284>: retq
After turned on -O2, the back and forth move do disapper
0051E8D5: F3 0F 6F 08 movdqu (%rax), %xmm1
0051E8D9: 66 0F EF C0 pxor %xmm0, %xmm0
0051E8DD: 66 0F 6F D1 movdqa %xmm1, %xmm2
0051E8E1: 66 0F 74 D0 pcmpeqb %xmm0, %xmm2
0051E8E5: 66 0F 38 17 D2 ptest %xmm2, %xmm2
0051E8EA: 0F 85 97 03 00 00 jne 0x51ec87
0051E8F0: F3 0F 7F 0C 3B movdqu %xmm1, (%rbx,%rdi)
...
0051E967: 48 01 F8 addq %rdi, %rax
0051E96A: 48 83 C0 10 addq $0x10, %rax
0051E96E: 48 01 FB addq %rdi, %rbx
0051E971: 48 83 C3 10 addq $0x10, %rbx

How to efficiently add two vectors in C++

Suppose I have two vectors a and b, stored as a vector. I want to make a += b or a +=b * k, where k is a number.
I can for sure do the following,
while (size--) {
(*a++) += (*b++) * k;
}
But what are the possible ways to easily leverage SIMD instructions such as SSE2?

The only thing you should need is to enable auto-vectorization with your compiler.
For example, compiling your code (assuming float) with GCC (5.2.0) -O3 produces this main loop
L8:
movups (%rsi,%rax), %xmm1
addl $1, %r11d
mulps %xmm2, %xmm1
addps (%rdi,%rax), %xmm1
movaps %xmm1, (%rdi,%rax)
addq $16, %rax
cmpl %r11d, %r10d
ja .L8
Clang also vectorizes the loop but also unrolls four times. Unrolling may help on some processors even though there is no dependency chain especially on Haswell. In fact, you can get GCC to unroll by adding -funroll-loops. GCC will unroll to eight independent operations in this case unlike in the case when there is a dependency chain.
One problem you may encounter is that your compiler may need to add some code to determine if the arrays overlap and make two branches one without vectorization for when they do overlap and one with vectorization for when they don't overlap. GCC and Clang both do this. But ICC does not vectorize the loop.
ICC 13.0.01 with -O3
..B1.4: # Preds ..B1.2 ..B1.4
movss (%rsi), %xmm1 #3.21
incl %ecx #2.5
mulss %xmm0, %xmm1 #3.28
addss (%rdi), %xmm1 #3.11
movss %xmm1, (%rdi) #3.11
movss 4(%rsi), %xmm2 #3.21
addq $8, %rsi #3.21
mulss %xmm0, %xmm2 #3.28
addss 4(%rdi), %xmm2 #3.11
movss %xmm2, 4(%rdi) #3.11
addq $8, %rdi #3.11
cmpl %eax, %ecx #2.5
jb ..B1.4 # Prob 63% #2.5
To fix this you need to tell the compiler the arrays don't overlap using the __restrict keyword.
void foo(float * __restrict a, float * __restrict b, float k, int size) {
while (size--) {
(*a++) += (*b++) * k;
}
}
In this case ICC produces two branches. One for when the arrays are 16 byte aligned and one for when they are not. Here is the aligned branch
..B1.16: # Preds ..B1.16 ..B1.15
movaps (%rsi), %xmm2 #3.21
addl $8, %r8d #2.5
movaps 16(%rsi), %xmm3 #3.21
addq $32, %rsi #1.6
mulps %xmm1, %xmm2 #3.28
mulps %xmm1, %xmm3 #3.28
addps (%rdi), %xmm2 #3.11
addps 16(%rdi), %xmm3 #3.11
movaps %xmm2, (%rdi) #3.11
movaps %xmm3, 16(%rdi) #3.11
addq $32, %rdi #1.6
cmpl %ecx, %r8d #2.5
jb ..B1.16 # Prob 82% #2.5
ICC unrolls twice in both cases. Even though GCC and Clang produce a vectorized and unvectorize branch without __restrict you may want to use __restrict anyway to remove the overhead of the code to determine which branch to use.
The last thing you can try is to to tell the compiler the arrays are aligned. This will work with GCC and Clang (3.6)
void foo(float * __restrict a, float * __restrict b, float k, int size) {
a = (float*)__builtin_assume_aligned (a, 32);
b = (float*)__builtin_assume_aligned (b, 32);
while (size--) {
(*a++) += (*b++) * k;
}
}
GCC produces in this case
.L4:
movaps (%rsi,%r8), %xmm1
addl $1, %r10d
mulps %xmm2, %xmm1
addps (%rdi,%r8), %xmm1
movaps %xmm1, (%rdi,%r8)
addq $16, %r8
cmpl %r10d, %eax
ja .L4
Lastly if you compiler supports OpenMP 4.0 you can use OpenMP like this
void foo(float * __restrict a, float * __restrict b, float k, int size) {
#pragma omp simd aligned(a:32) aligned(b:32)
for(int i=0; i<size; i++) {
a[i] += k*b[i];
}
}
GCC produces the same code in this case as when using __builtin_assume_aligned. This should work for a more recent version of ICC (which I don't have).
I did not check MSVC. I expect it vectorizes this loop as well.
For more details about restrict and the compiler producing different branches with and without overlap and for aligned and not aligned see
sum-of-overlapping-arrays-auto-vectorization-and-restrict.
Here is one more suggestion to consider. If you know that the range of the loop is a multiple of the the SIMD width the compiler will not have to use cleanup code. The following code
// gcc -O3
// n = size/8
void foo(float * __restrict a, float * __restrict b, float k, int n) {
a = (float*)__builtin_assume_aligned (a, 32);
b = (float*)__builtin_assume_aligned (b, 32);
//#pragma omp simd aligned(a:32) aligned(b:32)
for(int i=0; i<n*8; i++) {
a[i] += k*b[i];
}
}
produces the simplest assembly so far.
foo(float*, float*, float, int):
sall $2, %edx
testl %edx, %edx
jle .L1
subl $4, %edx
shufps $0, %xmm0, %xmm0
shrl $2, %edx
xorl %eax, %eax
xorl %ecx, %ecx
addl $1, %edx
.L4:
movaps (%rsi,%rax), %xmm1
addl $1, %ecx
mulps %xmm0, %xmm1
addps (%rdi,%rax), %xmm1
movaps %xmm1, (%rdi,%rax)
addq $16, %rax
cmpl %edx, %ecx
jb .L4
.L1:
rep ret
I used a multiple8 and 32 byte alignment because then just by using the compiler switch -mavx the compiler produces nice AVX vectorization.
foo(float*, float*, float, int):
sall $3, %edx
testl %edx, %edx
jle .L5
vshufps $0, %xmm0, %xmm0, %xmm0
subl $8, %edx
xorl %eax, %eax
shrl $3, %edx
xorl %ecx, %ecx
addl $1, %edx
vinsertf128 $1, %xmm0, %ymm0, %ymm0
.L4:
vmulps (%rsi,%rax), %ymm0, %ymm1
addl $1, %ecx
vaddps (%rdi,%rax), %ymm1, %ymm1
vmovaps %ymm1, (%rdi,%rax)
addq $32, %rax
cmpl %edx, %ecx
jb .L4
vzeroupper
.L5:
rep ret
I am not sure how the preamble could be made simpler but the only improvement I see left is to remove one of the iterators and a compare. Namely the addl $1, %ecx instruction should not be necessary. Niether should the cmpl %edx, %ecx be necessary. I'm not sure how to get GCC to fix this. I had a problem like before with GCC (Produce loops without cmp instruction in GCC).

The functions SAXPY (single-precision), DAXPY (double-precision), CAXPY (complex single-precision), and ZAXPY (complex double-precision) compute exactly the expression you want:
Y = a * X + Y
where a is a scalar constant, and X and Y are vectors.
These functions are provided by BLAS libraries and optimized for all practical platforms: for CPUs the best BLAS implementations are OpenBLAS, Intel MKL (optimized for Intel x86 processors and Xeon Phi co-processors only), BLIS, and Apple Accelerate (OS X only); for nVidia GPUs look at cuBLAS (part of CUDA SDK), for any GPUs - ArrayFire.
These libraries are well-optimized and deliver better performance than whatever implementation you can quickly hack up.

Storing a constant in SSE register (GCC, C++)

Hello StackOverflow community
I have encountered a following challenge: In my C++ application I have quite complex (cubic) loop in which, at all depths, I perform the following:
Compute 4 float values
Multiply all 4 values by a constant
Convert the floats to integers
This code is to be run with thousands of iterations in each loop (resulting in billions of operations) and I want to make it as fast as possible, so I'm trying to utilize SSE processor instructions.
While trying to manually optimize the code, I have encountered the following obstacle: each time I get to the part with multiplying all values by a constant, the constant has to be loaded to XMM register. My idea was to reserve one register (and forbid the compiler from using it), load the value once, and hardcode the multiplications with that one specific register, however I can't find the right way to do that.
By the way, could somebody please explain to me, why does this code:
vmovaps .LC0(%rip), %xmm1
movl $1000000000, %eax
vmovaps .LC1(%rip), %xmm0
.p2align 4,,10
.p2align 3
.L2:
#APP
# 26 "sse.cpp" 1
.intel_syntax noprefix;
mulps %xmm1,%xmm0;
.att_syntax prefix;
# 0 "" 2
#NO_APP
subl $1, %eax
jne .L2
Performs worse (real 0m1.656s vs real 0m1.618s) than the following one:
vmovaps .LC0(%rip), %xmm1
movl $1000000000, %eax
vmovaps .LC1(%rip), %xmm0
.p2align 4,,10
.p2align 3
.L2:
vmulps %xmm0, %xmm1, %xmm1
subl $1, %eax
jne .L2
(The difference is that I use intel syntax in my inline asm in gcc [first snippet] and legacy SSE instructions for compatibility, while gcc automatically generated version using AVX vectors [second snippet])

One note, you need to be more specific on how you do compile things and probably provide minimal example. I know it might not be best answer because of this, but I think it's good enough. It got long but it's because of codes.
Bottom line of below work is that it should be safe to leave for the compiler and use appropriate compiler flags. At the bottom I put an example how to use local register variable, but it probably won't be very useful (it gets ignored easily). You could use global register variable but it doesn't yield any good results and is discouraged.
My set-up is Intel(R) Core(TM) i7-4770 CPU, gcc version 4.9.2 and clang version 3.5.0. Below code does store avx_scalar in an xmm register with -O1 and above. With nothing or -O0 they don't. The code to generate assembly was:
[clang++|g++] -march=native -S -Ox ./sse.cpp,
where x was the optimization level.
Interesting thing is that with -march=archive both compilers decided to use SSE4.1 versions over legacy SSE in any case I tested, even though I used legacy SSE intrinsics in the code itself. This is good.
I also tested using smmintrin.h which is SSE4.1 header. With out the flag gcc uses legacy SSE and clang fails to compile with error: "SSE4.1 instruction set not enabled". With xmmintrin.h which is legacy SSE header, both compilers produced AVX versions in the presence of the flag, and legacy ones when it was absent.
Test code avx.cpp:
extern "C"
{
#include <smmintrin.h>
}
const float scalar = 3.14;
const __m128 avx_scalar = _mm_set1_ps(scalar);
__m128 vector;
__m128 its_me(){
__m128 ret;
__m128 result;
for(int i = 0; i < 1000; ++i)
{
vector = _mm_set_ps(i*1,i*2,i*3,i*4);
result = _mm_mul_ps(vector, avx_scalar);
ret = _mm_add_ps(ret, result);
}
return ret;
}
Revelvant part of g++ -march=native -S -O2 ./avx.cpp:
.LFB639:
.cfi_startproc
vmovaps _ZL10avx_scalar(%rip), %xmm5
xorl %edx, %edx
.p2align 4,,10
.p2align 3
.L2:
leal (%rdx,%rdx), %ecx
vxorps %xmm2, %xmm2, %xmm2
vxorps %xmm1, %xmm1, %xmm1
vxorps %xmm3, %xmm3, %xmm3
leal 0(,%rdx,4), %eax
vcvtsi2ss %ecx, %xmm3, %xmm3
vxorps %xmm4, %xmm4, %xmm4
vcvtsi2ss %eax, %xmm2, %xmm2
leal (%rcx,%rdx), %eax
vcvtsi2ss %edx, %xmm4, %xmm4
addl $1, %edx
vcvtsi2ss %eax, %xmm1, %xmm1
vunpcklps %xmm4, %xmm3, %xmm3
vunpcklps %xmm1, %xmm2, %xmm1
vmovlhps %xmm3, %xmm1, %xmm1
vmulps %xmm5, %xmm1, %xmm2
vaddps %xmm2, %xmm0, %xmm0
cmpl $1000, %edx
jne .L2
vmovaps %xmm1, vector(%rip)
ret
.cfi_endproc
And clang++ -march=native -S -O2 ./avx.cpp:
# BB#0:
xorl %eax, %eax
movl $4, %ecx
movl $2, %edx
vmovaps _ZL10avx_scalar(%rip), %xmm1
xorl %esi, %esi
# implicit-def: XMM0
.align 16, 0x90
.LBB0_1: # =>This Inner Loop Header: Depth=1
leal -2(%rdx), %r8d
leal -4(%rcx), %edi
vmovd %edi, %xmm2
vpinsrd $1, %eax, %xmm2, %xmm2
vpinsrd $2, %r8d, %xmm2, %xmm2
vpinsrd $3, %esi, %xmm2, %xmm2
vcvtdq2ps %xmm2, %xmm2
vmulps %xmm1, %xmm2, %xmm2
vaddps %xmm2, %xmm0, %xmm0
leal 1(%rsi), %r8d
leal 3(%rax), %edi
vmovd %ecx, %xmm2
vpinsrd $1, %edi, %xmm2, %xmm2
vpinsrd $2, %edx, %xmm2, %xmm2
vpinsrd $3, %r8d, %xmm2, %xmm2
vcvtdq2ps %xmm2, %xmm2
vmulps %xmm1, %xmm2, %xmm3
vaddps %xmm3, %xmm0, %xmm0
addl $2, %esi
addl $6, %eax
addl $8, %ecx
addl $4, %edx
cmpl $1000, %esi # imm = 0x3E8
jne .LBB0_1
# BB#2:
vmovaps %xmm2, vector(%rip)
retq
Just for the record, you can manually put a local variable into register, but clang ignores completely and gcc with -01 and above.I encourage to look for xmm13 in output from g++ -march=native -S -Ox ./avx.cpp with different x values for the below code (assuming you have at least 13 xmm registers on your cpu):
extern "C"
{
#include <xmmintrin.h>
}
const float scalar = 3.14;
__m128 its_me(){
__m128 vector;
register __m128 avx_scalar asm ("xmm13") = _mm_set1_ps(scalar); // that's how you do it in gcc.
//const __m128 avx_scalar = _mm_set1_ps(scalar);
__m128 ret;
__m128 result;
for(int i = 0; i < 1000; ++i)
{
vector = _mm_set_ps(i*1,i*2,i*3,i*4);
result = _mm_mul_ps(vector, avx_scalar);
ret = _mm_add_ps(ret, result);
}
return ret;
}

Performance of C++11 modern-style loops vs old-style loops

This is the first question I'm posting here, so I hope I won't do anything wrong.
My question concerns the performance of modern-style C++11 loops (std::for_each, range-based for) vs old-style C++ loops (for (...; ...; ...)). From what I understood, it seems to me that the motto of modern C++ is "expressivity with no compromise on performance". Modern C++ style leads to safe, clean, and fast code with little to no performance penalty and, possibly, with a performance gain over old-style C++.
Now I've made a little test to assess how big this gain is concerning loops. First I wrote the following three functions:
using namespace std;
void foo(vector<double>& v)
{
for (size_t i = 0; i < v.size(); i++)
{
v[i] /= 42;
}
}
void bar(vector<double>& v)
{
for (auto& x : v)
{
x /= 42;
}
}
void wee(vector<double>& v)
{
for_each(begin(v), end(v), [] (double& x)
{
x /= 42;
});
}
Then I compared their performance by calling them this way (properly commenting/uncommenting the three lines inside main()'s loop:
vector<double> make_vector()
{
vector<double> v;
for (int i = 0; i < 30000; i++) { v.push_back(i); }
return v;
}
int main()
{
time_t start = clock();
auto v = make_vector();
for (int i = 0; i <= 50000; i++)
{
// UNCOMMENT THE FUNCTION CALL TO BE TESTED, COMMENT THE OTHERS
foo(v);
// bar(v);
// wee(v);
}
time_t end = clock();
cout << (end - start) << endl;
return 0;
}
Averaging over 10 executions of each version of the program obtained by commenting/uncommenting the lines in main()'s loop, and using the old-style loop as a baseline, the range-based for loop performs ~1.9x worse, and the loop based on std::for_each and lambdas performs ~2.3x worse.
I used Clang 3.2 to compile this, and I haven't tried MS VC11 (I'm working on WinXP).
Considering my expectation of getting comparable execution times, my questions are:
Did I do something obviously wrong?
If not, couldn't a 2x performance penalty be a good reason NOT to embrace modern-style loops?
I would like to remark, that I do believe that the clarity and safety of code written in modern C++ style pay off for a possible performance loss, but I quite disagree with the statement that there is no trade-off between clarity/safety on one side and performance on the other side.
Am I missing something?

It looks like the difference only shows up when you do not enable optimisations in your compiler.
With Clang you can enable optimisation with the -O[0-3] flag.

Mankarse is right - most likely you have not enabled optimizations.
Actually on Clang they produce practically same result ASM code in main loop, and small difference in pre/post code.
I have tested four versions: hand_loop_index, hand_loop_iterator, range_based_for, for_each_algorithm
hand_loop_iterator, range_based_for and for_each_algorithm - all three do produce exactly same result ASM for full function body, only difference is in names of labels.
I.e. hand written for loop with iterators results in exactly same ASM code as range-based-for and std::for_each.
There are some differences between loop with index and loop with iterator versions.
Main loop in both cases is almost same. The only minor differece is that for iterators version(s) rdx register is used instead of rsi.
Index version:
.LBB0_7: # %vector.body
# =>This Inner Loop Header: Depth=1
movupd -48(%rsi), %xmm1
movupd -32(%rsi), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -48(%rsi)
movupd %xmm2, -32(%rsi)
movupd -16(%rsi), %xmm1
movupd (%rsi), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -16(%rsi)
movupd %xmm2, (%rsi)
addq $64, %rsi
addq $-8, %rdi
jne .LBB0_7
Iterator version(s):
.LBB1_6: # %vector.body
# =>This Inner Loop Header: Depth=1
movupd -48(%rdx), %xmm1
movupd -32(%rdx), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -48(%rdx)
movupd %xmm2, -32(%rdx)
movupd -16(%rdx), %xmm1
movupd (%rdx), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -16(%rdx)
movupd %xmm2, (%rdx)
addq $64, %rdx
addq $-8, %rsi
jne .LBB1_6
Pre/post code for index vs iterator versions has many differences, but it should not affect greatly total result timing for large enough arrays.
LIVE DEMO on Coliru with ASM output
#include <algorithm>
#include <iterator>
#include <vector>
using namespace std;
void hand_loop_index(vector<double> &v)
{
for (size_t i = 0; i < v.size(); ++i)
{
v[i] /= 42;
}
}
void hand_loop_iterator(vector<double> &v)
{
for (auto first = begin(v), last = end(v); first!=last; ++first)
{
*first /= 42;
}
}
void range_based_for(vector<double> &v)
{
for (auto &x : v)
{
x /= 42;
}
}
void for_each_algorithm(vector<double> &v)
{
for_each(begin(v), end(v), [] (double &x)
{
x /= 42;
});
}
Result ASM:
# clang++ -std=c++1z -O3 -Wall -pedantic -pthread main.cpp -S
.text
.file "main.cpp"
.section .rodata.cst16,"aM",#progbits,16
.align 16
.LCPI0_0:
.quad 4631107791820423168 # double 4.200000e+01
.quad 4631107791820423168 # double 4.200000e+01
.section .rodata.cst8,"aM",#progbits,8
.align 8
.LCPI0_1:
.quad 4631107791820423168 # double 42
.text
.globl _Z15hand_loop_indexRSt6vectorIdSaIdEE
.align 16, 0x90
.type _Z15hand_loop_indexRSt6vectorIdSaIdEE,#function
_Z15hand_loop_indexRSt6vectorIdSaIdEE: # #_Z15hand_loop_indexRSt6vectorIdSaIdEE
.cfi_startproc
# BB#0:
movq (%rdi), %rax
movq 8(%rdi), %rcx
subq %rax, %rcx
je .LBB0_11
# BB#1: # %.lr.ph
sarq $3, %rcx
cmpq $1, %rcx
movl $1, %edx
cmovaq %rcx, %rdx
xorl %edi, %edi
testq %rdx, %rdx
je .LBB0_10
# BB#2: # %overflow.checked
xorl %edi, %edi
movq %rdx, %r8
andq $-4, %r8
je .LBB0_9
# BB#3: # %vector.body.preheader
cmpq $1, %rcx
movl $1, %edi
cmovaq %rcx, %rdi
addq $-4, %rdi
movq %rdi, %rsi
shrq $2, %rsi
xorl %r9d, %r9d
btq $2, %rdi
jb .LBB0_5
# BB#4: # %vector.body.prol
movupd (%rax), %xmm0
movupd 16(%rax), %xmm1
movapd .LCPI0_0(%rip), %xmm2 # xmm2 = [4.200000e+01,4.200000e+01]
divpd %xmm2, %xmm0
divpd %xmm2, %xmm1
movupd %xmm0, (%rax)
movupd %xmm1, 16(%rax)
movl $4, %r9d
.LBB0_5: # %vector.body.preheader.split
testq %rsi, %rsi
je .LBB0_8
# BB#6: # %vector.body.preheader.split.split
cmpq $1, %rcx
movl $1, %edi
cmovaq %rcx, %rdi
andq $-4, %rdi
subq %r9, %rdi
leaq 48(%rax,%r9,8), %rsi
movapd .LCPI0_0(%rip), %xmm0 # xmm0 = [4.200000e+01,4.200000e+01]
.align 16, 0x90
.LBB0_7: # %vector.body
# =>This Inner Loop Header: Depth=1
movupd -48(%rsi), %xmm1
movupd -32(%rsi), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -48(%rsi)
movupd %xmm2, -32(%rsi)
movupd -16(%rsi), %xmm1
movupd (%rsi), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -16(%rsi)
movupd %xmm2, (%rsi)
addq $64, %rsi
addq $-8, %rdi
jne .LBB0_7
.LBB0_8:
movq %r8, %rdi
.LBB0_9: # %middle.block
cmpq %rdi, %rdx
je .LBB0_11
.align 16, 0x90
.LBB0_10: # %scalar.ph
# =>This Inner Loop Header: Depth=1
movsd (%rax,%rdi,8), %xmm0 # xmm0 = mem[0],zero
divsd .LCPI0_1(%rip), %xmm0
movsd %xmm0, (%rax,%rdi,8)
incq %rdi
cmpq %rcx, %rdi
jb .LBB0_10
.LBB0_11: # %._crit_edge
retq
.Lfunc_end0:
.size _Z15hand_loop_indexRSt6vectorIdSaIdEE, .Lfunc_end0-_Z15hand_loop_indexRSt6vectorIdSaIdEE
.cfi_endproc
.section .rodata.cst16,"aM",#progbits,16
.align 16
.LCPI1_0:
.quad 4631107791820423168 # double 4.200000e+01
.quad 4631107791820423168 # double 4.200000e+01
.section .rodata.cst8,"aM",#progbits,8
.align 8
.LCPI1_1:
.quad 4631107791820423168 # double 42
.text
.globl _Z18hand_loop_iteratorRSt6vectorIdSaIdEE
.align 16, 0x90
.type _Z18hand_loop_iteratorRSt6vectorIdSaIdEE,#function
_Z18hand_loop_iteratorRSt6vectorIdSaIdEE: # #_Z18hand_loop_iteratorRSt6vectorIdSaIdEE
.cfi_startproc
# BB#0:
movq (%rdi), %rdx
movq 8(%rdi), %rax
cmpq %rax, %rdx
je .LBB1_11
# BB#1: # %.lr.ph.preheader
movabsq $4611686018427387900, %rsi # imm = 0x3FFFFFFFFFFFFFFC
leaq -8(%rax), %rcx
subq %rdx, %rcx
shrq $3, %rcx
incq %rcx
xorl %edi, %edi
movq %rcx, %r9
andq %rsi, %r9
je .LBB1_8
# BB#2: # %vector.body.preheader
andq %rcx, %rsi
leaq -4(%rsi), %rdi
movq %rdi, %r11
shrq $2, %r11
xorl %r10d, %r10d
btq $2, %rdi
jb .LBB1_4
# BB#3: # %vector.body.prol
movupd (%rdx), %xmm0
movupd 16(%rdx), %xmm1
movapd .LCPI1_0(%rip), %xmm2 # xmm2 = [4.200000e+01,4.200000e+01]
divpd %xmm2, %xmm0
divpd %xmm2, %xmm1
movupd %xmm0, (%rdx)
movupd %xmm1, 16(%rdx)
movl $4, %r10d
.LBB1_4: # %vector.body.preheader.split
leaq (%rdx,%r9,8), %r8
testq %r11, %r11
je .LBB1_7
# BB#5: # %vector.body.preheader.split.split
subq %r10, %rsi
leaq 48(%rdx,%r10,8), %rdx
movapd .LCPI1_0(%rip), %xmm0 # xmm0 = [4.200000e+01,4.200000e+01]
.align 16, 0x90
.LBB1_6: # %vector.body
# =>This Inner Loop Header: Depth=1
movupd -48(%rdx), %xmm1
movupd -32(%rdx), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -48(%rdx)
movupd %xmm2, -32(%rdx)
movupd -16(%rdx), %xmm1
movupd (%rdx), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -16(%rdx)
movupd %xmm2, (%rdx)
addq $64, %rdx
addq $-8, %rsi
jne .LBB1_6
.LBB1_7:
movq %r8, %rdx
movq %r9, %rdi
.LBB1_8: # %middle.block
cmpq %rdi, %rcx
je .LBB1_11
# BB#9:
movsd .LCPI1_1(%rip), %xmm0 # xmm0 = mem[0],zero
.align 16, 0x90
.LBB1_10: # %.lr.ph
# =>This Inner Loop Header: Depth=1
movsd (%rdx), %xmm1 # xmm1 = mem[0],zero
divsd %xmm0, %xmm1
movsd %xmm1, (%rdx)
addq $8, %rdx
cmpq %rdx, %rax
jne .LBB1_10
.LBB1_11: # %._crit_edge
retq
.Lfunc_end1:
.size _Z18hand_loop_iteratorRSt6vectorIdSaIdEE, .Lfunc_end1-_Z18hand_loop_iteratorRSt6vectorIdSaIdEE
.cfi_endproc
.section .rodata.cst16,"aM",#progbits,16
.align 16
.LCPI2_0:
.quad 4631107791820423168 # double 4.200000e+01
.quad 4631107791820423168 # double 4.200000e+01
.section .rodata.cst8,"aM",#progbits,8
.align 8
.LCPI2_1:
.quad 4631107791820423168 # double 42
.text
.globl _Z15range_based_forRSt6vectorIdSaIdEE
.align 16, 0x90
.type _Z15range_based_forRSt6vectorIdSaIdEE,#function
_Z15range_based_forRSt6vectorIdSaIdEE: # #_Z15range_based_forRSt6vectorIdSaIdEE
.cfi_startproc
# BB#0:
movq (%rdi), %rdx
movq 8(%rdi), %rax
cmpq %rax, %rdx
je .LBB2_11
# BB#1: # %.lr.ph.preheader
movabsq $4611686018427387900, %rsi # imm = 0x3FFFFFFFFFFFFFFC
leaq -8(%rax), %rcx
subq %rdx, %rcx
shrq $3, %rcx
incq %rcx
xorl %edi, %edi
movq %rcx, %r9
andq %rsi, %r9
je .LBB2_8
# BB#2: # %vector.body.preheader
andq %rcx, %rsi
leaq -4(%rsi), %rdi
movq %rdi, %r11
shrq $2, %r11
xorl %r10d, %r10d
btq $2, %rdi
jb .LBB2_4
# BB#3: # %vector.body.prol
movupd (%rdx), %xmm0
movupd 16(%rdx), %xmm1
movapd .LCPI2_0(%rip), %xmm2 # xmm2 = [4.200000e+01,4.200000e+01]
divpd %xmm2, %xmm0
divpd %xmm2, %xmm1
movupd %xmm0, (%rdx)
movupd %xmm1, 16(%rdx)
movl $4, %r10d
.LBB2_4: # %vector.body.preheader.split
leaq (%rdx,%r9,8), %r8
testq %r11, %r11
je .LBB2_7
# BB#5: # %vector.body.preheader.split.split
subq %r10, %rsi
leaq 48(%rdx,%r10,8), %rdx
movapd .LCPI2_0(%rip), %xmm0 # xmm0 = [4.200000e+01,4.200000e+01]
.align 16, 0x90
.LBB2_6: # %vector.body
# =>This Inner Loop Header: Depth=1
movupd -48(%rdx), %xmm1
movupd -32(%rdx), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -48(%rdx)
movupd %xmm2, -32(%rdx)
movupd -16(%rdx), %xmm1
movupd (%rdx), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -16(%rdx)
movupd %xmm2, (%rdx)
addq $64, %rdx
addq $-8, %rsi
jne .LBB2_6
.LBB2_7:
movq %r8, %rdx
movq %r9, %rdi
.LBB2_8: # %middle.block
cmpq %rdi, %rcx
je .LBB2_11
# BB#9:
movsd .LCPI2_1(%rip), %xmm0 # xmm0 = mem[0],zero
.align 16, 0x90
.LBB2_10: # %.lr.ph
# =>This Inner Loop Header: Depth=1
movsd (%rdx), %xmm1 # xmm1 = mem[0],zero
divsd %xmm0, %xmm1
movsd %xmm1, (%rdx)
addq $8, %rdx
cmpq %rdx, %rax
jne .LBB2_10
.LBB2_11: # %._crit_edge
retq
.Lfunc_end2:
.size _Z15range_based_forRSt6vectorIdSaIdEE, .Lfunc_end2-_Z15range_based_forRSt6vectorIdSaIdEE
.cfi_endproc
.section .rodata.cst16,"aM",#progbits,16
.align 16
.LCPI3_0:
.quad 4631107791820423168 # double 4.200000e+01
.quad 4631107791820423168 # double 4.200000e+01
.section .rodata.cst8,"aM",#progbits,8
.align 8
.LCPI3_1:
.quad 4631107791820423168 # double 42
.text
.globl _Z18for_each_algorithmRSt6vectorIdSaIdEE
.align 16, 0x90
.type _Z18for_each_algorithmRSt6vectorIdSaIdEE,#function
_Z18for_each_algorithmRSt6vectorIdSaIdEE: # #_Z18for_each_algorithmRSt6vectorIdSaIdEE
.cfi_startproc
# BB#0:
movq (%rdi), %rdx
movq 8(%rdi), %rax
cmpq %rax, %rdx
je .LBB3_11
# BB#1: # %.lr.ph.i.preheader
movabsq $4611686018427387900, %rsi # imm = 0x3FFFFFFFFFFFFFFC
leaq -8(%rax), %rcx
subq %rdx, %rcx
shrq $3, %rcx
incq %rcx
xorl %edi, %edi
movq %rcx, %r9
andq %rsi, %r9
je .LBB3_8
# BB#2: # %vector.body.preheader
andq %rcx, %rsi
leaq -4(%rsi), %rdi
movq %rdi, %r11
shrq $2, %r11
xorl %r10d, %r10d
btq $2, %rdi
jb .LBB3_4
# BB#3: # %vector.body.prol
movupd (%rdx), %xmm0
movupd 16(%rdx), %xmm1
movapd .LCPI3_0(%rip), %xmm2 # xmm2 = [4.200000e+01,4.200000e+01]
divpd %xmm2, %xmm0
divpd %xmm2, %xmm1
movupd %xmm0, (%rdx)
movupd %xmm1, 16(%rdx)
movl $4, %r10d
.LBB3_4: # %vector.body.preheader.split
leaq (%rdx,%r9,8), %r8
testq %r11, %r11
je .LBB3_7
# BB#5: # %vector.body.preheader.split.split
subq %r10, %rsi
leaq 48(%rdx,%r10,8), %rdx
movapd .LCPI3_0(%rip), %xmm0 # xmm0 = [4.200000e+01,4.200000e+01]
.align 16, 0x90
.LBB3_6: # %vector.body
# =>This Inner Loop Header: Depth=1
movupd -48(%rdx), %xmm1
movupd -32(%rdx), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -48(%rdx)
movupd %xmm2, -32(%rdx)
movupd -16(%rdx), %xmm1
movupd (%rdx), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -16(%rdx)
movupd %xmm2, (%rdx)
addq $64, %rdx
addq $-8, %rsi
jne .LBB3_6
.LBB3_7:
movq %r8, %rdx
movq %r9, %rdi
.LBB3_8: # %middle.block
cmpq %rdi, %rcx
je .LBB3_11
# BB#9:
movsd .LCPI3_1(%rip), %xmm0 # xmm0 = mem[0],zero
.align 16, 0x90
.LBB3_10: # %.lr.ph.i
# =>This Inner Loop Header: Depth=1
movsd (%rdx), %xmm1 # xmm1 = mem[0],zero
divsd %xmm0, %xmm1
movsd %xmm1, (%rdx)
addq $8, %rdx
cmpq %rdx, %rax
jne .LBB3_10
.LBB3_11: # %_ZSt8for_eachIN9__gnu_cxx17__normal_iteratorIPdSt6vectorIdSaIdEEEEZ18for_each_algorithmR5_E3$_0ET0_T_SA_S9_.exit
retq
.Lfunc_end3:
.size _Z18for_each_algorithmRSt6vectorIdSaIdEE, .Lfunc_end3-_Z18for_each_algorithmRSt6vectorIdSaIdEE
.cfi_endproc
.ident "clang version 3.7.0 (tags/RELEASE_370/final 246979)"
.section ".note.GNU-stack","",#progbits

Can modern compilers unroll `for` loops expressed using begin and end iterators

Consider the following code
vector<double> v;
// fill v
const vector<double>::iterator end =v.end();
for(vector<double>::iterator i = v.bgin(); i != end; ++i) {
// do stuff
}
Are compilers like g++, clang++, icc able to unroll loops like this. Unfortunately I do not know assembly to be able verify from the output whether the loop gets unrolled or not. (and I only have access to g++.)
To me it seems that this will require more smartness than usual on behalf of the compiler, first to deduce that the iterator is a random access iterator, and then figure out the number of times the loop is executed. Can compilers do this when optimization is enabled ?
Thanks for your replies, and before some of you start lecturing about premature optimization, this is an excercise in curiosity.

To me it seems that this will require more smartness than usual on behalf of the compiler, first to deduce that the iterator is a random access iterator, and then figure out the number of times the loop is executed.
The STL, being comprised entirely of templates, has all the code inline. So, random access iterators reduce to pointers already when the compiler begins to apply optimizations. One of the reasons the STL was created was so that there would be less need for a programmer to outwit the compiler. You should rely on the STL to do the right thing until proven otherwise.
Of course, it is still up to you to choose the proper tool from the STL to use...
Edit: There was discussion about whether g++ does any loop unrolling. On the versions that I am using, loop unrolling is not part of -O, -O2, or -O3, and I get identical assembly for the latter two levels with the following code:
void foo (std::vector<int> &v) {
volatile int c = 0;
const std::vector<int>::const_iterator end = v.end();
for (std::vector<int>::iterator i = v.begin(); i != end; ++i) {
*i = c++;
}
}
With the corresponding assembly -O2 assembly:
_Z3fooRSt6vectorIiSaIiEE:
.LFB435:
movq 8(%rdi), %rcx
movq (%rdi), %rax
movl $0, -4(%rsp)
cmpq %rax, %rcx
je .L4
.p2align 4,,10
.p2align 3
.L3:
movl -4(%rsp), %edx
movl %edx, (%rax)
addq $4, %rax
addl $1, %edx
cmpq %rax, %rcx
movl %edx, -4(%rsp)
jne .L3
.L4:
rep
ret
With the -funroll-loops option added, the function expands into something much much larger. But, the documentation warns about this option:
Unroll loops whose number of iterations can be determined at compile time or upon entry to the loop. -funroll-loops implies -frerun-cse-after-loop. It also turns on complete loop peeling (i.e. complete removal of loops with small constant number of iterations). This option makes code larger, and may or may not make it run faster.
As a further argument to dissuade you from unrolling loops yourself, I'll finish this answer with an illustration of applying Duff's Device to the foo function above:
void foo_duff (std::vector<int> &v) {
volatile int c = 0;
const std::vector<int>::const_iterator end = v.end();
std::vector<int>::iterator i = v.begin();
switch ((end - i) % 4) do {
case 0: *i++ = c++;
case 3: *i++ = c++;
case 2: *i++ = c++;
case 1: *i++ = c++;
} while (i != end);
}
GCC has another loop optimization flag:
-ftree-loop-optimize
Perform loop optimizations on trees. This flag is enabled by default at -O and higher.
So, the -O option enables simple loop optimizations for the innermost loops, including complete loop unrolling (peeling) for loops with a fixed number of iterations. (Thanks to doc for pointing this out to me.)

I would propose that whether or not the compiler CAN unroll the loop, with modern pipelined architectures and caches, unless your "do stuff" is trivial, there is little benefit in doing so, and in many cases doing so would be a performance HIT instead of a boon. If your "do stuff" is nontrivial, unrolling the loop will create multiple copies of this nontrivial code, which will take extra time to load into the cache, significantly slowing down the first iteration through the unrolled loop. At the same time, it will evict more code from the cache, which may have been necessary for performing the "do stuff" if it makes any function calls, which would then need to be reloaded into the cache again. The purpose for unrolling loops made a lot of sense before cacheless pipelined non-branch-predictive architectures, with the goal being to reduce the overhead associated with the loop logic. Nowadays with cache-based pipelined branch-predictive hardware, your cpu will be pipelined well into the next loop iteration, speculatively executing the loop code again, by the time you detect the i==end exit condition, at which point the processor will throw out that final speculatively-executed set of results. In such an architecture, loop unrolling makes very little sense. It would further bloat code for virtually no benefit.

The short answer is yes. It will unroll as much as it can. In your case, it depends how you define end obviously (I assume your example is generic). Not only will most modern compilers unroll, but they will also vectorize and do other optimizations that will often blow your own solutions out of the water.
So what I'm saying is don't prematurely optimize! Just kidding :)

Simple answer: generally NO! At least when it comes to complete loop unrolling.
Let's test loop unrolling on this simple, dirty-coded (for testing purposes) structure.
struct Test
{
Test(): begin(arr), end(arr + 4) {}
double * begin;
double * end;
double arr[4];
};
First let's take counted loop and compile it without any optimizations.
double counted(double param, Test & d)
{
for (int i = 0; i < 4; i++)
param += d.arr[i];
return param;
}
Here's what gcc 4.9 produces.
counted(double, Test&):
pushq %rbp
movq %rsp, %rbp
movsd %xmm0, -24(%rbp)
movq %rdi, -32(%rbp)
movl $0, -4(%rbp)
jmp .L2
.L3:
movq -32(%rbp), %rax
movl -4(%rbp), %edx
movslq %edx, %rdx
addq $2, %rdx
movsd (%rax,%rdx,8), %xmm0
movsd -24(%rbp), %xmm1
addsd %xmm0, %xmm1
movq %xmm1, %rax
movq %rax, -24(%rbp)
addl $1, -4(%rbp)
.L2:
cmpl $3, -4(%rbp)
jle .L3
movq -24(%rbp), %rax
movq %rax, -40(%rbp)
movsd -40(%rbp), %xmm0
popq %rbp
ret
As expected loop hasn't been unrolled and, since no optimizations were performed, code is generally very verbose. Now let's turn on -O3 flag. Produced disassembly:
counted(double, Test&):
addsd 16(%rdi), %xmm0
addsd 24(%rdi), %xmm0
addsd 32(%rdi), %xmm0
addsd 40(%rdi), %xmm0
ret
Voila, loop has been unrolled this time.
Now let's take a look at iterated loop. Function containing the loop will look like this.
double iterated(double param, Test & d)
{
for (double * it = d.begin; it != d.end; ++it)
param += *it;
return param;
}
Still using -O3 flag, let's take a look at disassembly.
iterated(double, Test&):
movq (%rdi), %rax
movq 8(%rdi), %rdx
cmpq %rdx, %rax
je .L3
.L4:
addsd (%rax), %xmm0
addq $8, %rax
cmpq %rdx, %rax
jne .L4
.L3:
rep ret
Code looks better than in the very first case, because optimizations were performed, but loop hasn't been unrolled this time!
What about funroll-loops and funroll-all-loops flags? They will produce result similar to this
iterated(double, Test&):
movq (%rdi), %rsi
movq 8(%rdi), %rcx
cmpq %rcx, %rsi
je .L3
movq %rcx, %rdx
leaq 8(%rsi), %rax
addsd (%rsi), %xmm0
subq %rsi, %rdx
subq $8, %rdx
shrq $3, %rdx
andl $7, %edx
cmpq %rcx, %rax
je .L43
testq %rdx, %rdx
je .L4
cmpq $1, %rdx
je .L29
cmpq $2, %rdx
je .L30
cmpq $3, %rdx
je .L31
cmpq $4, %rdx
je .L32
cmpq $5, %rdx
je .L33
cmpq $6, %rdx
je .L34
addsd (%rax), %xmm0
leaq 16(%rsi), %rax
.L34:
addsd (%rax), %xmm0
addq $8, %rax
.L33:
addsd (%rax), %xmm0
addq $8, %rax
.L32:
addsd (%rax), %xmm0
addq $8, %rax
.L31:
addsd (%rax), %xmm0
addq $8, %rax
.L30:
addsd (%rax), %xmm0
addq $8, %rax
.L29:
addsd (%rax), %xmm0
addq $8, %rax
cmpq %rcx, %rax
je .L44
.L4:
addsd (%rax), %xmm0
addq $64, %rax
addsd -56(%rax), %xmm0
addsd -48(%rax), %xmm0
addsd -40(%rax), %xmm0
addsd -32(%rax), %xmm0
addsd -24(%rax), %xmm0
addsd -16(%rax), %xmm0
addsd -8(%rax), %xmm0
cmpq %rcx, %rax
jne .L4
.L3:
rep ret
.L44:
rep ret
.L43:
rep ret
Compare results with unrolled loop for counted loop. It's clearly not the same. What we see here is that gcc divided the loop into 8 element chunks. This can increase performance in some cases, because loop exit condition is checked once per 8 normal loop iterations. With additional flags vectorization could be also performed. But it isn't complete loop unrolling.
Iterated loop will be unrolled however if Test object is not a function argument.
double iteratedLocal(double param)
{
Test d;
for (double * it = d.begin; it != d.end; ++it)
param += *it;
return param;
}
Disassembly produced with only -O3 flag:
iteratedLocal(double):
addsd -40(%rsp), %xmm0
addsd -32(%rsp), %xmm0
addsd -24(%rsp), %xmm0
addsd -16(%rsp), %xmm0
ret
As you can see loop has been unrolled. This is because compiler can now safely assume that end has fixed value, while it couldn't predict that for function argument.
Test structure is statically allocated however. Things are more complicated with dynamically allocated structures like std::vector. From my observations on modified Test structure, so that it ressembles dynamically allocated container, it looks like gcc tries its best to unroll loops, but in most cases generated code is not as simple as one above.
As you ask for other compilers, here's output from clang 3.4.1 (-O3 flag)
counted(double, Test&): # #counted(double, Test&)
addsd 16(%rdi), %xmm0
addsd 24(%rdi), %xmm0
addsd 32(%rdi), %xmm0
addsd 40(%rdi), %xmm0
ret
iterated(double, Test&): # #iterated(double, Test&)
movq (%rdi), %rax
movq 8(%rdi), %rcx
cmpq %rcx, %rax
je .LBB1_2
.LBB1_1: # %.lr.ph
addsd (%rax), %xmm0
addq $8, %rax
cmpq %rax, %rcx
jne .LBB1_1
.LBB1_2: # %._crit_edge
ret
iteratedLocal(double): # #iteratedLocal(double)
leaq -32(%rsp), %rax
movq %rax, -48(%rsp)
leaq (%rsp), %rax
movq %rax, -40(%rsp)
xorl %eax, %eax
jmp .LBB2_1
.LBB2_2: # %._crit_edge4
movsd -24(%rsp,%rax), %xmm1
addq $8, %rax
.LBB2_1: # =>This Inner Loop Header: Depth=1
movaps %xmm0, %xmm2
cmpq $24, %rax
movaps %xmm1, %xmm0
addsd %xmm2, %xmm0
jne .LBB2_2
ret
Intel's icc 13.01 (-O3 flag)
counted(double, Test&):
addsd 16(%rdi), %xmm0 #24.5
addsd 24(%rdi), %xmm0 #24.5
addsd 32(%rdi), %xmm0 #24.5
addsd 40(%rdi), %xmm0 #24.5
ret #25.10
iterated(double, Test&):
movq (%rdi), %rdx #30.26
movq 8(%rdi), %rcx #30.41
cmpq %rcx, %rdx #30.41
je ..B3.25 # Prob 50% #30.41
subq %rdx, %rcx #30.7
movb $0, %r8b #30.7
lea 7(%rcx), %rax #30.7
sarq $2, %rax #30.7
shrq $61, %rax #30.7
lea 7(%rax,%rcx), %rcx #30.7
sarq $3, %rcx #30.7
cmpq $16, %rcx #30.7
jl ..B3.26 # Prob 10% #30.7
movq %rdx, %rdi #30.7
andq $15, %rdi #30.7
je ..B3.6 # Prob 50% #30.7
testq $7, %rdi #30.7
jne ..B3.26 # Prob 10% #30.7
movl $1, %edi #30.7
..B3.6: # Preds ..B3.5 ..B3.3
lea 16(%rdi), %rax #30.7
cmpq %rax, %rcx #30.7
jl ..B3.26 # Prob 10% #30.7
movq %rcx, %rax #30.7
xorl %esi, %esi #30.7
subq %rdi, %rax #30.7
andq $15, %rax #30.7
negq %rax #30.7
addq %rcx, %rax #30.7
testq %rdi, %rdi #30.7
jbe ..B3.11 # Prob 2% #30.7
..B3.9: # Preds ..B3.7 ..B3.9
addsd (%rdx,%rsi,8), %xmm0 #31.9
incq %rsi #30.7
cmpq %rdi, %rsi #30.7
jb ..B3.9 # Prob 82% #30.7
..B3.11: # Preds ..B3.9 ..B3.7
pxor %xmm6, %xmm6 #28.12
movaps %xmm6, %xmm7 #28.12
movaps %xmm6, %xmm5 #28.12
movsd %xmm0, %xmm7 #28.12
movaps %xmm6, %xmm4 #28.12
movaps %xmm6, %xmm3 #28.12
movaps %xmm6, %xmm2 #28.12
movaps %xmm6, %xmm1 #28.12
movaps %xmm6, %xmm0 #28.12
..B3.12: # Preds ..B3.12 ..B3.11
addpd (%rdx,%rdi,8), %xmm7 #31.9
addpd 16(%rdx,%rdi,8), %xmm6 #31.9
addpd 32(%rdx,%rdi,8), %xmm5 #31.9
addpd 48(%rdx,%rdi,8), %xmm4 #31.9
addpd 64(%rdx,%rdi,8), %xmm3 #31.9
addpd 80(%rdx,%rdi,8), %xmm2 #31.9
addpd 96(%rdx,%rdi,8), %xmm1 #31.9
addpd 112(%rdx,%rdi,8), %xmm0 #31.9
addq $16, %rdi #30.7
cmpq %rax, %rdi #30.7
jb ..B3.12 # Prob 82% #30.7
addpd %xmm6, %xmm7 #28.12
addpd %xmm4, %xmm5 #28.12
addpd %xmm2, %xmm3 #28.12
addpd %xmm0, %xmm1 #28.12
addpd %xmm5, %xmm7 #28.12
addpd %xmm1, %xmm3 #28.12
addpd %xmm3, %xmm7 #28.12
movaps %xmm7, %xmm0 #28.12
unpckhpd %xmm7, %xmm0 #28.12
addsd %xmm0, %xmm7 #28.12
movaps %xmm7, %xmm0 #28.12
..B3.14: # Preds ..B3.13 ..B3.26
lea 1(%rax), %rsi #30.7
cmpq %rsi, %rcx #30.7
jb ..B3.25 # Prob 50% #30.7
subq %rax, %rcx #30.7
cmpb $1, %r8b #30.7
jne ..B3.17 # Prob 50% #30.7
..B3.16: # Preds ..B3.17 ..B3.15
xorl %r8d, %r8d #30.7
jmp ..B3.21 # Prob 100% #30.7
..B3.17: # Preds ..B3.15
cmpq $2, %rcx #30.7
jl ..B3.16 # Prob 10% #30.7
movq %rcx, %r8 #30.7
xorl %edi, %edi #30.7
pxor %xmm1, %xmm1 #28.12
lea (%rdx,%rax,8), %rsi #31.19
andq $-2, %r8 #30.7
movsd %xmm0, %xmm1 #28.12
..B3.19: # Preds ..B3.19 ..B3.18
addpd (%rsi,%rdi,8), %xmm1 #31.9
addq $2, %rdi #30.7
cmpq %r8, %rdi #30.7
jb ..B3.19 # Prob 82% #30.7
movaps %xmm1, %xmm0 #28.12
unpckhpd %xmm1, %xmm0 #28.12
addsd %xmm0, %xmm1 #28.12
movaps %xmm1, %xmm0 #28.12
..B3.21: # Preds ..B3.20 ..B3.16
cmpq %rcx, %r8 #30.7
jae ..B3.25 # Prob 2% #30.7
lea (%rdx,%rax,8), %rax #31.19
..B3.23: # Preds ..B3.23 ..B3.22
addsd (%rax,%r8,8), %xmm0 #31.9
incq %r8 #30.7
cmpq %rcx, %r8 #30.7
jb ..B3.23 # Prob 82% #30.7
..B3.25: # Preds ..B3.23 ..B3.21 ..B3.14 ..B3.1
ret #32.14
..B3.26: # Preds ..B3.2 ..B3.6 ..B3.4 # Infreq
movb $1, %r8b #30.7
xorl %eax, %eax #30.7
jmp ..B3.14 # Prob 100% #30.7
iteratedLocal(double):
lea -8(%rsp), %rax #8.13
lea -40(%rsp), %rdx #7.11
cmpq %rax, %rdx #33.41
je ..B4.15 # Prob 50% #33.41
movq %rax, -48(%rsp) #32.12
movq %rdx, -56(%rsp) #32.12
xorl %eax, %eax #33.7
..B4.13: # Preds ..B4.11 ..B4.13
addsd -40(%rsp,%rax,8), %xmm0 #34.9
incq %rax #33.7
cmpq $4, %rax #33.7
jb ..B4.13 # Prob 82% #33.7
..B4.15: # Preds ..B4.13 ..B4.1
ret #35.14
To avoid misunderstandings. If counted loop condition would rely on external parameter like this one.
double countedDep(double param, Test & d)
{
for (int i = 0; i < d.size; i++)
param += d.arr[i];
return param;
}
Such loop also will not be unrolled.

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

Explanation of std::vector<int> sum ASM - c++

Related

Why some of sse intrinsics introduce move back and forth?

How to efficiently add two vectors in C++

Storing a constant in SSE register (GCC, C++)

Performance of C++11 modern-style loops vs old-style loops

Can modern compilers unroll `for` loops expressed using begin and end iterators

Categories

Resources