Why some of sse intrinsics introduce move back and forth?

Why some of sse intrinsics introduce move back and forth? - c++

In my code, I set a 128-bit variable to zero. But I don't quite understand why it translates to two move instructions in assembly code?
__m128i zeros = reinterpret_cast<__m128i>(_mm_setzero_pd());
Corresponding assembly code has two move back and forth from xmm0 to 0x40(%rsp).
00709658: 0F 57 C0 xorps %xmm0, %xmm0
0070965B: 66 0F 29 44 24 40 movapd %xmm0, 0x40(%rsp)
00709661: 66 0F 28 44 24 40 movapd 0x40(%rsp), %xmm0
My compiler is Clang 10.0 and no optimization turned on when I ask the question.
Here is minimal implementation of my code.
template <int LEN>
bool SSEEncodeChunk(const char** srcp, char** dstp) {
__m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(*srcp));
__m128i zeros = reinterpret_cast<__m128i>(_mm_setzero_pd());
__m128i zero_bytes = _mm_cmpeq_epi8(data, zeros);
bool all_zeros = _mm_testz_si128(zero_bytes, zero_bytes);
if ((!all_zeros)) {
return false;
}
_mm_storeu_si128(reinterpret_cast<__m128i*>(*dstp), data);
*dstp += LEN;
*srcp += LEN;
return true;
}
Update in June 28th, clang version 10.0.0-4ubuntu1, ubuntu2004.
Here is my code in assembly with O0. I also check the options, with -fomit-frame-pointer. There are indeed more than one moving back and forth from %xmm, offset(%rsp). My first post only take part of them out.
(lldb) disassemble --name SSEEncodeChunk
index_type_traits_test`SSEEncodeChunk<16>:
[0x709630] <+0>: subq $0x58, %rsp
[0x709634] <+4>: movq %rdi, -0x38(%rsp)
[0x709639] <+9>: movq %rsi, -0x40(%rsp)
[0x70963e] <+14>: movq -0x38(%rsp), %rax
[0x709643] <+19>: movq (%rax), %rax
[0x709646] <+22>: movq %rax, -0x28(%rsp)
[0x70964b] <+27>: movq -0x28(%rsp), %rax
[0x709650] <+32>: movups (%rax), %xmm0
[0x709653] <+35>: movaps %xmm0, -0x50(%rsp)
[0x709658] <+40>: xorps %xmm0, %xmm0
[0x70965b] <+43>: movapd %xmm0, 0x40(%rsp) ; this is the point I ask the question at the first time
[0x709661] <+49>: movapd 0x40(%rsp), %xmm0
[0x709667] <+55>: movapd %xmm0, -0x60(%rsp)
[0x70966d] <+61>: movaps -0x50(%rsp), %xmm0
[0x709672] <+66>: movaps -0x60(%rsp), %xmm1
[0x709677] <+71>: movaps %xmm0, 0x30(%rsp)
[0x70967c] <+76>: movaps %xmm1, 0x20(%rsp)
[0x709681] <+81>: movaps 0x30(%rsp), %xmm0
[0x709686] <+86>: movaps 0x20(%rsp), %xmm1
[0x70968b] <+91>: pcmpeqb %xmm1, %xmm0
[0x70968f] <+95>: movdqa %xmm0, -0x70(%rsp)
[0x709695] <+101>: movdqa -0x70(%rsp), %xmm0
[0x70969b] <+107>: movdqa -0x70(%rsp), %xmm1
[0x7096a1] <+113>: movdqa %xmm0, 0x10(%rsp)
[0x7096a7] <+119>: movdqa %xmm1, (%rsp)
[0x7096ac] <+124>: movdqa 0x10(%rsp), %xmm0
[0x7096b2] <+130>: movdqa (%rsp), %xmm1
[0x7096b7] <+135>: ptest %xmm1, %xmm0
[0x7096bc] <+140>: sete %cl
[0x7096bf] <+143>: movzbl %cl, %edx
[0x7096c2] <+146>: cmpl $0x0, %edx
[0x7096c5] <+149>: setne %cl
[0x7096c8] <+152>: andb $0x1, %cl
[0x7096cb] <+155>: movb %cl, -0x71(%rsp)
[0x7096cf] <+159>: movb -0x71(%rsp), %cl
[0x7096d3] <+163>: xorb $-0x1, %cl
[0x7096d6] <+166>: testb $0x1, %cl
[0x7096d9] <+169>: jne 0x7096e4
[0x7096df] <+175>: jmp 0x7096ee
[0x7096e4] <+180>: movb $0x0, -0x29(%rsp)
[0x7096e9] <+185>: jmp 0x70973f
[0x7096ee] <+190>: movq -0x40(%rsp), %rax
[0x7096f3] <+195>: movq (%rax), %rax
[0x7096f6] <+198>: movdqa -0x50(%rsp), %xmm0
[0x7096fc] <+204>: movq %rax, -0x8(%rsp)
[0x709701] <+209>: movdqa %xmm0, -0x20(%rsp)
[0x709707] <+215>: movdqa -0x20(%rsp), %xmm0
[0x70970d] <+221>: movq -0x8(%rsp), %rax
[0x709712] <+226>: movdqu %xmm0, (%rax)
[0x709716] <+230>: movq -0x40(%rsp), %rax
[0x70971b] <+235>: movq (%rax), %rcx
[0x70971e] <+238>: addq $0x10, %rcx
[0x709725] <+245>: movq %rcx, (%rax)
[0x709728] <+248>: movq -0x38(%rsp), %rax
[0x70972d] <+253>: movq (%rax), %rcx
[0x709730] <+256>: addq $0x10, %rcx
[0x709737] <+263>: movq %rcx, (%rax)
[0x70973a] <+266>: movb $0x1, -0x29(%rsp)
[0x70973f] <+271>: movb -0x29(%rsp), %al
[0x709743] <+275>: andb $0x1, %al
[0x709745] <+277>: movzbl %al, %eax
[0x709748] <+280>: addq $0x58, %rsp
[0x70974c] <+284>: retq
After turned on -O2, the back and forth move do disapper
0051E8D5: F3 0F 6F 08 movdqu (%rax), %xmm1
0051E8D9: 66 0F EF C0 pxor %xmm0, %xmm0
0051E8DD: 66 0F 6F D1 movdqa %xmm1, %xmm2
0051E8E1: 66 0F 74 D0 pcmpeqb %xmm0, %xmm2
0051E8E5: 66 0F 38 17 D2 ptest %xmm2, %xmm2
0051E8EA: 0F 85 97 03 00 00 jne 0x51ec87
0051E8F0: F3 0F 7F 0C 3B movdqu %xmm1, (%rbx,%rdi)
...
0051E967: 48 01 F8 addq %rdi, %rax
0051E96A: 48 83 C0 10 addq $0x10, %rax
0051E96E: 48 01 FB addq %rdi, %rbx
0051E971: 48 83 C3 10 addq $0x10, %rbx

Related

Explanation of std::vector<int> sum ASM

I was playing around with the Compiler Explorer, and I'm struggling to understand the ASM output (x86 Clang 3.7 -O3) of a simple std::vector<int> sum function:
#include <vector>
#include <numeric>
int sum(const std::vector<int>& v)
{
return std::accumulate(v.begin(), v.end(), 0);
}
The ASM for this code is:
sum(std::vector<int, std::allocator<int> > const&): # #sum(std::vector<int, std::allocator<int> > const&)
movq (%rdi), %rsi
movq 8(%rdi), %r11
xorl %eax, %eax
cmpq %r11, %rsi
je .LBB0_13
movabsq $9223372036854775800, %rax # imm = 0x7FFFFFFFFFFFFFF8
leaq -4(%r11), %rdx
movq %rdx, %r10
subq %rsi, %r10
shrq $2, %r10
incq %r10
xorl %edi, %edi
movq %r10, %r8
andq %rax, %r8
pxor %xmm0, %xmm0
je .LBB0_2
andq %r10, %rax
leaq -8(%rax), %r9
movl %r9d, %ecx
shrl $3, %ecx
incl %ecx
xorl %edi, %edi
testb $3, %cl
je .LBB0_4
subl %esi, %edx
shrl $2, %edx
incl %edx
andl $24, %edx
addl $-8, %edx
shrl $3, %edx
incl %edx
andl $3, %edx
negq %rdx
pxor %xmm0, %xmm0
xorl %edi, %edi
pxor %xmm1, %xmm1
.LBB0_6: # %vector.body.prol
movdqu (%rsi,%rdi,4), %xmm2
movdqu 16(%rsi,%rdi,4), %xmm3
paddd %xmm2, %xmm0
paddd %xmm3, %xmm1
addq $8, %rdi
incq %rdx
jne .LBB0_6
jmp .LBB0_7
.LBB0_2:
pxor %xmm1, %xmm1
jmp .LBB0_11
.LBB0_4:
pxor %xmm0, %xmm0
pxor %xmm1, %xmm1
.LBB0_7: # %vector.body.preheader.split
leaq (%rsi,%r8,4), %rdx
cmpq $24, %r9
jb .LBB0_10
subq %rdi, %rax
leaq 112(%rsi,%rdi,4), %rsi
.LBB0_9: # %vector.body
movdqu -112(%rsi), %xmm2
movdqu -96(%rsi), %xmm3
movdqu -80(%rsi), %xmm4
movdqu -64(%rsi), %xmm5
paddd %xmm0, %xmm2
paddd %xmm1, %xmm3
paddd %xmm4, %xmm2
paddd %xmm5, %xmm3
movdqu -48(%rsi), %xmm4
movdqu -32(%rsi), %xmm5
paddd %xmm2, %xmm4
paddd %xmm3, %xmm5
movdqu -16(%rsi), %xmm0
movdqu (%rsi), %xmm1
paddd %xmm4, %xmm0
paddd %xmm5, %xmm1
subq $-128, %rsi
addq $-32, %rax
jne .LBB0_9
.LBB0_10:
movq %rdx, %rsi
movq %r8, %rdi
.LBB0_11: # %middle.block
paddd %xmm1, %xmm0
pshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1]
paddd %xmm0, %xmm1
pshufd $229, %xmm1, %xmm0 # xmm0 = xmm1[1,1,2,3]
paddd %xmm1, %xmm0
movd %xmm0, %eax
cmpq %rdi, %r10
je .LBB0_13
.LBB0_12: # %.lr.ph.i
addl (%rsi), %eax
addq $4, %rsi
cmpq %rsi, %r11
jne .LBB0_12
.LBB0_13: # %int std::accumulate<__gnu_cxx::__normal_iterator<int const*, std::vector<int, std::allocator<int> > >, int>(__gnu_cxx::__normal_iterator<int const*, std::vector<int, std::allocator<int> > >, __gnu_cxx::__normal_iterator<int const*, std::vector<int, std::allocator<int> > >, int) [clone .exit]
req
For comparison, the ASM for the same function but using std::vector<double> is:
sum(std::vector<double, std::allocator<double> > const&):
movq 8(%rdi), %rdx
movq (%rdi), %rax
pxor %xmm0, %xmm0
cmpq %rax, %rdx
je .L4
.L3:
addsd (%rax), %xmm0
addq $8, %rax
cmpq %rax, %rdx
jne .L3
rep ret
.L4:
rep ret
The ASM for std::vector<double> seems fairly trivial, while the ASM for std::vector<int> appears markedly more complex. I'm assuming there is some clever optimisation going on with std::vector<int>, but I'm at a bit of a loss to explain what's going on. Could someone enlighten me?

Short answer - the compiler has vectorised and unrolled the loop for adding integers. Compare the vector<double> version which has these lines:
addsd (%rax), %xmm0
addq $8, %rax
This means its adding a single double into the sum and then moving on 8 bytes and looping.
The same code in the main loop of the vector<int> version does:
movdqu -112(%rsi), %xmm2
movdqu -96(%rsi), %xmm3
movdqu -80(%rsi), %xmm4
movdqu -64(%rsi), %xmm5
...
movdqu -48(%rsi), %xmm4
movdqu -32(%rsi), %xmm5
...
movdqu -16(%rsi), %xmm0
...
movdqu (%rsi), %xmm1
...
subq $-128, %rsi
The movdq shows its doing 16 bytes at once (4 integers) and the subq $-128, %rsi shows its doing 128 bytes (or 32 ints) in a single loop across 8 loads. The end result of each iteration of the loop adds the next 32 ints into one of the 8 slots in xmm0:xmm1
LBB0_11 then takes the output from the main loop (which is 8 integers across xmm0 and xmm1) and finds the sum of those.
LBB0_12 then finishes off any ints at the end of the vector which couldn't be consumed by the main loop (as the main loop works on 32 integers at the same time)
It vectorises the adds so it can handle 4 integers at once which is generally faster than doing one integer at a time. It also unrolls the loop so that it can do more than 1 iteration of adding per loop.
Explanation of vectorization: What does vectorization mean?
Explanation of loop unrolling: When, if ever, is loop unrolling still useful?
I've not analysed the start of the code for the integer case, but generally this is setting up the loop by aligning it to a 16 byte boundary before it starts the main loop.

Add up all elements of compile-time sized array most efficiently

I'm trying to efficiently add everything up in an compile-time sized array, using least amount of instructions. Naturally I'm using templates. I created this.
template<unsigned int startIndex, unsigned int count>
int AddCollapseArray(int theArray[])
{
if(count == 1)
{
return theArray[startIndex];
}
else if(count == 2)
{
return theArray[startIndex] + theArray[startIndex + 1];
}
else if(count % 2 == 0)
{
return AddCollapseArray<startIndex, count / 2>(theArray) + AddCollapseArray<startIndex + count / 2, count / 2>(theArray));
}
else if (count % 2 == 1)
{
int newCount = count-1;
return AddCollapseArray<startIndex, newCount/ 2>(theArray) + AddCollapseArray<startIndex + newCount/ 2, newCount/ 2>(theArray)) + theArray[startIndex + newCount];
}
}
This appears like it will get the job done most efficiently to me. I think the branching and the arithmetic besides the additions will be completely optimized out. Are there any flaws with doing it this way?

Don't try to outsmart the optimizer. All this complicated template machinery just makes it harder for the optimizer to understand what you actually want to do.
For example,
int f0(int *p) {
return AddCollapseArray<0, 10>(p);
}
int f1(int *p) {
return std::accumulate(p+0, p+10, 0);
}
Produces the exact same assembly with clang at -O3
f0(int*): # #f0(int*)
movl 4(%rdi), %eax
addl (%rdi), %eax
addl 8(%rdi), %eax
addl 12(%rdi), %eax
addl 16(%rdi), %eax
addl 20(%rdi), %eax
addl 24(%rdi), %eax
addl 28(%rdi), %eax
addl 32(%rdi), %eax
addl 36(%rdi), %eax
retq
f1(int*): # #f1(int*)
movl 4(%rdi), %eax
addl (%rdi), %eax
addl 8(%rdi), %eax
addl 12(%rdi), %eax
addl 16(%rdi), %eax
addl 20(%rdi), %eax
addl 24(%rdi), %eax
addl 28(%rdi), %eax
addl 32(%rdi), %eax
addl 36(%rdi), %eax
retq
Let's say we want to do 100 elements:
int f0(int *p) {
return AddCollapseArray<0, 100>(p);
}
int f1(int *p) {
return std::accumulate(p+0, p+100, 0);
}
Here's what we get:
f0(int*): # #f0(int*)
pushq %rbp
pushq %rbx
pushq %rax
movq %rdi, %rbx
callq int AddCollapseArray<0u, 50u>(int*)
movl %eax, %ebp
movq %rbx, %rdi
callq int AddCollapseArray<50u, 50u>(int*)
addl %ebp, %eax
addq $8, %rsp
popq %rbx
popq %rbp
retq
f1(int*): # #f1(int*)
movdqu (%rdi), %xmm0
movdqu 16(%rdi), %xmm1
movdqu 32(%rdi), %xmm2
movdqu 48(%rdi), %xmm3
paddd %xmm0, %xmm1
paddd %xmm2, %xmm1
paddd %xmm3, %xmm1
movdqu 64(%rdi), %xmm0
paddd %xmm1, %xmm0
movdqu 80(%rdi), %xmm1
paddd %xmm0, %xmm1
movdqu 96(%rdi), %xmm0
paddd %xmm1, %xmm0
movdqu 112(%rdi), %xmm1
paddd %xmm0, %xmm1
movdqu 128(%rdi), %xmm0
paddd %xmm1, %xmm0
movdqu 144(%rdi), %xmm1
paddd %xmm0, %xmm1
movdqu 160(%rdi), %xmm0
paddd %xmm1, %xmm0
movdqu 176(%rdi), %xmm1
paddd %xmm0, %xmm1
movdqu 192(%rdi), %xmm0
paddd %xmm1, %xmm0
movdqu 208(%rdi), %xmm1
paddd %xmm0, %xmm1
movdqu 224(%rdi), %xmm0
paddd %xmm1, %xmm0
movdqu 240(%rdi), %xmm1
paddd %xmm0, %xmm1
movdqu 256(%rdi), %xmm0
paddd %xmm1, %xmm0
movdqu 272(%rdi), %xmm1
paddd %xmm0, %xmm1
movdqu 288(%rdi), %xmm0
paddd %xmm1, %xmm0
movdqu 304(%rdi), %xmm1
paddd %xmm0, %xmm1
movdqu 320(%rdi), %xmm0
paddd %xmm1, %xmm0
movdqu 336(%rdi), %xmm1
paddd %xmm0, %xmm1
movdqu 352(%rdi), %xmm0
paddd %xmm1, %xmm0
movdqu 368(%rdi), %xmm1
paddd %xmm0, %xmm1
movdqu 384(%rdi), %xmm0
paddd %xmm1, %xmm0
pshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1]
paddd %xmm0, %xmm1
pshufd $229, %xmm1, %xmm0 # xmm0 = xmm1[1,1,2,3]
paddd %xmm1, %xmm0
movd %xmm0, %eax
retq
int AddCollapseArray<0u, 50u>(int*): # #int AddCollapseArray<0u, 50u>(int*)
movl 4(%rdi), %eax
addl (%rdi), %eax
addl 8(%rdi), %eax
addl 12(%rdi), %eax
addl 16(%rdi), %eax
addl 20(%rdi), %eax
addl 24(%rdi), %eax
addl 28(%rdi), %eax
addl 32(%rdi), %eax
addl 36(%rdi), %eax
addl 40(%rdi), %eax
addl 44(%rdi), %eax
addl 48(%rdi), %eax
addl 52(%rdi), %eax
addl 56(%rdi), %eax
addl 60(%rdi), %eax
addl 64(%rdi), %eax
addl 68(%rdi), %eax
addl 72(%rdi), %eax
addl 76(%rdi), %eax
addl 80(%rdi), %eax
addl 84(%rdi), %eax
addl 88(%rdi), %eax
addl 92(%rdi), %eax
addl 96(%rdi), %eax
addl 100(%rdi), %eax
addl 104(%rdi), %eax
addl 108(%rdi), %eax
addl 112(%rdi), %eax
addl 116(%rdi), %eax
addl 120(%rdi), %eax
addl 124(%rdi), %eax
addl 128(%rdi), %eax
addl 132(%rdi), %eax
addl 136(%rdi), %eax
addl 140(%rdi), %eax
addl 144(%rdi), %eax
addl 148(%rdi), %eax
addl 152(%rdi), %eax
addl 156(%rdi), %eax
addl 160(%rdi), %eax
addl 164(%rdi), %eax
addl 168(%rdi), %eax
addl 172(%rdi), %eax
addl 176(%rdi), %eax
addl 180(%rdi), %eax
addl 184(%rdi), %eax
addl 188(%rdi), %eax
addl 192(%rdi), %eax
addl 196(%rdi), %eax
retq
int AddCollapseArray<50u, 50u>(int*): # #int AddCollapseArray<50u, 50u>(int*)
movl 204(%rdi), %eax
addl 200(%rdi), %eax
addl 208(%rdi), %eax
addl 212(%rdi), %eax
addl 216(%rdi), %eax
addl 220(%rdi), %eax
addl 224(%rdi), %eax
addl 228(%rdi), %eax
addl 232(%rdi), %eax
addl 236(%rdi), %eax
addl 240(%rdi), %eax
addl 244(%rdi), %eax
addl 248(%rdi), %eax
addl 252(%rdi), %eax
addl 256(%rdi), %eax
addl 260(%rdi), %eax
addl 264(%rdi), %eax
addl 268(%rdi), %eax
addl 272(%rdi), %eax
addl 276(%rdi), %eax
addl 280(%rdi), %eax
addl 284(%rdi), %eax
addl 288(%rdi), %eax
addl 292(%rdi), %eax
addl 296(%rdi), %eax
addl 300(%rdi), %eax
addl 304(%rdi), %eax
addl 308(%rdi), %eax
addl 312(%rdi), %eax
addl 316(%rdi), %eax
addl 320(%rdi), %eax
addl 324(%rdi), %eax
addl 328(%rdi), %eax
addl 332(%rdi), %eax
addl 336(%rdi), %eax
addl 340(%rdi), %eax
addl 344(%rdi), %eax
addl 348(%rdi), %eax
addl 352(%rdi), %eax
addl 356(%rdi), %eax
addl 360(%rdi), %eax
addl 364(%rdi), %eax
addl 368(%rdi), %eax
addl 372(%rdi), %eax
addl 376(%rdi), %eax
addl 380(%rdi), %eax
addl 384(%rdi), %eax
addl 388(%rdi), %eax
addl 392(%rdi), %eax
addl 396(%rdi), %eax
retq
Not only is your function not fully inlined, it's also not vectorized. GCC produces similar results.

The important qualifier here is the meaning of "least number of instructions". If that is to be interpreted as causing the CPU to perform the fewest steps, and we further stipulate there are no advanced techniques to be employed, like SIMD, GPU programming or OMP (or other auto parallel technologies)....just C or C++, then consider:
Assuming something like:
int a[ 10 ];
Which is filled with data at runtime, and will always contain 10 entries (0 through 9)
The std::accumulate does a nice job here, creating a tight loop in the assembler, no mess...just quick:
int r = std::accumulate( &a[ 0 ], &a[ 9 ], 0 );
If course, some const int signifying the size of the array 'a' would be in order.
This compares curiously to:
for( int n=0; n < 10; ++n ) r += a[ n ];
The compiler very smartly emits 10 add instructions unrolled - it doesn't even bother with a loop.
Now, this means that in std::accumulate, though the loop is tight, there will be, at the minimum, two add instructions for each element (one for the sum, and one to increment the iterator). Add to that the comparison instruction and a conditional jump, and there are at least 4 instructions per item, or about 40 machine language steps of various cost in ticks.
On the other hand, the unrolled result of the for loop is just 10 machine steps, which the CPU can very likely schedule with great cache friendliness, and no jumps.
The for loop is definitely faster.
The compiler "knows" what you're trying to do, and gets to the job as well as you might think through it with the proposed code you posted.
Further, if the size of the array gets too outlandish for unrolling the loop, the compiler automatically performs the classic optimization that std::accumulate does not appear to do for some reason...i.e., performing two additions per loop (when it constructs a loop for reason of the number of elements).
Using VC 2012, this source:
int r = std::accumulate( &a[ 0 ], &a[ 9 ], 0 );
int z = 0;
int *ap = a;
int *ae = &a[9];
while( ap <= ae ) { z += *ap; ++ap; }
int z2 = 0;
for (int n=0; n < 10; ++n ) z2 += a[ n ];
Produces the following assembler snippets on a release build in VC2012
int r = std::accumulate( &a[ 0 ], &a[ 9 ], 0 );
00301270 33 D2 xor edx,edx
00301272 B8 D4 40 30 00 mov eax,3040D4h
00301277 EB 07 jmp wmain+10h (0301280h)
00301279 8D A4 24 00 00 00 00 lea esp,[esp]
00301280 03 10 add edx,dword ptr [eax]
00301282 83 C0 04 add eax,4
00301285 3D F8 40 30 00 cmp eax,3040F8h
0030128A 75 F4 jne wmain+10h (0301280h)
while( ap <= ae ) { z += *ap; ++ap; }
003012A0 03 08 add ecx,dword ptr [eax]
003012A2 03 70 04 add esi,dword ptr [eax+4]
003012A5 83 C0 08 add eax,8
003012A8 3D F4 40 30 00 cmp eax,3040F4h
003012AD 7E F1 jle wmain+30h (03012A0h)
003012AF 3D F8 40 30 00 cmp eax,3040F8h
003012B4 77 02 ja wmain+48h (03012B8h)
003012B6 8B 38 mov edi,dword ptr [eax]
003012B8 8D 04 0E lea eax,[esi+ecx]
003012BB 03 F8 add edi,eax
for (int n=0; n < 10; ++n ) z2 += a[ n ];
003012BD A1 D4 40 30 00 mov eax,dword ptr ds:[003040D4h]
003012C2 03 05 F8 40 30 00 add eax,dword ptr ds:[3040F8h]
003012C8 03 05 D8 40 30 00 add eax,dword ptr ds:[3040D8h]
003012CE 03 05 DC 40 30 00 add eax,dword ptr ds:[3040DCh]
003012D4 03 05 E0 40 30 00 add eax,dword ptr ds:[3040E0h]
003012DA 03 05 E4 40 30 00 add eax,dword ptr ds:[3040E4h]
003012E0 03 05 E8 40 30 00 add eax,dword ptr ds:[3040E8h]
003012E6 03 05 EC 40 30 00 add eax,dword ptr ds:[3040ECh]
003012EC 03 05 F0 40 30 00 add eax,dword ptr ds:[3040F0h]
003012F2 03 05 F4 40 30 00 add eax,dword ptr ds:[3040F4h]
Based on comments I decided to try this in XCode 7, with drastically different results. This is it's unroll of the for loop:
.loc 1 58 36 ## /Users/jv/testclang/testcp/checkloop/checkloop/main.cpp:58:36
movq _a(%rip), %rax
Ltmp22:
##DEBUG_VALUE: do3:z2 <- EAX
movq %rax, %rcx
shrq $32, %rcx
.loc 1 58 33 is_stmt 0 ## /Users/jv/testclang/testcp/checkloop/checkloop/main.cpp:58:33
addl %eax, %ecx
.loc 1 58 36 ## /Users/jv/testclang/testcp/checkloop/checkloop/main.cpp:58:36
movq _a+8(%rip), %rax
Ltmp23:
.loc 1 58 33 ## /Users/jv/testclang/testcp/checkloop/checkloop/main.cpp:58:33
movl %eax, %edx
addl %ecx, %edx
shrq $32, %rax
addl %edx, %eax
.loc 1 58 36 ## /Users/jv/testclang/testcp/checkloop/checkloop/main.cpp:58:36
movq _a+16(%rip), %rcx
.loc 1 58 33 ## /Users/jv/testclang/testcp/checkloop/checkloop/main.cpp:58:33
movl %ecx, %edx
addl %eax, %edx
shrq $32, %rcx
addl %edx, %ecx
.loc 1 58 36 ## /Users/jv/testclang/testcp/checkloop/checkloop/main.cpp:58:36
movq _a+24(%rip), %rax
.loc 1 58 33 ## /Users/jv/testclang/testcp/checkloop/checkloop/main.cpp:58:33
movl %eax, %edx
addl %ecx, %edx
shrq $32, %rax
addl %edx, %eax
.loc 1 58 36 ## /Users/jv/testclang/testcp/checkloop/checkloop/main.cpp:58:36
movq _a+32(%rip), %rcx
.loc 1 58 33 ## /Users/jv/testclang/testcp/checkloop/checkloop/main.cpp:58:33
movl %ecx, %edx
addl %eax, %edx
shrq $32, %rcx
addl %edx, %ecx
This may not look as clean as VC's simple list, but it may run as fast because the setup (movq or movl) for each addition may run parallel in the CPU as the previous entry is finishing it's addition, costing little to nothing by comparison to VC's simple, clean 'looking' series of adds on memory sources.
The following is Xcode's std::accumulator. It SEEMS there's a init required, but then it performs a clean series of additions having unrolled the loop, which VC did not do.
.file 37 "/Applications/Xcode7.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/../include/c++/v1" "numeric"
.loc 37 75 27 is_stmt 1 ## /Applications/Xcode7.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/../include/c++/v1/numeric:75:27
movq _a(%rip), %r14
Ltmp11:
movq %r14, -48(%rbp) ## 8-byte Spill
Ltmp12:
shrq $32, %r14
movq _a+8(%rip), %rbx
movq %rbx, -56(%rbp) ## 8-byte Spill
shrq $32, %rbx
movq _a+16(%rip), %r13
movq %r13, -72(%rbp) ## 8-byte Spill
shrq $32, %r13
movq _a+24(%rip), %r15
movq %r15, %r12
shrq $32, %r12
Ltmp13:
movl _a+32(%rip), %eax
Ltmp14:
movq -48(%rbp), %rax ## 8-byte Reload
addl %eax, %r14d
movq -56(%rbp), %rax ## 8-byte Reload
addl %eax, %r14d
addl %ebx, %r14d
movq -72(%rbp), %rax ## 8-byte Reload
addl %eax, %r14d
addl %r13d, %r14d
addl %r15d, %r14d
addl %r12d, %r14d
addl -64(%rbp), %r14d ## 4-byte Folded Reload
The bottom line here is that the optimizations we rely upon from compilers differs so widely and wildly from one compiler to another that we should rely upon them, but watch.
LLVM is quite exemplary, and understands std::accumulate better than VC, it seems - but this short investigation can't reveal if that is a difference in the implementation of the libary or of the compiler. There could be important differences in the implementation of Xcode's std::accumulate which give the compiler more insight than VC's version of the library.
That applies more generally to algorithms, even those from numeric. std::accumulate is a for loop. It is likely expanded inline as for loop based on pointers into the array, which is why VC's choice to create a loop for std::accumulate was echoed in it's choice to produce a loop for the code using int * to loop through the array, but unrolled the loop for the for loop using an integer to reference entries in the array by index. In other words, it really did no better in a straight for loop when pointers were used, and that suggests it's VC's optimizer, not the library, in this case.
This follows Stroustrup's own favorite example of the idea of information available to the compiler, comparing qsort from C and sort from C++. qsort takes a function pointer to perform the comparison, cutting off the compiler from understand the comparison, forcing it to call a function via a pointer. The C++ sort function, on the other hand, takes a functor, which conveys more information about the comparison. That could still result in a function call, but the optimizer has the opportunity to understand the comparison sufficiently to make it inline.
In VC's case, for whatever reason (we'd have to as Microsoft), the compiler is confused when looping through the array via pointers. The information given to it is different than with the loop using an integer to index the array. It understands that, but not the pointers. LLVM, by contrast, understood both (and more). The difference of information is not important to LLVM, but it is to VC. Since std::accumulate is really an inline representing a for loop, and that loop is processed via pointers, it escaped VC's recognition, just as VC did in the straight for loop based on pointers. If a specialization could be made for integer arrays, such that accumulated looped with indexes rather than pointers, VC would respond with better output, but it shouldn't be so.
A poor optimizer can miss the point, and a poor implementation of the library could confuse the optimizer, which means that under the best circumstances std::accumulate can perform about as well as the for loop for a simple array of integers, producing an unrolled version of the loop creating the sum, but not always. However, there's little to get in the way of the compiler's understanding in a for loop..everything is right there, and the implementation of the library can't mess it up, it's all up to the compiler at that point. For that, VC shows it's weakness.
I tried all settings on VC to try to get it to unroll std::accumulate, but so far it never did (haven't tried newer versions of VC).
It didn't take much to get Xcode to unroll the loop; LLVM seems to have deeper engineering. It may have a better implementation of the library, too.
Incidentally, the C code example I posted at top was used in VC, which didn't recognize that the three different summations were related. LLVM on XCode did, which meant the first time I tried it there it simply adopted the answer from std::accumulate and did nothing otherwise. VC was really weak on that point. In order to get Xcode to perform 3 separate tests, I randomized the array before each call...otherwise Xcode realized what I was doing where VC did not.

Whereas std::accumulate should be enough, to unroll manually the loop, you may do
namespace detail
{
template<std::size_t startIndex, std::size_t... Is>
int Accumulate(std::index_sequence<Is...>, const int a[])
{
int res = 0;
const int dummy[] = {0, ((res += a[startIndex + Is]), 0)...};
static_cast<void>(dummy); // Remove warning for unused variable
return res;
}
}
template<std::size_t startIndex, std::size_t count>
int AddCollapseArray(const int a[])
{
return detail::Accumulate<startIndex>(std::make_index_sequence<count>{}, a);
}
or in C++17, with fold expression:
namespace detail
{
template<std::size_t startIndex, std::size_t... Is>
int Accumulate(std::index_sequence<Is...>, const int a[])
{
return (a[startIndex + Is] + ...);
}
}

Optimization of naive matrix multiplication (ICC vs GCC)

The code below uses a very straightforward approach to calculate the matrix product a * b and store the result in c. The code was compiled with -O3 on both GCC 4.4.6 (with -mtune=native) and Intel Compiler 13.0.1 and the speed on GCC is significantly worse (over a factor of two for the sample data used).
I'm curious about the cause of these differences, but unfortunately I'm not familiar enough with the assembly output to understand what's going on here. From a glance it looks as though ICC is doing a better job at vectorizing the computation, but I can't decipher much more than that. (This is mainly for learning purposes since there's no way I would use this in production!)
void __attribute__ ((noinline)) mm( // Line 3
int n,
double*__restrict__ c,
double*__restrict__ a,
double*__restrict__ b
) {
int i, j, k;
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++) {
c[i + n * j] = 0; // Line 12
for (k = 0; k < n; k++) {
c[i + n * j] += a[i + n * k] * b[k + n * j]; // Line 14
}
}
}
}
Here is the output from GCC:
_Z2mmiPdS_S_:
.LFB0:
.cfi_startproc
.cfi_personality 0x3,__gxx_personality_v0
pushq %r14 #
.cfi_def_cfa_offset 16
.cfi_offset 14, -16
testl %edi, %edi # n
movq %rcx, %r14 # b, b
pushq %r13 #
.cfi_def_cfa_offset 24
.cfi_offset 13, -24
pushq %r12 #
.cfi_def_cfa_offset 32
.cfi_offset 12, -32
pushq %rbp #
.cfi_def_cfa_offset 40
.cfi_offset 6, -40
pushq %rbx #
.cfi_def_cfa_offset 48
.cfi_offset 3, -48
jle .L6 #,
leal -1(%rdi), %eax #, tmp96
movslq %edi, %r11 # n, n
movq %rdx, %rbx # a, ivtmp.54
xorl %r12d, %r12d # ivtmp.67
salq $3, %r11 #, D.2193
xorl %ebp, %ebp # prephitmp.37
leaq 8(,%rax,8), %r13 #, D.2208
.L3:
leaq (%rsi,%r12), %r10 #, ivtmp.61
movq %r14, %rcx # b, ivtmp.63
xorl %edx, %edx # j
.p2align 4,,10
.p2align 3
.L5:
movq $0, (%r10) #,* ivtmp.61
movq %rbp, -8(%rsp) # prephitmp.37,
movq %rcx, %r9 # ivtmp.63, ivtmp.70
movsd -8(%rsp), %xmm1 #, prephitmp.37
movq %rbx, %r8 # ivtmp.54, ivtmp.69
xorl %eax, %eax # k
.p2align 4,,10
.p2align 3
.L4:
movsd (%r8), %xmm0 #* ivtmp.69, tmp99
addl $1, %eax #, k
addq %r11, %r8 # D.2193, ivtmp.69
mulsd (%r9), %xmm0 #* ivtmp.70, tmp99
addq $8, %r9 #, ivtmp.70
cmpl %edi, %eax # n, k
addsd %xmm0, %xmm1 # tmp99, prephitmp.37
movsd %xmm1, (%r10) # prephitmp.37,* ivtmp.61
jne .L4 #,
addl $1, %edx #, j
addq %r11, %r10 # D.2193, ivtmp.61
addq %r11, %rcx # D.2193, ivtmp.63
cmpl %edi, %edx # n, j
jne .L5 #,
addq $8, %r12 #, ivtmp.67
addq $8, %rbx #, ivtmp.54
cmpq %r13, %r12 # D.2208, ivtmp.67
jne .L3 #,
.L6:
popq %rbx #
.cfi_def_cfa_offset 40
popq %rbp #
.cfi_def_cfa_offset 32
popq %r12 #
.cfi_def_cfa_offset 24
popq %r13 #
.cfi_def_cfa_offset 16
popq %r14 #
.cfi_def_cfa_offset 8
ret
.cfi_endproc
And here is the output from ICC:
# -- Begin _Z2mmiPdS_S_
# mark_begin;
.align 16,0x90
.globl _Z2mmiPdS_S_
_Z2mmiPdS_S_:
# parameter 1: %edi
# parameter 2: %rsi
# parameter 3: %rdx
# parameter 4: %rcx
..B1.1: # Preds ..B1.0
..___tag_value__Z2mmiPdS_S_.1: #8.3
pushq %r12 #8.3
..___tag_value__Z2mmiPdS_S_.3: #
pushq %r13 #8.3
..___tag_value__Z2mmiPdS_S_.5: #
pushq %r14 #8.3
..___tag_value__Z2mmiPdS_S_.7: #
pushq %r15 #8.3
..___tag_value__Z2mmiPdS_S_.9: #
pushq %rbx #8.3
..___tag_value__Z2mmiPdS_S_.11: #
pushq %rbp #8.3
..___tag_value__Z2mmiPdS_S_.13: #
subq $72, %rsp #8.3
..___tag_value__Z2mmiPdS_S_.15: #
movq %rsi, %r9 #
movslq %edi, %rax #
xorl %r10d, %r10d #11.9
testl %edi, %edi #11.25
jle ..B1.7 # Prob 10% #11.25
# LOE rax rdx rcx rbx rbp rsi r9 r12 r13 r14 r15 edi r10d
..B1.2: # Preds ..B1.1
movl %edi, %r11d #10.5
lea (,%rax,8), %r8 #
andl $-4, %r11d #10.5
movq %rax, %r14 #12.28
movslq %r11d, %r11 #10.5
movl %edi, %r12d #12.28
movq %rsi, 8(%rsp) #12.28
movq %r8, %rbp #12.28
movq %rdx, 32(%rsp) #12.28
movq %r9, %r13 #12.28
movq %rcx, (%rsp) #12.28
movl %r10d, %r15d #12.28
pxor %xmm0, %xmm0 #12.28
movq %r11, %rbx #12.28
# LOE rbx rbp r13 r14 r12d r15d
..B1.3: # Preds ..B1.5 ..B1.48 ..B1.45 ..B1.2
cmpl $12, %r12d #10.5
jle ..B1.38 # Prob 0% #10.5
# LOE rbx rbp r13 r14 r12d r15d
..B1.4: # Preds ..B1.3
movq %r13, %rdi #12.13
xorl %esi, %esi #12.13
movq %rbp, %rdx #12.13
call _intel_fast_memset #12.13
# LOE rbx rbp r13 r14 r12d r15d
..B1.5: # Preds ..B1.4
incl %r15d #11.9
lea (%r13,%r14,8), %r13 #11.9
cmpl %r12d, %r15d #11.9
jb ..B1.3 # Prob 82% #11.9
# LOE rbx rbp r13 r14 r12d r15d
..B1.6: # Preds ..B1.48 ..B1.45 ..B1.5 # Infreq
movl %r12d, %edi #
movq %r14, %rax #
movq 8(%rsp), %rsi #
testl %edi, %edi #11.25
movq 32(%rsp), %rdx #
movq (%rsp), %rcx #
# LOE rax rdx rcx rbx rbp rsi r12 r13 r14 r15 edi
..B1.7: # Preds ..B1.1 ..B1.6 # Infreq
movl $0, %r9d #11.9
movl $0, %r8d #
jle ..B1.33 # Prob 10% #11.25
# LOE rax rdx rcx rbx rbp rsi r8 r12 r13 r14 r15 edi r9d
..B1.8: # Preds ..B1.7 # Infreq
movq %rdx, 32(%rsp) #
# LOE rax rcx rsi r8 edi r9d
..B1.9: # Preds ..B1.31 ..B1.8 # Infreq
xorl %r12d, %r12d #
lea (%rsi,%r8,8), %r13 #14.17
movq %r13, %r15 #10.5
xorl %ebx, %ebx #13.13
andq $15, %r15 #10.5
xorl %r10d, %r10d #
movl %r15d, %r14d #10.5
lea (%rcx,%r8,8), %rbp #14.48
andl $7, %r14d #10.5
xorl %r11d, %r11d #
movl %r14d, 48(%rsp) #
xorl %edx, %edx #
movl %r15d, 56(%rsp) #
movq %r13, 40(%rsp) #
movq %r8, 16(%rsp) #
movl %r9d, 24(%rsp) #
movq %rsi, 8(%rsp) #
movq %rcx, (%rsp) #
movq 32(%rsp), %r14 #
# LOE rax rdx rbp r10 r12 r14 ebx edi r11d
..B1.10: # Preds ..B1.30 ..B1.9 # Infreq
cmpq $8, %rax #10.5
jl ..B1.34 # Prob 10% #10.5
# LOE rax rdx rbp r10 r12 r14 ebx edi r11d
..B1.11: # Preds ..B1.10 # Infreq
movl 56(%rsp), %r9d #10.5
testl %r9d, %r9d #10.5
je ..B1.14 # Prob 50% #10.5
# LOE rax rdx rbp r9 r10 r12 r14 ebx edi r11d
..B1.12: # Preds ..B1.11 # Infreq
cmpl $0, 48(%rsp) #10.5
jne ..B1.34 # Prob 10% #10.5
# LOE rax rdx rbp r10 r12 r14 ebx edi r11d
..B1.13: # Preds ..B1.12 # Infreq
movl $1, %r9d #10.5
# LOE rax rdx rbp r9 r10 r12 r14 ebx edi r11d
..B1.14: # Preds ..B1.13 ..B1.11 # Infreq
movl %r9d, %r13d #10.5
lea 8(%r13), %rcx #10.5
cmpq %rcx, %rax #10.5
jl ..B1.34 # Prob 10% #10.5
# LOE rax rdx rbp r9 r10 r12 r13 r14 ebx edi r11d
..B1.15: # Preds ..B1.14 # Infreq
movl %edi, %r15d #10.5
xorl %ecx, %ecx #10.5
subl %r9d, %r15d #10.5
movslq %r11d, %r8 #14.33
andl $7, %r15d #10.5
negl %r15d #10.5
addl %edi, %r15d #10.5
movslq %r15d, %r15 #10.5
testq %r13, %r13 #10.5
lea (%r14,%r8,8), %rsi #14.33
jbe ..B1.35 # Prob 0% #10.5
# LOE rax rdx rcx rbp rsi r8 r9 r10 r12 r13 r14 r15 ebx edi r11d
..B1.16: # Preds ..B1.15 # Infreq
movsd (%r10,%rbp), %xmm0 #14.48
movq 40(%rsp), %r14 #14.48
# LOE rax rdx rcx rbp rsi r8 r9 r10 r12 r13 r14 r15 ebx edi r11d xmm0
..B1.17: # Preds ..B1.17 ..B1.16 # Infreq
movsd (%rsi,%rcx,8), %xmm1 #14.33
mulsd %xmm0, %xmm1 #14.48
addsd (%r14,%rcx,8), %xmm1 #14.17
movsd %xmm1, (%r14,%rcx,8) #14.17
incq %rcx #10.5
cmpq %r13, %rcx #10.5
jb ..B1.17 # Prob 82% #10.5
# LOE rax rdx rcx rbp rsi r8 r9 r10 r12 r13 r14 r15 ebx edi r11d xmm0
..B1.18: # Preds ..B1.17 # Infreq
movq 32(%rsp), %r14 #
# LOE rax rdx rbp rsi r8 r9 r10 r12 r13 r14 r15 ebx edi r11d xmm0
..B1.19: # Preds ..B1.18 ..B1.35 # Infreq
addq %r9, %r8 #14.33
lea (%r14,%r8,8), %rcx #14.33
testq $15, %rcx #10.5
je ..B1.23 # Prob 60% #10.5
# LOE rax rdx rbp rsi r10 r12 r13 r14 r15 ebx edi r11d xmm0
..B1.20: # Preds ..B1.19 # Infreq
movq 40(%rsp), %rcx #14.48
unpcklpd %xmm0, %xmm0 #14.48
# LOE rax rdx rcx rbp rsi r10 r12 r13 r14 r15 ebx edi r11d xmm0
..B1.21: # Preds ..B1.21 ..B1.20 # Infreq
movsd (%rsi,%r13,8), %xmm1 #14.33
movsd 16(%rsi,%r13,8), %xmm2 #14.33
movsd 32(%rsi,%r13,8), %xmm3 #14.33
movsd 48(%rsi,%r13,8), %xmm4 #14.33
movhpd 8(%rsi,%r13,8), %xmm1 #14.33
movhpd 24(%rsi,%r13,8), %xmm2 #14.33
movhpd 40(%rsi,%r13,8), %xmm3 #14.33
movhpd 56(%rsi,%r13,8), %xmm4 #14.33
mulpd %xmm0, %xmm1 #14.48
mulpd %xmm0, %xmm2 #14.48
mulpd %xmm0, %xmm3 #14.48
mulpd %xmm0, %xmm4 #14.48
addpd (%rcx,%r13,8), %xmm1 #14.17
addpd 16(%rcx,%r13,8), %xmm2 #14.17
addpd 32(%rcx,%r13,8), %xmm3 #14.17
addpd 48(%rcx,%r13,8), %xmm4 #14.17
movaps %xmm1, (%rcx,%r13,8) #14.17
movaps %xmm2, 16(%rcx,%r13,8) #14.17
movaps %xmm3, 32(%rcx,%r13,8) #14.17
movaps %xmm4, 48(%rcx,%r13,8) #14.17
addq $8, %r13 #10.5
cmpq %r15, %r13 #10.5
jb ..B1.21 # Prob 82% #10.5
jmp ..B1.26 # Prob 100% #10.5
# LOE rax rdx rcx rbp rsi r10 r12 r13 r14 r15 ebx edi r11d xmm0
..B1.23: # Preds ..B1.19 # Infreq
movq 40(%rsp), %rcx #14.48
unpcklpd %xmm0, %xmm0 #14.48
.align 16,0x90
# LOE rax rdx rcx rbp rsi r10 r12 r13 r14 r15 ebx edi r11d xmm0
..B1.24: # Preds ..B1.24 ..B1.23 # Infreq
movaps (%rsi,%r13,8), %xmm1 #14.33
movaps 16(%rsi,%r13,8), %xmm2 #14.33
movaps 32(%rsi,%r13,8), %xmm3 #14.33
movaps 48(%rsi,%r13,8), %xmm4 #14.33
mulpd %xmm0, %xmm1 #14.48
mulpd %xmm0, %xmm2 #14.48
mulpd %xmm0, %xmm3 #14.48
mulpd %xmm0, %xmm4 #14.48
addpd (%rcx,%r13,8), %xmm1 #14.17
addpd 16(%rcx,%r13,8), %xmm2 #14.17
addpd 32(%rcx,%r13,8), %xmm3 #14.17
addpd 48(%rcx,%r13,8), %xmm4 #14.17
movaps %xmm1, (%rcx,%r13,8) #14.17
movaps %xmm2, 16(%rcx,%r13,8) #14.17
movaps %xmm3, 32(%rcx,%r13,8) #14.17
movaps %xmm4, 48(%rcx,%r13,8) #14.17
addq $8, %r13 #10.5
cmpq %r15, %r13 #10.5
jb ..B1.24 # Prob 82% #10.5
# LOE rax rdx rcx rbp rsi r10 r12 r13 r14 r15 ebx edi r11d xmm0
..B1.26: # Preds ..B1.24 ..B1.21 ..B1.34 # Infreq
cmpq %rax, %r15 #10.5
jae ..B1.30 # Prob 0% #10.5
# LOE rax rdx rbp r10 r12 r14 r15 ebx edi r11d
..B1.27: # Preds ..B1.26 # Infreq
movsd (%rbp,%r12,8), %xmm0 #14.48
lea (%r14,%rdx,8), %rcx #14.33
movq 40(%rsp), %rsi #14.48
# LOE rax rdx rcx rbp rsi r10 r12 r14 r15 ebx edi r11d xmm0
..B1.28: # Preds ..B1.28 ..B1.27 # Infreq
movsd (%rcx,%r15,8), %xmm1 #14.33
mulsd %xmm0, %xmm1 #14.48
addsd (%rsi,%r15,8), %xmm1 #14.17
movsd %xmm1, (%rsi,%r15,8) #14.17
incq %r15 #10.5
cmpq %rax, %r15 #10.5
jb ..B1.28 # Prob 82% #10.5
# LOE rax rdx rcx rbp rsi r10 r12 r14 r15 ebx edi r11d xmm0
..B1.30: # Preds ..B1.28 ..B1.26 # Infreq
incl %ebx #13.13
addq %rax, %rdx #13.13
addl %edi, %r11d #13.13
addq $8, %r10 #13.13
incq %r12 #13.13
cmpl %edi, %ebx #13.13
jb ..B1.10 # Prob 82% #13.13
# LOE rax rdx rbp r10 r12 r14 ebx edi r11d
..B1.31: # Preds ..B1.30 # Infreq
movl 24(%rsp), %r9d #
incl %r9d #11.9
movq 16(%rsp), %r8 #
addq %rax, %r8 #11.9
movq 8(%rsp), %rsi #
cmpl %edi, %r9d #11.9
movq (%rsp), %rcx #
jb ..B1.9 # Prob 82% #11.9
# LOE rax rcx rsi r8 edi r9d
..B1.33: # Preds ..B1.31 ..B1.7 # Infreq
addq $72, %rsp #18.1
..___tag_value__Z2mmiPdS_S_.16: #
popq %rbp #18.1
..___tag_value__Z2mmiPdS_S_.18: #
popq %rbx #18.1
..___tag_value__Z2mmiPdS_S_.20: #
popq %r15 #18.1
..___tag_value__Z2mmiPdS_S_.22: #
popq %r14 #18.1
..___tag_value__Z2mmiPdS_S_.24: #
popq %r13 #18.1
..___tag_value__Z2mmiPdS_S_.26: #
popq %r12 #18.1
..___tag_value__Z2mmiPdS_S_.28: #
ret #18.1
..___tag_value__Z2mmiPdS_S_.29: #
# LOE
..B1.34: # Preds ..B1.10 ..B1.14 ..B1.12 # Infreq
xorl %r15d, %r15d #10.5
jmp ..B1.26 # Prob 100% #10.5
# LOE rax rdx rbp r10 r12 r14 r15 ebx edi r11d
..B1.35: # Preds ..B1.15 # Infreq
movsd (%rbp,%r12,8), %xmm0 #14.48
jmp ..B1.19 # Prob 100% #14.48
# LOE rax rdx rbp rsi r8 r9 r10 r12 r13 r14 r15 ebx edi r11d xmm0
..B1.38: # Preds ..B1.3 # Infreq
cmpq $4, %r14 #10.5
jl ..B1.47 # Prob 10% #10.5
# LOE rbx rbp r13 r14 r12d r15d
..B1.39: # Preds ..B1.38 # Infreq
xorl %esi, %esi #10.5
movq %rbx, %rdx #10.5
movq %r13, %rcx #
xorl %eax, %eax #
pxor %xmm0, %xmm0 #
# LOE rax rdx rcx rbx rbp rsi r13 r14 r12d r15d xmm0
..B1.40: # Preds ..B1.40 ..B1.39 # Infreq
addq $4, %rsi #10.5
movq %rax, (%rcx) #12.13
movhpd %xmm0, 8(%rcx) #12.13
movq %rax, 16(%rcx) #12.13
movhpd %xmm0, 24(%rcx) #12.13
addq $32, %rcx #10.5
cmpq %rbx, %rsi #10.5
jb ..B1.40 # Prob 82% #10.5
# LOE rax rdx rcx rbx rbp rsi r13 r14 r12d r15d xmm0
..B1.42: # Preds ..B1.40 ..B1.47 # Infreq
cmpq %r14, %rdx #10.5
jae ..B1.48 # Prob 0% #10.5
# LOE rdx rbx rbp r13 r14 r12d r15d
..B1.43: # Preds ..B1.42 # Infreq
xorl %ecx, %ecx #
# LOE rdx rcx rbx rbp r13 r14 r12d r15d
..B1.44: # Preds ..B1.44 ..B1.43 # Infreq
movq %rcx, (%r13,%rdx,8) #12.13
incq %rdx #10.5
cmpq %r14, %rdx #10.5
jb ..B1.44 # Prob 82% #10.5
# LOE rdx rcx rbx rbp r13 r14 r12d r15d
..B1.45: # Preds ..B1.44 # Infreq
incl %r15d #11.9
lea (%r13,%r14,8), %r13 #11.9
cmpl %r12d, %r15d #11.9
jb ..B1.3 # Prob 82% #11.9
jmp ..B1.6 # Prob 100% #11.9
# LOE rbx rbp r13 r14 r12d r15d
..B1.47: # Preds ..B1.38 # Infreq
xorl %edx, %edx #10.5
jmp ..B1.42 # Prob 100% #10.5
# LOE rdx rbx rbp r13 r14 r12d r15d
..B1.48: # Preds ..B1.42 # Infreq
incl %r15d #11.9
lea (%r13,%r14,8), %r13 #11.9
cmpl %r12d, %r15d #11.9
jb ..B1.3 # Prob 82% #11.9
jmp ..B1.6 # Prob 100% #11.9
.align 16,0x90
..___tag_value__Z2mmiPdS_S_.36: #
# LOE rbx rbp r13 r14 r12d r15d
# mark_end;
.type _Z2mmiPdS_S_,#function
.size _Z2mmiPdS_S_,.-_Z2mmiPdS_S_
.data
# -- End _Z2mmiPdS_S_
Edit: With the help of Olaf Dietsche, it looks like the code below can run much faster with GCC 4.8.2, though still a bit (~30%) slower than Intel. The main difference is that the initialization is done ahead of time (this by itself makes no difference) and the loop ordering has been interchanged (this makes a major difference for GCC).
memset(c, 0, n * n);
for (j = 0; j < n; j++) {
for (k = 0; k < n; k++) {
for (i = 0; i < n; i++) {
c[i + n * j] += a[i + n * k] * b[k + n * j]; // Line 14
}
}
}

Your code seems to be wrong or not suitable for vectorization.
When I modify your code according to this blog post Performance – GCC & auto-vectorization
int i, j, k;
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++) {
for (k = 0; k < n; k++) {
c[n * i + k] += a[n * i + j] * b[n * j + k]; // Line 14
}
}
}
and compile it with
gcc-4.8 -O3 -S a.c
it uses at least some SIMD instructions
.L8:
movsd (%rcx), %xmm1
addl $1, %r8d
movsd (%rdx,%rsi), %xmm2
unpcklpd %xmm1, %xmm1
movhpd 8(%rdx,%rsi), %xmm2
movsd (%rax,%rsi), %xmm0
mulpd %xmm2, %xmm1
movhpd 8(%rax,%rsi), %xmm0
addpd %xmm1, %xmm0
movlpd %xmm0, (%rax,%rsi)
movhpd %xmm0, 8(%rax,%rsi)
addq $16, %rsi
cmpl %r8d, %ebx
ja .L8
cmpl %edi, %r15d
je .L9
although not as much as ICC does.
Update:
Adding -funroll-loops enlarges the generated assembly code substantially to about the length of your posted ICC assembly.

Looks like the Intel compiler is using SIMD instructions (mulpd, addpd, movaps etc) -- it's able to perform more than one operation (i.e. both a = b*c and d = e*f) in a single clock cycle, whereas the GCC code would take two to do the same.. I'm not sure if it's possible to enable these operations automatically in GCC, but you can hand-write them in with some work.
It seems like the flags -msse, -msse2, -msse3 to GCC cause it to attempt to do some SIMD optimization of its own.

I'm not sure if ICC really produced faster code in this case because I didn't run any actual benchmarks. But you can tell GCC to unroll the loops with -funroll-loops. The output will be longer, will contain lots of xmm's and will look faster.

Neither icc nor gcc necessarily optimizes the degree of unrolling. To match them, you would use e.g.
gcc -funroll-loops --param max-unroll-times=4
icc -unroll4
as gcc tends to unroll more than optimum for CPUs of the last 8 years (if allowed to do so) while icc is more conservative.
Also glossed over above is that icc -O3 encourages the compiler to optimize loop nesting, and may even engage the special -opt-matmul facility.
The original form implies a dot product reduction inner loop, for which gcc might require both -ffast-math and a more modern choice for -march= in order to optimize. icc is much more aggressive about riffling a dot product (batching into multiple sums), if it can't avoid it by switching loop nest.

Performance of C++11 modern-style loops vs old-style loops

This is the first question I'm posting here, so I hope I won't do anything wrong.
My question concerns the performance of modern-style C++11 loops (std::for_each, range-based for) vs old-style C++ loops (for (...; ...; ...)). From what I understood, it seems to me that the motto of modern C++ is "expressivity with no compromise on performance". Modern C++ style leads to safe, clean, and fast code with little to no performance penalty and, possibly, with a performance gain over old-style C++.
Now I've made a little test to assess how big this gain is concerning loops. First I wrote the following three functions:
using namespace std;
void foo(vector<double>& v)
{
for (size_t i = 0; i < v.size(); i++)
{
v[i] /= 42;
}
}
void bar(vector<double>& v)
{
for (auto& x : v)
{
x /= 42;
}
}
void wee(vector<double>& v)
{
for_each(begin(v), end(v), [] (double& x)
{
x /= 42;
});
}
Then I compared their performance by calling them this way (properly commenting/uncommenting the three lines inside main()'s loop:
vector<double> make_vector()
{
vector<double> v;
for (int i = 0; i < 30000; i++) { v.push_back(i); }
return v;
}
int main()
{
time_t start = clock();
auto v = make_vector();
for (int i = 0; i <= 50000; i++)
{
// UNCOMMENT THE FUNCTION CALL TO BE TESTED, COMMENT THE OTHERS
foo(v);
// bar(v);
// wee(v);
}
time_t end = clock();
cout << (end - start) << endl;
return 0;
}
Averaging over 10 executions of each version of the program obtained by commenting/uncommenting the lines in main()'s loop, and using the old-style loop as a baseline, the range-based for loop performs ~1.9x worse, and the loop based on std::for_each and lambdas performs ~2.3x worse.
I used Clang 3.2 to compile this, and I haven't tried MS VC11 (I'm working on WinXP).
Considering my expectation of getting comparable execution times, my questions are:
Did I do something obviously wrong?
If not, couldn't a 2x performance penalty be a good reason NOT to embrace modern-style loops?
I would like to remark, that I do believe that the clarity and safety of code written in modern C++ style pay off for a possible performance loss, but I quite disagree with the statement that there is no trade-off between clarity/safety on one side and performance on the other side.
Am I missing something?

It looks like the difference only shows up when you do not enable optimisations in your compiler.
With Clang you can enable optimisation with the -O[0-3] flag.

Mankarse is right - most likely you have not enabled optimizations.
Actually on Clang they produce practically same result ASM code in main loop, and small difference in pre/post code.
I have tested four versions: hand_loop_index, hand_loop_iterator, range_based_for, for_each_algorithm
hand_loop_iterator, range_based_for and for_each_algorithm - all three do produce exactly same result ASM for full function body, only difference is in names of labels.
I.e. hand written for loop with iterators results in exactly same ASM code as range-based-for and std::for_each.
There are some differences between loop with index and loop with iterator versions.
Main loop in both cases is almost same. The only minor differece is that for iterators version(s) rdx register is used instead of rsi.
Index version:
.LBB0_7: # %vector.body
# =>This Inner Loop Header: Depth=1
movupd -48(%rsi), %xmm1
movupd -32(%rsi), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -48(%rsi)
movupd %xmm2, -32(%rsi)
movupd -16(%rsi), %xmm1
movupd (%rsi), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -16(%rsi)
movupd %xmm2, (%rsi)
addq $64, %rsi
addq $-8, %rdi
jne .LBB0_7
Iterator version(s):
.LBB1_6: # %vector.body
# =>This Inner Loop Header: Depth=1
movupd -48(%rdx), %xmm1
movupd -32(%rdx), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -48(%rdx)
movupd %xmm2, -32(%rdx)
movupd -16(%rdx), %xmm1
movupd (%rdx), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -16(%rdx)
movupd %xmm2, (%rdx)
addq $64, %rdx
addq $-8, %rsi
jne .LBB1_6
Pre/post code for index vs iterator versions has many differences, but it should not affect greatly total result timing for large enough arrays.
LIVE DEMO on Coliru with ASM output
#include <algorithm>
#include <iterator>
#include <vector>
using namespace std;
void hand_loop_index(vector<double> &v)
{
for (size_t i = 0; i < v.size(); ++i)
{
v[i] /= 42;
}
}
void hand_loop_iterator(vector<double> &v)
{
for (auto first = begin(v), last = end(v); first!=last; ++first)
{
*first /= 42;
}
}
void range_based_for(vector<double> &v)
{
for (auto &x : v)
{
x /= 42;
}
}
void for_each_algorithm(vector<double> &v)
{
for_each(begin(v), end(v), [] (double &x)
{
x /= 42;
});
}
Result ASM:
# clang++ -std=c++1z -O3 -Wall -pedantic -pthread main.cpp -S
.text
.file "main.cpp"
.section .rodata.cst16,"aM",#progbits,16
.align 16
.LCPI0_0:
.quad 4631107791820423168 # double 4.200000e+01
.quad 4631107791820423168 # double 4.200000e+01
.section .rodata.cst8,"aM",#progbits,8
.align 8
.LCPI0_1:
.quad 4631107791820423168 # double 42
.text
.globl _Z15hand_loop_indexRSt6vectorIdSaIdEE
.align 16, 0x90
.type _Z15hand_loop_indexRSt6vectorIdSaIdEE,#function
_Z15hand_loop_indexRSt6vectorIdSaIdEE: # #_Z15hand_loop_indexRSt6vectorIdSaIdEE
.cfi_startproc
# BB#0:
movq (%rdi), %rax
movq 8(%rdi), %rcx
subq %rax, %rcx
je .LBB0_11
# BB#1: # %.lr.ph
sarq $3, %rcx
cmpq $1, %rcx
movl $1, %edx
cmovaq %rcx, %rdx
xorl %edi, %edi
testq %rdx, %rdx
je .LBB0_10
# BB#2: # %overflow.checked
xorl %edi, %edi
movq %rdx, %r8
andq $-4, %r8
je .LBB0_9
# BB#3: # %vector.body.preheader
cmpq $1, %rcx
movl $1, %edi
cmovaq %rcx, %rdi
addq $-4, %rdi
movq %rdi, %rsi
shrq $2, %rsi
xorl %r9d, %r9d
btq $2, %rdi
jb .LBB0_5
# BB#4: # %vector.body.prol
movupd (%rax), %xmm0
movupd 16(%rax), %xmm1
movapd .LCPI0_0(%rip), %xmm2 # xmm2 = [4.200000e+01,4.200000e+01]
divpd %xmm2, %xmm0
divpd %xmm2, %xmm1
movupd %xmm0, (%rax)
movupd %xmm1, 16(%rax)
movl $4, %r9d
.LBB0_5: # %vector.body.preheader.split
testq %rsi, %rsi
je .LBB0_8
# BB#6: # %vector.body.preheader.split.split
cmpq $1, %rcx
movl $1, %edi
cmovaq %rcx, %rdi
andq $-4, %rdi
subq %r9, %rdi
leaq 48(%rax,%r9,8), %rsi
movapd .LCPI0_0(%rip), %xmm0 # xmm0 = [4.200000e+01,4.200000e+01]
.align 16, 0x90
.LBB0_7: # %vector.body
# =>This Inner Loop Header: Depth=1
movupd -48(%rsi), %xmm1
movupd -32(%rsi), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -48(%rsi)
movupd %xmm2, -32(%rsi)
movupd -16(%rsi), %xmm1
movupd (%rsi), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -16(%rsi)
movupd %xmm2, (%rsi)
addq $64, %rsi
addq $-8, %rdi
jne .LBB0_7
.LBB0_8:
movq %r8, %rdi
.LBB0_9: # %middle.block
cmpq %rdi, %rdx
je .LBB0_11
.align 16, 0x90
.LBB0_10: # %scalar.ph
# =>This Inner Loop Header: Depth=1
movsd (%rax,%rdi,8), %xmm0 # xmm0 = mem[0],zero
divsd .LCPI0_1(%rip), %xmm0
movsd %xmm0, (%rax,%rdi,8)
incq %rdi
cmpq %rcx, %rdi
jb .LBB0_10
.LBB0_11: # %._crit_edge
retq
.Lfunc_end0:
.size _Z15hand_loop_indexRSt6vectorIdSaIdEE, .Lfunc_end0-_Z15hand_loop_indexRSt6vectorIdSaIdEE
.cfi_endproc
.section .rodata.cst16,"aM",#progbits,16
.align 16
.LCPI1_0:
.quad 4631107791820423168 # double 4.200000e+01
.quad 4631107791820423168 # double 4.200000e+01
.section .rodata.cst8,"aM",#progbits,8
.align 8
.LCPI1_1:
.quad 4631107791820423168 # double 42
.text
.globl _Z18hand_loop_iteratorRSt6vectorIdSaIdEE
.align 16, 0x90
.type _Z18hand_loop_iteratorRSt6vectorIdSaIdEE,#function
_Z18hand_loop_iteratorRSt6vectorIdSaIdEE: # #_Z18hand_loop_iteratorRSt6vectorIdSaIdEE
.cfi_startproc
# BB#0:
movq (%rdi), %rdx
movq 8(%rdi), %rax
cmpq %rax, %rdx
je .LBB1_11
# BB#1: # %.lr.ph.preheader
movabsq $4611686018427387900, %rsi # imm = 0x3FFFFFFFFFFFFFFC
leaq -8(%rax), %rcx
subq %rdx, %rcx
shrq $3, %rcx
incq %rcx
xorl %edi, %edi
movq %rcx, %r9
andq %rsi, %r9
je .LBB1_8
# BB#2: # %vector.body.preheader
andq %rcx, %rsi
leaq -4(%rsi), %rdi
movq %rdi, %r11
shrq $2, %r11
xorl %r10d, %r10d
btq $2, %rdi
jb .LBB1_4
# BB#3: # %vector.body.prol
movupd (%rdx), %xmm0
movupd 16(%rdx), %xmm1
movapd .LCPI1_0(%rip), %xmm2 # xmm2 = [4.200000e+01,4.200000e+01]
divpd %xmm2, %xmm0
divpd %xmm2, %xmm1
movupd %xmm0, (%rdx)
movupd %xmm1, 16(%rdx)
movl $4, %r10d
.LBB1_4: # %vector.body.preheader.split
leaq (%rdx,%r9,8), %r8
testq %r11, %r11
je .LBB1_7
# BB#5: # %vector.body.preheader.split.split
subq %r10, %rsi
leaq 48(%rdx,%r10,8), %rdx
movapd .LCPI1_0(%rip), %xmm0 # xmm0 = [4.200000e+01,4.200000e+01]
.align 16, 0x90
.LBB1_6: # %vector.body
# =>This Inner Loop Header: Depth=1
movupd -48(%rdx), %xmm1
movupd -32(%rdx), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -48(%rdx)
movupd %xmm2, -32(%rdx)
movupd -16(%rdx), %xmm1
movupd (%rdx), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -16(%rdx)
movupd %xmm2, (%rdx)
addq $64, %rdx
addq $-8, %rsi
jne .LBB1_6
.LBB1_7:
movq %r8, %rdx
movq %r9, %rdi
.LBB1_8: # %middle.block
cmpq %rdi, %rcx
je .LBB1_11
# BB#9:
movsd .LCPI1_1(%rip), %xmm0 # xmm0 = mem[0],zero
.align 16, 0x90
.LBB1_10: # %.lr.ph
# =>This Inner Loop Header: Depth=1
movsd (%rdx), %xmm1 # xmm1 = mem[0],zero
divsd %xmm0, %xmm1
movsd %xmm1, (%rdx)
addq $8, %rdx
cmpq %rdx, %rax
jne .LBB1_10
.LBB1_11: # %._crit_edge
retq
.Lfunc_end1:
.size _Z18hand_loop_iteratorRSt6vectorIdSaIdEE, .Lfunc_end1-_Z18hand_loop_iteratorRSt6vectorIdSaIdEE
.cfi_endproc
.section .rodata.cst16,"aM",#progbits,16
.align 16
.LCPI2_0:
.quad 4631107791820423168 # double 4.200000e+01
.quad 4631107791820423168 # double 4.200000e+01
.section .rodata.cst8,"aM",#progbits,8
.align 8
.LCPI2_1:
.quad 4631107791820423168 # double 42
.text
.globl _Z15range_based_forRSt6vectorIdSaIdEE
.align 16, 0x90
.type _Z15range_based_forRSt6vectorIdSaIdEE,#function
_Z15range_based_forRSt6vectorIdSaIdEE: # #_Z15range_based_forRSt6vectorIdSaIdEE
.cfi_startproc
# BB#0:
movq (%rdi), %rdx
movq 8(%rdi), %rax
cmpq %rax, %rdx
je .LBB2_11
# BB#1: # %.lr.ph.preheader
movabsq $4611686018427387900, %rsi # imm = 0x3FFFFFFFFFFFFFFC
leaq -8(%rax), %rcx
subq %rdx, %rcx
shrq $3, %rcx
incq %rcx
xorl %edi, %edi
movq %rcx, %r9
andq %rsi, %r9
je .LBB2_8
# BB#2: # %vector.body.preheader
andq %rcx, %rsi
leaq -4(%rsi), %rdi
movq %rdi, %r11
shrq $2, %r11
xorl %r10d, %r10d
btq $2, %rdi
jb .LBB2_4
# BB#3: # %vector.body.prol
movupd (%rdx), %xmm0
movupd 16(%rdx), %xmm1
movapd .LCPI2_0(%rip), %xmm2 # xmm2 = [4.200000e+01,4.200000e+01]
divpd %xmm2, %xmm0
divpd %xmm2, %xmm1
movupd %xmm0, (%rdx)
movupd %xmm1, 16(%rdx)
movl $4, %r10d
.LBB2_4: # %vector.body.preheader.split
leaq (%rdx,%r9,8), %r8
testq %r11, %r11
je .LBB2_7
# BB#5: # %vector.body.preheader.split.split
subq %r10, %rsi
leaq 48(%rdx,%r10,8), %rdx
movapd .LCPI2_0(%rip), %xmm0 # xmm0 = [4.200000e+01,4.200000e+01]
.align 16, 0x90
.LBB2_6: # %vector.body
# =>This Inner Loop Header: Depth=1
movupd -48(%rdx), %xmm1
movupd -32(%rdx), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -48(%rdx)
movupd %xmm2, -32(%rdx)
movupd -16(%rdx), %xmm1
movupd (%rdx), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -16(%rdx)
movupd %xmm2, (%rdx)
addq $64, %rdx
addq $-8, %rsi
jne .LBB2_6
.LBB2_7:
movq %r8, %rdx
movq %r9, %rdi
.LBB2_8: # %middle.block
cmpq %rdi, %rcx
je .LBB2_11
# BB#9:
movsd .LCPI2_1(%rip), %xmm0 # xmm0 = mem[0],zero
.align 16, 0x90
.LBB2_10: # %.lr.ph
# =>This Inner Loop Header: Depth=1
movsd (%rdx), %xmm1 # xmm1 = mem[0],zero
divsd %xmm0, %xmm1
movsd %xmm1, (%rdx)
addq $8, %rdx
cmpq %rdx, %rax
jne .LBB2_10
.LBB2_11: # %._crit_edge
retq
.Lfunc_end2:
.size _Z15range_based_forRSt6vectorIdSaIdEE, .Lfunc_end2-_Z15range_based_forRSt6vectorIdSaIdEE
.cfi_endproc
.section .rodata.cst16,"aM",#progbits,16
.align 16
.LCPI3_0:
.quad 4631107791820423168 # double 4.200000e+01
.quad 4631107791820423168 # double 4.200000e+01
.section .rodata.cst8,"aM",#progbits,8
.align 8
.LCPI3_1:
.quad 4631107791820423168 # double 42
.text
.globl _Z18for_each_algorithmRSt6vectorIdSaIdEE
.align 16, 0x90
.type _Z18for_each_algorithmRSt6vectorIdSaIdEE,#function
_Z18for_each_algorithmRSt6vectorIdSaIdEE: # #_Z18for_each_algorithmRSt6vectorIdSaIdEE
.cfi_startproc
# BB#0:
movq (%rdi), %rdx
movq 8(%rdi), %rax
cmpq %rax, %rdx
je .LBB3_11
# BB#1: # %.lr.ph.i.preheader
movabsq $4611686018427387900, %rsi # imm = 0x3FFFFFFFFFFFFFFC
leaq -8(%rax), %rcx
subq %rdx, %rcx
shrq $3, %rcx
incq %rcx
xorl %edi, %edi
movq %rcx, %r9
andq %rsi, %r9
je .LBB3_8
# BB#2: # %vector.body.preheader
andq %rcx, %rsi
leaq -4(%rsi), %rdi
movq %rdi, %r11
shrq $2, %r11
xorl %r10d, %r10d
btq $2, %rdi
jb .LBB3_4
# BB#3: # %vector.body.prol
movupd (%rdx), %xmm0
movupd 16(%rdx), %xmm1
movapd .LCPI3_0(%rip), %xmm2 # xmm2 = [4.200000e+01,4.200000e+01]
divpd %xmm2, %xmm0
divpd %xmm2, %xmm1
movupd %xmm0, (%rdx)
movupd %xmm1, 16(%rdx)
movl $4, %r10d
.LBB3_4: # %vector.body.preheader.split
leaq (%rdx,%r9,8), %r8
testq %r11, %r11
je .LBB3_7
# BB#5: # %vector.body.preheader.split.split
subq %r10, %rsi
leaq 48(%rdx,%r10,8), %rdx
movapd .LCPI3_0(%rip), %xmm0 # xmm0 = [4.200000e+01,4.200000e+01]
.align 16, 0x90
.LBB3_6: # %vector.body
# =>This Inner Loop Header: Depth=1
movupd -48(%rdx), %xmm1
movupd -32(%rdx), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -48(%rdx)
movupd %xmm2, -32(%rdx)
movupd -16(%rdx), %xmm1
movupd (%rdx), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -16(%rdx)
movupd %xmm2, (%rdx)
addq $64, %rdx
addq $-8, %rsi
jne .LBB3_6
.LBB3_7:
movq %r8, %rdx
movq %r9, %rdi
.LBB3_8: # %middle.block
cmpq %rdi, %rcx
je .LBB3_11
# BB#9:
movsd .LCPI3_1(%rip), %xmm0 # xmm0 = mem[0],zero
.align 16, 0x90
.LBB3_10: # %.lr.ph.i
# =>This Inner Loop Header: Depth=1
movsd (%rdx), %xmm1 # xmm1 = mem[0],zero
divsd %xmm0, %xmm1
movsd %xmm1, (%rdx)
addq $8, %rdx
cmpq %rdx, %rax
jne .LBB3_10
.LBB3_11: # %_ZSt8for_eachIN9__gnu_cxx17__normal_iteratorIPdSt6vectorIdSaIdEEEEZ18for_each_algorithmR5_E3$_0ET0_T_SA_S9_.exit
retq
.Lfunc_end3:
.size _Z18for_each_algorithmRSt6vectorIdSaIdEE, .Lfunc_end3-_Z18for_each_algorithmRSt6vectorIdSaIdEE
.cfi_endproc
.ident "clang version 3.7.0 (tags/RELEASE_370/final 246979)"
.section ".note.GNU-stack","",#progbits

Can modern compilers unroll `for` loops expressed using begin and end iterators

Consider the following code
vector<double> v;
// fill v
const vector<double>::iterator end =v.end();
for(vector<double>::iterator i = v.bgin(); i != end; ++i) {
// do stuff
}
Are compilers like g++, clang++, icc able to unroll loops like this. Unfortunately I do not know assembly to be able verify from the output whether the loop gets unrolled or not. (and I only have access to g++.)
To me it seems that this will require more smartness than usual on behalf of the compiler, first to deduce that the iterator is a random access iterator, and then figure out the number of times the loop is executed. Can compilers do this when optimization is enabled ?
Thanks for your replies, and before some of you start lecturing about premature optimization, this is an excercise in curiosity.

To me it seems that this will require more smartness than usual on behalf of the compiler, first to deduce that the iterator is a random access iterator, and then figure out the number of times the loop is executed.
The STL, being comprised entirely of templates, has all the code inline. So, random access iterators reduce to pointers already when the compiler begins to apply optimizations. One of the reasons the STL was created was so that there would be less need for a programmer to outwit the compiler. You should rely on the STL to do the right thing until proven otherwise.
Of course, it is still up to you to choose the proper tool from the STL to use...
Edit: There was discussion about whether g++ does any loop unrolling. On the versions that I am using, loop unrolling is not part of -O, -O2, or -O3, and I get identical assembly for the latter two levels with the following code:
void foo (std::vector<int> &v) {
volatile int c = 0;
const std::vector<int>::const_iterator end = v.end();
for (std::vector<int>::iterator i = v.begin(); i != end; ++i) {
*i = c++;
}
}
With the corresponding assembly -O2 assembly:
_Z3fooRSt6vectorIiSaIiEE:
.LFB435:
movq 8(%rdi), %rcx
movq (%rdi), %rax
movl $0, -4(%rsp)
cmpq %rax, %rcx
je .L4
.p2align 4,,10
.p2align 3
.L3:
movl -4(%rsp), %edx
movl %edx, (%rax)
addq $4, %rax
addl $1, %edx
cmpq %rax, %rcx
movl %edx, -4(%rsp)
jne .L3
.L4:
rep
ret
With the -funroll-loops option added, the function expands into something much much larger. But, the documentation warns about this option:
Unroll loops whose number of iterations can be determined at compile time or upon entry to the loop. -funroll-loops implies -frerun-cse-after-loop. It also turns on complete loop peeling (i.e. complete removal of loops with small constant number of iterations). This option makes code larger, and may or may not make it run faster.
As a further argument to dissuade you from unrolling loops yourself, I'll finish this answer with an illustration of applying Duff's Device to the foo function above:
void foo_duff (std::vector<int> &v) {
volatile int c = 0;
const std::vector<int>::const_iterator end = v.end();
std::vector<int>::iterator i = v.begin();
switch ((end - i) % 4) do {
case 0: *i++ = c++;
case 3: *i++ = c++;
case 2: *i++ = c++;
case 1: *i++ = c++;
} while (i != end);
}
GCC has another loop optimization flag:
-ftree-loop-optimize
Perform loop optimizations on trees. This flag is enabled by default at -O and higher.
So, the -O option enables simple loop optimizations for the innermost loops, including complete loop unrolling (peeling) for loops with a fixed number of iterations. (Thanks to doc for pointing this out to me.)

I would propose that whether or not the compiler CAN unroll the loop, with modern pipelined architectures and caches, unless your "do stuff" is trivial, there is little benefit in doing so, and in many cases doing so would be a performance HIT instead of a boon. If your "do stuff" is nontrivial, unrolling the loop will create multiple copies of this nontrivial code, which will take extra time to load into the cache, significantly slowing down the first iteration through the unrolled loop. At the same time, it will evict more code from the cache, which may have been necessary for performing the "do stuff" if it makes any function calls, which would then need to be reloaded into the cache again. The purpose for unrolling loops made a lot of sense before cacheless pipelined non-branch-predictive architectures, with the goal being to reduce the overhead associated with the loop logic. Nowadays with cache-based pipelined branch-predictive hardware, your cpu will be pipelined well into the next loop iteration, speculatively executing the loop code again, by the time you detect the i==end exit condition, at which point the processor will throw out that final speculatively-executed set of results. In such an architecture, loop unrolling makes very little sense. It would further bloat code for virtually no benefit.

The short answer is yes. It will unroll as much as it can. In your case, it depends how you define end obviously (I assume your example is generic). Not only will most modern compilers unroll, but they will also vectorize and do other optimizations that will often blow your own solutions out of the water.
So what I'm saying is don't prematurely optimize! Just kidding :)

Simple answer: generally NO! At least when it comes to complete loop unrolling.
Let's test loop unrolling on this simple, dirty-coded (for testing purposes) structure.
struct Test
{
Test(): begin(arr), end(arr + 4) {}
double * begin;
double * end;
double arr[4];
};
First let's take counted loop and compile it without any optimizations.
double counted(double param, Test & d)
{
for (int i = 0; i < 4; i++)
param += d.arr[i];
return param;
}
Here's what gcc 4.9 produces.
counted(double, Test&):
pushq %rbp
movq %rsp, %rbp
movsd %xmm0, -24(%rbp)
movq %rdi, -32(%rbp)
movl $0, -4(%rbp)
jmp .L2
.L3:
movq -32(%rbp), %rax
movl -4(%rbp), %edx
movslq %edx, %rdx
addq $2, %rdx
movsd (%rax,%rdx,8), %xmm0
movsd -24(%rbp), %xmm1
addsd %xmm0, %xmm1
movq %xmm1, %rax
movq %rax, -24(%rbp)
addl $1, -4(%rbp)
.L2:
cmpl $3, -4(%rbp)
jle .L3
movq -24(%rbp), %rax
movq %rax, -40(%rbp)
movsd -40(%rbp), %xmm0
popq %rbp
ret
As expected loop hasn't been unrolled and, since no optimizations were performed, code is generally very verbose. Now let's turn on -O3 flag. Produced disassembly:
counted(double, Test&):
addsd 16(%rdi), %xmm0
addsd 24(%rdi), %xmm0
addsd 32(%rdi), %xmm0
addsd 40(%rdi), %xmm0
ret
Voila, loop has been unrolled this time.
Now let's take a look at iterated loop. Function containing the loop will look like this.
double iterated(double param, Test & d)
{
for (double * it = d.begin; it != d.end; ++it)
param += *it;
return param;
}
Still using -O3 flag, let's take a look at disassembly.
iterated(double, Test&):
movq (%rdi), %rax
movq 8(%rdi), %rdx
cmpq %rdx, %rax
je .L3
.L4:
addsd (%rax), %xmm0
addq $8, %rax
cmpq %rdx, %rax
jne .L4
.L3:
rep ret
Code looks better than in the very first case, because optimizations were performed, but loop hasn't been unrolled this time!
What about funroll-loops and funroll-all-loops flags? They will produce result similar to this
iterated(double, Test&):
movq (%rdi), %rsi
movq 8(%rdi), %rcx
cmpq %rcx, %rsi
je .L3
movq %rcx, %rdx
leaq 8(%rsi), %rax
addsd (%rsi), %xmm0
subq %rsi, %rdx
subq $8, %rdx
shrq $3, %rdx
andl $7, %edx
cmpq %rcx, %rax
je .L43
testq %rdx, %rdx
je .L4
cmpq $1, %rdx
je .L29
cmpq $2, %rdx
je .L30
cmpq $3, %rdx
je .L31
cmpq $4, %rdx
je .L32
cmpq $5, %rdx
je .L33
cmpq $6, %rdx
je .L34
addsd (%rax), %xmm0
leaq 16(%rsi), %rax
.L34:
addsd (%rax), %xmm0
addq $8, %rax
.L33:
addsd (%rax), %xmm0
addq $8, %rax
.L32:
addsd (%rax), %xmm0
addq $8, %rax
.L31:
addsd (%rax), %xmm0
addq $8, %rax
.L30:
addsd (%rax), %xmm0
addq $8, %rax
.L29:
addsd (%rax), %xmm0
addq $8, %rax
cmpq %rcx, %rax
je .L44
.L4:
addsd (%rax), %xmm0
addq $64, %rax
addsd -56(%rax), %xmm0
addsd -48(%rax), %xmm0
addsd -40(%rax), %xmm0
addsd -32(%rax), %xmm0
addsd -24(%rax), %xmm0
addsd -16(%rax), %xmm0
addsd -8(%rax), %xmm0
cmpq %rcx, %rax
jne .L4
.L3:
rep ret
.L44:
rep ret
.L43:
rep ret
Compare results with unrolled loop for counted loop. It's clearly not the same. What we see here is that gcc divided the loop into 8 element chunks. This can increase performance in some cases, because loop exit condition is checked once per 8 normal loop iterations. With additional flags vectorization could be also performed. But it isn't complete loop unrolling.
Iterated loop will be unrolled however if Test object is not a function argument.
double iteratedLocal(double param)
{
Test d;
for (double * it = d.begin; it != d.end; ++it)
param += *it;
return param;
}
Disassembly produced with only -O3 flag:
iteratedLocal(double):
addsd -40(%rsp), %xmm0
addsd -32(%rsp), %xmm0
addsd -24(%rsp), %xmm0
addsd -16(%rsp), %xmm0
ret
As you can see loop has been unrolled. This is because compiler can now safely assume that end has fixed value, while it couldn't predict that for function argument.
Test structure is statically allocated however. Things are more complicated with dynamically allocated structures like std::vector. From my observations on modified Test structure, so that it ressembles dynamically allocated container, it looks like gcc tries its best to unroll loops, but in most cases generated code is not as simple as one above.
As you ask for other compilers, here's output from clang 3.4.1 (-O3 flag)
counted(double, Test&): # #counted(double, Test&)
addsd 16(%rdi), %xmm0
addsd 24(%rdi), %xmm0
addsd 32(%rdi), %xmm0
addsd 40(%rdi), %xmm0
ret
iterated(double, Test&): # #iterated(double, Test&)
movq (%rdi), %rax
movq 8(%rdi), %rcx
cmpq %rcx, %rax
je .LBB1_2
.LBB1_1: # %.lr.ph
addsd (%rax), %xmm0
addq $8, %rax
cmpq %rax, %rcx
jne .LBB1_1
.LBB1_2: # %._crit_edge
ret
iteratedLocal(double): # #iteratedLocal(double)
leaq -32(%rsp), %rax
movq %rax, -48(%rsp)
leaq (%rsp), %rax
movq %rax, -40(%rsp)
xorl %eax, %eax
jmp .LBB2_1
.LBB2_2: # %._crit_edge4
movsd -24(%rsp,%rax), %xmm1
addq $8, %rax
.LBB2_1: # =>This Inner Loop Header: Depth=1
movaps %xmm0, %xmm2
cmpq $24, %rax
movaps %xmm1, %xmm0
addsd %xmm2, %xmm0
jne .LBB2_2
ret
Intel's icc 13.01 (-O3 flag)
counted(double, Test&):
addsd 16(%rdi), %xmm0 #24.5
addsd 24(%rdi), %xmm0 #24.5
addsd 32(%rdi), %xmm0 #24.5
addsd 40(%rdi), %xmm0 #24.5
ret #25.10
iterated(double, Test&):
movq (%rdi), %rdx #30.26
movq 8(%rdi), %rcx #30.41
cmpq %rcx, %rdx #30.41
je ..B3.25 # Prob 50% #30.41
subq %rdx, %rcx #30.7
movb $0, %r8b #30.7
lea 7(%rcx), %rax #30.7
sarq $2, %rax #30.7
shrq $61, %rax #30.7
lea 7(%rax,%rcx), %rcx #30.7
sarq $3, %rcx #30.7
cmpq $16, %rcx #30.7
jl ..B3.26 # Prob 10% #30.7
movq %rdx, %rdi #30.7
andq $15, %rdi #30.7
je ..B3.6 # Prob 50% #30.7
testq $7, %rdi #30.7
jne ..B3.26 # Prob 10% #30.7
movl $1, %edi #30.7
..B3.6: # Preds ..B3.5 ..B3.3
lea 16(%rdi), %rax #30.7
cmpq %rax, %rcx #30.7
jl ..B3.26 # Prob 10% #30.7
movq %rcx, %rax #30.7
xorl %esi, %esi #30.7
subq %rdi, %rax #30.7
andq $15, %rax #30.7
negq %rax #30.7
addq %rcx, %rax #30.7
testq %rdi, %rdi #30.7
jbe ..B3.11 # Prob 2% #30.7
..B3.9: # Preds ..B3.7 ..B3.9
addsd (%rdx,%rsi,8), %xmm0 #31.9
incq %rsi #30.7
cmpq %rdi, %rsi #30.7
jb ..B3.9 # Prob 82% #30.7
..B3.11: # Preds ..B3.9 ..B3.7
pxor %xmm6, %xmm6 #28.12
movaps %xmm6, %xmm7 #28.12
movaps %xmm6, %xmm5 #28.12
movsd %xmm0, %xmm7 #28.12
movaps %xmm6, %xmm4 #28.12
movaps %xmm6, %xmm3 #28.12
movaps %xmm6, %xmm2 #28.12
movaps %xmm6, %xmm1 #28.12
movaps %xmm6, %xmm0 #28.12
..B3.12: # Preds ..B3.12 ..B3.11
addpd (%rdx,%rdi,8), %xmm7 #31.9
addpd 16(%rdx,%rdi,8), %xmm6 #31.9
addpd 32(%rdx,%rdi,8), %xmm5 #31.9
addpd 48(%rdx,%rdi,8), %xmm4 #31.9
addpd 64(%rdx,%rdi,8), %xmm3 #31.9
addpd 80(%rdx,%rdi,8), %xmm2 #31.9
addpd 96(%rdx,%rdi,8), %xmm1 #31.9
addpd 112(%rdx,%rdi,8), %xmm0 #31.9
addq $16, %rdi #30.7
cmpq %rax, %rdi #30.7
jb ..B3.12 # Prob 82% #30.7
addpd %xmm6, %xmm7 #28.12
addpd %xmm4, %xmm5 #28.12
addpd %xmm2, %xmm3 #28.12
addpd %xmm0, %xmm1 #28.12
addpd %xmm5, %xmm7 #28.12
addpd %xmm1, %xmm3 #28.12
addpd %xmm3, %xmm7 #28.12
movaps %xmm7, %xmm0 #28.12
unpckhpd %xmm7, %xmm0 #28.12
addsd %xmm0, %xmm7 #28.12
movaps %xmm7, %xmm0 #28.12
..B3.14: # Preds ..B3.13 ..B3.26
lea 1(%rax), %rsi #30.7
cmpq %rsi, %rcx #30.7
jb ..B3.25 # Prob 50% #30.7
subq %rax, %rcx #30.7
cmpb $1, %r8b #30.7
jne ..B3.17 # Prob 50% #30.7
..B3.16: # Preds ..B3.17 ..B3.15
xorl %r8d, %r8d #30.7
jmp ..B3.21 # Prob 100% #30.7
..B3.17: # Preds ..B3.15
cmpq $2, %rcx #30.7
jl ..B3.16 # Prob 10% #30.7
movq %rcx, %r8 #30.7
xorl %edi, %edi #30.7
pxor %xmm1, %xmm1 #28.12
lea (%rdx,%rax,8), %rsi #31.19
andq $-2, %r8 #30.7
movsd %xmm0, %xmm1 #28.12
..B3.19: # Preds ..B3.19 ..B3.18
addpd (%rsi,%rdi,8), %xmm1 #31.9
addq $2, %rdi #30.7
cmpq %r8, %rdi #30.7
jb ..B3.19 # Prob 82% #30.7
movaps %xmm1, %xmm0 #28.12
unpckhpd %xmm1, %xmm0 #28.12
addsd %xmm0, %xmm1 #28.12
movaps %xmm1, %xmm0 #28.12
..B3.21: # Preds ..B3.20 ..B3.16
cmpq %rcx, %r8 #30.7
jae ..B3.25 # Prob 2% #30.7
lea (%rdx,%rax,8), %rax #31.19
..B3.23: # Preds ..B3.23 ..B3.22
addsd (%rax,%r8,8), %xmm0 #31.9
incq %r8 #30.7
cmpq %rcx, %r8 #30.7
jb ..B3.23 # Prob 82% #30.7
..B3.25: # Preds ..B3.23 ..B3.21 ..B3.14 ..B3.1
ret #32.14
..B3.26: # Preds ..B3.2 ..B3.6 ..B3.4 # Infreq
movb $1, %r8b #30.7
xorl %eax, %eax #30.7
jmp ..B3.14 # Prob 100% #30.7
iteratedLocal(double):
lea -8(%rsp), %rax #8.13
lea -40(%rsp), %rdx #7.11
cmpq %rax, %rdx #33.41
je ..B4.15 # Prob 50% #33.41
movq %rax, -48(%rsp) #32.12
movq %rdx, -56(%rsp) #32.12
xorl %eax, %eax #33.7
..B4.13: # Preds ..B4.11 ..B4.13
addsd -40(%rsp,%rax,8), %xmm0 #34.9
incq %rax #33.7
cmpq $4, %rax #33.7
jb ..B4.13 # Prob 82% #33.7
..B4.15: # Preds ..B4.13 ..B4.1
ret #35.14
To avoid misunderstandings. If counted loop condition would rely on external parameter like this one.
double countedDep(double param, Test & d)
{
for (int i = 0; i < d.size; i++)
param += d.arr[i];
return param;
}
Such loop also will not be unrolled.

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

Why some of sse intrinsics introduce move back and forth? - c++

Related

Explanation of std::vector<int> sum ASM

Add up all elements of compile-time sized array most efficiently

Optimization of naive matrix multiplication (ICC vs GCC)

Performance of C++11 modern-style loops vs old-style loops

Can modern compilers unroll `for` loops expressed using begin and end iterators

Categories

Resources