Optimization of naive matrix multiplication (ICC vs GCC)

Optimization of naive matrix multiplication (ICC vs GCC) - c++

The code below uses a very straightforward approach to calculate the matrix product a * b and store the result in c. The code was compiled with -O3 on both GCC 4.4.6 (with -mtune=native) and Intel Compiler 13.0.1 and the speed on GCC is significantly worse (over a factor of two for the sample data used).
I'm curious about the cause of these differences, but unfortunately I'm not familiar enough with the assembly output to understand what's going on here. From a glance it looks as though ICC is doing a better job at vectorizing the computation, but I can't decipher much more than that. (This is mainly for learning purposes since there's no way I would use this in production!)
void __attribute__ ((noinline)) mm( // Line 3
int n,
double*__restrict__ c,
double*__restrict__ a,
double*__restrict__ b
) {
int i, j, k;
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++) {
c[i + n * j] = 0; // Line 12
for (k = 0; k < n; k++) {
c[i + n * j] += a[i + n * k] * b[k + n * j]; // Line 14
}
}
}
}
Here is the output from GCC:
_Z2mmiPdS_S_:
.LFB0:
.cfi_startproc
.cfi_personality 0x3,__gxx_personality_v0
pushq %r14 #
.cfi_def_cfa_offset 16
.cfi_offset 14, -16
testl %edi, %edi # n
movq %rcx, %r14 # b, b
pushq %r13 #
.cfi_def_cfa_offset 24
.cfi_offset 13, -24
pushq %r12 #
.cfi_def_cfa_offset 32
.cfi_offset 12, -32
pushq %rbp #
.cfi_def_cfa_offset 40
.cfi_offset 6, -40
pushq %rbx #
.cfi_def_cfa_offset 48
.cfi_offset 3, -48
jle .L6 #,
leal -1(%rdi), %eax #, tmp96
movslq %edi, %r11 # n, n
movq %rdx, %rbx # a, ivtmp.54
xorl %r12d, %r12d # ivtmp.67
salq $3, %r11 #, D.2193
xorl %ebp, %ebp # prephitmp.37
leaq 8(,%rax,8), %r13 #, D.2208
.L3:
leaq (%rsi,%r12), %r10 #, ivtmp.61
movq %r14, %rcx # b, ivtmp.63
xorl %edx, %edx # j
.p2align 4,,10
.p2align 3
.L5:
movq $0, (%r10) #,* ivtmp.61
movq %rbp, -8(%rsp) # prephitmp.37,
movq %rcx, %r9 # ivtmp.63, ivtmp.70
movsd -8(%rsp), %xmm1 #, prephitmp.37
movq %rbx, %r8 # ivtmp.54, ivtmp.69
xorl %eax, %eax # k
.p2align 4,,10
.p2align 3
.L4:
movsd (%r8), %xmm0 #* ivtmp.69, tmp99
addl $1, %eax #, k
addq %r11, %r8 # D.2193, ivtmp.69
mulsd (%r9), %xmm0 #* ivtmp.70, tmp99
addq $8, %r9 #, ivtmp.70
cmpl %edi, %eax # n, k
addsd %xmm0, %xmm1 # tmp99, prephitmp.37
movsd %xmm1, (%r10) # prephitmp.37,* ivtmp.61
jne .L4 #,
addl $1, %edx #, j
addq %r11, %r10 # D.2193, ivtmp.61
addq %r11, %rcx # D.2193, ivtmp.63
cmpl %edi, %edx # n, j
jne .L5 #,
addq $8, %r12 #, ivtmp.67
addq $8, %rbx #, ivtmp.54
cmpq %r13, %r12 # D.2208, ivtmp.67
jne .L3 #,
.L6:
popq %rbx #
.cfi_def_cfa_offset 40
popq %rbp #
.cfi_def_cfa_offset 32
popq %r12 #
.cfi_def_cfa_offset 24
popq %r13 #
.cfi_def_cfa_offset 16
popq %r14 #
.cfi_def_cfa_offset 8
ret
.cfi_endproc
And here is the output from ICC:
# -- Begin _Z2mmiPdS_S_
# mark_begin;
.align 16,0x90
.globl _Z2mmiPdS_S_
_Z2mmiPdS_S_:
# parameter 1: %edi
# parameter 2: %rsi
# parameter 3: %rdx
# parameter 4: %rcx
..B1.1: # Preds ..B1.0
..___tag_value__Z2mmiPdS_S_.1: #8.3
pushq %r12 #8.3
..___tag_value__Z2mmiPdS_S_.3: #
pushq %r13 #8.3
..___tag_value__Z2mmiPdS_S_.5: #
pushq %r14 #8.3
..___tag_value__Z2mmiPdS_S_.7: #
pushq %r15 #8.3
..___tag_value__Z2mmiPdS_S_.9: #
pushq %rbx #8.3
..___tag_value__Z2mmiPdS_S_.11: #
pushq %rbp #8.3
..___tag_value__Z2mmiPdS_S_.13: #
subq $72, %rsp #8.3
..___tag_value__Z2mmiPdS_S_.15: #
movq %rsi, %r9 #
movslq %edi, %rax #
xorl %r10d, %r10d #11.9
testl %edi, %edi #11.25
jle ..B1.7 # Prob 10% #11.25
# LOE rax rdx rcx rbx rbp rsi r9 r12 r13 r14 r15 edi r10d
..B1.2: # Preds ..B1.1
movl %edi, %r11d #10.5
lea (,%rax,8), %r8 #
andl $-4, %r11d #10.5
movq %rax, %r14 #12.28
movslq %r11d, %r11 #10.5
movl %edi, %r12d #12.28
movq %rsi, 8(%rsp) #12.28
movq %r8, %rbp #12.28
movq %rdx, 32(%rsp) #12.28
movq %r9, %r13 #12.28
movq %rcx, (%rsp) #12.28
movl %r10d, %r15d #12.28
pxor %xmm0, %xmm0 #12.28
movq %r11, %rbx #12.28
# LOE rbx rbp r13 r14 r12d r15d
..B1.3: # Preds ..B1.5 ..B1.48 ..B1.45 ..B1.2
cmpl $12, %r12d #10.5
jle ..B1.38 # Prob 0% #10.5
# LOE rbx rbp r13 r14 r12d r15d
..B1.4: # Preds ..B1.3
movq %r13, %rdi #12.13
xorl %esi, %esi #12.13
movq %rbp, %rdx #12.13
call _intel_fast_memset #12.13
# LOE rbx rbp r13 r14 r12d r15d
..B1.5: # Preds ..B1.4
incl %r15d #11.9
lea (%r13,%r14,8), %r13 #11.9
cmpl %r12d, %r15d #11.9
jb ..B1.3 # Prob 82% #11.9
# LOE rbx rbp r13 r14 r12d r15d
..B1.6: # Preds ..B1.48 ..B1.45 ..B1.5 # Infreq
movl %r12d, %edi #
movq %r14, %rax #
movq 8(%rsp), %rsi #
testl %edi, %edi #11.25
movq 32(%rsp), %rdx #
movq (%rsp), %rcx #
# LOE rax rdx rcx rbx rbp rsi r12 r13 r14 r15 edi
..B1.7: # Preds ..B1.1 ..B1.6 # Infreq
movl $0, %r9d #11.9
movl $0, %r8d #
jle ..B1.33 # Prob 10% #11.25
# LOE rax rdx rcx rbx rbp rsi r8 r12 r13 r14 r15 edi r9d
..B1.8: # Preds ..B1.7 # Infreq
movq %rdx, 32(%rsp) #
# LOE rax rcx rsi r8 edi r9d
..B1.9: # Preds ..B1.31 ..B1.8 # Infreq
xorl %r12d, %r12d #
lea (%rsi,%r8,8), %r13 #14.17
movq %r13, %r15 #10.5
xorl %ebx, %ebx #13.13
andq $15, %r15 #10.5
xorl %r10d, %r10d #
movl %r15d, %r14d #10.5
lea (%rcx,%r8,8), %rbp #14.48
andl $7, %r14d #10.5
xorl %r11d, %r11d #
movl %r14d, 48(%rsp) #
xorl %edx, %edx #
movl %r15d, 56(%rsp) #
movq %r13, 40(%rsp) #
movq %r8, 16(%rsp) #
movl %r9d, 24(%rsp) #
movq %rsi, 8(%rsp) #
movq %rcx, (%rsp) #
movq 32(%rsp), %r14 #
# LOE rax rdx rbp r10 r12 r14 ebx edi r11d
..B1.10: # Preds ..B1.30 ..B1.9 # Infreq
cmpq $8, %rax #10.5
jl ..B1.34 # Prob 10% #10.5
# LOE rax rdx rbp r10 r12 r14 ebx edi r11d
..B1.11: # Preds ..B1.10 # Infreq
movl 56(%rsp), %r9d #10.5
testl %r9d, %r9d #10.5
je ..B1.14 # Prob 50% #10.5
# LOE rax rdx rbp r9 r10 r12 r14 ebx edi r11d
..B1.12: # Preds ..B1.11 # Infreq
cmpl $0, 48(%rsp) #10.5
jne ..B1.34 # Prob 10% #10.5
# LOE rax rdx rbp r10 r12 r14 ebx edi r11d
..B1.13: # Preds ..B1.12 # Infreq
movl $1, %r9d #10.5
# LOE rax rdx rbp r9 r10 r12 r14 ebx edi r11d
..B1.14: # Preds ..B1.13 ..B1.11 # Infreq
movl %r9d, %r13d #10.5
lea 8(%r13), %rcx #10.5
cmpq %rcx, %rax #10.5
jl ..B1.34 # Prob 10% #10.5
# LOE rax rdx rbp r9 r10 r12 r13 r14 ebx edi r11d
..B1.15: # Preds ..B1.14 # Infreq
movl %edi, %r15d #10.5
xorl %ecx, %ecx #10.5
subl %r9d, %r15d #10.5
movslq %r11d, %r8 #14.33
andl $7, %r15d #10.5
negl %r15d #10.5
addl %edi, %r15d #10.5
movslq %r15d, %r15 #10.5
testq %r13, %r13 #10.5
lea (%r14,%r8,8), %rsi #14.33
jbe ..B1.35 # Prob 0% #10.5
# LOE rax rdx rcx rbp rsi r8 r9 r10 r12 r13 r14 r15 ebx edi r11d
..B1.16: # Preds ..B1.15 # Infreq
movsd (%r10,%rbp), %xmm0 #14.48
movq 40(%rsp), %r14 #14.48
# LOE rax rdx rcx rbp rsi r8 r9 r10 r12 r13 r14 r15 ebx edi r11d xmm0
..B1.17: # Preds ..B1.17 ..B1.16 # Infreq
movsd (%rsi,%rcx,8), %xmm1 #14.33
mulsd %xmm0, %xmm1 #14.48
addsd (%r14,%rcx,8), %xmm1 #14.17
movsd %xmm1, (%r14,%rcx,8) #14.17
incq %rcx #10.5
cmpq %r13, %rcx #10.5
jb ..B1.17 # Prob 82% #10.5
# LOE rax rdx rcx rbp rsi r8 r9 r10 r12 r13 r14 r15 ebx edi r11d xmm0
..B1.18: # Preds ..B1.17 # Infreq
movq 32(%rsp), %r14 #
# LOE rax rdx rbp rsi r8 r9 r10 r12 r13 r14 r15 ebx edi r11d xmm0
..B1.19: # Preds ..B1.18 ..B1.35 # Infreq
addq %r9, %r8 #14.33
lea (%r14,%r8,8), %rcx #14.33
testq $15, %rcx #10.5
je ..B1.23 # Prob 60% #10.5
# LOE rax rdx rbp rsi r10 r12 r13 r14 r15 ebx edi r11d xmm0
..B1.20: # Preds ..B1.19 # Infreq
movq 40(%rsp), %rcx #14.48
unpcklpd %xmm0, %xmm0 #14.48
# LOE rax rdx rcx rbp rsi r10 r12 r13 r14 r15 ebx edi r11d xmm0
..B1.21: # Preds ..B1.21 ..B1.20 # Infreq
movsd (%rsi,%r13,8), %xmm1 #14.33
movsd 16(%rsi,%r13,8), %xmm2 #14.33
movsd 32(%rsi,%r13,8), %xmm3 #14.33
movsd 48(%rsi,%r13,8), %xmm4 #14.33
movhpd 8(%rsi,%r13,8), %xmm1 #14.33
movhpd 24(%rsi,%r13,8), %xmm2 #14.33
movhpd 40(%rsi,%r13,8), %xmm3 #14.33
movhpd 56(%rsi,%r13,8), %xmm4 #14.33
mulpd %xmm0, %xmm1 #14.48
mulpd %xmm0, %xmm2 #14.48
mulpd %xmm0, %xmm3 #14.48
mulpd %xmm0, %xmm4 #14.48
addpd (%rcx,%r13,8), %xmm1 #14.17
addpd 16(%rcx,%r13,8), %xmm2 #14.17
addpd 32(%rcx,%r13,8), %xmm3 #14.17
addpd 48(%rcx,%r13,8), %xmm4 #14.17
movaps %xmm1, (%rcx,%r13,8) #14.17
movaps %xmm2, 16(%rcx,%r13,8) #14.17
movaps %xmm3, 32(%rcx,%r13,8) #14.17
movaps %xmm4, 48(%rcx,%r13,8) #14.17
addq $8, %r13 #10.5
cmpq %r15, %r13 #10.5
jb ..B1.21 # Prob 82% #10.5
jmp ..B1.26 # Prob 100% #10.5
# LOE rax rdx rcx rbp rsi r10 r12 r13 r14 r15 ebx edi r11d xmm0
..B1.23: # Preds ..B1.19 # Infreq
movq 40(%rsp), %rcx #14.48
unpcklpd %xmm0, %xmm0 #14.48
.align 16,0x90
# LOE rax rdx rcx rbp rsi r10 r12 r13 r14 r15 ebx edi r11d xmm0
..B1.24: # Preds ..B1.24 ..B1.23 # Infreq
movaps (%rsi,%r13,8), %xmm1 #14.33
movaps 16(%rsi,%r13,8), %xmm2 #14.33
movaps 32(%rsi,%r13,8), %xmm3 #14.33
movaps 48(%rsi,%r13,8), %xmm4 #14.33
mulpd %xmm0, %xmm1 #14.48
mulpd %xmm0, %xmm2 #14.48
mulpd %xmm0, %xmm3 #14.48
mulpd %xmm0, %xmm4 #14.48
addpd (%rcx,%r13,8), %xmm1 #14.17
addpd 16(%rcx,%r13,8), %xmm2 #14.17
addpd 32(%rcx,%r13,8), %xmm3 #14.17
addpd 48(%rcx,%r13,8), %xmm4 #14.17
movaps %xmm1, (%rcx,%r13,8) #14.17
movaps %xmm2, 16(%rcx,%r13,8) #14.17
movaps %xmm3, 32(%rcx,%r13,8) #14.17
movaps %xmm4, 48(%rcx,%r13,8) #14.17
addq $8, %r13 #10.5
cmpq %r15, %r13 #10.5
jb ..B1.24 # Prob 82% #10.5
# LOE rax rdx rcx rbp rsi r10 r12 r13 r14 r15 ebx edi r11d xmm0
..B1.26: # Preds ..B1.24 ..B1.21 ..B1.34 # Infreq
cmpq %rax, %r15 #10.5
jae ..B1.30 # Prob 0% #10.5
# LOE rax rdx rbp r10 r12 r14 r15 ebx edi r11d
..B1.27: # Preds ..B1.26 # Infreq
movsd (%rbp,%r12,8), %xmm0 #14.48
lea (%r14,%rdx,8), %rcx #14.33
movq 40(%rsp), %rsi #14.48
# LOE rax rdx rcx rbp rsi r10 r12 r14 r15 ebx edi r11d xmm0
..B1.28: # Preds ..B1.28 ..B1.27 # Infreq
movsd (%rcx,%r15,8), %xmm1 #14.33
mulsd %xmm0, %xmm1 #14.48
addsd (%rsi,%r15,8), %xmm1 #14.17
movsd %xmm1, (%rsi,%r15,8) #14.17
incq %r15 #10.5
cmpq %rax, %r15 #10.5
jb ..B1.28 # Prob 82% #10.5
# LOE rax rdx rcx rbp rsi r10 r12 r14 r15 ebx edi r11d xmm0
..B1.30: # Preds ..B1.28 ..B1.26 # Infreq
incl %ebx #13.13
addq %rax, %rdx #13.13
addl %edi, %r11d #13.13
addq $8, %r10 #13.13
incq %r12 #13.13
cmpl %edi, %ebx #13.13
jb ..B1.10 # Prob 82% #13.13
# LOE rax rdx rbp r10 r12 r14 ebx edi r11d
..B1.31: # Preds ..B1.30 # Infreq
movl 24(%rsp), %r9d #
incl %r9d #11.9
movq 16(%rsp), %r8 #
addq %rax, %r8 #11.9
movq 8(%rsp), %rsi #
cmpl %edi, %r9d #11.9
movq (%rsp), %rcx #
jb ..B1.9 # Prob 82% #11.9
# LOE rax rcx rsi r8 edi r9d
..B1.33: # Preds ..B1.31 ..B1.7 # Infreq
addq $72, %rsp #18.1
..___tag_value__Z2mmiPdS_S_.16: #
popq %rbp #18.1
..___tag_value__Z2mmiPdS_S_.18: #
popq %rbx #18.1
..___tag_value__Z2mmiPdS_S_.20: #
popq %r15 #18.1
..___tag_value__Z2mmiPdS_S_.22: #
popq %r14 #18.1
..___tag_value__Z2mmiPdS_S_.24: #
popq %r13 #18.1
..___tag_value__Z2mmiPdS_S_.26: #
popq %r12 #18.1
..___tag_value__Z2mmiPdS_S_.28: #
ret #18.1
..___tag_value__Z2mmiPdS_S_.29: #
# LOE
..B1.34: # Preds ..B1.10 ..B1.14 ..B1.12 # Infreq
xorl %r15d, %r15d #10.5
jmp ..B1.26 # Prob 100% #10.5
# LOE rax rdx rbp r10 r12 r14 r15 ebx edi r11d
..B1.35: # Preds ..B1.15 # Infreq
movsd (%rbp,%r12,8), %xmm0 #14.48
jmp ..B1.19 # Prob 100% #14.48
# LOE rax rdx rbp rsi r8 r9 r10 r12 r13 r14 r15 ebx edi r11d xmm0
..B1.38: # Preds ..B1.3 # Infreq
cmpq $4, %r14 #10.5
jl ..B1.47 # Prob 10% #10.5
# LOE rbx rbp r13 r14 r12d r15d
..B1.39: # Preds ..B1.38 # Infreq
xorl %esi, %esi #10.5
movq %rbx, %rdx #10.5
movq %r13, %rcx #
xorl %eax, %eax #
pxor %xmm0, %xmm0 #
# LOE rax rdx rcx rbx rbp rsi r13 r14 r12d r15d xmm0
..B1.40: # Preds ..B1.40 ..B1.39 # Infreq
addq $4, %rsi #10.5
movq %rax, (%rcx) #12.13
movhpd %xmm0, 8(%rcx) #12.13
movq %rax, 16(%rcx) #12.13
movhpd %xmm0, 24(%rcx) #12.13
addq $32, %rcx #10.5
cmpq %rbx, %rsi #10.5
jb ..B1.40 # Prob 82% #10.5
# LOE rax rdx rcx rbx rbp rsi r13 r14 r12d r15d xmm0
..B1.42: # Preds ..B1.40 ..B1.47 # Infreq
cmpq %r14, %rdx #10.5
jae ..B1.48 # Prob 0% #10.5
# LOE rdx rbx rbp r13 r14 r12d r15d
..B1.43: # Preds ..B1.42 # Infreq
xorl %ecx, %ecx #
# LOE rdx rcx rbx rbp r13 r14 r12d r15d
..B1.44: # Preds ..B1.44 ..B1.43 # Infreq
movq %rcx, (%r13,%rdx,8) #12.13
incq %rdx #10.5
cmpq %r14, %rdx #10.5
jb ..B1.44 # Prob 82% #10.5
# LOE rdx rcx rbx rbp r13 r14 r12d r15d
..B1.45: # Preds ..B1.44 # Infreq
incl %r15d #11.9
lea (%r13,%r14,8), %r13 #11.9
cmpl %r12d, %r15d #11.9
jb ..B1.3 # Prob 82% #11.9
jmp ..B1.6 # Prob 100% #11.9
# LOE rbx rbp r13 r14 r12d r15d
..B1.47: # Preds ..B1.38 # Infreq
xorl %edx, %edx #10.5
jmp ..B1.42 # Prob 100% #10.5
# LOE rdx rbx rbp r13 r14 r12d r15d
..B1.48: # Preds ..B1.42 # Infreq
incl %r15d #11.9
lea (%r13,%r14,8), %r13 #11.9
cmpl %r12d, %r15d #11.9
jb ..B1.3 # Prob 82% #11.9
jmp ..B1.6 # Prob 100% #11.9
.align 16,0x90
..___tag_value__Z2mmiPdS_S_.36: #
# LOE rbx rbp r13 r14 r12d r15d
# mark_end;
.type _Z2mmiPdS_S_,#function
.size _Z2mmiPdS_S_,.-_Z2mmiPdS_S_
.data
# -- End _Z2mmiPdS_S_
Edit: With the help of Olaf Dietsche, it looks like the code below can run much faster with GCC 4.8.2, though still a bit (~30%) slower than Intel. The main difference is that the initialization is done ahead of time (this by itself makes no difference) and the loop ordering has been interchanged (this makes a major difference for GCC).
memset(c, 0, n * n);
for (j = 0; j < n; j++) {
for (k = 0; k < n; k++) {
for (i = 0; i < n; i++) {
c[i + n * j] += a[i + n * k] * b[k + n * j]; // Line 14
}
}
}

Your code seems to be wrong or not suitable for vectorization.
When I modify your code according to this blog post Performance – GCC & auto-vectorization
int i, j, k;
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++) {
for (k = 0; k < n; k++) {
c[n * i + k] += a[n * i + j] * b[n * j + k]; // Line 14
}
}
}
and compile it with
gcc-4.8 -O3 -S a.c
it uses at least some SIMD instructions
.L8:
movsd (%rcx), %xmm1
addl $1, %r8d
movsd (%rdx,%rsi), %xmm2
unpcklpd %xmm1, %xmm1
movhpd 8(%rdx,%rsi), %xmm2
movsd (%rax,%rsi), %xmm0
mulpd %xmm2, %xmm1
movhpd 8(%rax,%rsi), %xmm0
addpd %xmm1, %xmm0
movlpd %xmm0, (%rax,%rsi)
movhpd %xmm0, 8(%rax,%rsi)
addq $16, %rsi
cmpl %r8d, %ebx
ja .L8
cmpl %edi, %r15d
je .L9
although not as much as ICC does.
Update:
Adding -funroll-loops enlarges the generated assembly code substantially to about the length of your posted ICC assembly.

Looks like the Intel compiler is using SIMD instructions (mulpd, addpd, movaps etc) -- it's able to perform more than one operation (i.e. both a = b*c and d = e*f) in a single clock cycle, whereas the GCC code would take two to do the same.. I'm not sure if it's possible to enable these operations automatically in GCC, but you can hand-write them in with some work.
It seems like the flags -msse, -msse2, -msse3 to GCC cause it to attempt to do some SIMD optimization of its own.

I'm not sure if ICC really produced faster code in this case because I didn't run any actual benchmarks. But you can tell GCC to unroll the loops with -funroll-loops. The output will be longer, will contain lots of xmm's and will look faster.

Neither icc nor gcc necessarily optimizes the degree of unrolling. To match them, you would use e.g.
gcc -funroll-loops --param max-unroll-times=4
icc -unroll4
as gcc tends to unroll more than optimum for CPUs of the last 8 years (if allowed to do so) while icc is more conservative.
Also glossed over above is that icc -O3 encourages the compiler to optimize loop nesting, and may even engage the special -opt-matmul facility.
The original form implies a dot product reduction inner loop, for which gcc might require both -ffast-math and a more modern choice for -march= in order to optimize. icc is much more aggressive about riffling a dot product (batching into multiple sums), if it can't avoid it by switching loop nest.

Related

Why some of sse intrinsics introduce move back and forth?

In my code, I set a 128-bit variable to zero. But I don't quite understand why it translates to two move instructions in assembly code?
__m128i zeros = reinterpret_cast<__m128i>(_mm_setzero_pd());
Corresponding assembly code has two move back and forth from xmm0 to 0x40(%rsp).
00709658: 0F 57 C0 xorps %xmm0, %xmm0
0070965B: 66 0F 29 44 24 40 movapd %xmm0, 0x40(%rsp)
00709661: 66 0F 28 44 24 40 movapd 0x40(%rsp), %xmm0
My compiler is Clang 10.0 and no optimization turned on when I ask the question.
Here is minimal implementation of my code.
template <int LEN>
bool SSEEncodeChunk(const char** srcp, char** dstp) {
__m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(*srcp));
__m128i zeros = reinterpret_cast<__m128i>(_mm_setzero_pd());
__m128i zero_bytes = _mm_cmpeq_epi8(data, zeros);
bool all_zeros = _mm_testz_si128(zero_bytes, zero_bytes);
if ((!all_zeros)) {
return false;
}
_mm_storeu_si128(reinterpret_cast<__m128i*>(*dstp), data);
*dstp += LEN;
*srcp += LEN;
return true;
}
Update in June 28th, clang version 10.0.0-4ubuntu1, ubuntu2004.
Here is my code in assembly with O0. I also check the options, with -fomit-frame-pointer. There are indeed more than one moving back and forth from %xmm, offset(%rsp). My first post only take part of them out.
(lldb) disassemble --name SSEEncodeChunk
index_type_traits_test`SSEEncodeChunk<16>:
[0x709630] <+0>: subq $0x58, %rsp
[0x709634] <+4>: movq %rdi, -0x38(%rsp)
[0x709639] <+9>: movq %rsi, -0x40(%rsp)
[0x70963e] <+14>: movq -0x38(%rsp), %rax
[0x709643] <+19>: movq (%rax), %rax
[0x709646] <+22>: movq %rax, -0x28(%rsp)
[0x70964b] <+27>: movq -0x28(%rsp), %rax
[0x709650] <+32>: movups (%rax), %xmm0
[0x709653] <+35>: movaps %xmm0, -0x50(%rsp)
[0x709658] <+40>: xorps %xmm0, %xmm0
[0x70965b] <+43>: movapd %xmm0, 0x40(%rsp) ; this is the point I ask the question at the first time
[0x709661] <+49>: movapd 0x40(%rsp), %xmm0
[0x709667] <+55>: movapd %xmm0, -0x60(%rsp)
[0x70966d] <+61>: movaps -0x50(%rsp), %xmm0
[0x709672] <+66>: movaps -0x60(%rsp), %xmm1
[0x709677] <+71>: movaps %xmm0, 0x30(%rsp)
[0x70967c] <+76>: movaps %xmm1, 0x20(%rsp)
[0x709681] <+81>: movaps 0x30(%rsp), %xmm0
[0x709686] <+86>: movaps 0x20(%rsp), %xmm1
[0x70968b] <+91>: pcmpeqb %xmm1, %xmm0
[0x70968f] <+95>: movdqa %xmm0, -0x70(%rsp)
[0x709695] <+101>: movdqa -0x70(%rsp), %xmm0
[0x70969b] <+107>: movdqa -0x70(%rsp), %xmm1
[0x7096a1] <+113>: movdqa %xmm0, 0x10(%rsp)
[0x7096a7] <+119>: movdqa %xmm1, (%rsp)
[0x7096ac] <+124>: movdqa 0x10(%rsp), %xmm0
[0x7096b2] <+130>: movdqa (%rsp), %xmm1
[0x7096b7] <+135>: ptest %xmm1, %xmm0
[0x7096bc] <+140>: sete %cl
[0x7096bf] <+143>: movzbl %cl, %edx
[0x7096c2] <+146>: cmpl $0x0, %edx
[0x7096c5] <+149>: setne %cl
[0x7096c8] <+152>: andb $0x1, %cl
[0x7096cb] <+155>: movb %cl, -0x71(%rsp)
[0x7096cf] <+159>: movb -0x71(%rsp), %cl
[0x7096d3] <+163>: xorb $-0x1, %cl
[0x7096d6] <+166>: testb $0x1, %cl
[0x7096d9] <+169>: jne 0x7096e4
[0x7096df] <+175>: jmp 0x7096ee
[0x7096e4] <+180>: movb $0x0, -0x29(%rsp)
[0x7096e9] <+185>: jmp 0x70973f
[0x7096ee] <+190>: movq -0x40(%rsp), %rax
[0x7096f3] <+195>: movq (%rax), %rax
[0x7096f6] <+198>: movdqa -0x50(%rsp), %xmm0
[0x7096fc] <+204>: movq %rax, -0x8(%rsp)
[0x709701] <+209>: movdqa %xmm0, -0x20(%rsp)
[0x709707] <+215>: movdqa -0x20(%rsp), %xmm0
[0x70970d] <+221>: movq -0x8(%rsp), %rax
[0x709712] <+226>: movdqu %xmm0, (%rax)
[0x709716] <+230>: movq -0x40(%rsp), %rax
[0x70971b] <+235>: movq (%rax), %rcx
[0x70971e] <+238>: addq $0x10, %rcx
[0x709725] <+245>: movq %rcx, (%rax)
[0x709728] <+248>: movq -0x38(%rsp), %rax
[0x70972d] <+253>: movq (%rax), %rcx
[0x709730] <+256>: addq $0x10, %rcx
[0x709737] <+263>: movq %rcx, (%rax)
[0x70973a] <+266>: movb $0x1, -0x29(%rsp)
[0x70973f] <+271>: movb -0x29(%rsp), %al
[0x709743] <+275>: andb $0x1, %al
[0x709745] <+277>: movzbl %al, %eax
[0x709748] <+280>: addq $0x58, %rsp
[0x70974c] <+284>: retq
After turned on -O2, the back and forth move do disapper
0051E8D5: F3 0F 6F 08 movdqu (%rax), %xmm1
0051E8D9: 66 0F EF C0 pxor %xmm0, %xmm0
0051E8DD: 66 0F 6F D1 movdqa %xmm1, %xmm2
0051E8E1: 66 0F 74 D0 pcmpeqb %xmm0, %xmm2
0051E8E5: 66 0F 38 17 D2 ptest %xmm2, %xmm2
0051E8EA: 0F 85 97 03 00 00 jne 0x51ec87
0051E8F0: F3 0F 7F 0C 3B movdqu %xmm1, (%rbx,%rdi)
...
0051E967: 48 01 F8 addq %rdi, %rax
0051E96A: 48 83 C0 10 addq $0x10, %rax
0051E96E: 48 01 FB addq %rdi, %rbx
0051E971: 48 83 C3 10 addq $0x10, %rbx

Explanation of std::vector<int> sum ASM

I was playing around with the Compiler Explorer, and I'm struggling to understand the ASM output (x86 Clang 3.7 -O3) of a simple std::vector<int> sum function:
#include <vector>
#include <numeric>
int sum(const std::vector<int>& v)
{
return std::accumulate(v.begin(), v.end(), 0);
}
The ASM for this code is:
sum(std::vector<int, std::allocator<int> > const&): # #sum(std::vector<int, std::allocator<int> > const&)
movq (%rdi), %rsi
movq 8(%rdi), %r11
xorl %eax, %eax
cmpq %r11, %rsi
je .LBB0_13
movabsq $9223372036854775800, %rax # imm = 0x7FFFFFFFFFFFFFF8
leaq -4(%r11), %rdx
movq %rdx, %r10
subq %rsi, %r10
shrq $2, %r10
incq %r10
xorl %edi, %edi
movq %r10, %r8
andq %rax, %r8
pxor %xmm0, %xmm0
je .LBB0_2
andq %r10, %rax
leaq -8(%rax), %r9
movl %r9d, %ecx
shrl $3, %ecx
incl %ecx
xorl %edi, %edi
testb $3, %cl
je .LBB0_4
subl %esi, %edx
shrl $2, %edx
incl %edx
andl $24, %edx
addl $-8, %edx
shrl $3, %edx
incl %edx
andl $3, %edx
negq %rdx
pxor %xmm0, %xmm0
xorl %edi, %edi
pxor %xmm1, %xmm1
.LBB0_6: # %vector.body.prol
movdqu (%rsi,%rdi,4), %xmm2
movdqu 16(%rsi,%rdi,4), %xmm3
paddd %xmm2, %xmm0
paddd %xmm3, %xmm1
addq $8, %rdi
incq %rdx
jne .LBB0_6
jmp .LBB0_7
.LBB0_2:
pxor %xmm1, %xmm1
jmp .LBB0_11
.LBB0_4:
pxor %xmm0, %xmm0
pxor %xmm1, %xmm1
.LBB0_7: # %vector.body.preheader.split
leaq (%rsi,%r8,4), %rdx
cmpq $24, %r9
jb .LBB0_10
subq %rdi, %rax
leaq 112(%rsi,%rdi,4), %rsi
.LBB0_9: # %vector.body
movdqu -112(%rsi), %xmm2
movdqu -96(%rsi), %xmm3
movdqu -80(%rsi), %xmm4
movdqu -64(%rsi), %xmm5
paddd %xmm0, %xmm2
paddd %xmm1, %xmm3
paddd %xmm4, %xmm2
paddd %xmm5, %xmm3
movdqu -48(%rsi), %xmm4
movdqu -32(%rsi), %xmm5
paddd %xmm2, %xmm4
paddd %xmm3, %xmm5
movdqu -16(%rsi), %xmm0
movdqu (%rsi), %xmm1
paddd %xmm4, %xmm0
paddd %xmm5, %xmm1
subq $-128, %rsi
addq $-32, %rax
jne .LBB0_9
.LBB0_10:
movq %rdx, %rsi
movq %r8, %rdi
.LBB0_11: # %middle.block
paddd %xmm1, %xmm0
pshufd $78, %xmm0, %xmm1 # xmm1 = xmm0[2,3,0,1]
paddd %xmm0, %xmm1
pshufd $229, %xmm1, %xmm0 # xmm0 = xmm1[1,1,2,3]
paddd %xmm1, %xmm0
movd %xmm0, %eax
cmpq %rdi, %r10
je .LBB0_13
.LBB0_12: # %.lr.ph.i
addl (%rsi), %eax
addq $4, %rsi
cmpq %rsi, %r11
jne .LBB0_12
.LBB0_13: # %int std::accumulate<__gnu_cxx::__normal_iterator<int const*, std::vector<int, std::allocator<int> > >, int>(__gnu_cxx::__normal_iterator<int const*, std::vector<int, std::allocator<int> > >, __gnu_cxx::__normal_iterator<int const*, std::vector<int, std::allocator<int> > >, int) [clone .exit]
req
For comparison, the ASM for the same function but using std::vector<double> is:
sum(std::vector<double, std::allocator<double> > const&):
movq 8(%rdi), %rdx
movq (%rdi), %rax
pxor %xmm0, %xmm0
cmpq %rax, %rdx
je .L4
.L3:
addsd (%rax), %xmm0
addq $8, %rax
cmpq %rax, %rdx
jne .L3
rep ret
.L4:
rep ret
The ASM for std::vector<double> seems fairly trivial, while the ASM for std::vector<int> appears markedly more complex. I'm assuming there is some clever optimisation going on with std::vector<int>, but I'm at a bit of a loss to explain what's going on. Could someone enlighten me?

Short answer - the compiler has vectorised and unrolled the loop for adding integers. Compare the vector<double> version which has these lines:
addsd (%rax), %xmm0
addq $8, %rax
This means its adding a single double into the sum and then moving on 8 bytes and looping.
The same code in the main loop of the vector<int> version does:
movdqu -112(%rsi), %xmm2
movdqu -96(%rsi), %xmm3
movdqu -80(%rsi), %xmm4
movdqu -64(%rsi), %xmm5
...
movdqu -48(%rsi), %xmm4
movdqu -32(%rsi), %xmm5
...
movdqu -16(%rsi), %xmm0
...
movdqu (%rsi), %xmm1
...
subq $-128, %rsi
The movdq shows its doing 16 bytes at once (4 integers) and the subq $-128, %rsi shows its doing 128 bytes (or 32 ints) in a single loop across 8 loads. The end result of each iteration of the loop adds the next 32 ints into one of the 8 slots in xmm0:xmm1
LBB0_11 then takes the output from the main loop (which is 8 integers across xmm0 and xmm1) and finds the sum of those.
LBB0_12 then finishes off any ints at the end of the vector which couldn't be consumed by the main loop (as the main loop works on 32 integers at the same time)
It vectorises the adds so it can handle 4 integers at once which is generally faster than doing one integer at a time. It also unrolls the loop so that it can do more than 1 iteration of adding per loop.
Explanation of vectorization: What does vectorization mean?
Explanation of loop unrolling: When, if ever, is loop unrolling still useful?
I've not analysed the start of the code for the integer case, but generally this is setting up the loop by aligning it to a 16 byte boundary before it starts the main loop.

openCL glGetProgramInfo causing Core Foundation crash

I am writing a C++ command line program in XCode (6.4) on OSX Yosemite (10.10.4). I am using Apple's openCL framework and am trying to save my openCL binaries to disk.
First I create my program from source and build as follows:
cl_int err;
cl_program result = clCreateProgramWithSource(context, numFiles, (const char **)sourceStrings, NULL, &err);
ErrorManager::CheckError(err, "Failed to create a compute program");
err = clBuildProgram(result, deviceCount, devices, NULL, NULL, NULL);
ErrorManager::CheckError(err, "Failed to build program");
The above code works fine and I can launch my kernels without any error.
Then I try to access the binary sizes...
size_t *programBinarySizes = new size_t[deviceCount];
err = clGetProgramInfo(result, CL_PROGRAM_BINARY_SIZES, sizeof(size_t) * deviceCount, programBinarySizes, NULL);
However this call to clGetProgramInfo causes Xcode to throw an EXC_BREAKPOINT that has the following output:
CoreFoundation`__CFTypeCollectionRetain:
0x7fff8e2c2480 <+0>: pushq %rbp
0x7fff8e2c2481 <+1>: movq %rsp, %rbp
0x7fff8e2c2484 <+4>: pushq %r14
0x7fff8e2c2486 <+6>: pushq %rbx
0x7fff8e2c2487 <+7>: movq %rsi, %rbx
0x7fff8e2c248a <+10>: testq %rbx, %rbx
0x7fff8e2c248d <+13>: je 0x7fff8e2c25ae ; <+302>
0x7fff8e2c2493 <+19>: cmpb $0x0, -0x17f41f28(%rip) ; __CFDeallocateZombies
0x7fff8e2c249a <+26>: je 0x7fff8e2c257c ; <+252>
0x7fff8e2c24a0 <+32>: testq %rdi, %rdi
0x7fff8e2c24a3 <+35>: je 0x7fff8e2c24b5 ; <+53>
0x7fff8e2c24a5 <+37>: leaq -0x17f917cc(%rip), %rax ; kCFAllocatorSystemDefault
0x7fff8e2c24ac <+44>: cmpq %rdi, (%rax)
0x7fff8e2c24af <+47>: jne 0x7fff8e2c257c ; <+252>
0x7fff8e2c24b5 <+53>: testb $0x1, %bl
0x7fff8e2c24b8 <+56>: je 0x7fff8e2c24e4 ; <+100>
0x7fff8e2c24ba <+58>: movl %ebx, %eax
0x7fff8e2c24bc <+60>: shrl %eax
0x7fff8e2c24be <+62>: andl $0x7, %eax
0x7fff8e2c24c1 <+65>: cmpl $0x6, %eax
0x7fff8e2c24c4 <+68>: ja 0x7fff8e2c259c ; <+284>
0x7fff8e2c24ca <+74>: leaq 0xef(%rip), %rcx ; <+320>
0x7fff8e2c24d1 <+81>: movslq (%rcx,%rax,4), %rax
0x7fff8e2c24d5 <+85>: addq %rcx, %rax
0x7fff8e2c24d8 <+88>: jmpq *%rax
0x7fff8e2c24da <+90>: callq 0x7fff8e2df7b0 ; CFStringGetTypeID
0x7fff8e2c24df <+95>: jmp 0x7fff8e2c2594 ; <+276>
0x7fff8e2c24e4 <+100>: movl 0x8(%rbx), %eax
0x7fff8e2c24e7 <+103>: shrl $0x8, %eax
0x7fff8e2c24ea <+106>: andl $0x3ff, %eax
0x7fff8e2c24ef <+111>: movq (%rbx), %rcx
0x7fff8e2c24f2 <+114>: testq %rcx, %rcx
0x7fff8e2c24f5 <+117>: je 0x7fff8e2c2522 ; <+162>
0x7fff8e2c24f7 <+119>: cmpq -0x17f41f96(%rip), %rcx ; __CFConstantStringClassReferencePtr
0x7fff8e2c24fe <+126>: je 0x7fff8e2c2522 ; <+162>
0x7fff8e2c2500 <+128>: leaq -0x17f43fa7(%rip), %rdx ; __CFRuntimeObjCClassTable
0x7fff8e2c2507 <+135>: movq (%rdx,%rax,8), %r14
0x7fff8e2c250b <+139>: cmpq %r14, %rcx
0x7fff8e2c250e <+142>: je 0x7fff8e2c2522 ; <+162>
0x7fff8e2c2510 <+144>: testb $0x1, %cl
0x7fff8e2c2513 <+147>: je 0x7fff8e2c2594 ; <+276>
0x7fff8e2c2515 <+149>: movq %rbx, %rdi
0x7fff8e2c2518 <+152>: callq 0x7fff8e481b6a ; symbol stub for: object_getClass
0x7fff8e2c251d <+157>: cmpq %r14, %rax
0x7fff8e2c2520 <+160>: jne 0x7fff8e2c2594 ; <+276>
0x7fff8e2c2522 <+162>: callq 0x7fff8e481aaa ; symbol stub for: objc_collectableZone
0x7fff8e2c2527 <+167>: movq %rax, %rdi
0x7fff8e2c252a <+170>: movq %rbx, %rsi
0x7fff8e2c252d <+173>: callq 0x7fff8e481594 ; symbol stub for: auto_zone_is_valid_pointer
0x7fff8e2c2532 <+178>: testl %eax, %eax
0x7fff8e2c2534 <+180>: je 0x7fff8e2c255b ; <+219>
0x7fff8e2c2536 <+182>: movl 0x8(%rbx), %eax
0x7fff8e2c2539 <+185>: shrl $0x8, %eax
0x7fff8e2c253c <+188>: andl $0x3ff, %eax
0x7fff8e2c2541 <+193>: leaq -0x17f45ff8(%rip), %rcx ; __CFRuntimeClassTable
0x7fff8e2c2548 <+200>: movq (%rcx,%rax,8), %rax
0x7fff8e2c254c <+204>: testb $0x4, (%rax)
0x7fff8e2c254f <+207>: je 0x7fff8e2c2594 ; <+276>
0x7fff8e2c2551 <+209>: movq %rbx, %rdi
0x7fff8e2c2554 <+212>: callq 0x7fff8e2bf500 ; CFRetain
0x7fff8e2c2559 <+217>: jmp 0x7fff8e2c2594 ; <+276>
0x7fff8e2c255b <+219>: cmpl $0x0, 0xc(%rbx)
0x7fff8e2c255f <+223>: je 0x7fff8e2c2594 ; <+276>
0x7fff8e2c2561 <+225>: leaq -0x17f72248(%rip), %rsi ; #"storing a non-GC object %p in a GC collection, break on CFCollection_non_gc_storage_error to debug."
0x7fff8e2c2568 <+232>: movl $0x4, %edi
0x7fff8e2c256d <+237>: xorl %eax, %eax
0x7fff8e2c256f <+239>: movq %rbx, %rdx
0x7fff8e2c2572 <+242>: callq 0x7fff8e3b61e0 ; CFLog
0x7fff8e2c2577 <+247>: callq 0x7fff8e3c3290 ; CFCollection_non_gc_storage_error
0x7fff8e2c257c <+252>: movq %rbx, %rdi
0x7fff8e2c257f <+255>: popq %rbx
0x7fff8e2c2580 <+256>: popq %r14
0x7fff8e2c2582 <+258>: popq %rbp
0x7fff8e2c2583 <+259>: jmp 0x7fff8e2bf500 ; CFRetain
0x7fff8e2c2588 <+264>: callq 0x7fff8e2df7f0 ; CFNumberGetTypeID
0x7fff8e2c258d <+269>: jmp 0x7fff8e2c2594 ; <+276>
0x7fff8e2c258f <+271>: callq 0x7fff8e2df850 ; CFDateGetTypeID
0x7fff8e2c2594 <+276>: movq %rbx, %rax
0x7fff8e2c2597 <+279>: popq %rbx
0x7fff8e2c2598 <+280>: popq %r14
0x7fff8e2c259a <+282>: popq %rbp
0x7fff8e2c259b <+283>: retq
0x7fff8e2c259c <+284>: int3
0x7fff8e2c259d <+285>: callq 0x7fff8e482020 ; symbol stub for: getpid
0x7fff8e2c25a2 <+290>: movl $0x9, %esi
0x7fff8e2c25a7 <+295>: movl %eax, %edi
0x7fff8e2c25a9 <+297>: callq 0x7fff8e482080 ; symbol stub for: kill
0x7fff8e2c25ae <+302>: leaq 0x36325e(%rip), %rax ; "*** __CFTypeCollectionRetain() called with NULL; likely a collection has been corrupted ***"
0x7fff8e2c25b5 <+309>: movq %rax, -0x17f460c4(%rip) ; gCRAnnotations + 8
0x7fff8e2c25bc <+316>: int3
-> 0x7fff8e2c25bd <+317>: jmp 0x7fff8e2c259d ; <+285>
0x7fff8e2c25bf <+319>: nop
This exception doesn't get thrown if I call clGetProgramInfo with a param name other than CL_PROGRAM_BINARY_SIZES.
I cannot figure out the reason why I'm getting this exception. It's puzzling because I'm writing a C++ program, not an ObjC or Swift one, so I don't see why I would be getting any errors related to Core Foundation or retain counts.
My only guess is maybe there is some build settings option that I have set incorrectly that is causing my program to think this is an objc environment but that seems unlikely.
Any help would be appreciated

Performance of C++11 modern-style loops vs old-style loops

This is the first question I'm posting here, so I hope I won't do anything wrong.
My question concerns the performance of modern-style C++11 loops (std::for_each, range-based for) vs old-style C++ loops (for (...; ...; ...)). From what I understood, it seems to me that the motto of modern C++ is "expressivity with no compromise on performance". Modern C++ style leads to safe, clean, and fast code with little to no performance penalty and, possibly, with a performance gain over old-style C++.
Now I've made a little test to assess how big this gain is concerning loops. First I wrote the following three functions:
using namespace std;
void foo(vector<double>& v)
{
for (size_t i = 0; i < v.size(); i++)
{
v[i] /= 42;
}
}
void bar(vector<double>& v)
{
for (auto& x : v)
{
x /= 42;
}
}
void wee(vector<double>& v)
{
for_each(begin(v), end(v), [] (double& x)
{
x /= 42;
});
}
Then I compared their performance by calling them this way (properly commenting/uncommenting the three lines inside main()'s loop:
vector<double> make_vector()
{
vector<double> v;
for (int i = 0; i < 30000; i++) { v.push_back(i); }
return v;
}
int main()
{
time_t start = clock();
auto v = make_vector();
for (int i = 0; i <= 50000; i++)
{
// UNCOMMENT THE FUNCTION CALL TO BE TESTED, COMMENT THE OTHERS
foo(v);
// bar(v);
// wee(v);
}
time_t end = clock();
cout << (end - start) << endl;
return 0;
}
Averaging over 10 executions of each version of the program obtained by commenting/uncommenting the lines in main()'s loop, and using the old-style loop as a baseline, the range-based for loop performs ~1.9x worse, and the loop based on std::for_each and lambdas performs ~2.3x worse.
I used Clang 3.2 to compile this, and I haven't tried MS VC11 (I'm working on WinXP).
Considering my expectation of getting comparable execution times, my questions are:
Did I do something obviously wrong?
If not, couldn't a 2x performance penalty be a good reason NOT to embrace modern-style loops?
I would like to remark, that I do believe that the clarity and safety of code written in modern C++ style pay off for a possible performance loss, but I quite disagree with the statement that there is no trade-off between clarity/safety on one side and performance on the other side.
Am I missing something?

It looks like the difference only shows up when you do not enable optimisations in your compiler.
With Clang you can enable optimisation with the -O[0-3] flag.

Mankarse is right - most likely you have not enabled optimizations.
Actually on Clang they produce practically same result ASM code in main loop, and small difference in pre/post code.
I have tested four versions: hand_loop_index, hand_loop_iterator, range_based_for, for_each_algorithm
hand_loop_iterator, range_based_for and for_each_algorithm - all three do produce exactly same result ASM for full function body, only difference is in names of labels.
I.e. hand written for loop with iterators results in exactly same ASM code as range-based-for and std::for_each.
There are some differences between loop with index and loop with iterator versions.
Main loop in both cases is almost same. The only minor differece is that for iterators version(s) rdx register is used instead of rsi.
Index version:
.LBB0_7: # %vector.body
# =>This Inner Loop Header: Depth=1
movupd -48(%rsi), %xmm1
movupd -32(%rsi), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -48(%rsi)
movupd %xmm2, -32(%rsi)
movupd -16(%rsi), %xmm1
movupd (%rsi), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -16(%rsi)
movupd %xmm2, (%rsi)
addq $64, %rsi
addq $-8, %rdi
jne .LBB0_7
Iterator version(s):
.LBB1_6: # %vector.body
# =>This Inner Loop Header: Depth=1
movupd -48(%rdx), %xmm1
movupd -32(%rdx), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -48(%rdx)
movupd %xmm2, -32(%rdx)
movupd -16(%rdx), %xmm1
movupd (%rdx), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -16(%rdx)
movupd %xmm2, (%rdx)
addq $64, %rdx
addq $-8, %rsi
jne .LBB1_6
Pre/post code for index vs iterator versions has many differences, but it should not affect greatly total result timing for large enough arrays.
LIVE DEMO on Coliru with ASM output
#include <algorithm>
#include <iterator>
#include <vector>
using namespace std;
void hand_loop_index(vector<double> &v)
{
for (size_t i = 0; i < v.size(); ++i)
{
v[i] /= 42;
}
}
void hand_loop_iterator(vector<double> &v)
{
for (auto first = begin(v), last = end(v); first!=last; ++first)
{
*first /= 42;
}
}
void range_based_for(vector<double> &v)
{
for (auto &x : v)
{
x /= 42;
}
}
void for_each_algorithm(vector<double> &v)
{
for_each(begin(v), end(v), [] (double &x)
{
x /= 42;
});
}
Result ASM:
# clang++ -std=c++1z -O3 -Wall -pedantic -pthread main.cpp -S
.text
.file "main.cpp"
.section .rodata.cst16,"aM",#progbits,16
.align 16
.LCPI0_0:
.quad 4631107791820423168 # double 4.200000e+01
.quad 4631107791820423168 # double 4.200000e+01
.section .rodata.cst8,"aM",#progbits,8
.align 8
.LCPI0_1:
.quad 4631107791820423168 # double 42
.text
.globl _Z15hand_loop_indexRSt6vectorIdSaIdEE
.align 16, 0x90
.type _Z15hand_loop_indexRSt6vectorIdSaIdEE,#function
_Z15hand_loop_indexRSt6vectorIdSaIdEE: # #_Z15hand_loop_indexRSt6vectorIdSaIdEE
.cfi_startproc
# BB#0:
movq (%rdi), %rax
movq 8(%rdi), %rcx
subq %rax, %rcx
je .LBB0_11
# BB#1: # %.lr.ph
sarq $3, %rcx
cmpq $1, %rcx
movl $1, %edx
cmovaq %rcx, %rdx
xorl %edi, %edi
testq %rdx, %rdx
je .LBB0_10
# BB#2: # %overflow.checked
xorl %edi, %edi
movq %rdx, %r8
andq $-4, %r8
je .LBB0_9
# BB#3: # %vector.body.preheader
cmpq $1, %rcx
movl $1, %edi
cmovaq %rcx, %rdi
addq $-4, %rdi
movq %rdi, %rsi
shrq $2, %rsi
xorl %r9d, %r9d
btq $2, %rdi
jb .LBB0_5
# BB#4: # %vector.body.prol
movupd (%rax), %xmm0
movupd 16(%rax), %xmm1
movapd .LCPI0_0(%rip), %xmm2 # xmm2 = [4.200000e+01,4.200000e+01]
divpd %xmm2, %xmm0
divpd %xmm2, %xmm1
movupd %xmm0, (%rax)
movupd %xmm1, 16(%rax)
movl $4, %r9d
.LBB0_5: # %vector.body.preheader.split
testq %rsi, %rsi
je .LBB0_8
# BB#6: # %vector.body.preheader.split.split
cmpq $1, %rcx
movl $1, %edi
cmovaq %rcx, %rdi
andq $-4, %rdi
subq %r9, %rdi
leaq 48(%rax,%r9,8), %rsi
movapd .LCPI0_0(%rip), %xmm0 # xmm0 = [4.200000e+01,4.200000e+01]
.align 16, 0x90
.LBB0_7: # %vector.body
# =>This Inner Loop Header: Depth=1
movupd -48(%rsi), %xmm1
movupd -32(%rsi), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -48(%rsi)
movupd %xmm2, -32(%rsi)
movupd -16(%rsi), %xmm1
movupd (%rsi), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -16(%rsi)
movupd %xmm2, (%rsi)
addq $64, %rsi
addq $-8, %rdi
jne .LBB0_7
.LBB0_8:
movq %r8, %rdi
.LBB0_9: # %middle.block
cmpq %rdi, %rdx
je .LBB0_11
.align 16, 0x90
.LBB0_10: # %scalar.ph
# =>This Inner Loop Header: Depth=1
movsd (%rax,%rdi,8), %xmm0 # xmm0 = mem[0],zero
divsd .LCPI0_1(%rip), %xmm0
movsd %xmm0, (%rax,%rdi,8)
incq %rdi
cmpq %rcx, %rdi
jb .LBB0_10
.LBB0_11: # %._crit_edge
retq
.Lfunc_end0:
.size _Z15hand_loop_indexRSt6vectorIdSaIdEE, .Lfunc_end0-_Z15hand_loop_indexRSt6vectorIdSaIdEE
.cfi_endproc
.section .rodata.cst16,"aM",#progbits,16
.align 16
.LCPI1_0:
.quad 4631107791820423168 # double 4.200000e+01
.quad 4631107791820423168 # double 4.200000e+01
.section .rodata.cst8,"aM",#progbits,8
.align 8
.LCPI1_1:
.quad 4631107791820423168 # double 42
.text
.globl _Z18hand_loop_iteratorRSt6vectorIdSaIdEE
.align 16, 0x90
.type _Z18hand_loop_iteratorRSt6vectorIdSaIdEE,#function
_Z18hand_loop_iteratorRSt6vectorIdSaIdEE: # #_Z18hand_loop_iteratorRSt6vectorIdSaIdEE
.cfi_startproc
# BB#0:
movq (%rdi), %rdx
movq 8(%rdi), %rax
cmpq %rax, %rdx
je .LBB1_11
# BB#1: # %.lr.ph.preheader
movabsq $4611686018427387900, %rsi # imm = 0x3FFFFFFFFFFFFFFC
leaq -8(%rax), %rcx
subq %rdx, %rcx
shrq $3, %rcx
incq %rcx
xorl %edi, %edi
movq %rcx, %r9
andq %rsi, %r9
je .LBB1_8
# BB#2: # %vector.body.preheader
andq %rcx, %rsi
leaq -4(%rsi), %rdi
movq %rdi, %r11
shrq $2, %r11
xorl %r10d, %r10d
btq $2, %rdi
jb .LBB1_4
# BB#3: # %vector.body.prol
movupd (%rdx), %xmm0
movupd 16(%rdx), %xmm1
movapd .LCPI1_0(%rip), %xmm2 # xmm2 = [4.200000e+01,4.200000e+01]
divpd %xmm2, %xmm0
divpd %xmm2, %xmm1
movupd %xmm0, (%rdx)
movupd %xmm1, 16(%rdx)
movl $4, %r10d
.LBB1_4: # %vector.body.preheader.split
leaq (%rdx,%r9,8), %r8
testq %r11, %r11
je .LBB1_7
# BB#5: # %vector.body.preheader.split.split
subq %r10, %rsi
leaq 48(%rdx,%r10,8), %rdx
movapd .LCPI1_0(%rip), %xmm0 # xmm0 = [4.200000e+01,4.200000e+01]
.align 16, 0x90
.LBB1_6: # %vector.body
# =>This Inner Loop Header: Depth=1
movupd -48(%rdx), %xmm1
movupd -32(%rdx), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -48(%rdx)
movupd %xmm2, -32(%rdx)
movupd -16(%rdx), %xmm1
movupd (%rdx), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -16(%rdx)
movupd %xmm2, (%rdx)
addq $64, %rdx
addq $-8, %rsi
jne .LBB1_6
.LBB1_7:
movq %r8, %rdx
movq %r9, %rdi
.LBB1_8: # %middle.block
cmpq %rdi, %rcx
je .LBB1_11
# BB#9:
movsd .LCPI1_1(%rip), %xmm0 # xmm0 = mem[0],zero
.align 16, 0x90
.LBB1_10: # %.lr.ph
# =>This Inner Loop Header: Depth=1
movsd (%rdx), %xmm1 # xmm1 = mem[0],zero
divsd %xmm0, %xmm1
movsd %xmm1, (%rdx)
addq $8, %rdx
cmpq %rdx, %rax
jne .LBB1_10
.LBB1_11: # %._crit_edge
retq
.Lfunc_end1:
.size _Z18hand_loop_iteratorRSt6vectorIdSaIdEE, .Lfunc_end1-_Z18hand_loop_iteratorRSt6vectorIdSaIdEE
.cfi_endproc
.section .rodata.cst16,"aM",#progbits,16
.align 16
.LCPI2_0:
.quad 4631107791820423168 # double 4.200000e+01
.quad 4631107791820423168 # double 4.200000e+01
.section .rodata.cst8,"aM",#progbits,8
.align 8
.LCPI2_1:
.quad 4631107791820423168 # double 42
.text
.globl _Z15range_based_forRSt6vectorIdSaIdEE
.align 16, 0x90
.type _Z15range_based_forRSt6vectorIdSaIdEE,#function
_Z15range_based_forRSt6vectorIdSaIdEE: # #_Z15range_based_forRSt6vectorIdSaIdEE
.cfi_startproc
# BB#0:
movq (%rdi), %rdx
movq 8(%rdi), %rax
cmpq %rax, %rdx
je .LBB2_11
# BB#1: # %.lr.ph.preheader
movabsq $4611686018427387900, %rsi # imm = 0x3FFFFFFFFFFFFFFC
leaq -8(%rax), %rcx
subq %rdx, %rcx
shrq $3, %rcx
incq %rcx
xorl %edi, %edi
movq %rcx, %r9
andq %rsi, %r9
je .LBB2_8
# BB#2: # %vector.body.preheader
andq %rcx, %rsi
leaq -4(%rsi), %rdi
movq %rdi, %r11
shrq $2, %r11
xorl %r10d, %r10d
btq $2, %rdi
jb .LBB2_4
# BB#3: # %vector.body.prol
movupd (%rdx), %xmm0
movupd 16(%rdx), %xmm1
movapd .LCPI2_0(%rip), %xmm2 # xmm2 = [4.200000e+01,4.200000e+01]
divpd %xmm2, %xmm0
divpd %xmm2, %xmm1
movupd %xmm0, (%rdx)
movupd %xmm1, 16(%rdx)
movl $4, %r10d
.LBB2_4: # %vector.body.preheader.split
leaq (%rdx,%r9,8), %r8
testq %r11, %r11
je .LBB2_7
# BB#5: # %vector.body.preheader.split.split
subq %r10, %rsi
leaq 48(%rdx,%r10,8), %rdx
movapd .LCPI2_0(%rip), %xmm0 # xmm0 = [4.200000e+01,4.200000e+01]
.align 16, 0x90
.LBB2_6: # %vector.body
# =>This Inner Loop Header: Depth=1
movupd -48(%rdx), %xmm1
movupd -32(%rdx), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -48(%rdx)
movupd %xmm2, -32(%rdx)
movupd -16(%rdx), %xmm1
movupd (%rdx), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -16(%rdx)
movupd %xmm2, (%rdx)
addq $64, %rdx
addq $-8, %rsi
jne .LBB2_6
.LBB2_7:
movq %r8, %rdx
movq %r9, %rdi
.LBB2_8: # %middle.block
cmpq %rdi, %rcx
je .LBB2_11
# BB#9:
movsd .LCPI2_1(%rip), %xmm0 # xmm0 = mem[0],zero
.align 16, 0x90
.LBB2_10: # %.lr.ph
# =>This Inner Loop Header: Depth=1
movsd (%rdx), %xmm1 # xmm1 = mem[0],zero
divsd %xmm0, %xmm1
movsd %xmm1, (%rdx)
addq $8, %rdx
cmpq %rdx, %rax
jne .LBB2_10
.LBB2_11: # %._crit_edge
retq
.Lfunc_end2:
.size _Z15range_based_forRSt6vectorIdSaIdEE, .Lfunc_end2-_Z15range_based_forRSt6vectorIdSaIdEE
.cfi_endproc
.section .rodata.cst16,"aM",#progbits,16
.align 16
.LCPI3_0:
.quad 4631107791820423168 # double 4.200000e+01
.quad 4631107791820423168 # double 4.200000e+01
.section .rodata.cst8,"aM",#progbits,8
.align 8
.LCPI3_1:
.quad 4631107791820423168 # double 42
.text
.globl _Z18for_each_algorithmRSt6vectorIdSaIdEE
.align 16, 0x90
.type _Z18for_each_algorithmRSt6vectorIdSaIdEE,#function
_Z18for_each_algorithmRSt6vectorIdSaIdEE: # #_Z18for_each_algorithmRSt6vectorIdSaIdEE
.cfi_startproc
# BB#0:
movq (%rdi), %rdx
movq 8(%rdi), %rax
cmpq %rax, %rdx
je .LBB3_11
# BB#1: # %.lr.ph.i.preheader
movabsq $4611686018427387900, %rsi # imm = 0x3FFFFFFFFFFFFFFC
leaq -8(%rax), %rcx
subq %rdx, %rcx
shrq $3, %rcx
incq %rcx
xorl %edi, %edi
movq %rcx, %r9
andq %rsi, %r9
je .LBB3_8
# BB#2: # %vector.body.preheader
andq %rcx, %rsi
leaq -4(%rsi), %rdi
movq %rdi, %r11
shrq $2, %r11
xorl %r10d, %r10d
btq $2, %rdi
jb .LBB3_4
# BB#3: # %vector.body.prol
movupd (%rdx), %xmm0
movupd 16(%rdx), %xmm1
movapd .LCPI3_0(%rip), %xmm2 # xmm2 = [4.200000e+01,4.200000e+01]
divpd %xmm2, %xmm0
divpd %xmm2, %xmm1
movupd %xmm0, (%rdx)
movupd %xmm1, 16(%rdx)
movl $4, %r10d
.LBB3_4: # %vector.body.preheader.split
leaq (%rdx,%r9,8), %r8
testq %r11, %r11
je .LBB3_7
# BB#5: # %vector.body.preheader.split.split
subq %r10, %rsi
leaq 48(%rdx,%r10,8), %rdx
movapd .LCPI3_0(%rip), %xmm0 # xmm0 = [4.200000e+01,4.200000e+01]
.align 16, 0x90
.LBB3_6: # %vector.body
# =>This Inner Loop Header: Depth=1
movupd -48(%rdx), %xmm1
movupd -32(%rdx), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -48(%rdx)
movupd %xmm2, -32(%rdx)
movupd -16(%rdx), %xmm1
movupd (%rdx), %xmm2
divpd %xmm0, %xmm1
divpd %xmm0, %xmm2
movupd %xmm1, -16(%rdx)
movupd %xmm2, (%rdx)
addq $64, %rdx
addq $-8, %rsi
jne .LBB3_6
.LBB3_7:
movq %r8, %rdx
movq %r9, %rdi
.LBB3_8: # %middle.block
cmpq %rdi, %rcx
je .LBB3_11
# BB#9:
movsd .LCPI3_1(%rip), %xmm0 # xmm0 = mem[0],zero
.align 16, 0x90
.LBB3_10: # %.lr.ph.i
# =>This Inner Loop Header: Depth=1
movsd (%rdx), %xmm1 # xmm1 = mem[0],zero
divsd %xmm0, %xmm1
movsd %xmm1, (%rdx)
addq $8, %rdx
cmpq %rdx, %rax
jne .LBB3_10
.LBB3_11: # %_ZSt8for_eachIN9__gnu_cxx17__normal_iteratorIPdSt6vectorIdSaIdEEEEZ18for_each_algorithmR5_E3$_0ET0_T_SA_S9_.exit
retq
.Lfunc_end3:
.size _Z18for_each_algorithmRSt6vectorIdSaIdEE, .Lfunc_end3-_Z18for_each_algorithmRSt6vectorIdSaIdEE
.cfi_endproc
.ident "clang version 3.7.0 (tags/RELEASE_370/final 246979)"
.section ".note.GNU-stack","",#progbits

Can modern compilers unroll `for` loops expressed using begin and end iterators

Consider the following code
vector<double> v;
// fill v
const vector<double>::iterator end =v.end();
for(vector<double>::iterator i = v.bgin(); i != end; ++i) {
// do stuff
}
Are compilers like g++, clang++, icc able to unroll loops like this. Unfortunately I do not know assembly to be able verify from the output whether the loop gets unrolled or not. (and I only have access to g++.)
To me it seems that this will require more smartness than usual on behalf of the compiler, first to deduce that the iterator is a random access iterator, and then figure out the number of times the loop is executed. Can compilers do this when optimization is enabled ?
Thanks for your replies, and before some of you start lecturing about premature optimization, this is an excercise in curiosity.

To me it seems that this will require more smartness than usual on behalf of the compiler, first to deduce that the iterator is a random access iterator, and then figure out the number of times the loop is executed.
The STL, being comprised entirely of templates, has all the code inline. So, random access iterators reduce to pointers already when the compiler begins to apply optimizations. One of the reasons the STL was created was so that there would be less need for a programmer to outwit the compiler. You should rely on the STL to do the right thing until proven otherwise.
Of course, it is still up to you to choose the proper tool from the STL to use...
Edit: There was discussion about whether g++ does any loop unrolling. On the versions that I am using, loop unrolling is not part of -O, -O2, or -O3, and I get identical assembly for the latter two levels with the following code:
void foo (std::vector<int> &v) {
volatile int c = 0;
const std::vector<int>::const_iterator end = v.end();
for (std::vector<int>::iterator i = v.begin(); i != end; ++i) {
*i = c++;
}
}
With the corresponding assembly -O2 assembly:
_Z3fooRSt6vectorIiSaIiEE:
.LFB435:
movq 8(%rdi), %rcx
movq (%rdi), %rax
movl $0, -4(%rsp)
cmpq %rax, %rcx
je .L4
.p2align 4,,10
.p2align 3
.L3:
movl -4(%rsp), %edx
movl %edx, (%rax)
addq $4, %rax
addl $1, %edx
cmpq %rax, %rcx
movl %edx, -4(%rsp)
jne .L3
.L4:
rep
ret
With the -funroll-loops option added, the function expands into something much much larger. But, the documentation warns about this option:
Unroll loops whose number of iterations can be determined at compile time or upon entry to the loop. -funroll-loops implies -frerun-cse-after-loop. It also turns on complete loop peeling (i.e. complete removal of loops with small constant number of iterations). This option makes code larger, and may or may not make it run faster.
As a further argument to dissuade you from unrolling loops yourself, I'll finish this answer with an illustration of applying Duff's Device to the foo function above:
void foo_duff (std::vector<int> &v) {
volatile int c = 0;
const std::vector<int>::const_iterator end = v.end();
std::vector<int>::iterator i = v.begin();
switch ((end - i) % 4) do {
case 0: *i++ = c++;
case 3: *i++ = c++;
case 2: *i++ = c++;
case 1: *i++ = c++;
} while (i != end);
}
GCC has another loop optimization flag:
-ftree-loop-optimize
Perform loop optimizations on trees. This flag is enabled by default at -O and higher.
So, the -O option enables simple loop optimizations for the innermost loops, including complete loop unrolling (peeling) for loops with a fixed number of iterations. (Thanks to doc for pointing this out to me.)

I would propose that whether or not the compiler CAN unroll the loop, with modern pipelined architectures and caches, unless your "do stuff" is trivial, there is little benefit in doing so, and in many cases doing so would be a performance HIT instead of a boon. If your "do stuff" is nontrivial, unrolling the loop will create multiple copies of this nontrivial code, which will take extra time to load into the cache, significantly slowing down the first iteration through the unrolled loop. At the same time, it will evict more code from the cache, which may have been necessary for performing the "do stuff" if it makes any function calls, which would then need to be reloaded into the cache again. The purpose for unrolling loops made a lot of sense before cacheless pipelined non-branch-predictive architectures, with the goal being to reduce the overhead associated with the loop logic. Nowadays with cache-based pipelined branch-predictive hardware, your cpu will be pipelined well into the next loop iteration, speculatively executing the loop code again, by the time you detect the i==end exit condition, at which point the processor will throw out that final speculatively-executed set of results. In such an architecture, loop unrolling makes very little sense. It would further bloat code for virtually no benefit.

The short answer is yes. It will unroll as much as it can. In your case, it depends how you define end obviously (I assume your example is generic). Not only will most modern compilers unroll, but they will also vectorize and do other optimizations that will often blow your own solutions out of the water.
So what I'm saying is don't prematurely optimize! Just kidding :)

Simple answer: generally NO! At least when it comes to complete loop unrolling.
Let's test loop unrolling on this simple, dirty-coded (for testing purposes) structure.
struct Test
{
Test(): begin(arr), end(arr + 4) {}
double * begin;
double * end;
double arr[4];
};
First let's take counted loop and compile it without any optimizations.
double counted(double param, Test & d)
{
for (int i = 0; i < 4; i++)
param += d.arr[i];
return param;
}
Here's what gcc 4.9 produces.
counted(double, Test&):
pushq %rbp
movq %rsp, %rbp
movsd %xmm0, -24(%rbp)
movq %rdi, -32(%rbp)
movl $0, -4(%rbp)
jmp .L2
.L3:
movq -32(%rbp), %rax
movl -4(%rbp), %edx
movslq %edx, %rdx
addq $2, %rdx
movsd (%rax,%rdx,8), %xmm0
movsd -24(%rbp), %xmm1
addsd %xmm0, %xmm1
movq %xmm1, %rax
movq %rax, -24(%rbp)
addl $1, -4(%rbp)
.L2:
cmpl $3, -4(%rbp)
jle .L3
movq -24(%rbp), %rax
movq %rax, -40(%rbp)
movsd -40(%rbp), %xmm0
popq %rbp
ret
As expected loop hasn't been unrolled and, since no optimizations were performed, code is generally very verbose. Now let's turn on -O3 flag. Produced disassembly:
counted(double, Test&):
addsd 16(%rdi), %xmm0
addsd 24(%rdi), %xmm0
addsd 32(%rdi), %xmm0
addsd 40(%rdi), %xmm0
ret
Voila, loop has been unrolled this time.
Now let's take a look at iterated loop. Function containing the loop will look like this.
double iterated(double param, Test & d)
{
for (double * it = d.begin; it != d.end; ++it)
param += *it;
return param;
}
Still using -O3 flag, let's take a look at disassembly.
iterated(double, Test&):
movq (%rdi), %rax
movq 8(%rdi), %rdx
cmpq %rdx, %rax
je .L3
.L4:
addsd (%rax), %xmm0
addq $8, %rax
cmpq %rdx, %rax
jne .L4
.L3:
rep ret
Code looks better than in the very first case, because optimizations were performed, but loop hasn't been unrolled this time!
What about funroll-loops and funroll-all-loops flags? They will produce result similar to this
iterated(double, Test&):
movq (%rdi), %rsi
movq 8(%rdi), %rcx
cmpq %rcx, %rsi
je .L3
movq %rcx, %rdx
leaq 8(%rsi), %rax
addsd (%rsi), %xmm0
subq %rsi, %rdx
subq $8, %rdx
shrq $3, %rdx
andl $7, %edx
cmpq %rcx, %rax
je .L43
testq %rdx, %rdx
je .L4
cmpq $1, %rdx
je .L29
cmpq $2, %rdx
je .L30
cmpq $3, %rdx
je .L31
cmpq $4, %rdx
je .L32
cmpq $5, %rdx
je .L33
cmpq $6, %rdx
je .L34
addsd (%rax), %xmm0
leaq 16(%rsi), %rax
.L34:
addsd (%rax), %xmm0
addq $8, %rax
.L33:
addsd (%rax), %xmm0
addq $8, %rax
.L32:
addsd (%rax), %xmm0
addq $8, %rax
.L31:
addsd (%rax), %xmm0
addq $8, %rax
.L30:
addsd (%rax), %xmm0
addq $8, %rax
.L29:
addsd (%rax), %xmm0
addq $8, %rax
cmpq %rcx, %rax
je .L44
.L4:
addsd (%rax), %xmm0
addq $64, %rax
addsd -56(%rax), %xmm0
addsd -48(%rax), %xmm0
addsd -40(%rax), %xmm0
addsd -32(%rax), %xmm0
addsd -24(%rax), %xmm0
addsd -16(%rax), %xmm0
addsd -8(%rax), %xmm0
cmpq %rcx, %rax
jne .L4
.L3:
rep ret
.L44:
rep ret
.L43:
rep ret
Compare results with unrolled loop for counted loop. It's clearly not the same. What we see here is that gcc divided the loop into 8 element chunks. This can increase performance in some cases, because loop exit condition is checked once per 8 normal loop iterations. With additional flags vectorization could be also performed. But it isn't complete loop unrolling.
Iterated loop will be unrolled however if Test object is not a function argument.
double iteratedLocal(double param)
{
Test d;
for (double * it = d.begin; it != d.end; ++it)
param += *it;
return param;
}
Disassembly produced with only -O3 flag:
iteratedLocal(double):
addsd -40(%rsp), %xmm0
addsd -32(%rsp), %xmm0
addsd -24(%rsp), %xmm0
addsd -16(%rsp), %xmm0
ret
As you can see loop has been unrolled. This is because compiler can now safely assume that end has fixed value, while it couldn't predict that for function argument.
Test structure is statically allocated however. Things are more complicated with dynamically allocated structures like std::vector. From my observations on modified Test structure, so that it ressembles dynamically allocated container, it looks like gcc tries its best to unroll loops, but in most cases generated code is not as simple as one above.
As you ask for other compilers, here's output from clang 3.4.1 (-O3 flag)
counted(double, Test&): # #counted(double, Test&)
addsd 16(%rdi), %xmm0
addsd 24(%rdi), %xmm0
addsd 32(%rdi), %xmm0
addsd 40(%rdi), %xmm0
ret
iterated(double, Test&): # #iterated(double, Test&)
movq (%rdi), %rax
movq 8(%rdi), %rcx
cmpq %rcx, %rax
je .LBB1_2
.LBB1_1: # %.lr.ph
addsd (%rax), %xmm0
addq $8, %rax
cmpq %rax, %rcx
jne .LBB1_1
.LBB1_2: # %._crit_edge
ret
iteratedLocal(double): # #iteratedLocal(double)
leaq -32(%rsp), %rax
movq %rax, -48(%rsp)
leaq (%rsp), %rax
movq %rax, -40(%rsp)
xorl %eax, %eax
jmp .LBB2_1
.LBB2_2: # %._crit_edge4
movsd -24(%rsp,%rax), %xmm1
addq $8, %rax
.LBB2_1: # =>This Inner Loop Header: Depth=1
movaps %xmm0, %xmm2
cmpq $24, %rax
movaps %xmm1, %xmm0
addsd %xmm2, %xmm0
jne .LBB2_2
ret
Intel's icc 13.01 (-O3 flag)
counted(double, Test&):
addsd 16(%rdi), %xmm0 #24.5
addsd 24(%rdi), %xmm0 #24.5
addsd 32(%rdi), %xmm0 #24.5
addsd 40(%rdi), %xmm0 #24.5
ret #25.10
iterated(double, Test&):
movq (%rdi), %rdx #30.26
movq 8(%rdi), %rcx #30.41
cmpq %rcx, %rdx #30.41
je ..B3.25 # Prob 50% #30.41
subq %rdx, %rcx #30.7
movb $0, %r8b #30.7
lea 7(%rcx), %rax #30.7
sarq $2, %rax #30.7
shrq $61, %rax #30.7
lea 7(%rax,%rcx), %rcx #30.7
sarq $3, %rcx #30.7
cmpq $16, %rcx #30.7
jl ..B3.26 # Prob 10% #30.7
movq %rdx, %rdi #30.7
andq $15, %rdi #30.7
je ..B3.6 # Prob 50% #30.7
testq $7, %rdi #30.7
jne ..B3.26 # Prob 10% #30.7
movl $1, %edi #30.7
..B3.6: # Preds ..B3.5 ..B3.3
lea 16(%rdi), %rax #30.7
cmpq %rax, %rcx #30.7
jl ..B3.26 # Prob 10% #30.7
movq %rcx, %rax #30.7
xorl %esi, %esi #30.7
subq %rdi, %rax #30.7
andq $15, %rax #30.7
negq %rax #30.7
addq %rcx, %rax #30.7
testq %rdi, %rdi #30.7
jbe ..B3.11 # Prob 2% #30.7
..B3.9: # Preds ..B3.7 ..B3.9
addsd (%rdx,%rsi,8), %xmm0 #31.9
incq %rsi #30.7
cmpq %rdi, %rsi #30.7
jb ..B3.9 # Prob 82% #30.7
..B3.11: # Preds ..B3.9 ..B3.7
pxor %xmm6, %xmm6 #28.12
movaps %xmm6, %xmm7 #28.12
movaps %xmm6, %xmm5 #28.12
movsd %xmm0, %xmm7 #28.12
movaps %xmm6, %xmm4 #28.12
movaps %xmm6, %xmm3 #28.12
movaps %xmm6, %xmm2 #28.12
movaps %xmm6, %xmm1 #28.12
movaps %xmm6, %xmm0 #28.12
..B3.12: # Preds ..B3.12 ..B3.11
addpd (%rdx,%rdi,8), %xmm7 #31.9
addpd 16(%rdx,%rdi,8), %xmm6 #31.9
addpd 32(%rdx,%rdi,8), %xmm5 #31.9
addpd 48(%rdx,%rdi,8), %xmm4 #31.9
addpd 64(%rdx,%rdi,8), %xmm3 #31.9
addpd 80(%rdx,%rdi,8), %xmm2 #31.9
addpd 96(%rdx,%rdi,8), %xmm1 #31.9
addpd 112(%rdx,%rdi,8), %xmm0 #31.9
addq $16, %rdi #30.7
cmpq %rax, %rdi #30.7
jb ..B3.12 # Prob 82% #30.7
addpd %xmm6, %xmm7 #28.12
addpd %xmm4, %xmm5 #28.12
addpd %xmm2, %xmm3 #28.12
addpd %xmm0, %xmm1 #28.12
addpd %xmm5, %xmm7 #28.12
addpd %xmm1, %xmm3 #28.12
addpd %xmm3, %xmm7 #28.12
movaps %xmm7, %xmm0 #28.12
unpckhpd %xmm7, %xmm0 #28.12
addsd %xmm0, %xmm7 #28.12
movaps %xmm7, %xmm0 #28.12
..B3.14: # Preds ..B3.13 ..B3.26
lea 1(%rax), %rsi #30.7
cmpq %rsi, %rcx #30.7
jb ..B3.25 # Prob 50% #30.7
subq %rax, %rcx #30.7
cmpb $1, %r8b #30.7
jne ..B3.17 # Prob 50% #30.7
..B3.16: # Preds ..B3.17 ..B3.15
xorl %r8d, %r8d #30.7
jmp ..B3.21 # Prob 100% #30.7
..B3.17: # Preds ..B3.15
cmpq $2, %rcx #30.7
jl ..B3.16 # Prob 10% #30.7
movq %rcx, %r8 #30.7
xorl %edi, %edi #30.7
pxor %xmm1, %xmm1 #28.12
lea (%rdx,%rax,8), %rsi #31.19
andq $-2, %r8 #30.7
movsd %xmm0, %xmm1 #28.12
..B3.19: # Preds ..B3.19 ..B3.18
addpd (%rsi,%rdi,8), %xmm1 #31.9
addq $2, %rdi #30.7
cmpq %r8, %rdi #30.7
jb ..B3.19 # Prob 82% #30.7
movaps %xmm1, %xmm0 #28.12
unpckhpd %xmm1, %xmm0 #28.12
addsd %xmm0, %xmm1 #28.12
movaps %xmm1, %xmm0 #28.12
..B3.21: # Preds ..B3.20 ..B3.16
cmpq %rcx, %r8 #30.7
jae ..B3.25 # Prob 2% #30.7
lea (%rdx,%rax,8), %rax #31.19
..B3.23: # Preds ..B3.23 ..B3.22
addsd (%rax,%r8,8), %xmm0 #31.9
incq %r8 #30.7
cmpq %rcx, %r8 #30.7
jb ..B3.23 # Prob 82% #30.7
..B3.25: # Preds ..B3.23 ..B3.21 ..B3.14 ..B3.1
ret #32.14
..B3.26: # Preds ..B3.2 ..B3.6 ..B3.4 # Infreq
movb $1, %r8b #30.7
xorl %eax, %eax #30.7
jmp ..B3.14 # Prob 100% #30.7
iteratedLocal(double):
lea -8(%rsp), %rax #8.13
lea -40(%rsp), %rdx #7.11
cmpq %rax, %rdx #33.41
je ..B4.15 # Prob 50% #33.41
movq %rax, -48(%rsp) #32.12
movq %rdx, -56(%rsp) #32.12
xorl %eax, %eax #33.7
..B4.13: # Preds ..B4.11 ..B4.13
addsd -40(%rsp,%rax,8), %xmm0 #34.9
incq %rax #33.7
cmpq $4, %rax #33.7
jb ..B4.13 # Prob 82% #33.7
..B4.15: # Preds ..B4.13 ..B4.1
ret #35.14
To avoid misunderstandings. If counted loop condition would rely on external parameter like this one.
double countedDep(double param, Test & d)
{
for (int i = 0; i < d.size; i++)
param += d.arr[i];
return param;
}
Such loop also will not be unrolled.

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

Optimization of naive matrix multiplication (ICC vs GCC) - c++

I'm not sure if ICC really produced faster code in this case because I didn't run any actual benchmarks. But you can tell GCC to unroll the loops with -funroll-loops. The output will be longer, will contain lots of xmm's and will look faster.

Related

Why some of sse intrinsics introduce move back and forth?

Explanation of std::vector<int> sum ASM

openCL glGetProgramInfo causing Core Foundation crash

Performance of C++11 modern-style loops vs old-style loops

Can modern compilers unroll `for` loops expressed using begin and end iterators

Categories

Resources