Performance difference with custom iterator

Performance difference with custom iterator - c++

I was trying to implement an iterator over the pixels of a 3D image. The image is stored in a single vector, and I do index mapping to go from (x, y, z) to the vector index. The idea is to be able to do a range-based for on the image.
Of course, iterating over the pixels by using the vector iterators is really fast (0.6s in my case), but I don't have access to the indexes.
A triple for loop (z, y, x to be in the memory order) is as fast on VS2015, but way slower (2.4s) with gcc or clang (gcc 5.2, clang 3.9 as well as older versions).
I wrote an iterator that returns a tuple (x, y, z, value), and it's also around 2.4s, even with VS2015. The iterator contains a reference to the image, and the three field x, y, and z, and the value of the pixel is obtained by using the same pixel_at(x, y, z) as the triple loop.
So, I have 2 questions:
Why is VS2015 so much faster than GCC and Clang? (answered in edit)
How can I achieve the faster time while still having access to x, y, z?
Here's the increment of the x/y/z iterator:
_x++;
if (_x == _im.width())
{
_x = 0;
_y++;
if (_y == _im.height())
{
_y = 0;
_z++;
}
}
The loops that are measured are the following:
Triple for-loop:
for (int k = 0; k < im.depth(); ++k)
for (int j = 0; j < im.height(); ++j)
for (int i = 0; i < im.width(); ++i)
sum += im.pixel_at(i, j, k);
Iterate over the vector:
for (unsigned char c : im.pixels())
sum += c;
Iterate with the custom iterator:
for (const auto& c : im.indexes())
sum += std::get<3>(c);
The iterator implemented with a single index gives me the good performance (0.6s), but then I don't have access to x, y, z, and computing them on the fly is too slow (6s). Having x, y, z and the index in the iterator doesn't help either (2.4s).
The benchmark is 100 passes on the image, while summing the values of the pixels, randomly initialized. For the cases where I have the indexes too, I made sure that the value of the indexes are read (for instance, the sum of the z coordinates) so that the compiler doesn't optimize them. It doesn't change anything, except for the case where I compute the indexes on the fly, where the computation was optimized away.
The whole code can be found here: https://github.com/nitnelave/3DIterator
Thanks!
EDIT: After enabling link-time optimization (-flto), the triple for loop is as fast (0.19s) on g++ as the vector iterators (0.18s), but the custom iterator is still 6 times slower (1.2s). That's already much better than the 3.2s I was getting for both the for loop and the iterator, but we're not quite there yet.
EDIT: I isolated the loops inside functions which I forbade gcc to inline, to isolate the assembly code, and here it is:
Triple for loop:
push %r15
push %r14
push %r13
push %r12
push %rbp
push %rdi
push %rsi
push %rbx
sub $0x78,%rsp
mov 0x8(%rcx),%eax
pxor %xmm4,%xmm4
mov %rcx,%r12
movl $0x64,0x5c(%rsp)
xor %r13d,%r13d
xor %r14d,%r14d
mov %eax,0x58(%rsp)
mov 0x58(%rsp),%edx
test %edx,%edx
jle 10040163a <_Z17bench_triple_loopRK7Image3D.constprop.5+0x2fa>
mov 0x4(%r12),%eax
movl $0x0,0x54(%rsp)
movl $0x0,0x2c(%rsp)
mov %eax,0x48(%rsp)
nopl 0x0(%rax,%rax,1)
nopw %cs:0x0(%rax,%rax,1)
mov 0x48(%rsp),%eax
test %eax,%eax
jle 10040161f <_Z17bench_triple_loopRK7Image3D.constprop.5+0x2df>
movslq (%r12),%rax
mov 0x54(%rsp),%r10d
mov 0x2c(%rsp),%ebx
mov %rax,%rcx
mov %rax,0x40(%rsp)
add %rax,%rax
mov %rax,0x38(%rsp)
lea -0x1(%rcx),%eax
imul %ecx,%r10d
imul %eax,%ebx
mov %eax,0x50(%rsp)
movslq %r10d,%rsi
lea (%rsi,%rsi,1),%r9
mov %ebx,0x4c(%rsp)
xor %ebx,%ebx
xchg %ax,%ax
nopw %cs:0x0(%rax,%rax,1)
test %ecx,%ecx
jle 100401605 <_Z17bench_triple_loopRK7Image3D.constprop.5+0x2c5>
mov 0x10(%r12),%rdx
lea (%rdx,%r9,1),%r8
mov %r8,%rax
and $0xf,%eax
shr %rax
neg %rax
and $0x7,%eax
cmp %ecx,%eax
cmova %ecx,%eax
cmp $0xa,%ecx
cmovbe %ecx,%eax
test %eax,%eax
je 100401690 <_Z17bench_triple_loopRK7Image3D.constprop.5+0x350>
movswl (%r8),%r8d
add %r8d,%r14d
cmp $0x1,%eax
je 100401720 <_Z17bench_triple_loopRK7Image3D.constprop.5+0x3e0>
movswl 0x2(%rdx,%r9,1),%r8d
add %r8d,%r14d
cmp $0x2,%eax
je 100401710 <_Z17bench_triple_loopRK7Image3D.constprop.5+0x3d0>
movswl 0x4(%rdx,%r9,1),%r8d
add %r8d,%r14d
cmp $0x3,%eax
je 100401700 <_Z17bench_triple_loopRK7Image3D.constprop.5+0x3c0>
movswl 0x6(%rdx,%r9,1),%r8d
add %r8d,%r14d
cmp $0x4,%eax
je 1004016f0 <_Z17bench_triple_loopRK7Image3D.constprop.5+0x3b0>
movswl 0x8(%rdx,%r9,1),%r8d
add %r8d,%r14d
cmp $0x5,%eax
je 1004016e0 <_Z17bench_triple_loopRK7Image3D.constprop.5+0x3a0>
movswl 0xa(%rdx,%r9,1),%r8d
add %r8d,%r14d
cmp $0x6,%eax
je 1004016d0 <_Z17bench_triple_loopRK7Image3D.constprop.5+0x390>
movswl 0xc(%rdx,%r9,1),%r8d
add %r8d,%r14d
cmp $0x7,%eax
je 1004016c0 <_Z17bench_triple_loopRK7Image3D.constprop.5+0x380>
movswl 0xe(%rdx,%r9,1),%r8d
add %r8d,%r14d
cmp $0x8,%eax
je 1004016b0 <_Z17bench_triple_loopRK7Image3D.constprop.5+0x370>
movswl 0x10(%rdx,%r9,1),%r8d
add %r8d,%r14d
cmp $0xa,%eax
jne 1004016a0 <_Z17bench_triple_loopRK7Image3D.constprop.5+0x360>
movswl 0x12(%rdx,%r9,1),%r8d
add %r8d,%r14d
mov $0xa,%r8d
cmp %ecx,%eax
je 1004015fb <_Z17bench_triple_loopRK7Image3D.constprop.5+0x2bb>
mov %eax,%r15d
mov %ecx,%edi
sub %eax,%edi
mov %r15,0x30(%rsp)
mov 0x50(%rsp),%r15d
lea -0x8(%rdi),%r11d
sub %eax,%r15d
shr $0x3,%r11d
add $0x1,%r11d
cmp $0x6,%r15d
lea 0x0(,%r11,8),%ebp
jbe 100401573 <_Z17bench_triple_loopRK7Image3D.constprop.5+0x233>
mov 0x30(%rsp),%r15
pxor %xmm0,%xmm0
xor %eax,%eax
add %rsi,%r15
lea (%rdx,%r15,2),%r15
movdqa (%r15),%xmm1
movdqa %xmm4,%xmm3
add $0x1,%eax
add $0x10,%r15
pcmpgtw %xmm1,%xmm3
movdqa %xmm1,%xmm2
cmp %eax,%r11d
punpcklwd %xmm3,%xmm2
punpckhwd %xmm3,%xmm1
paddd %xmm2,%xmm0
paddd %xmm1,%xmm0
ja 10040151a <_Z17bench_triple_loopRK7Image3D.constprop.5+0x1da>
movdqa %xmm0,%xmm1
add %ebp,%r8d
psrldq $0x8,%xmm1
paddd %xmm1,%xmm0
movdqa %xmm0,%xmm1
psrldq $0x4,%xmm1
paddd %xmm1,%xmm0
movd %xmm0,%eax
add %eax,%r14d
cmp %ebp,%edi
je 1004015fb <_Z17bench_triple_loopRK7Image3D.constprop.5+0x2bb
lea (%r10,%r8,1),%eax
cltq
movswl (%rdx,%rax,2),%eax
add %eax,%r14d
lea 0x1(%r8),%eax
cmp %eax,%ecx
jle 1004015fb <_Z17bench_triple_loopRK7Image3D.constprop.5+0x2bb
add %r10d,%eax
cltq
movswl (%rdx,%rax,2),%eax
add %eax,%r14d
lea 0x2(%r8),%eax
cmp %eax,%ecx
jle 1004015fb <_Z17bench_triple_loopRK7Image3D.constprop.5+0x2bb
add %r10d,%eax
cltq
movswl (%rdx,%rax,2),%eax
add %eax,%r14d
lea 0x3(%r8),%eax
cmp %eax,%ecx
jle 1004015fb <_Z17bench_triple_loopRK7Image3D.constprop.5+0x2bb
add %r10d,%eax
cltq
movswl (%rdx,%rax,2),%eax
add %eax,%r14d
lea 0x4(%r8),%eax
cmp %eax,%ecx
jle 1004015fb <_Z17bench_triple_loopRK7Image3D.constprop.5+0x2bb
add %r10d,%eax
cltq
movswl (%rdx,%rax,2),%eax
add %eax,%r14d
lea 0x5(%r8),%eax
cmp %eax,%ecx
jle 1004015fb <_Z17bench_triple_loopRK7Image3D.constprop.5+0x2bb
add %r10d,%eax
add $0x6,%r8d
cltq
movswl (%rdx,%rax,2),%eax
add %eax,%r14d
cmp %r8d,%ecx
jle 1004015fb <_Z17bench_triple_loopRK7Image3D.constprop.5+0x2bb
add %r10d,%r8d
movslq %r8d,%r8
movswl (%rdx,%r8,2),%eax
add %eax,%r14d
add 0x2c(%rsp),%r13d
add 0x4c(%rsp),%r13d
add $0x1,%ebx
add 0x38(%rsp),%r9
add 0x40(%rsp),%rsi
add %ecx,%r10d
cmp %ebx,0x48(%rsp)
jne 1004013f0 <_Z17bench_triple_loopRK7Image3D.constprop.5+0xb0>
addl $0x1,0x2c(%rsp)
mov 0x48(%rsp),%ebx
mov 0x2c(%rsp),%eax
add %ebx,0x54(%rsp)
cmp 0x58(%rsp),%eax
jne 1004013a0 <_Z17bench_triple_loopRK7Image3D.constprop.5+0x60>
subl $0x1,0x5c(%rsp)
jne 10040136c <_Z17bench_triple_loopRK7Image3D.constprop.5+0x2c>
mov 0x2a64(%rip),%rcx # 1004040b0 <__fu0__ZSt4cout>
mov %r13d,%eax
mov %r14d,%r13d
mov %eax,%edx
callq 100401828 <_ZNSolsEi>
lea 0x6f(%rsp),%rdx
mov $0x1,%r8d
mov %rax,%rcx
movb $0xa,0x6f(%rsp)
callq 100401800 <_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l>
mov %r13d,%eax
add $0x78,%rsp
pop %rbx
pop %rsi
pop %rdi
pop %rbp
pop %r12
pop %r13
pop %r14
pop %r15
retq
nop
nopw %cs:0x0(%rax,%rax,1)
xor %r8d,%r8d
jmpq 1004014da <_Z17bench_triple_loopRK7Image3D.constprop.5+0x19a>
nopl 0x0(%rax,%rax,1)
mov $0x9,%r8d
jmpq 1004014d2 <_Z17bench_triple_loopRK7Image3D.constprop.5+0x192>
nopl 0x0(%rax,%rax,1)
mov $0x8,%r8d
jmpq 1004014d2 <_Z17bench_triple_loopRK7Image3D.constprop.5+0x192>
nopl 0x0(%rax,%rax,1)
mov $0x7,%r8d
jmpq 1004014d2 <_Z17bench_triple_loopRK7Image3D.constprop.5+0x192>
nopl 0x0(%rax,%rax,1)
mov $0x6,%r8d
jmpq 1004014d2 <_Z17bench_triple_loopRK7Image3D.constprop.5+0x192>
nopl 0x0(%rax,%rax,1)
mov $0x5,%r8d
jmpq 1004014d2 <_Z17bench_triple_loopRK7Image3D.constprop.5+0x192>
nopl 0x0(%rax,%rax,1)
mov $0x4,%r8d
jmpq 1004014d2 <_Z17bench_triple_loopRK7Image3D.constprop.5+0x192>
nopl 0x0(%rax,%rax,1)
mov $0x3,%r8d
jmpq 1004014d2 <_Z17bench_triple_loopRK7Image3D.constprop.5+0x192>
nopl 0x0(%rax,%rax,1)
mov $0x2,%r8d
jmpq 1004014d2 <_Z17bench_triple_loopRK7Image3D.constprop.5+0x192>
nopl 0x0(%rax,%rax,1)
mov $0x1,%r8d
jmpq 1004014d2 <_Z17bench_triple_loopRK7Image3D.constprop.5+0x192>
nopl 0x0(%rax,%rax,1)
Iteration over the vector:
push %r14
push %r13
push %r12
push %rbp
push %rdi
push %rsi
push %rbx
mov 0x10(%rcx),%r8
mov 0x18(%rcx),%r10
mov $0x64,%ebx
pxor %xmm4,%xmm4
lea 0x2(%r8),%r12
mov %r10,%rdi
mov %r8,%rbp
and $0xf,%ebp
sub %r12,%rdi
shr %rbp
shr %rdi
neg %rbp
lea 0x1(%rdi),%r11
and $0x7,%ebp
cmp %r11,%rbp
cmova %r11,%rbp
xor %eax,%eax
xchg %ax,%ax
nopw %cs:0x0(%rax,%rax,1)
cmp %r8,%r10
je 1004012dc <_Z10bench_iterRK7Image3D.constprop.4+0x1fc>
cmp $0xa,%r11
mov %r11,%rcx
ja 1004012f0 <_Z10bench_iterRK7Image3D.constprop.4+0x210>
movswl (%r8),%edx
add %edx,%eax
cmp $0x1,%rcx
je 100401300 <_Z10bench_iterRK7Image3D.constprop.4+0x220>
movswl 0x2(%r8),%edx
add %edx,%eax
cmp $0x2,%rcx
lea 0x4(%r8),%rdx
je 1004011ed <_Z10bench_iterRK7Image3D.constprop.4+0x10d>
movswl 0x4(%r8),%edx
add %edx,%eax
cmp $0x3,%rcx
lea 0x6(%r8),%rdx
je 1004011ed <_Z10bench_iterRK7Image3D.constprop.4+0x10d>
movswl 0x6(%r8),%edx
add %edx,%eax
cmp $0x4,%rcx
lea 0x8(%r8),%rdx
je 1004011ed <_Z10bench_iterRK7Image3D.constprop.4+0x10d>
movswl 0x8(%r8),%edx
add %edx,%eax
cmp $0x5,%rcx
lea 0xa(%r8),%rdx
je 1004011ed <_Z10bench_iterRK7Image3D.constprop.4+0x10d>
movswl 0xa(%r8),%edx
add %edx,%eax
cmp $0x6,%rcx
lea 0xc(%r8),%rdx
je 1004011ed <_Z10bench_iterRK7Image3D.constprop.4+0x10d>
movswl 0xc(%r8),%edx
add %edx,%eax
cmp $0x7,%rcx
lea 0xe(%r8),%rdx
je 1004011ed <_Z10bench_iterRK7Image3D.constprop.4+0x10d>
movswl 0xe(%r8),%edx
add %edx,%eax
cmp $0x8,%rcx
lea 0x10(%r8),%rdx
je 1004011ed <_Z10bench_iterRK7Image3D.constprop.4+0x10d>
movswl 0x10(%r8),%edx
add %edx,%eax
cmp $0xa,%rcx
lea 0x12(%r8),%rdx
jne 1004011ed <_Z10bench_iterRK7Image3D.constprop.4+0x10d>
movswl 0x12(%r8),%edx
add %edx,%eax
lea 0x14(%r8),%rdx
cmp %rcx,%r11
je 1004012dc <_Z10bench_iterRK7Image3D.constprop.4+0x1fc>
mov %r11,%rsi
mov %rdi,%r14
sub %rcx,%rsi
sub %rcx,%r14
lea -0x8(%rsi),%r9
shr $0x3,%r9
add $0x1,%r9
cmp $0x6,%r14
lea 0x0(,%r9,8),%r13
jbe 10040127d <_Z10bench_iterRK7Image3D.constprop.4+0x19d>
pxor %xmm0,%xmm0
lea (%r8,%rcx,2),%r14
xor %ecx,%ecx
movdqa (%r14),%xmm1
add $0x1,%rcx
movdqa %xmm4,%xmm2
add $0x10,%r14
movdqa %xmm1,%xmm3
cmp %rcx,%r9
pcmpgtw %xmm1,%xmm2
punpcklwd %xmm2,%xmm3
punpckhwd %xmm2,%xmm1
paddd %xmm3,%xmm0
paddd %xmm1,%xmm0
ja 100401226 <_Z10bench_iterRK7Image3D.constprop.4+0x146>
movdqa %xmm0,%xmm1
lea (%rdx,%r13,2),%rdx
psrldq $0x8,%xmm1
paddd %xmm1,%xmm0
movdqa %xmm0,%xmm1
psrldq $0x4,%xmm1
paddd %xmm1,%xmm0
movd %xmm0,%ecx
add %ecx,%eax
cmp %r13,%rsi
je 1004012dc <_Z10bench_iterRK7Image3D.constprop.4+0x1fc>
movswl (%rdx),%ecx
add %ecx,%eax
lea 0x2(%rdx),%rcx
cmp %rcx,%r10
je 1004012dc <_Z10bench_iterRK7Image3D.constprop.4+0x1fc>
movswl 0x2(%rdx),%ecx
add %ecx,%eax
lea 0x4(%rdx),%rcx
cmp %rcx,%r10
je 1004012dc <_Z10bench_iterRK7Image3D.constprop.4+0x1fc>
movswl 0x4(%rdx),%ecx
add %ecx,%eax
lea 0x6(%rdx),%rcx
cmp %rcx,%r10
je 1004012dc <_Z10bench_iterRK7Image3D.constprop.4+0x1fc>
movswl 0x6(%rdx),%ecx
add %ecx,%eax
lea 0x8(%rdx),%rcx
cmp %rcx,%r10
je 1004012dc <_Z10bench_iterRK7Image3D.constprop.4+0x1fc>
movswl 0x8(%rdx),%ecx
add %ecx,%eax
lea 0xa(%rdx),%rcx
cmp %rcx,%r10
je 1004012dc <_Z10bench_iterRK7Image3D.constprop.4+0x1fc>
movswl 0xa(%rdx),%ecx
add %ecx,%eax
lea 0xc(%rdx),%rcx
cmp %rcx,%r10
je 1004012dc <_Z10bench_iterRK7Image3D.constprop.4+0x1fc>
movswl 0xc(%rdx),%edx
add %edx,%eax
sub $0x1,%ebx
jne 100401130 <_Z10bench_iterRK7Image3D.constprop.4+0x50>
pop %rbx
pop %rsi
pop %rdi
pop %rbp
pop %r12
pop %r13
pop %r14
retq
test %rbp,%rbp
jne 100401308 <_Z10bench_iterRK7Image3D.constprop.4+0x228>
xor %ecx,%ecx
mov %r8,%rdx
jmpq 1004011f6 <_Z10bench_iterRK7Image3D.constprop.4+0x116>
nop
mov %r12,%rdx
jmpq 1004011ed <_Z10bench_iterRK7Image3D.constprop.4+0x10d>
mov %rbp,%rcx
jmpq 100401146 <_Z10bench_iterRK7Image3D.constprop.4+0x66>
Custom iterator:
push %r14
push %rbp
push %rdi
push %rsi
push %rbx
sub $0x30,%rsp
mov (%rcx),%r10d
mov $0x64,%ebp
xor %ebx,%ebx
mov %rcx,%rdi
mov 0x4(%rcx),%ecx
xor %edx,%edx
mov 0x8(%rdi),%esi
nop
xor %r9d,%r9d
xor %r11d,%r11d
xor %r8d,%r8d
nopl 0x0(%rax)
cmp %r9d,%esi
je 1004017b0 <_Z16bench_index_iterRK7Image3D.constprop.0+0x80>
mov %ecx,%eax
mov 0x10(%rdi),%r14
add %r9d,%edx
imul %r9d,%eax
add %r11d,%eax
imul %r10d,%eax
add %r8d,%eax
add $0x1,%r8d
cltq
movswl (%r14,%rax,2),%eax
add %eax,%ebx
cmp %r10d,%r8d
jne 100401760 <_Z16bench_index_iterRK7Image3D.constprop.0+0x30>
add $0x1,%r11d
xor %r8d,%r8d
cmp %r11d,%ecx
jne 100401760 <_Z16bench_index_iterRK7Image3D.constprop.0+0x30>
add $0x1,%r9d
xor %r11d,%r11d
cmp %r9d,%esi
jne 100401765 <_Z16bench_index_iterRK7Image3D.constprop.0+0x35>
nopw %cs:0x0(%rax,%rax,1)
test %r11d,%r11d
jne 100401765 <_Z16bench_index_iterRK7Image3D.constprop.0+0x35>
test %r8d,%r8d
jne 100401765 <_Z16bench_index_iterRK7Image3D.constprop.0+0x35>
sub $0x1,%ebp
jne 100401750 <_Z16bench_index_iterRK7Image3D.constprop.0+0x20>
mov 0x28ea(%rip),%rcx # 1004040b0 <__fu0__ZSt4cout>
callq 100401828 <_ZNSolsEi>
lea 0x2f(%rsp),%rdx
mov $0x1,%r8d
mov %rax,%rcx
movb $0xa,0x2f(%rsp)
callq 100401800 <_ZSt16__ostream_insertIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_PKS3_l>
mov %ebx,%eax
add $0x30,%rsp
pop %rbx
pop %rsi
pop %rdi
pop %rbp
pop %r14
retq
So it seems that there is loop unrolling for the first 2 benches, but not for the custom iterator.
EDIT: I also tried using the boost::asio::coroutines, but it's slower than my solution.

Why is it slower
As you show nicely in the disassembly, the compiler generates vastly different code for the different loops. The main disadvantage of the generated machine code for the custom iterator is, that it does not utilize SIMD instructions.
Compiler optimizations are incredibly complex. You could spend days just looking at the intermediate optimization steps. Also the performance varies between different compilers (and compiler versions). In my tests clang was the only compiler that generated fast SIMD code for your iterator.
You already have one thing perfectly right: Measure. If performance of that part matters for you, you have to measure it.
How to speed it up
The basic idea is to help the compiler with the loop. For instance the end condition. The compiler needs to be able to prove that it doesn't change. You can try to locally keep a const copy the width/height of the image. Unfortunately I haven't been able to achieve significant speedup on your example code.
Another issue is the end iterator. You need to check three conditions in each iteration, even if only one (z) matters. That bring us to alternative loop end conditions such as sentinels.
range-v3
With range-v3 one could potentially implement more efficient and elegant loops. I made a quick attempt at your code:
#pragma once
#include "Image3D.h"
#include <range/v3/all.hpp>
template <typename Image>
class range_3d : public ranges::view_facade<range_3d<Image>>
{
using data_t = typename Image::data_t;
friend ranges::range_access;
int x_ = 0;
int y_ = 0;
int z_ = 0;
// Note: This cannot be a reference, because ranges needs a default constructor
// If it doesn't get one, the compiler errors will haunt you in your dreams.
Image* im_;
data_t const& get() const
{
return im_->pixel_at(x_, y_, z_);
}
bool done() const
{
return z_ == im_->depth();
}
void next()
{
x_++;
if (x_ == im_->width())
{
x_ = 0;
y_++;
if (y_ == im_->height())
{
y_ = 0;
z_++;
}
}
}
public:
range_3d() = default;
explicit range_3d(Image& im)
: im_(&im)
{
}
};
// use like:
range_3d<Image3D> r(im);
ranges::for_each(r, [&sum](unsigned char c) {
sum += c;
});
Unfortunately it still doesn't provide perfect performance, but at least the code is much nicer (as long as you don't get a compiler error).
Context matters
You have to keep in mind, that the performance of the iterators may be much different when you actually use of x/y/z. If your compiler cannot do a sum up the vector optimization, the iterator will probably show more similar performance.
On-demand x/y/z computation
I still think you might want to consider not keeping the separate loop variables. Maybe you could even optimize for images with power-of-two sizes, or block the loop somehow, so that you can compute x/y/z very efficiently from the index using bit operations.
Zero-overhead abstractions
One of the properties i like most about C++ is that it allows you to create nice abstractions with absolutely no runtime overhead. It pains me that I / the compilers fail in this case. I feel this answer lacks a happy-end, so I'd encourage everyone to attempt this as well.

Related

What is the disassembler telling me?

Application is crashing, so I started it on debug mode and there I got the message that a write access violation is happening.
Additionnally Qt Creator pointed me to the disassembler showing the following:
ntdll!RtlWaitOnAddress:
0x7fff9dfd3350 sub rsp,38h
0x7fff9dfd3354 <+ 4> mov eax,dword ptr [ntdll!NlsAnsiCodePage+0x1360 (00007fff`9e0d8b2c)]
0x7fff9dfd335a <+ 10> mov dword ptr [rsp+20h],eax
0x7fff9dfd335e <+ 14> call ntdll!RtlWaitOnAddress+0x254 (00007fff`9dfd35a4)
0x7fff9dfd3363 <+ 19> add rsp,38h
0x7fff9dfd3367 <+ 23> ret
0x7fff9dfd3368 <+ 24> int 3
0x7fff9dfd3369 <+ 25> int 3
0x7fff9dfd336a <+ 26> int 3
0x7fff9dfd336b <+ 27> int 3
0x7fff9dfd336c <+ 28> int 3
0x7fff9dfd336d <+ 29> int 3
0x7fff9dfd336e <+ 30> int 3
0x7fff9dfd336f <+ 31> int 3
0x7fff9dfd3370 <+ 32> mov qword ptr [rsp+18h],rbx
0x7fff9dfd3375 <+ 37> push rbp
0x7fff9dfd3376 <+ 38> push rsi
0x7fff9dfd3377 <+ 39> push rdi
0x7fff9dfd3378 <+ 40> push r12
0x7fff9dfd337a <+ 42> push r13
0x7fff9dfd337c <+ 44> push r14
0x7fff9dfd337e <+ 46> push r15
0x7fff9dfd3380 <+ 48> sub rsp,0A0h
0x7fff9dfd3387 <+ 55> mov rax,qword ptr [ntdll!KiUserInvertedFunctionTable+0x3010 (00007fff`9e0f2510)]
0x7fff9dfd338e <+ 62> xor rax,rsp
0x7fff9dfd3391 <+ 65> mov qword ptr [rsp+90h],rax
0x7fff9dfd3399 <+ 73> mov r13,qword ptr gs:[30h]
0x7fff9dfd33a2 <+ 82> lea rax,[ntdll!RtlNtdllName+0x470c8 (00007fff`9e0d44f8)]
0x7fff9dfd33a9 <+ 89> xor bpl,bpl
0x7fff9dfd33ac <+ 92> mov dword ptr [rsp+48h],edx
0x7fff9dfd33b0 <+ 96> xor esi,esi
0x7fff9dfd33b2 <+ 98> mov byte ptr [rsp+40h],bpl
0x7fff9dfd33b7 <+ 103> mov qword ptr [rsp+50h],r13
0x7fff9dfd33bc <+ 108> mov rbx,rcx
0x7fff9dfd33bf <+ 111> mov r12d,esi
0x7fff9dfd33c2 <+ 114> cmp rcx,rax
0x7fff9dfd33c5 <+ 117> je ntdll!RtlWaitOnAddress+0x1b4 (00007fff`9dfd3504)
0x7fff9dfd33cb <+ 123> cmp byte ptr [ntdll!NlsAnsiCodePage+0x2d3c (00007fff`9e0da508)],sil
0x7fff9dfd33d2 <+ 130> jne ntdll!RtlWaitOnAddress+0x230 (00007fff`9dfd3580)
0x7fff9dfd33d8 <+ 136> cmp dword ptr [ntdll!NlsAnsiCodePage+0x2d5c (00007fff`9e0da528)],esi
0x7fff9dfd33de <+ 142> jne ntdll!RtlWaitOnAddress+0x1fe (00007fff`9dfd354e)
0x7fff9dfd33e4 <+ 148> cmp byte ptr [ntdll!NlsAnsiCodePage+0x282c (00007fff`9e0d9ff8)],sil
0x7fff9dfd33eb <+ 155> lea rdi,[ntdll!NlsAnsiCodePage+0x2834 (00007fff`9e0da000)]
0x7fff9dfd33f2 <+ 162> cmovne rdi,rsi
0x7fff9dfd33f6 <+ 166> cmp qword ptr [rbx+18h],rsi
0x7fff9dfd33fa <+ 170> je ntdll!RtlWaitOnAddress+0x182 (00007fff`9dfd34d2)
0x7fff9dfd3400 <+ 176> mov rax,qword ptr [rbx]
0x7fff9dfd3403 <+ 179> cmp rax,0FFFFFFFFFFFFFFFFh
0x7fff9dfd3407 <+ 183> je ntdll!RtlWaitOnAddress+0x18f (00007fff`9dfd34df)
0x7fff9dfd340d <+ 189> mov r15d,esi
0x7fff9dfd3410 <+ 192> cmp rax,0FFFFFFFFFFFFFFFFh
0x7fff9dfd3414 <+ 196> je ntdll!RtlWaitOnAddress+0xc9 (00007fff`9dfd3419)
0x7fff9dfd3416 <+ 198> inc dword ptr [rax+24h]
0x7fff9dfd3419 <+ 201> mov r14,qword ptr [rbx+18h]
0x7fff9dfd341d <+ 205> lea r13,[ntdll!RtlNtdllName+0x470c8 (00007fff`9e0d44f8)]
0x7fff9dfd3424 <+ 212> mov ebp,1722h
0x7fff9dfd3429 <+ 217> call ntdll!RtlGetCurrentServiceSessionId (00007fff`9df94850)
0x7fff9dfd342e <+ 222> test eax,eax
0x7fff9dfd3430 <+ 224> jne ntdll!memset+0x1a8f4 (00007fff`9e02e6f4)
0x7fff9dfd3436 <+ 230> mov ecx,offset SharedUserData+0x382 (00000000`7ffe0382)
0x7fff9dfd343b <+ 235> cmp byte ptr [rcx],0
0x7fff9dfd343e <+ 238> jne ntdll!memset+0x1a910 (00007fff`9e02e710)
0x7fff9dfd3444 <+ 244> cmp r14,0FFFFFFFFFFFFFFFFh
0x7fff9dfd3448 <+ 248> jne ntdll!memset+0x1a9a8 (00007fff`9e02e7a8)
0x7fff9dfd344e <+ 254> mov r9,rdi
0x7fff9dfd3451 <+ 257> mov dword ptr [rsp+20h],0
0x7fff9dfd3459 <+ 265> mov r8d,4
0x7fff9dfd345f <+ 271> lea rdx,[rsp+48h]
0x7fff9dfd3464 <+ 276> lea rcx,[rbx+8]
0x7fff9dfd3468 <+ 280> call ntdll!RtlWaitOnAddress+0x254 (00007fff`9dfd35a4)
0x7fff9dfd346d <+ 285> cmp eax,102h
0x7fff9dfd3472 <+ 290> je ntdll!memset+0x1a9bb (00007fff`9e02e7bb)
0x7fff9dfd3478 <+ 296> mov ecx,dword ptr [rbx+8]
0x7fff9dfd347b <+ 299> mov dword ptr [rsp+48h],ecx
0x7fff9dfd347f <+ 303> test cl,2
0x7fff9dfd3482 <+ 306> jne ntdll!RtlWaitOnAddress+0xfe (00007fff`9dfd344e)
0x7fff9dfd3484 <+ 308> cmp eax,102h
0x7fff9dfd3489 <+ 313> je ntdll!memset+0x1a9bb (00007fff`9e02e7bb)
0x7fff9dfd348f <+ 319> movzx ebp,byte ptr [rsp+40h]
0x7fff9dfd3494 <+ 324> mov r13,qword ptr [rsp+50h]
0x7fff9dfd3499 <+ 329> test eax,eax
0x7fff9dfd349b <+ 331> js ntdll!memset+0x1ab19 (00007fff`9e02e919)
0x7fff9dfd34a1 <+ 337> test bpl,bpl
0x7fff9dfd34a4 <+ 340> jne ntdll!RtlWaitOnAddress+0x1c9 (00007fff`9dfd3519)
0x7fff9dfd34a6 <+ 342> mov rcx,qword ptr [rsp+90h]
0x7fff9dfd34ae <+ 350> xor rcx,rsp
0x7fff9dfd34b1 <+ 353> call ntdll!RtlRetrieveNtUserPfn+0x110 (00007fff`9dffc230)
0x7fff9dfd34b6 <+ 358> mov rbx,qword ptr [rsp+0F0h]
0x7fff9dfd34be <+ 366> add rsp,0A0h
0x7fff9dfd34c5 <+ 373> pop r15
0x7fff9dfd34c7 <+ 375> pop r14
0x7fff9dfd34c9 <+ 377> pop r13
0x7fff9dfd34cb <+ 379> pop r12
0x7fff9dfd34cd <+ 381> pop rdi
0x7fff9dfd34ce <+ 382> pop rsi
0x7fff9dfd34cf <+ 383> pop rbp
0x7fff9dfd34d0 <+ 384> ret
0x7fff9dfd34d1 <+ 385> int 3
0x7fff9dfd34d2 <+ 386> mov rcx,rbx
0x7fff9dfd34d5 <+ 389> call ntdll!RtlWaitOnAddress+0x6e0 (00007fff`9dfd3a30)
0x7fff9dfd34da <+ 394> jmp ntdll!RtlWaitOnAddress+0xb0 (00007fff`9dfd3400)
0x7fff9dfd34df <+ 399> mov ecx,dword ptr [rbx+20h]
0x7fff9dfd34e2 <+ 402> mov rax,0FFFFFFFFFFFFFFFFh
0x7fff9dfd34e9 <+ 409> bt rcx,18h
0x7fff9dfd34ee <+ 414> jb ntdll!RtlWaitOnAddress+0xbd (00007fff`9dfd340d)
0x7fff9dfd34f4 <+ 420> mov rcx,rbx
0x7fff9dfd34f7 <+ 423> call ntdll!LdrGetDllPath+0x3f0 (00007fff`9df81840)
0x7fff9dfd34fc <+ 428> mov rax,qword ptr [rbx]
0x7fff9dfd34ff <+ 431> jmp ntdll!RtlWaitOnAddress+0xbd (00007fff`9dfd340d)
0x7fff9dfd3504 <+ 436> mov byte ptr [rsp+40h],1
0x7fff9dfd3509 <+ 441> mov dword ptr [r13+1760h],1
0x7fff9dfd3514 <+ 452> jmp ntdll!RtlWaitOnAddress+0x7b (00007fff`9dfd33cb)
And pointed to line
(while pointing to `0x7fff9dfd3416 <+ 198> inc dword ptr [rax+24h]` line):
How should I deal with this?
I'm in a Qt5 project using Microsoft compiler.
At application output terminal I see:
QCoreApplication::postEvent: Unexpected null receiver

Generating fast assembly for complex arithmetic in g++4.4.7

I have a very simple function:
__attribute__((noinline))
void benchmark(cfloat* __restrict__ aa, cfloat* __restrict__ bb, cfloat* __restrict__ cc, cfloat* __restrict__ dd, cfloat uu, cfloat vv, size_t nn) {
for (ssize_t ii=0; ii < nn; ii++) {
dd[ii] = (
aa[ii]*uu +
bb[ii]*vv +
cc[ii]
);
}
}
That generates very different assembly with g++4.4.7 depending on how I define my cfloat object.
First iteration, if I define my cfloat thusly:
struct cfloat {
cfloat(float re, float im) : re(re), im(im) {}
float re,im;
};
cfloat operator +(cfloat a, cfloat b) {
return cfloat(a.re+b.re, a.im+b.im);
}
cfloat operator *(cfloat a, cfloat b) {
return cfloat(a.re*b.re-a.im*b.im, a.re*b.im+a.im*b.re);
}
generates this assembly for the benchmark function (compiled with g++ testcx.cc -O3 -o testcx:
0x00000000004006a0 <+0>: push %r15
0x00000000004006a2 <+2>: test %r8,%r8
0x00000000004006a5 <+5>: push %r14
0x00000000004006a7 <+7>: push %r13
0x00000000004006a9 <+9>: push %r12
0x00000000004006ab <+11>: push %rbp
0x00000000004006ac <+12>: push %rbx
0x00000000004006ad <+13>: movq %xmm0,-0x28(%rsp)
0x00000000004006b3 <+19>: mov %rdi,-0x38(%rsp)
0x00000000004006b8 <+24>: mov -0x28(%rsp),%rax
0x00000000004006bd <+29>: movq %xmm1,-0x28(%rsp)
0x00000000004006c3 <+35>: mov -0x28(%rsp),%r9
0x00000000004006c8 <+40>: je 0x4008a0 <_Z9benchmarkP6cfloatS0_S0_S0_S_S_m+512>
0x00000000004006ce <+46>: mov %r9,%r15
0x00000000004006d1 <+49>: mov %rax,%r14
0x00000000004006d4 <+52>: xor %r11d,%r11d
0x00000000004006d7 <+55>: shr $0x20,%r15
0x00000000004006db <+59>: shr $0x20,%r14
0x00000000004006df <+63>: xor %r10d,%r10d
0x00000000004006e2 <+66>: mov %r15d,-0x2c(%rsp)
0x00000000004006e7 <+71>: xor %ebp,%ebp
0x00000000004006e9 <+73>: xor %ebx,%ebx
0x00000000004006eb <+75>: movss -0x2c(%rsp),%xmm6
0x00000000004006f1 <+81>: mov %r9d,-0x2c(%rsp)
0x00000000004006f6 <+86>: movss -0x2c(%rsp),%xmm5
0x00000000004006fc <+92>: mov %r14d,-0x2c(%rsp)
0x0000000000400701 <+97>: movss -0x2c(%rsp),%xmm4
0x0000000000400707 <+103>: mov %eax,-0x2c(%rsp)
0x000000000040070b <+107>: xor %r13d,%r13d
0x000000000040070e <+110>: xor %r12d,%r12d
0x0000000000400711 <+113>: movabs $0xffffffff00000000,%r9
0x000000000040071b <+123>: movss -0x2c(%rsp),%xmm3
0x0000000000400721 <+129>: nopl 0x0(%rax)
0x0000000000400728 <+136>: lea 0x0(,%r13,8),%rax
0x0000000000400730 <+144>: movaps %xmm6,%xmm1
0x0000000000400733 <+147>: movaps %xmm5,%xmm7
0x0000000000400736 <+150>: and $0xffffffff,%ebp
0x0000000000400739 <+153>: lea (%rsi,%rax,1),%r15
0x000000000040073d <+157>: lea (%rdx,%rax,1),%r14
0x0000000000400741 <+161>: add -0x38(%rsp),%rax
0x0000000000400746 <+166>: and $0xffffffff,%ebx
0x0000000000400749 <+169>: add $0x1,%r12
0x000000000040074d <+173>: movss (%r15),%xmm0
0x0000000000400752 <+178>: movss 0x4(%r15),%xmm2
0x0000000000400758 <+184>: mulss %xmm0,%xmm1
0x000000000040075c <+188>: mulss %xmm2,%xmm7
0x0000000000400760 <+192>: mulss %xmm5,%xmm0
0x0000000000400764 <+196>: mulss %xmm6,%xmm2
0x0000000000400768 <+200>: addss %xmm7,%xmm1
0x000000000040076c <+204>: movaps %xmm3,%xmm7
0x000000000040076f <+207>: subss %xmm2,%xmm0
0x0000000000400773 <+211>: movd %xmm1,-0x30(%rsp)
0x0000000000400779 <+217>: mov -0x30(%rsp),%edi
0x000000000040077d <+221>: movaps %xmm4,%xmm1
0x0000000000400780 <+224>: movd %xmm0,-0x30(%rsp)
0x0000000000400786 <+230>: mov %edi,%r15d
0x0000000000400789 <+233>: mov -0x30(%rsp),%edi
0x000000000040078d <+237>: movss (%rax),%xmm0
0x0000000000400791 <+241>: shl $0x20,%r15
0x0000000000400795 <+245>: movss 0x4(%rax),%xmm2
0x000000000040079a <+250>: mulss %xmm0,%xmm1
0x000000000040079e <+254>: or %r15,%rbp
0x00000000004007a1 <+257>: mulss %xmm2,%xmm7
0x00000000004007a5 <+261>: mov %edi,%r15d
0x00000000004007a8 <+264>: and %r9,%rbp
0x00000000004007ab <+267>: mulss %xmm3,%xmm0
0x00000000004007af <+271>: or %r15,%rbp
0x00000000004007b2 <+274>: mulss %xmm4,%xmm2
0x00000000004007b6 <+278>: addss %xmm7,%xmm1
0x00000000004007ba <+282>: subss %xmm2,%xmm0
0x00000000004007be <+286>: movd %xmm1,-0x30(%rsp)
0x00000000004007c4 <+292>: mov -0x30(%rsp),%edi
0x00000000004007c8 <+296>: movd %xmm0,-0x30(%rsp)
0x00000000004007ce <+302>: mov %edi,%eax
0x00000000004007d0 <+304>: mov -0x30(%rsp),%edi
0x00000000004007d4 <+308>: shl $0x20,%rax
0x00000000004007d8 <+312>: or %rax,%rbx
0x00000000004007db <+315>: and %r9,%rbx
0x00000000004007de <+318>: mov %edi,%eax
0x00000000004007e0 <+320>: or %rax,%rbx
0x00000000004007e3 <+323>: mov %r10,%rax
0x00000000004007e6 <+326>: mov %rbx,%rdi
0x00000000004007e9 <+329>: and $0xffffffff,%eax
0x00000000004007ec <+332>: shr $0x20,%rdi
0x00000000004007f0 <+336>: mov %edi,-0x20(%rsp)
0x00000000004007f4 <+340>: mov %rbp,%rdi
0x00000000004007f7 <+343>: shr $0x20,%rdi
0x00000000004007fb <+347>: movss -0x20(%rsp),%xmm0
0x0000000000400801 <+353>: mov %edi,-0x10(%rsp)
0x0000000000400805 <+357>: addss -0x10(%rsp),%xmm0
0x000000000040080b <+363>: mov %ebp,-0x10(%rsp)
0x000000000040080f <+367>: movss %xmm0,-0x20(%rsp)
0x0000000000400815 <+373>: mov -0x20(%rsp),%r10d
0x000000000040081a <+378>: mov %ebx,-0x20(%rsp)
0x000000000040081e <+382>: movss -0x20(%rsp),%xmm0
0x0000000000400824 <+388>: addss -0x10(%rsp),%xmm0
0x000000000040082a <+394>: shl $0x20,%r10
0x000000000040082e <+398>: or %rax,%r10
0x0000000000400831 <+401>: and %r9,%r10
0x0000000000400834 <+404>: movss %xmm0,-0x20(%rsp)
0x000000000040083a <+410>: mov -0x20(%rsp),%eax
0x000000000040083e <+414>: or %rax,%r10
0x0000000000400841 <+417>: mov %r11,%rax
0x0000000000400844 <+420>: mov %r10,%rdi
0x0000000000400847 <+423>: and $0xffffffff,%eax
0x000000000040084a <+426>: shr $0x20,%rdi
0x000000000040084e <+430>: mov %edi,-0x20(%rsp)
0x0000000000400852 <+434>: movss -0x20(%rsp),%xmm0
0x0000000000400858 <+440>: addss 0x4(%r14),%xmm0
0x000000000040085e <+446>: movss %xmm0,-0x20(%rsp)
0x0000000000400864 <+452>: mov -0x20(%rsp),%r11d
0x0000000000400869 <+457>: mov %r10d,-0x20(%rsp)
0x000000000040086e <+462>: movss -0x20(%rsp),%xmm0
0x0000000000400874 <+468>: addss (%r14),%xmm0
0x0000000000400879 <+473>: shl $0x20,%r11
0x000000000040087d <+477>: or %rax,%r11
0x0000000000400880 <+480>: and %r9,%r11
0x0000000000400883 <+483>: movss %xmm0,-0x20(%rsp)
0x0000000000400889 <+489>: mov -0x20(%rsp),%eax
0x000000000040088d <+493>: or %rax,%r11
0x0000000000400890 <+496>: cmp %r8,%r12
0x0000000000400893 <+499>: mov %r11,(%rcx,%r13,8)
0x0000000000400897 <+503>: mov %r12,%r13
0x000000000040089a <+506>: jne 0x400728 <_Z9benchmarkP6cfloatS0_S0_S0_S_S_m+136>
0x00000000004008a0 <+512>: pop %rbx
0x00000000004008a1 <+513>: pop %rbp
0x00000000004008a2 <+514>: pop %r12
0x00000000004008a4 <+516>: pop %r13
0x00000000004008a6 <+518>: pop %r14
0x00000000004008a8 <+520>: pop %r15
0x00000000004008aa <+522>: retq
Which is about 133 instructions.
If I define the cfloat like this, with an array as the state:
struct cfloat {
cfloat(float re, float im) { ri[0] = re; ri[1] = im; }
float ri[2];
};
cfloat operator +(cfloat a, cfloat b) {
return cfloat(a.ri[0]+b.ri[0], a.ri[1]+b.ri[1]);
}
cfloat operator *(cfloat a, cfloat b) {
return cfloat(a.ri[0]*b.ri[0]-a.ri[1]*b.ri[1], a.ri[0]*b.ri[1]+a.ri[1]*b.ri[0]);
}
It generates this assembly:
Dump of assembler code for function _Z9benchmarkP6cfloatS0_S0_S0_S_S_m:
0x00000000004006a0 <+0>: push %rbx
0x00000000004006a1 <+1>: movq %xmm0,-0x8(%rsp)
0x00000000004006a7 <+7>: mov -0x8(%rsp),%r9
0x00000000004006ac <+12>: movq %xmm1,-0x8(%rsp)
0x00000000004006b2 <+18>: mov -0x8(%rsp),%rax
0x00000000004006b7 <+23>: mov %r9d,-0xc(%rsp)
0x00000000004006bc <+28>: shr $0x20,%r9
0x00000000004006c0 <+32>: movss -0xc(%rsp),%xmm9
0x00000000004006c7 <+39>: mov %r9d,-0xc(%rsp)
0x00000000004006cc <+44>: movss -0xc(%rsp),%xmm8
0x00000000004006d3 <+51>: mov %eax,-0xc(%rsp)
0x00000000004006d7 <+55>: shr $0x20,%rax
0x00000000004006db <+59>: movss -0xc(%rsp),%xmm7
0x00000000004006e1 <+65>: test %r8,%r8
0x00000000004006e4 <+68>: mov %eax,-0xc(%rsp)
0x00000000004006e8 <+72>: movss -0xc(%rsp),%xmm6
0x00000000004006ee <+78>: je 0x400796 <_Z9benchmarkP6cfloatS0_S0_S0_S_S_m+246>
0x00000000004006f4 <+84>: xor %eax,%eax
0x00000000004006f6 <+86>: xor %r9d,%r9d
0x00000000004006f9 <+89>: nopl 0x0(%rax)
0x0000000000400700 <+96>: shl $0x3,%rax
0x0000000000400704 <+100>: movaps %xmm7,%xmm0
0x0000000000400707 <+103>: lea (%rsi,%rax,1),%rbx
0x000000000040070b <+107>: movaps %xmm6,%xmm3
0x000000000040070e <+110>: lea (%rcx,%rax,1),%r10
0x0000000000400712 <+114>: lea (%rdx,%rax,1),%r11
0x0000000000400716 <+118>: lea (%rdi,%rax,1),%rax
0x000000000040071a <+122>: movss (%rbx),%xmm1
0x000000000040071e <+126>: add $0x1,%r9
0x0000000000400722 <+130>: movss 0x4(%rbx),%xmm5
0x0000000000400727 <+135>: mulss %xmm1,%xmm0
0x000000000040072b <+139>: mulss %xmm5,%xmm3
0x000000000040072f <+143>: movss (%rax),%xmm2
0x0000000000400733 <+147>: movaps %xmm8,%xmm10
0x0000000000400737 <+151>: mulss %xmm6,%xmm1
0x000000000040073b <+155>: movss 0x4(%rax),%xmm4
0x0000000000400740 <+160>: mulss %xmm7,%xmm5
0x0000000000400744 <+164>: mulss %xmm4,%xmm10
0x0000000000400749 <+169>: cmp %r8,%r9
0x000000000040074c <+172>: mov %r9,%rax
0x000000000040074f <+175>: subss %xmm3,%xmm0
0x0000000000400753 <+179>: movaps %xmm2,%xmm3
0x0000000000400756 <+182>: mulss %xmm9,%xmm4
0x000000000040075b <+187>: mulss %xmm9,%xmm3
0x0000000000400760 <+192>: addss %xmm5,%xmm1
0x0000000000400764 <+196>: mulss %xmm8,%xmm2
0x0000000000400769 <+201>: subss %xmm10,%xmm3
0x000000000040076e <+206>: addss %xmm4,%xmm2
0x0000000000400772 <+210>: addss %xmm3,%xmm0
0x0000000000400776 <+214>: addss %xmm2,%xmm1
0x000000000040077a <+218>: addss (%r11),%xmm0
0x000000000040077f <+223>: addss 0x4(%r11),%xmm1
0x0000000000400785 <+229>: movss %xmm0,(%r10)
0x000000000040078a <+234>: movss %xmm1,0x4(%r10)
0x0000000000400790 <+240>: jne 0x400700 <_Z9benchmarkP6cfloatS0_S0_S0_S_S_m+96>
0x0000000000400796 <+246>: pop %rbx
0x0000000000400797 <+247>: retq
End of assembler dump.
Which is about 59 instructions. And, my benchmarks show, the first iteration is about 3x slower than the second.
I would prefer the separate real/imaginary fields, not least because having them as an array seems to break the vectorizer in Intel's compiler for some reason.
Is there any way I can convince gcc that these two classes are equivalent?

So I don't believe this, but if I specify an explicit copy constructor, the problem resolves itself:
struct cfloat {
cfloat(float re, float im) : re(re), im(im) {}
cfloat(const cfloat& o) : re(o.re), im(o.im) {}
float re,im;
};
Now generates the same assembly:
Dump of assembler code for function benchmark(cfloat*, cfloat*, cfloat*, cfloat*, cfloat, cfloat, unsigned long):
0x0000000000400600 <+0>: mov 0x8(%rsp),%r10
0x0000000000400605 <+5>: test %r10,%r10
0x0000000000400608 <+8>: je 0x4006aa <benchmark(cfloat*, cfloat*, cfloat*, cfloat*, cfloat, cfloat, unsigned long)+170>
0x000000000040060e <+14>: xor %eax,%eax
0x0000000000400610 <+16>: movss (%r9),%xmm8
0x0000000000400615 <+21>: movss 0x4(%r9),%xmm9
0x000000000040061b <+27>: movaps %xmm8,%xmm0
0x000000000040061f <+31>: movaps %xmm9,%xmm3
0x0000000000400623 <+35>: movss (%rsi,%rax,8),%xmm1
0x0000000000400628 <+40>: movss 0x4(%rsi,%rax,8),%xmm7
0x000000000040062e <+46>: mulss %xmm1,%xmm0
0x0000000000400632 <+50>: mulss %xmm7,%xmm3
0x0000000000400636 <+54>: movss (%r8),%xmm5
0x000000000040063b <+59>: movss 0x4(%r8),%xmm6
0x0000000000400641 <+65>: mulss %xmm9,%xmm1
0x0000000000400646 <+70>: movaps %xmm6,%xmm10
0x000000000040064a <+74>: mulss %xmm8,%xmm7
0x000000000040064f <+79>: movss (%rdi,%rax,8),%xmm2
0x0000000000400654 <+84>: subss %xmm3,%xmm0
0x0000000000400658 <+88>: movaps %xmm5,%xmm3
0x000000000040065b <+91>: movss 0x4(%rdi,%rax,8),%xmm4
0x0000000000400661 <+97>: mulss %xmm2,%xmm3
0x0000000000400665 <+101>: addss %xmm7,%xmm1
0x0000000000400669 <+105>: mulss %xmm4,%xmm10
0x000000000040066e <+110>: mulss %xmm6,%xmm2
0x0000000000400672 <+114>: mulss %xmm5,%xmm4
0x0000000000400676 <+118>: subss %xmm10,%xmm3
0x000000000040067b <+123>: addss %xmm4,%xmm2
0x000000000040067f <+127>: addss %xmm3,%xmm0
0x0000000000400683 <+131>: addss %xmm2,%xmm1
0x0000000000400687 <+135>: addss (%rdx,%rax,8),%xmm0
0x000000000040068c <+140>: addss 0x4(%rdx,%rax,8),%xmm1
0x0000000000400692 <+146>: movss %xmm0,(%rcx,%rax,8)
0x0000000000400697 <+151>: movss %xmm1,0x4(%rcx,%rax,8)
0x000000000040069d <+157>: add $0x1,%rax
0x00000000004006a1 <+161>: cmp %rax,%r10
0x00000000004006a4 <+164>: ja 0x400610 <benchmark(cfloat*, cfloat*, cfloat*, cfloat*, cfloat, cfloat, unsigned long)+16>
0x00000000004006aa <+170>: repz retq
End of assembler dump.
Find me that in the spec.

You mentioned that you target Red Hat Enterprise Linux, and (in your deleted post) that newer compiler versions generate better code. You could use Developer Toolset to get a newer compiler, creating applications which are compatible with the rest of the operating system:
https://www.softwarecollections.org/en/scls/rhscl/devtoolset-6/
https://developers.redhat.com/products/developertoolset/overview/

gcc -O0 outperforming -O3 on matrix sizes that are powers of 2 (matrix transpositions)

(For testing purposes) I have written a simple Method to calculate the transpose of a nxn Matrix
void transpose(const size_t _n, double* _A) {
for(uint i=0; i < _n; ++i) {
for(uint j=i+1; j < _n; ++j) {
double tmp = _A[i*_n+j];
_A[i*_n+j] = _A[j*_n+i];
_A[j*_n+i] = tmp;
}
}
}
When using optimization levels O3 or Ofast I expected the compiler to unroll some loops which would lead to higher performance especially when the matrix size is a multiple of 2 (i.e., the double loop body can be performed each iteration) or similar. Instead what I measured was the exact opposite. Powers of 2 actually show a significant spike in execution time.
These spikes are also at regular intervals of 64, more pronounced at intervals of 128 and so on. Each spike extends to the neighboring matrix sizes like in the following table
size n time(us)
1020 2649
1021 2815
1022 3100
1023 5428
1024 15791
1025 6778
1026 3106
1027 2847
1028 2660
1029 3038
1030 2613
I compiled with a gcc version 4.8.2 but the same thing happens with a clang 3.5 so this might be some generic thing?
So my question basically is: Why is there this periodic increase in execution time? Is it some generic thing coming with any of the optimization options (as it happens with clang and gcc alike)? If so which optimization option is causing this?
And how can this be so significant that even the O0 version of the program outperforms the 03 version at multiples of 512?
EDIT: Note the magnitude of the spikes in this (logarithmic) plot. Transposing a 1024x1024 matrix with optimization actually takes as much time as transposing a 1300x1300 matrix without optimization. If this is a cache-fault / page-fault problem, then someone needs to explain to me why the memory layout is so significantly different for the optimized version of the program, that it fails for powers of two, just to recover high performance for slightly larger matrices. Shouldn't cache-faults create more of a step-like pattern? Why does the execution times go down again at all? (and why should optimization create cache-faults that weren't there before?)
EDIT: the following should be the assembler codes that gcc produced
no optimization (O0):
_Z9transposemRPd:
.LFB0:
.cfi_startproc
push rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
mov rbp, rsp
.cfi_def_cfa_register 6
mov QWORD PTR [rbp-24], rdi
mov QWORD PTR [rbp-32], rsi
mov DWORD PTR [rbp-4], 0
jmp .L2
.L5:
mov eax, DWORD PTR [rbp-4]
add eax, 1
mov DWORD PTR [rbp-8], eax
jmp .L3
.L4:
mov rax, QWORD PTR [rbp-32]
mov rdx, QWORD PTR [rax]
mov eax, DWORD PTR [rbp-4]
imul rax, QWORD PTR [rbp-24]
mov rcx, rax
mov eax, DWORD PTR [rbp-8]
add rax, rcx
sal rax, 3
add rax, rdx
mov rax, QWORD PTR [rax]
mov QWORD PTR [rbp-16], rax
mov rax, QWORD PTR [rbp-32]
mov rdx, QWORD PTR [rax]
mov eax, DWORD PTR [rbp-4]
imul rax, QWORD PTR [rbp-24]
mov rcx, rax
mov eax, DWORD PTR [rbp-8]
add rax, rcx
sal rax, 3
add rdx, rax
mov rax, QWORD PTR [rbp-32]
mov rcx, QWORD PTR [rax]
mov eax, DWORD PTR [rbp-8]
imul rax, QWORD PTR [rbp-24]
mov rsi, rax
mov eax, DWORD PTR [rbp-4]
add rax, rsi
sal rax, 3
add rax, rcx
mov rax, QWORD PTR [rax]
mov QWORD PTR [rdx], rax
mov rax, QWORD PTR [rbp-32]
mov rdx, QWORD PTR [rax]
mov eax, DWORD PTR [rbp-8]
imul rax, QWORD PTR [rbp-24]
mov rcx, rax
mov eax, DWORD PTR [rbp-4]
add rax, rcx
sal rax, 3
add rdx, rax
mov rax, QWORD PTR [rbp-16]
mov QWORD PTR [rdx], rax
add DWORD PTR [rbp-8], 1
.L3:
mov eax, DWORD PTR [rbp-8]
cmp rax, QWORD PTR [rbp-24]
jb .L4
add DWORD PTR [rbp-4], 1
.L2:
mov eax, DWORD PTR [rbp-4]
cmp rax, QWORD PTR [rbp-24]
jb .L5
pop rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE0:
.size _Z9transposemRPd, .-_Z9transposemRPd
.ident "GCC: (Debian 4.8.2-15) 4.8.2"
.section .note.GNU-stack,"",#progbits
with optimization (O3)
_Z9transposemRPd:
.LFB0:
.cfi_startproc
push rbx
.cfi_def_cfa_offset 16
.cfi_offset 3, -16
xor r11d, r11d
xor ebx, ebx
.L2:
cmp r11, rdi
mov r9, r11
jae .L10
.p2align 4,,10
.p2align 3
.L7:
add ebx, 1
mov r11d, ebx
cmp rdi, r11
mov rax, r11
jbe .L2
mov r10, r9
mov r8, QWORD PTR [rsi]
mov edx, ebx
imul r10, rdi
.p2align 4,,10
.p2align 3
.L6:
lea rcx, [rax+r10]
add edx, 1
imul rax, rdi
lea rcx, [r8+rcx*8]
movsd xmm0, QWORD PTR [rcx]
add rax, r9
lea rax, [r8+rax*8]
movsd xmm1, QWORD PTR [rax]
movsd QWORD PTR [rcx], xmm1
movsd QWORD PTR [rax], xmm0
mov eax, edx
cmp rdi, rax
ja .L6
cmp r11, rdi
mov r9, r11
jb .L7
.L10:
pop rbx
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE0:
.size _Z9transposemRPd, .-_Z9transposemRPd
.ident "GCC: (Debian 4.8.2-15) 4.8.2"
.section .note.GNU-stack,"",#progbits

The periodic increase of execution time must be due to the cache being only N-way associative instead of fully associative. You are witnessing hash collision related to cache line selection algorithm.
The fastest L1 cache has a smaller number of cache lines than the next level L2. In each level each cache line can be filled only from a limited set of sources.
Typical HW implementations of cache line selection algorithms will just use few bits from the memory address to determine in which cache slot the data should be written -- in HW bit shifts are free.
This causes a competition between memory ranges e.g. between addresses 0x300010 and 0x341010.
In fully sequential algorithm this doesn't matter -- N is large enough for practically all algorithms of the form:
for (i=0;i<1000;i++) a[i] += b[i] * c[i] + d[i];
But when the number of the inputs (or outputs) gets larger, which happens internally when the algorithm is optimized, having one input in the cache forces another input out of the cache.
// one possible method of optimization with 2 outputs and 6 inputs
// with two unrelated execution paths -- should be faster, but maybe it isn't
for (i=0;i<500;i++) {
a[i] += b[i] * c[i] + d[i];
a[i+500] += b[i+500] * c[i+500] + d[i+500];
}
A graph in Example 5: Cache Associativity illustrates 512 byte offset between matrix lines being a global worst case dimension for the particular system. When this is known, a working mitigation is to over-allocate the matrix horizontally to some other dimension char matrix[512][512 + 64].

The improvement in performance is likely related to CPU/RAM caching.
When the data is not a power of 2, a cache line load (like 16, 32, or 64 words) transfers more than the data that is required tying up the bus—uselessly as it turns out. For a data set which is a power of 2, all of the pre-fetched data is used.
I bet if you were to disable L1 and L2 caching, the performance would be completely smooth and predictable. But it would be much slower. Caching really helps performance!

Comment with code: In the -O3 case, with
#include <cstdlib>
extern void transpose(const size_t n, double* a)
{
for (size_t i = 0; i < n; ++i) {
for (size_t j = i + 1; j < n; ++j) {
std::swap(a[i * n + j], a[j * n + i]); // or your expanded version.
}
}
}
compiling with
$ g++ --version
g++ (Ubuntu/Linaro 4.8.1-10ubuntu9) 4.8.1
...
$ g++ -g1 -std=c++11 -Wall -o test.S -S test.cpp -O3
I get
_Z9transposemPd:
.LFB68:
.cfi_startproc
.LBB2:
testq %rdi, %rdi
je .L1
leaq 8(,%rdi,8), %r10
xorl %r8d, %r8d
.LBB3:
addq $1, %r8
leaq -8(%r10), %rcx
cmpq %rdi, %r8
leaq (%rsi,%rcx), %r9
je .L1
.p2align 4,,10
.p2align 3
.L10:
movq %r9, %rdx
movq %r8, %rax
.p2align 4,,10
.p2align 3
.L5:
.LBB4:
movsd (%rdx), %xmm1
movsd (%rsi,%rax,8), %xmm0
movsd %xmm1, (%rsi,%rax,8)
.LBE4:
addq $1, %rax
.LBB5:
movsd %xmm0, (%rdx)
addq %rcx, %rdx
.LBE5:
cmpq %rdi, %rax
jne .L5
addq $1, %r8
addq %r10, %r9
addq %rcx, %rsi
cmpq %rdi, %r8
jne .L10
.L1:
rep ret
.LBE3:
.LBE2:
.cfi_endproc
And something quite different if I add -m32.
(Note: it makes no difference to the assembly whether I use std::swap or your variant)
In order to understand what is causing the spikes, though, you probably want to visualize the memory operations going on.

To add to others: g++ -std=c++11 -march=core2 -O3 -c -S - gcc version 4.8.2 (MacPorts gcc48 4.8.2_0) - x86_64-apple-darwin13.0.0 :
__Z9transposemPd:
LFB0:
testq %rdi, %rdi
je L1
leaq 8(,%rdi,8), %r10
xorl %r8d, %r8d
leaq -8(%r10), %rcx
addq $1, %r8
leaq (%rsi,%rcx), %r9
cmpq %rdi, %r8
je L1
.align 4,0x90
L10:
movq %r9, %rdx
movq %r8, %rax
.align 4,0x90
L5:
movsd (%rdx), %xmm0
movsd (%rsi,%rax,8), %xmm1
movsd %xmm0, (%rsi,%rax,8)
addq $1, %rax
movsd %xmm1, (%rdx)
addq %rcx, %rdx
cmpq %rdi, %rax
jne L5
addq $1, %r8
addq %r10, %r9
addq %rcx, %rsi
cmpq %rdi, %r8
jne L10
L1:
rep; ret
Basically the same as #ksfone's code, for:
#include <cstddef>
void transpose(const size_t _n, double* _A) {
for(size_t i=0; i < _n; ++i) {
for(size_t j=i+1; j < _n; ++j) {
double tmp = _A[i*_n+j];
_A[i*_n+j] = _A[j*_n+i];
_A[j*_n+i] = tmp;
}
}
}
Apart from the Mach-O 'as' differences (extra underscore, align and DWARF locations), it's the same. But very different from the OP's assembly output. A much 'tighter' inner loop.

How to know if loop has optimization potential

My code spends a considerable amount of time in the following loop. I used gcc-4.8 with -O3 -march=native to compile the code. Since I am an absolute newbie in optimization, how do I know if the compiler did all it could? I am running on a AMD FX(tm)-6200
float* __restrict__ ApsiPtr = Apsi.begin();
const float* const __restrict__ psiPtr = psi.begin();
const float* const __restrict__ diagPtr = diag().begin();
register const label nCells = diag().size();
for (register label cell=0; cell<nCells; cell++) {
ApsiPtr[cell] = diagPtr[cell]*psiPtr[cell];
}
ddd dumps me the following assembler code.
Dump of assembler code from 0x7ffff1d32010 to 0x7ffff1d32110:¬
=> mov (%rsp),%rdi¬
callq 0x7ffff1ba5ca0 <_ZNK4Foam9lduMatrix4diagEv#plt>¬
mov 0x8(%rax),%edx¬
test %edx,%edx¬
mov %edx,%esi¬
mov %edx,0x34(%rsp)¬
jle 0x7ffff1d3251f ¬
mov 0x38(%rsp),%r10¬
lea 0x10(%rbx),%r8¬
lea 0x10(%rbp),%rax¬
lea 0x10(%r10),%rdi¬
cmp %rdi,%rbx¬
setae %r9b¬
cmp %r8,%r10¬
setae %r11b¬
or %r11d,%r9d¬
cmp %rax,%rbx¬
setae %dl¬
cmp %r8,%rbp¬
setae %cl¬
or %ecx,%edx¬
test %dl,%r9b¬
je 0x7ffff1d32728 ¬
cmp $0xc,%esi¬
jbe 0x7ffff1d32728 ¬
shr %esi¬
lea 0x40(%r10),%rax¬
mov %rbx,%rdx¬
lea -0x5(%rsi),%r8d¬
lea (%rsi,%rsi,1),%r9d¬
mov %esi,0x38(%rsp)¬
shr $0x2,%r8d¬
mov %r9d,0x4c(%rsp)¬
mov %rbp,%rsi¬
shl $0x6,%r8¬
mov $0x0,%r9d¬
lea 0x80(%r10,%r8,1),%r11¬
mov %r11,%rcx¬
sub %rax,%rcx¬
and $0x40,%ecx¬
movupd -0x40(%rax),%xmm0¬
movupd 0x0(%rbp),%xmm1¬
prefetcht0 0x1e0(%r10)¬
prefetcht0 0x1e0(%rbp)¬
prefetchw 0x1e0(%rbx)¬
mov $0x4,%r9d¬
mulpd %xmm1,%xmm0¬
lea 0x40(%rbp),%rsi¬
lea 0x40(%rbx),%rdx¬
mov %r10,0x40(%rsp)¬
movlpd %xmm0,(%rbx)¬
movhpd %xmm0,0x8(%rbx)¬
movupd -0x30(%rax),%xmm2¬
movupd 0x10(%rbp),%xmm3¬
mulpd %xmm3,%xmm2¬
movlpd %xmm2,0x10(%rbx)¬
movhpd %xmm2,0x18(%rbx)¬
movupd -0x20(%rax),%xmm4¬
movupd 0x20(%rbp),%xmm5¬
End of assembler dump.¬

How do I make GCC instantiate a class instance with non-trivial const/dest?

I am implementing a profiler. I want to use the Constructor/Destructor idiom to keep track of when I enter/exit a function.
A rough outline of my code is as follows:
class Profile
{
Profile(void); //Start timing
~Profile(void); //Stop timer and log
};
//...
Game::Game(void) : m_Quit(false)
{
Profile p();
InitalizeModules();
//...
}
However, when I run it, the Constructor and destructor are not being called. Even when I disassemble, there are no references to Profile::Profile(). I understood that the standard specifies that an instance with a non-trivial constructor cannot be optimized out by the compiler.
There are no optimization flags on the command line of either the compiler or the linker.
I also tried specifying attribute((used)), but to no avail.
Here is the disassembly:
(gdb) disassemble Ztk::Game::Game
Dump of assembler code for function Ztk::Game::Game():
0x00000000004cd798 <+0>: push %rbp
0x00000000004cd799 <+1>: mov %rsp,%rbp
0x00000000004cd79c <+4>: push %r12
0x00000000004cd79e <+6>: push %rbx
0x00000000004cd79f <+7>: sub $0x30,%rsp
0x00000000004cd7a3 <+11>: mov %rdi,-0x38(%rbp)
0x00000000004cd7a7 <+15>: mov -0x38(%rbp),%rax
0x00000000004cd7ab <+19>: mov %rax,%rdi
0x00000000004cd7ae <+22>: callq 0x4cdc6a <Ztk::Highlander<Ztk::Game, int>::Highlander()>
/** CALL SHOULD BE HERE **/
0x00000000004cd7b3 <+27>: mov -0x38(%rbp),%rax
0x00000000004cd7b7 <+31>: movb $0x0,(%rax)
0x00000000004cd7ba <+34>: callq 0x4e59f0 <Ztk::InitializeModules()>
Indeed there is code generated and linked into the executable
(gdb) disassemble Ztk::Profile::Profile(void)
Dump of assembler code for function Ztk::Profile::Profile():
0x0000000000536668 <+0>: push %rbp
0x0000000000536669 <+1>: mov %rsp,%rbp
0x000000000053666c <+4>: sub $0x20,%rsp
0x0000000000536670 <+8>: mov %rdi,-0x18(%rbp)
0x0000000000536674 <+12>: mov 0x8(%rbp),%rax
0x0000000000536678 <+16>: mov %rax,-0x8(%rbp)
0x000000000053667c <+20>: mov -0x8(%rbp),%rax
0x0000000000536680 <+24>: mov %rax,%rsi
0x0000000000536683 <+27>: mov $0x802440,%edi
0x0000000000536688 <+32>: callq 0x5363ca <Ztk::Profiler::FindNode(void*)>

Profile p();
What you've done here is declared a function, called p, that returns an object of type Profile. What you want is this:
Profile p;

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

Performance difference with custom iterator - c++

Related

What is the disassembler telling me?

Generating fast assembly for complex arithmetic in g++4.4.7

gcc -O0 outperforming -O3 on matrix sizes that are powers of 2 (matrix transpositions)

How to know if loop has optimization potential

How do I make GCC instantiate a class instance with non-trivial const/dest?

Categories

Resources