How to know if loop has optimization potential

How to know if loop has optimization potential - c++

My code spends a considerable amount of time in the following loop. I used gcc-4.8 with -O3 -march=native to compile the code. Since I am an absolute newbie in optimization, how do I know if the compiler did all it could? I am running on a AMD FX(tm)-6200
float* __restrict__ ApsiPtr = Apsi.begin();
const float* const __restrict__ psiPtr = psi.begin();
const float* const __restrict__ diagPtr = diag().begin();
register const label nCells = diag().size();
for (register label cell=0; cell<nCells; cell++) {
ApsiPtr[cell] = diagPtr[cell]*psiPtr[cell];
}
ddd dumps me the following assembler code.
Dump of assembler code from 0x7ffff1d32010 to 0x7ffff1d32110:¬
=> mov (%rsp),%rdi¬
callq 0x7ffff1ba5ca0 <_ZNK4Foam9lduMatrix4diagEv#plt>¬
mov 0x8(%rax),%edx¬
test %edx,%edx¬
mov %edx,%esi¬
mov %edx,0x34(%rsp)¬
jle 0x7ffff1d3251f ¬
mov 0x38(%rsp),%r10¬
lea 0x10(%rbx),%r8¬
lea 0x10(%rbp),%rax¬
lea 0x10(%r10),%rdi¬
cmp %rdi,%rbx¬
setae %r9b¬
cmp %r8,%r10¬
setae %r11b¬
or %r11d,%r9d¬
cmp %rax,%rbx¬
setae %dl¬
cmp %r8,%rbp¬
setae %cl¬
or %ecx,%edx¬
test %dl,%r9b¬
je 0x7ffff1d32728 ¬
cmp $0xc,%esi¬
jbe 0x7ffff1d32728 ¬
shr %esi¬
lea 0x40(%r10),%rax¬
mov %rbx,%rdx¬
lea -0x5(%rsi),%r8d¬
lea (%rsi,%rsi,1),%r9d¬
mov %esi,0x38(%rsp)¬
shr $0x2,%r8d¬
mov %r9d,0x4c(%rsp)¬
mov %rbp,%rsi¬
shl $0x6,%r8¬
mov $0x0,%r9d¬
lea 0x80(%r10,%r8,1),%r11¬
mov %r11,%rcx¬
sub %rax,%rcx¬
and $0x40,%ecx¬
movupd -0x40(%rax),%xmm0¬
movupd 0x0(%rbp),%xmm1¬
prefetcht0 0x1e0(%r10)¬
prefetcht0 0x1e0(%rbp)¬
prefetchw 0x1e0(%rbx)¬
mov $0x4,%r9d¬
mulpd %xmm1,%xmm0¬
lea 0x40(%rbp),%rsi¬
lea 0x40(%rbx),%rdx¬
mov %r10,0x40(%rsp)¬
movlpd %xmm0,(%rbx)¬
movhpd %xmm0,0x8(%rbx)¬
movupd -0x30(%rax),%xmm2¬
movupd 0x10(%rbp),%xmm3¬
mulpd %xmm3,%xmm2¬
movlpd %xmm2,0x10(%rbx)¬
movhpd %xmm2,0x18(%rbx)¬
movupd -0x20(%rax),%xmm4¬
movupd 0x20(%rbp),%xmm5¬
End of assembler dump.¬

Related

vector bool compiler xor specialization?

I was thinking again about implementing the quadratic sieve for fun, which requires Guassian elimination over a binary field, that is the operations required are 1. swapping rows and 2. XORing rows.
My ideas were either to maintain a bit array using a vector of 64-bit ints and bit twiddling, or use vector<bool>, which is probably space-optimized on my system. The bit array must be able to be dynamically sized, so std::bitset won't work. The advantage of maintaining my own ints is that I can XOR 64 bits at a time which is a neat trick. I wanted to see what a compiler would do for a loop that XOR'd bool vectors: (I wasn't able to use ^=, see operator |= on std::vector<bool>)
void xor_vector(std::vector<bool>& a, std::vector<bool>& b) {
for (std::size_t i=0; i<a.size(); ++i)
a[i] = a[i] ^ b[i];
}
I have a very basic understanding of x86 but it looks like the compiler isn't actually XORing words together? Is there a way to get the compiler to XOR entire words at a time?
https://godbolt.org/z/PbGdv3sKT
xor_vector(std::vector<bool, std::allocator<bool> >&, std::vector<bool, std::allocator<bool> >&):
mov r8, QWORD PTR [rdi]
mov rax, QWORD PTR [rdi+16]
mov edx, DWORD PTR [rdi+24]
sub rax, r8
lea rdi, [rdx+rax*8]
test rdi, rdi
je .L11
push rbp
mov r10d, 1
push rbx
mov r9, QWORD PTR [rsi]
xor esi, esi
jmp .L7
.L16:
mov rdx, r10
sal rdx, cl
mov rcx, QWORD PTR [r11]
mov rbp, rdx
test rdx, rcx
setne bl
and rbp, QWORD PTR [rax]
setne bpl
.L4:
mov rax, rdx
not rdx
or rax, rcx
and rdx, rcx
cmp bpl, bl
cmovne rdx, rax
add rsi, 1
mov QWORD PTR [r11], rdx
cmp rsi, rdi
je .L15
.L7:
test rsi, rsi
lea rax, [rsi+63]
mov rdx, rsi
cmovns rax, rsi
sar rdx, 63
shr rdx, 58
sar rax, 6
lea rcx, [rsi+rdx]
sal rax, 3
and ecx, 63
lea r11, [r8+rax]
add rax, r9
sub rcx, rdx
jns .L16
add rcx, 64
mov rdx, r10
sal rdx, cl
mov rcx, QWORD PTR [r11-8]
mov rbp, rdx
test rcx, rdx
setne bl
and rbp, QWORD PTR [rax-8]
setne bpl
sub r11, 8
jmp .L4
.L15:
pop rbx
pop rbp
ret
.L11:
ret
My question is similar to bitwise operations on vector<bool> but the answers are dated and don't seem to answer my question.
Update: I tested with a 256 bit sized bitset too. Still I don't see XORing whole machine words.
void xor_vector(std::bitset<256>& a, std::bitset<256>& b) {
for (std::size_t i=0; i<a.size(); ++i)
a[i] = a[i] ^ b[i];
}
https://godbolt.org/z/jKEf89E1j
xor_vector(std::bitset<256ul>&, std::bitset<256ul>&):
push rbx
mov r8, rdi
mov r11, rsi
xor edx, edx
mov ebx, 1
.L4:
mov rsi, rdx
mov rcx, rdx
mov rax, rbx
shr rsi, 6
and ecx, 63
sal rax, cl
mov rdi, QWORD PTR [r8+rsi*8]
mov rcx, rax
and rcx, QWORD PTR [r11+rsi*8]
mov rcx, rax
setne r10b
test rax, rdi
not rax
setne r9b
or rcx, rdi
and rax, rdi
cmp r10b, r9b
cmovne rax, rcx
add rdx, 1
mov QWORD PTR [r8+rsi*8], rax
cmp rdx, 256
jne .L4
pop rbx
ret

Different assembly when rangifying a simple algorithm

When I was preparing supplementary info for this question, I noticed that “rangified” implementations of a very simple algorithm resulted in important differences (to my eyes) in the resulting assembly, compared with “legacy” implementations.
I expanded the tests a bit, with the following results (GCC 9.1 -O3):
Case 1. Simple for loop (https://godbolt.org/z/rAVaT2)
#include <vector>
void foo(std::vector<double> &u, std::vector<double> const &v)
{
for (std::size_t i = 0u; i < u.size(); ++i)
u[i] += v[i];
}
mov rdx, QWORD PTR [rdi]
mov rdi, QWORD PTR [rdi+8]
sub rdi, rdx
sar rdi, 3
je .L1
mov rcx, QWORD PTR [rsi]
lea rax, [rcx+15]
sub rax, rdx
cmp rax, 30
jbe .L7
lea rax, [rdi-1]
cmp rax, 1
jbe .L7
mov rsi, rdi
xor eax, eax
shr rsi
sal rsi, 4
.L4:
movupd xmm0, XMMWORD PTR [rcx+rax]
movupd xmm1, XMMWORD PTR [rdx+rax]
addpd xmm0, xmm1
movups XMMWORD PTR [rdx+rax], xmm0
add rax, 16
cmp rsi, rax
jne .L4
mov rsi, rdi
and rsi, -2
and edi, 1
je .L1
lea rax, [rdx+rsi*8]
movsd xmm0, QWORD PTR [rax]
addsd xmm0, QWORD PTR [rcx+rsi*8]
movsd QWORD PTR [rax], xmm0
ret
.L7:
xor eax, eax
.L3:
movsd xmm0, QWORD PTR [rdx+rax*8]
addsd xmm0, QWORD PTR [rcx+rax*8]
movsd QWORD PTR [rdx+rax*8], xmm0
add rax, 1
cmp rdi, rax
jne .L3
.L1:
ret
Case 2. std::transform (https://godbolt.org/z/2iZaqo)
#include <algorithm>
#include <vector>
void foo(std::vector<double> &u, std::vector<double> const &v)
{
std::transform(std::begin(u), std::end(u),
std::begin(v),
std::begin(u),
std::plus());
}
mov rdx, QWORD PTR [rdi]
mov rax, QWORD PTR [rdi+8]
mov rsi, QWORD PTR [rsi]
cmp rax, rdx
je .L1
sub rax, 8
lea rcx, [rsi+15]
sub rax, rdx
sub rcx, rdx
shr rax, 3
cmp rcx, 30
jbe .L7
movabs rcx, 2305843009213693950
test rax, rcx
je .L7
lea rcx, [rax+1]
xor eax, eax
mov rdi, rcx
shr rdi
sal rdi, 4
.L4:
movupd xmm0, XMMWORD PTR [rdx+rax]
movupd xmm1, XMMWORD PTR [rsi+rax]
addpd xmm0, xmm1
movups XMMWORD PTR [rdx+rax], xmm0
add rax, 16
cmp rax, rdi
jne .L4
mov rdi, rcx
and rdi, -2
lea rax, [0+rdi*8]
add rdx, rax
add rsi, rax
cmp rcx, rdi
je .L1
movsd xmm0, QWORD PTR [rdx]
addsd xmm0, QWORD PTR [rsi]
movsd QWORD PTR [rdx], xmm0
ret
.L7:
xor ecx, ecx
.L3:
movsd xmm0, QWORD PTR [rdx+rcx*8]
addsd xmm0, QWORD PTR [rsi+rcx*8]
mov rdi, rcx
movsd QWORD PTR [rdx+rcx*8], xmm0
add rcx, 1
cmp rax, rdi
jne .L3
.L1:
ret
Case 3. Range-v3 view::zip (https://godbolt.org/z/0BEkfT)
#define RANGES_ASSERT(...) ((void)0)
#include <algorithm>
#include <range/v3/view/zip.hpp>
#include <vector>
void foo(std::vector<double> &u, std::vector<double> const &v)
{
auto w = ranges::view::zip(u, v);
std::for_each(std::begin(w), std::end(w),
[](auto &&x) { std::get<0u>(x) += std::get<1u>(x); });
}
mov rdx, QWORD PTR [rsi]
mov rsi, QWORD PTR [rsi+8]
mov rax, QWORD PTR [rdi]
mov rcx, QWORD PTR [rdi+8]
cmp rdx, rsi
je .L1
cmp rax, rcx
je .L1
.L3:
movsd xmm0, QWORD PTR [rax]
addsd xmm0, QWORD PTR [rdx]
add rax, 8
add rdx, 8
movsd QWORD PTR [rax-8], xmm0
cmp rax, rcx
je .L1
cmp rdx, rsi
jne .L3
.L1:
ret
Case 4. cmcstl2 ranges::transform (https://godbolt.org/z/MjYO1G)
#include <experimental/ranges/algorithm>
#include <vector>
namespace std
{
namespace ranges = experimental::ranges;
}
void foo(std::vector<double> &u,s td::vector<double> const &v)
{
std::ranges::transform(std::ranges::begin(u), std::ranges::end(u),
std::ranges::begin(v), std::ranges::end(v),
std::ranges::begin(u),
std::plus());
}
mov r8, QWORD PTR [rsi+8]
mov rdx, QWORD PTR [rsi]
mov rax, QWORD PTR [rdi]
mov rcx, QWORD PTR [rdi+8]
cmp rdx, r8
je .L1
cmp rcx, rax
jne .L3
jmp .L1
.L16:
cmp rdx, r8
je .L1
.L3:
movsd xmm0, QWORD PTR [rax]
addsd xmm0, QWORD PTR [rdx]
add rax, 8
add rdx, 8
movsd QWORD PTR [rax-8], xmm0
cmp rax, rcx
jne .L16
.L1:
ret
I can’t read assembly, but I seem to understand that the assemblies of Case 1 and Case 2 are almost equivalent and involve packed sums, whilst the assembly of the ranges versions (Cases 3 and 4) is much terser, but not vectorized.
I would really love to understand what those differences mean. Do my interpretation of the assembly make any sense? What are the additional instructions in the non-ranges versions? Why are there those differences?

Understanding what clang is doing in assembly, decrementing for a loop that is incrementing

Consider the following code, in C++:
#include <cstdlib>
std::size_t count(std::size_t n)
{
std::size_t i = 0;
while (i < n) {
asm volatile("": : :"memory");
++i;
}
return i;
}
int main(int argc, char* argv[])
{
return count(argc > 1 ? std::atoll(argv[1]) : 1);
}
It is just a loop that is incrementing its value, and returns it at the end. The asm volatile prevents the loop from being optimized away. We compile it under g++ 8.1 and clang++ 5.0 with the arguments -Wall -Wextra -std=c++11 -g -O3.
Now, if we look at what compiler explorer is producing, we have, for g++:
count(unsigned long):
mov rax, rdi
test rdi, rdi
je .L2
xor edx, edx
.L3:
add rdx, 1
cmp rax, rdx
jne .L3
.L2:
ret
main:
mov eax, 1
xor edx, edx
cmp edi, 1
jg .L25
.L21:
add rdx, 1
cmp rdx, rax
jb .L21
mov eax, edx
ret
.L25:
push rcx
mov rdi, QWORD PTR [rsi+8]
mov edx, 10
xor esi, esi
call strtoll
mov rdx, rax
test rax, rax
je .L11
xor edx, edx
.L12:
add rdx, 1
cmp rdx, rax
jb .L12
.L11:
mov eax, edx
pop rdx
ret
and for clang++:
count(unsigned long): # #count(unsigned long)
test rdi, rdi
je .LBB0_1
mov rax, rdi
.LBB0_3: # =>This Inner Loop Header: Depth=1
dec rax
jne .LBB0_3
mov rax, rdi
ret
.LBB0_1:
xor edi, edi
mov rax, rdi
ret
main: # #main
push rbx
cmp edi, 2
jl .LBB1_1
mov rdi, qword ptr [rsi + 8]
xor ebx, ebx
xor esi, esi
mov edx, 10
call strtoll
test rax, rax
jne .LBB1_3
mov eax, ebx
pop rbx
ret
.LBB1_1:
mov eax, 1
.LBB1_3:
mov rcx, rax
.LBB1_4: # =>This Inner Loop Header: Depth=1
dec rcx
jne .LBB1_4
mov rbx, rax
mov eax, ebx
pop rbx
ret
Understanding the code generated by g++, is not that complicated, the loop being:
.L3:
add rdx, 1
cmp rax, rdx
jne .L3
every iteration increments rdx, and compares it to rax that stores the size of the loop.
Now, I have no idea of what clang++ is doing. Apparently it uses dec, which is weird to me, and I don't even understand where the actual loop is. My question is the following: what is clang doing?
(I am looking for comments about the clang assembly code to describe what is done at each step and how it actually works).

The effect of the function is to return n, either by counting up to n and returning the result, or by simply returning the passed-in value of n. The clang code does the latter. The counting loop is here:
mov rax, rdi
.LBB0_3: # =>This Inner Loop Header: Depth=1
dec rax
jne .LBB0_3
mov rax, rdi
ret
It begins by copying the value of n into rax. It decrements the value in rax, and if the result is not 0, it jumps back to .LBB0_3. If the value is 0 it falls through to the next instruction, which copies the original value of n into rax and returns.
There is no i stored, but the code does the loop the prescribed number of times, and returns the value that i would have had, namely, n.

Generating fast assembly for complex arithmetic in g++4.4.7

I have a very simple function:
__attribute__((noinline))
void benchmark(cfloat* __restrict__ aa, cfloat* __restrict__ bb, cfloat* __restrict__ cc, cfloat* __restrict__ dd, cfloat uu, cfloat vv, size_t nn) {
for (ssize_t ii=0; ii < nn; ii++) {
dd[ii] = (
aa[ii]*uu +
bb[ii]*vv +
cc[ii]
);
}
}
That generates very different assembly with g++4.4.7 depending on how I define my cfloat object.
First iteration, if I define my cfloat thusly:
struct cfloat {
cfloat(float re, float im) : re(re), im(im) {}
float re,im;
};
cfloat operator +(cfloat a, cfloat b) {
return cfloat(a.re+b.re, a.im+b.im);
}
cfloat operator *(cfloat a, cfloat b) {
return cfloat(a.re*b.re-a.im*b.im, a.re*b.im+a.im*b.re);
}
generates this assembly for the benchmark function (compiled with g++ testcx.cc -O3 -o testcx:
0x00000000004006a0 <+0>: push %r15
0x00000000004006a2 <+2>: test %r8,%r8
0x00000000004006a5 <+5>: push %r14
0x00000000004006a7 <+7>: push %r13
0x00000000004006a9 <+9>: push %r12
0x00000000004006ab <+11>: push %rbp
0x00000000004006ac <+12>: push %rbx
0x00000000004006ad <+13>: movq %xmm0,-0x28(%rsp)
0x00000000004006b3 <+19>: mov %rdi,-0x38(%rsp)
0x00000000004006b8 <+24>: mov -0x28(%rsp),%rax
0x00000000004006bd <+29>: movq %xmm1,-0x28(%rsp)
0x00000000004006c3 <+35>: mov -0x28(%rsp),%r9
0x00000000004006c8 <+40>: je 0x4008a0 <_Z9benchmarkP6cfloatS0_S0_S0_S_S_m+512>
0x00000000004006ce <+46>: mov %r9,%r15
0x00000000004006d1 <+49>: mov %rax,%r14
0x00000000004006d4 <+52>: xor %r11d,%r11d
0x00000000004006d7 <+55>: shr $0x20,%r15
0x00000000004006db <+59>: shr $0x20,%r14
0x00000000004006df <+63>: xor %r10d,%r10d
0x00000000004006e2 <+66>: mov %r15d,-0x2c(%rsp)
0x00000000004006e7 <+71>: xor %ebp,%ebp
0x00000000004006e9 <+73>: xor %ebx,%ebx
0x00000000004006eb <+75>: movss -0x2c(%rsp),%xmm6
0x00000000004006f1 <+81>: mov %r9d,-0x2c(%rsp)
0x00000000004006f6 <+86>: movss -0x2c(%rsp),%xmm5
0x00000000004006fc <+92>: mov %r14d,-0x2c(%rsp)
0x0000000000400701 <+97>: movss -0x2c(%rsp),%xmm4
0x0000000000400707 <+103>: mov %eax,-0x2c(%rsp)
0x000000000040070b <+107>: xor %r13d,%r13d
0x000000000040070e <+110>: xor %r12d,%r12d
0x0000000000400711 <+113>: movabs $0xffffffff00000000,%r9
0x000000000040071b <+123>: movss -0x2c(%rsp),%xmm3
0x0000000000400721 <+129>: nopl 0x0(%rax)
0x0000000000400728 <+136>: lea 0x0(,%r13,8),%rax
0x0000000000400730 <+144>: movaps %xmm6,%xmm1
0x0000000000400733 <+147>: movaps %xmm5,%xmm7
0x0000000000400736 <+150>: and $0xffffffff,%ebp
0x0000000000400739 <+153>: lea (%rsi,%rax,1),%r15
0x000000000040073d <+157>: lea (%rdx,%rax,1),%r14
0x0000000000400741 <+161>: add -0x38(%rsp),%rax
0x0000000000400746 <+166>: and $0xffffffff,%ebx
0x0000000000400749 <+169>: add $0x1,%r12
0x000000000040074d <+173>: movss (%r15),%xmm0
0x0000000000400752 <+178>: movss 0x4(%r15),%xmm2
0x0000000000400758 <+184>: mulss %xmm0,%xmm1
0x000000000040075c <+188>: mulss %xmm2,%xmm7
0x0000000000400760 <+192>: mulss %xmm5,%xmm0
0x0000000000400764 <+196>: mulss %xmm6,%xmm2
0x0000000000400768 <+200>: addss %xmm7,%xmm1
0x000000000040076c <+204>: movaps %xmm3,%xmm7
0x000000000040076f <+207>: subss %xmm2,%xmm0
0x0000000000400773 <+211>: movd %xmm1,-0x30(%rsp)
0x0000000000400779 <+217>: mov -0x30(%rsp),%edi
0x000000000040077d <+221>: movaps %xmm4,%xmm1
0x0000000000400780 <+224>: movd %xmm0,-0x30(%rsp)
0x0000000000400786 <+230>: mov %edi,%r15d
0x0000000000400789 <+233>: mov -0x30(%rsp),%edi
0x000000000040078d <+237>: movss (%rax),%xmm0
0x0000000000400791 <+241>: shl $0x20,%r15
0x0000000000400795 <+245>: movss 0x4(%rax),%xmm2
0x000000000040079a <+250>: mulss %xmm0,%xmm1
0x000000000040079e <+254>: or %r15,%rbp
0x00000000004007a1 <+257>: mulss %xmm2,%xmm7
0x00000000004007a5 <+261>: mov %edi,%r15d
0x00000000004007a8 <+264>: and %r9,%rbp
0x00000000004007ab <+267>: mulss %xmm3,%xmm0
0x00000000004007af <+271>: or %r15,%rbp
0x00000000004007b2 <+274>: mulss %xmm4,%xmm2
0x00000000004007b6 <+278>: addss %xmm7,%xmm1
0x00000000004007ba <+282>: subss %xmm2,%xmm0
0x00000000004007be <+286>: movd %xmm1,-0x30(%rsp)
0x00000000004007c4 <+292>: mov -0x30(%rsp),%edi
0x00000000004007c8 <+296>: movd %xmm0,-0x30(%rsp)
0x00000000004007ce <+302>: mov %edi,%eax
0x00000000004007d0 <+304>: mov -0x30(%rsp),%edi
0x00000000004007d4 <+308>: shl $0x20,%rax
0x00000000004007d8 <+312>: or %rax,%rbx
0x00000000004007db <+315>: and %r9,%rbx
0x00000000004007de <+318>: mov %edi,%eax
0x00000000004007e0 <+320>: or %rax,%rbx
0x00000000004007e3 <+323>: mov %r10,%rax
0x00000000004007e6 <+326>: mov %rbx,%rdi
0x00000000004007e9 <+329>: and $0xffffffff,%eax
0x00000000004007ec <+332>: shr $0x20,%rdi
0x00000000004007f0 <+336>: mov %edi,-0x20(%rsp)
0x00000000004007f4 <+340>: mov %rbp,%rdi
0x00000000004007f7 <+343>: shr $0x20,%rdi
0x00000000004007fb <+347>: movss -0x20(%rsp),%xmm0
0x0000000000400801 <+353>: mov %edi,-0x10(%rsp)
0x0000000000400805 <+357>: addss -0x10(%rsp),%xmm0
0x000000000040080b <+363>: mov %ebp,-0x10(%rsp)
0x000000000040080f <+367>: movss %xmm0,-0x20(%rsp)
0x0000000000400815 <+373>: mov -0x20(%rsp),%r10d
0x000000000040081a <+378>: mov %ebx,-0x20(%rsp)
0x000000000040081e <+382>: movss -0x20(%rsp),%xmm0
0x0000000000400824 <+388>: addss -0x10(%rsp),%xmm0
0x000000000040082a <+394>: shl $0x20,%r10
0x000000000040082e <+398>: or %rax,%r10
0x0000000000400831 <+401>: and %r9,%r10
0x0000000000400834 <+404>: movss %xmm0,-0x20(%rsp)
0x000000000040083a <+410>: mov -0x20(%rsp),%eax
0x000000000040083e <+414>: or %rax,%r10
0x0000000000400841 <+417>: mov %r11,%rax
0x0000000000400844 <+420>: mov %r10,%rdi
0x0000000000400847 <+423>: and $0xffffffff,%eax
0x000000000040084a <+426>: shr $0x20,%rdi
0x000000000040084e <+430>: mov %edi,-0x20(%rsp)
0x0000000000400852 <+434>: movss -0x20(%rsp),%xmm0
0x0000000000400858 <+440>: addss 0x4(%r14),%xmm0
0x000000000040085e <+446>: movss %xmm0,-0x20(%rsp)
0x0000000000400864 <+452>: mov -0x20(%rsp),%r11d
0x0000000000400869 <+457>: mov %r10d,-0x20(%rsp)
0x000000000040086e <+462>: movss -0x20(%rsp),%xmm0
0x0000000000400874 <+468>: addss (%r14),%xmm0
0x0000000000400879 <+473>: shl $0x20,%r11
0x000000000040087d <+477>: or %rax,%r11
0x0000000000400880 <+480>: and %r9,%r11
0x0000000000400883 <+483>: movss %xmm0,-0x20(%rsp)
0x0000000000400889 <+489>: mov -0x20(%rsp),%eax
0x000000000040088d <+493>: or %rax,%r11
0x0000000000400890 <+496>: cmp %r8,%r12
0x0000000000400893 <+499>: mov %r11,(%rcx,%r13,8)
0x0000000000400897 <+503>: mov %r12,%r13
0x000000000040089a <+506>: jne 0x400728 <_Z9benchmarkP6cfloatS0_S0_S0_S_S_m+136>
0x00000000004008a0 <+512>: pop %rbx
0x00000000004008a1 <+513>: pop %rbp
0x00000000004008a2 <+514>: pop %r12
0x00000000004008a4 <+516>: pop %r13
0x00000000004008a6 <+518>: pop %r14
0x00000000004008a8 <+520>: pop %r15
0x00000000004008aa <+522>: retq
Which is about 133 instructions.
If I define the cfloat like this, with an array as the state:
struct cfloat {
cfloat(float re, float im) { ri[0] = re; ri[1] = im; }
float ri[2];
};
cfloat operator +(cfloat a, cfloat b) {
return cfloat(a.ri[0]+b.ri[0], a.ri[1]+b.ri[1]);
}
cfloat operator *(cfloat a, cfloat b) {
return cfloat(a.ri[0]*b.ri[0]-a.ri[1]*b.ri[1], a.ri[0]*b.ri[1]+a.ri[1]*b.ri[0]);
}
It generates this assembly:
Dump of assembler code for function _Z9benchmarkP6cfloatS0_S0_S0_S_S_m:
0x00000000004006a0 <+0>: push %rbx
0x00000000004006a1 <+1>: movq %xmm0,-0x8(%rsp)
0x00000000004006a7 <+7>: mov -0x8(%rsp),%r9
0x00000000004006ac <+12>: movq %xmm1,-0x8(%rsp)
0x00000000004006b2 <+18>: mov -0x8(%rsp),%rax
0x00000000004006b7 <+23>: mov %r9d,-0xc(%rsp)
0x00000000004006bc <+28>: shr $0x20,%r9
0x00000000004006c0 <+32>: movss -0xc(%rsp),%xmm9
0x00000000004006c7 <+39>: mov %r9d,-0xc(%rsp)
0x00000000004006cc <+44>: movss -0xc(%rsp),%xmm8
0x00000000004006d3 <+51>: mov %eax,-0xc(%rsp)
0x00000000004006d7 <+55>: shr $0x20,%rax
0x00000000004006db <+59>: movss -0xc(%rsp),%xmm7
0x00000000004006e1 <+65>: test %r8,%r8
0x00000000004006e4 <+68>: mov %eax,-0xc(%rsp)
0x00000000004006e8 <+72>: movss -0xc(%rsp),%xmm6
0x00000000004006ee <+78>: je 0x400796 <_Z9benchmarkP6cfloatS0_S0_S0_S_S_m+246>
0x00000000004006f4 <+84>: xor %eax,%eax
0x00000000004006f6 <+86>: xor %r9d,%r9d
0x00000000004006f9 <+89>: nopl 0x0(%rax)
0x0000000000400700 <+96>: shl $0x3,%rax
0x0000000000400704 <+100>: movaps %xmm7,%xmm0
0x0000000000400707 <+103>: lea (%rsi,%rax,1),%rbx
0x000000000040070b <+107>: movaps %xmm6,%xmm3
0x000000000040070e <+110>: lea (%rcx,%rax,1),%r10
0x0000000000400712 <+114>: lea (%rdx,%rax,1),%r11
0x0000000000400716 <+118>: lea (%rdi,%rax,1),%rax
0x000000000040071a <+122>: movss (%rbx),%xmm1
0x000000000040071e <+126>: add $0x1,%r9
0x0000000000400722 <+130>: movss 0x4(%rbx),%xmm5
0x0000000000400727 <+135>: mulss %xmm1,%xmm0
0x000000000040072b <+139>: mulss %xmm5,%xmm3
0x000000000040072f <+143>: movss (%rax),%xmm2
0x0000000000400733 <+147>: movaps %xmm8,%xmm10
0x0000000000400737 <+151>: mulss %xmm6,%xmm1
0x000000000040073b <+155>: movss 0x4(%rax),%xmm4
0x0000000000400740 <+160>: mulss %xmm7,%xmm5
0x0000000000400744 <+164>: mulss %xmm4,%xmm10
0x0000000000400749 <+169>: cmp %r8,%r9
0x000000000040074c <+172>: mov %r9,%rax
0x000000000040074f <+175>: subss %xmm3,%xmm0
0x0000000000400753 <+179>: movaps %xmm2,%xmm3
0x0000000000400756 <+182>: mulss %xmm9,%xmm4
0x000000000040075b <+187>: mulss %xmm9,%xmm3
0x0000000000400760 <+192>: addss %xmm5,%xmm1
0x0000000000400764 <+196>: mulss %xmm8,%xmm2
0x0000000000400769 <+201>: subss %xmm10,%xmm3
0x000000000040076e <+206>: addss %xmm4,%xmm2
0x0000000000400772 <+210>: addss %xmm3,%xmm0
0x0000000000400776 <+214>: addss %xmm2,%xmm1
0x000000000040077a <+218>: addss (%r11),%xmm0
0x000000000040077f <+223>: addss 0x4(%r11),%xmm1
0x0000000000400785 <+229>: movss %xmm0,(%r10)
0x000000000040078a <+234>: movss %xmm1,0x4(%r10)
0x0000000000400790 <+240>: jne 0x400700 <_Z9benchmarkP6cfloatS0_S0_S0_S_S_m+96>
0x0000000000400796 <+246>: pop %rbx
0x0000000000400797 <+247>: retq
End of assembler dump.
Which is about 59 instructions. And, my benchmarks show, the first iteration is about 3x slower than the second.
I would prefer the separate real/imaginary fields, not least because having them as an array seems to break the vectorizer in Intel's compiler for some reason.
Is there any way I can convince gcc that these two classes are equivalent?

So I don't believe this, but if I specify an explicit copy constructor, the problem resolves itself:
struct cfloat {
cfloat(float re, float im) : re(re), im(im) {}
cfloat(const cfloat& o) : re(o.re), im(o.im) {}
float re,im;
};
Now generates the same assembly:
Dump of assembler code for function benchmark(cfloat*, cfloat*, cfloat*, cfloat*, cfloat, cfloat, unsigned long):
0x0000000000400600 <+0>: mov 0x8(%rsp),%r10
0x0000000000400605 <+5>: test %r10,%r10
0x0000000000400608 <+8>: je 0x4006aa <benchmark(cfloat*, cfloat*, cfloat*, cfloat*, cfloat, cfloat, unsigned long)+170>
0x000000000040060e <+14>: xor %eax,%eax
0x0000000000400610 <+16>: movss (%r9),%xmm8
0x0000000000400615 <+21>: movss 0x4(%r9),%xmm9
0x000000000040061b <+27>: movaps %xmm8,%xmm0
0x000000000040061f <+31>: movaps %xmm9,%xmm3
0x0000000000400623 <+35>: movss (%rsi,%rax,8),%xmm1
0x0000000000400628 <+40>: movss 0x4(%rsi,%rax,8),%xmm7
0x000000000040062e <+46>: mulss %xmm1,%xmm0
0x0000000000400632 <+50>: mulss %xmm7,%xmm3
0x0000000000400636 <+54>: movss (%r8),%xmm5
0x000000000040063b <+59>: movss 0x4(%r8),%xmm6
0x0000000000400641 <+65>: mulss %xmm9,%xmm1
0x0000000000400646 <+70>: movaps %xmm6,%xmm10
0x000000000040064a <+74>: mulss %xmm8,%xmm7
0x000000000040064f <+79>: movss (%rdi,%rax,8),%xmm2
0x0000000000400654 <+84>: subss %xmm3,%xmm0
0x0000000000400658 <+88>: movaps %xmm5,%xmm3
0x000000000040065b <+91>: movss 0x4(%rdi,%rax,8),%xmm4
0x0000000000400661 <+97>: mulss %xmm2,%xmm3
0x0000000000400665 <+101>: addss %xmm7,%xmm1
0x0000000000400669 <+105>: mulss %xmm4,%xmm10
0x000000000040066e <+110>: mulss %xmm6,%xmm2
0x0000000000400672 <+114>: mulss %xmm5,%xmm4
0x0000000000400676 <+118>: subss %xmm10,%xmm3
0x000000000040067b <+123>: addss %xmm4,%xmm2
0x000000000040067f <+127>: addss %xmm3,%xmm0
0x0000000000400683 <+131>: addss %xmm2,%xmm1
0x0000000000400687 <+135>: addss (%rdx,%rax,8),%xmm0
0x000000000040068c <+140>: addss 0x4(%rdx,%rax,8),%xmm1
0x0000000000400692 <+146>: movss %xmm0,(%rcx,%rax,8)
0x0000000000400697 <+151>: movss %xmm1,0x4(%rcx,%rax,8)
0x000000000040069d <+157>: add $0x1,%rax
0x00000000004006a1 <+161>: cmp %rax,%r10
0x00000000004006a4 <+164>: ja 0x400610 <benchmark(cfloat*, cfloat*, cfloat*, cfloat*, cfloat, cfloat, unsigned long)+16>
0x00000000004006aa <+170>: repz retq
End of assembler dump.
Find me that in the spec.

You mentioned that you target Red Hat Enterprise Linux, and (in your deleted post) that newer compiler versions generate better code. You could use Developer Toolset to get a newer compiler, creating applications which are compatible with the rest of the operating system:
https://www.softwarecollections.org/en/scls/rhscl/devtoolset-6/
https://developers.redhat.com/products/developertoolset/overview/

Plain C++ Code 10 times faster than inline assembler. Why?

These two code snippets do the same thing: Adding two float arrays together and storing the result back into them.
Inline Assembler:
void vecAdd_SSE(float* v1, float* v2) {
_asm {
mov esi, v1
mov edi, v2
movups xmm0, [esi]
movups xmm1, [edi]
addps xmm0, xmm1
movups [esi], xmm0
movups [edi], xmm0
}
}
Plain C++ Code:
void vecAdd_Std(float* v1, float* v2) {
v1[0] = v1[0]+ v2[0];
v1[1] = v1[1]+ v2[1];
v1[2] = v1[2]+ v2[2];
v1[3] = v1[3]+ v2[3];
v2[0] = v1[0];
v2[1] = v1[1];
v2[2] = v1[2];
v2[3] = v1[3];
}
Disassembly for C++ Code (Disassembly made in Debug mode because i cannot view the Disassembly in Release mode for some reason):
void vecAdd_Std(float* v1, float* v2) {
push ebp
mov ebp,esp
sub esp,0C0h
push ebx
push esi
push edi
lea edi,[ebp-0C0h]
mov ecx,30h
mov eax,0CCCCCCCCh
rep stos dword ptr es:[edi]
v1[0] = v1[0]+ v2[0];
mov eax,4
imul ecx,eax,0
mov edx,4
imul eax,edx,0
mov edx,dword ptr [v1]
mov esi,dword ptr [v2]
movss xmm0,dword ptr [edx+ecx]
addss xmm0,dword ptr [esi+eax]
mov eax,4
imul ecx,eax,0
mov edx,dword ptr [v1]
movss dword ptr [edx+ecx],xmm0
v1[1] = v1[1]+ v2[1];
mov eax,4
shl eax,0
v1[1] = v1[1]+ v2[1];
mov ecx,4
shl ecx,0
mov edx,dword ptr [v1]
mov esi,dword ptr [v2]
movss xmm0,dword ptr [edx+eax]
addss xmm0,dword ptr [esi+ecx]
mov eax,4
shl eax,0
mov ecx,dword ptr [v1]
movss dword ptr [ecx+eax],xmm0
v1[2] = v1[2]+ v2[2];
mov eax,4
shl eax,1
mov ecx,4
shl ecx,1
mov edx,dword ptr [v1]
mov esi,dword ptr [v2]
movss xmm0,dword ptr [edx+eax]
addss xmm0,dword ptr [esi+ecx]
mov eax,4
shl eax,1
mov ecx,dword ptr [v1]
movss dword ptr [ecx+eax],xmm0
v1[3] = v1[3]+ v2[3];
mov eax,4
imul ecx,eax,3
mov edx,4
imul eax,edx,3
mov edx,dword ptr [v1]
mov esi,dword ptr [v2]
movss xmm0,dword ptr [edx+ecx]
addss xmm0,dword ptr [esi+eax]
mov eax,4
imul ecx,eax,3
mov edx,dword ptr [v1]
movss dword ptr [edx+ecx],xmm0
v2[0] = v1[0];
mov eax,4
imul ecx,eax,0
mov edx,4
imul eax,edx,0
mov edx,dword ptr [v2]
mov esi,dword ptr [v1]
mov ecx,dword ptr [esi+ecx]
mov dword ptr [edx+eax],ecx
v2[1] = v1[1];
mov eax,4
shl eax,0
mov ecx,4
shl ecx,0
mov edx,dword ptr [v2]
mov esi,dword ptr [v1]
mov eax,dword ptr [esi+eax]
mov dword ptr [edx+ecx],eax
v2[2] = v1[2];
mov eax,4
shl eax,1
mov ecx,4
shl ecx,1
mov edx,dword ptr [v2]
mov esi,dword ptr [v1]
mov eax,dword ptr [esi+eax]
mov dword ptr [edx+ecx],eax
v2[3] = v1[3];
mov eax,4
imul ecx,eax,3
mov edx,4
imul eax,edx,3
mov edx,dword ptr [v2]
mov esi,dword ptr [v1]
mov ecx,dword ptr [esi+ecx]
mov dword ptr [edx+eax],ecx
}
Now I made a time measurement on those to functions and noticed that the inline assembler code takes approximately 10 times longer (in Release mode).
Does anybody know why?

On my machine (VS2015 64-bit mode), the compiler inlines vecAdd_Std and produces
00007FF625921C8F vmovups xmm1,xmmword ptr [__xmm#4100000040c000004080000040000000 (07FF625929D60h)]
00007FF625921C97 vmovups xmm4,xmm1
00007FF625921C9B vcvtss2sd xmm1,xmm1,xmm4
Test code
int main() {
float x[4] = {1.0, 2.0, 3.0, 4.0};
float y[4] = {1.0, 2.0, 3.0, 4.0};
vecAdd_Std(x, y);
std::cout << x[0];
}

You aren't really calling a function that executes one SSE instruction, are you? There's non-trivial overhead involved in setting up the xmm registers, and you're copying the values from memory to the registers and back, which will take far longer than the actual calculation.
I wouldn't be at all surprised to find that the compiler inlines the C++ version of the function, but doesn't (can't, really) do the same for functions that contain inline assembly.

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

How to know if loop has optimization potential - c++

Related

vector bool compiler xor specialization?

Different assembly when rangifying a simple algorithm

Understanding what clang is doing in assembly, decrementing for a loop that is incrementing

Generating fast assembly for complex arithmetic in g++4.4.7

Plain C++ Code 10 times faster than inline assembler. Why?

Categories

Resources