Differences in custom and std fetch_add on floats - c++

This is an attempt at implementing fetch_add on floats without C++20.
void fetch_add(volatile float* x, float y)
{
bool success = false;
auto xi = (volatile std::int32_t*)x;
while(!success)
{
union {
std::int32_t sumint;
float sum;
};
auto tmp = __atomic_load_n(xi, __ATOMIC_RELAXED);
sumint = tmp;
sum += y;
success = __atomic_compare_exchange_n(xi, &tmp, sumint, true, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
}
}
To my great confusion, when I compare the assembly from gcc10.1 -O2 -std=c++2a for x86-64, they differ.
fetch_add(float volatile*, float):
.L2:
mov eax, DWORD PTR [rdi]
movd xmm1, eax
addss xmm1, xmm0
movd edx, xmm1
lock cmpxchg DWORD PTR [rdi], edx
jne .L2
ret
fetch_add_std(std::atomic<float>&, float):
mov eax, DWORD PTR [rdi]
movaps xmm1, xmm0
movd xmm0, eax
mov DWORD PTR [rsp-4], eax
addss xmm0, xmm1
.L9:
mov eax, DWORD PTR [rsp-4]
movd edx, xmm0
lock cmpxchg DWORD PTR [rdi], edx
je .L6
mov DWORD PTR [rsp-4], eax
movss xmm0, DWORD PTR [rsp-4]
addss xmm0, xmm1
jmp .L9
.L6:
ret
My ability to read assembly is near non-existent, but the custom version looks correct to me, which implies it is either incorrect, inefficient or somehow the standard library is rather broken. I don't quite believe the third case, which leads me to ask, is the custom version incorrect or inefficient?
After some comments, a second version without reloading after cmpxchg is written. They do still differ.

Related

Loop unroll issue with Visual Studio compiler

I have some simple setup, where I noticed that VS compiler seems not smart enough to unroll loop, but other compilers like clang or gcc do so. Do I miss some optimization flag for VS?
#include <cstddef>
struct A
{
double data[4];
double *begin() { return data; }
double *end() { return data + 4; }
double const *begin() const { return data; }
double const *end() const { return data + 4; }
};
double sum_index(A const &a) {
double ret = 0;
for(std::size_t i = 0; i < 4; ++i)
{
ret += a.data[i];
}
return ret;
}
double sum_iter(A const &a) {
double ret = 0;
for(auto const &v : a)
{
ret += v;
}
return ret;
}
I used https://godbolt.org/ compiler explorer to generate assembler code.
gcc 11.2 with -O3:
sum_index(A const&):
pxor xmm0, xmm0
addsd xmm0, QWORD PTR [rdi]
addsd xmm0, QWORD PTR [rdi+8]
addsd xmm0, QWORD PTR [rdi+16]
addsd xmm0, QWORD PTR [rdi+24]
ret
sum_iter(A const&):
movsd xmm1, QWORD PTR [rdi]
addsd xmm1, QWORD PTR .LC0[rip]
movsd xmm0, QWORD PTR [rdi+8]
addsd xmm1, xmm0
movupd xmm0, XMMWORD PTR [rdi+16]
addsd xmm1, xmm0
unpckhpd xmm0, xmm0
addsd xmm0, xmm1
ret
.LC0:
.long 0
.long 0
clang 13.0.1 with -O3:
sum_index(A const&): # #sum_index(A const&)
xorpd xmm0, xmm0
addsd xmm0, qword ptr [rdi]
addsd xmm0, qword ptr [rdi + 8]
addsd xmm0, qword ptr [rdi + 16]
addsd xmm0, qword ptr [rdi + 24]
ret
sum_iter(A const&): # #sum_iter(A const&)
xorpd xmm0, xmm0
addsd xmm0, qword ptr [rdi]
addsd xmm0, qword ptr [rdi + 8]
addsd xmm0, qword ptr [rdi + 16]
addsd xmm0, qword ptr [rdi + 24]
ret
MSVC 19.30 with /O2 (there is no /O3?):
this$ = 8
double const * A::begin(void)const PROC ; A::begin, COMDAT
mov rax, rcx
ret 0
double const * A::begin(void)const ENDP ; A::begin
this$ = 8
double const * A::end(void)const PROC ; A::end, COMDAT
lea rax, QWORD PTR [rcx+32]
ret 0
double const * A::end(void)const ENDP ; A::end
a$ = 8
double sum_index(A const &) PROC ; sum_index, COMDAT
movsd xmm0, QWORD PTR [rcx]
xorps xmm1, xmm1
addsd xmm0, xmm1
addsd xmm0, QWORD PTR [rcx+8]
addsd xmm0, QWORD PTR [rcx+16]
addsd xmm0, QWORD PTR [rcx+24]
ret 0
double sum_index(A const &) ENDP ; sum_index
a$ = 8
double sum_iter(A const &) PROC ; sum_iter, COMDAT
lea rax, QWORD PTR [rcx+32]
xorps xmm0, xmm0
cmp rcx, rax
je SHORT $LN12#sum_iter
npad 4
$LL8#sum_iter:
addsd xmm0, QWORD PTR [rcx]
add rcx, 8
cmp rcx, rax
jne SHORT $LL8#sum_iter
$LN12#sum_iter:
ret 0
double sum_iter(A const &) ENDP ; sum_iter
Obviously there is problem with unrolling the loop for MSVC. Is there some additional optimization flag I have to set?
Thanks for help!

Loop unrolling and SSE -- clang vs gcc

Disclaimer: full code can be found here.
16 byte alignment
Given a fairly simple type to support proper SSE alignment
struct alignas(16) simd_pack
{
std::int32_t data[4];
};
and a function that adds two arrays together
void add_packed(simd_pack* lhs_and_result, simd_pack* rhs, std::size_t size)
{
for (std::size_t i = 0; i < size; i++)
for (std::size_t j = 0; j < 4; j++)
lhs_and_result[i].data[j] += rhs[i].data[j];
}
compile the code with clang and gcc using -O3.
Clang produces the following assembly:
add_packed(simd_pack*, simd_pack*, unsigned long): # #add_packed(simd_pack*, simd_pack*, unsigned long)
test rdx, rdx
je .LBB0_3
mov eax, 12
.LBB0_2: # =>This Inner Loop Header: Depth=1
mov ecx, dword ptr [rsi + rax - 12]
add dword ptr [rdi + rax - 12], ecx
mov ecx, dword ptr [rsi + rax - 8]
add dword ptr [rdi + rax - 8], ecx
mov ecx, dword ptr [rsi + rax - 4]
add dword ptr [rdi + rax - 4], ecx
mov ecx, dword ptr [rsi + rax]
add dword ptr [rdi + rax], ecx
add rax, 16
add rdx, -1
jne .LBB0_2
.LBB0_3:
ret
I'm not very literate in assembly but to me it looks like clang is simply unrolling the inner for loop. If we take a look at gcc we get:
add_packed(simd_pack*, simd_pack*, unsigned long):
test rdx, rdx
je .L1
sal rdx, 4
xor eax, eax
.L3:
movdqa xmm0, XMMWORD PTR [rdi+rax]
paddd xmm0, XMMWORD PTR [rsi+rax]
movaps XMMWORD PTR [rdi+rax], xmm0
add rax, 16
cmp rax, rdx
jne .L3
.L1:
ret
which is what I expect.
64 byte alignment
The difference gets even bigger (obviously) if we go to 64 byte alignment (which usually is a cache line if I'm not mistaken)
struct alignas(64) cache_line
{
std::int32_t data[16];
};
void add_cache_line(cache_line* lhs_and_result, cache_line* rhs, std::size_t size)
{
for (std::size_t i = 0; i < size; i++)
for (std::size_t j = 0; j < 16; j++)
lhs_and_result[i].data[j] += rhs[i].data[j];
}
Clang keeps simply unrolling:
add_cache_line(cache_line*, cache_line*, unsigned long): # #add_cache_line(cache_line*, cache_line*, unsigned long)
test rdx, rdx
je .LBB1_3
mov eax, 60
.LBB1_2: # =>This Inner Loop Header: Depth=1
mov ecx, dword ptr [rsi + rax - 60]
add dword ptr [rdi + rax - 60], ecx
mov ecx, dword ptr [rsi + rax - 56]
add dword ptr [rdi + rax - 56], ecx
mov ecx, dword ptr [rsi + rax - 52]
add dword ptr [rdi + rax - 52], ecx
mov ecx, dword ptr [rsi + rax - 48]
add dword ptr [rdi + rax - 48], ecx
mov ecx, dword ptr [rsi + rax - 44]
add dword ptr [rdi + rax - 44], ecx
mov ecx, dword ptr [rsi + rax - 40]
add dword ptr [rdi + rax - 40], ecx
mov ecx, dword ptr [rsi + rax - 36]
add dword ptr [rdi + rax - 36], ecx
mov ecx, dword ptr [rsi + rax - 32]
add dword ptr [rdi + rax - 32], ecx
mov ecx, dword ptr [rsi + rax - 28]
add dword ptr [rdi + rax - 28], ecx
mov ecx, dword ptr [rsi + rax - 24]
add dword ptr [rdi + rax - 24], ecx
mov ecx, dword ptr [rsi + rax - 20]
add dword ptr [rdi + rax - 20], ecx
mov ecx, dword ptr [rsi + rax - 16]
add dword ptr [rdi + rax - 16], ecx
mov ecx, dword ptr [rsi + rax - 12]
add dword ptr [rdi + rax - 12], ecx
mov ecx, dword ptr [rsi + rax - 8]
add dword ptr [rdi + rax - 8], ecx
mov ecx, dword ptr [rsi + rax - 4]
add dword ptr [rdi + rax - 4], ecx
mov ecx, dword ptr [rsi + rax]
add dword ptr [rdi + rax], ecx
add rax, 64
add rdx, -1
jne .LBB1_2
.LBB1_3:
ret
while gcc uses SSE and also unrolls that:
add_cache_line(cache_line*, cache_line*, unsigned long):
mov rcx, rdx
test rdx, rdx
je .L9
sal rcx, 6
mov rax, rdi
mov rdx, rsi
add rcx, rdi
.L11:
movdqa xmm2, XMMWORD PTR [rdx+16]
movdqa xmm3, XMMWORD PTR [rax]
add rax, 64
add rdx, 64
movdqa xmm1, XMMWORD PTR [rdx-32]
movdqa xmm0, XMMWORD PTR [rdx-16]
paddd xmm3, XMMWORD PTR [rdx-64]
paddd xmm2, XMMWORD PTR [rax-48]
paddd xmm1, XMMWORD PTR [rax-32]
paddd xmm0, XMMWORD PTR [rax-16]
movaps XMMWORD PTR [rax-64], xmm3
movaps XMMWORD PTR [rax-48], xmm2
movaps XMMWORD PTR [rax-32], xmm1
movaps XMMWORD PTR [rax-16], xmm0
cmp rax, rcx
jne .L11
.L9:
ret
No alignment
It's getting interesting if we use plain 32 bit integer arrays with no alignment at all. We use the exact same compiler flags.
void add_unaligned(std::int32_t* lhs_and_result, std::int32_t* rhs, std::size_t size)
{
for (std::size_t i = 0; i < size; i++)
lhs_and_result[i] += rhs[i];
}
Clang
Clang's assembly exploaded a fair bit by adding some branches:
add_unaligned(int*, int*, unsigned long): # #add_unaligned(int*, int*, unsigned long)
test rdx, rdx
je .LBB2_16
cmp rdx, 7
jbe .LBB2_2
lea rax, [rsi + 4*rdx]
cmp rax, rdi
jbe .LBB2_9
lea rax, [rdi + 4*rdx]
cmp rax, rsi
jbe .LBB2_9
.LBB2_2:
xor r10d, r10d
.LBB2_3:
mov r8, r10
not r8
add r8, rdx
mov rcx, rdx
and rcx, 3
je .LBB2_5
.LBB2_4: # =>This Inner Loop Header: Depth=1
mov eax, dword ptr [rsi + 4*r10]
add dword ptr [rdi + 4*r10], eax
add r10, 1
add rcx, -1
jne .LBB2_4
.LBB2_5:
cmp r8, 3
jb .LBB2_16
.LBB2_6: # =>This Inner Loop Header: Depth=1
mov eax, dword ptr [rsi + 4*r10]
add dword ptr [rdi + 4*r10], eax
mov eax, dword ptr [rsi + 4*r10 + 4]
add dword ptr [rdi + 4*r10 + 4], eax
mov eax, dword ptr [rsi + 4*r10 + 8]
add dword ptr [rdi + 4*r10 + 8], eax
mov eax, dword ptr [rsi + 4*r10 + 12]
add dword ptr [rdi + 4*r10 + 12], eax
add r10, 4
cmp rdx, r10
jne .LBB2_6
jmp .LBB2_16
.LBB2_9:
mov r10, rdx
and r10, -8
lea rax, [r10 - 8]
mov r9, rax
shr r9, 3
add r9, 1
mov r8d, r9d
and r8d, 1
test rax, rax
je .LBB2_10
sub r9, r8
xor ecx, ecx
.LBB2_12: # =>This Inner Loop Header: Depth=1
movdqu xmm0, xmmword ptr [rsi + 4*rcx]
movdqu xmm1, xmmword ptr [rsi + 4*rcx + 16]
movdqu xmm2, xmmword ptr [rdi + 4*rcx]
paddd xmm2, xmm0
movdqu xmm0, xmmword ptr [rdi + 4*rcx + 16]
paddd xmm0, xmm1
movdqu xmm1, xmmword ptr [rdi + 4*rcx + 32]
movdqu xmm3, xmmword ptr [rdi + 4*rcx + 48]
movdqu xmmword ptr [rdi + 4*rcx], xmm2
movdqu xmmword ptr [rdi + 4*rcx + 16], xmm0
movdqu xmm0, xmmword ptr [rsi + 4*rcx + 32]
paddd xmm0, xmm1
movdqu xmm1, xmmword ptr [rsi + 4*rcx + 48]
paddd xmm1, xmm3
movdqu xmmword ptr [rdi + 4*rcx + 32], xmm0
movdqu xmmword ptr [rdi + 4*rcx + 48], xmm1
add rcx, 16
add r9, -2
jne .LBB2_12
test r8, r8
je .LBB2_15
.LBB2_14:
movdqu xmm0, xmmword ptr [rsi + 4*rcx]
movdqu xmm1, xmmword ptr [rsi + 4*rcx + 16]
movdqu xmm2, xmmword ptr [rdi + 4*rcx]
paddd xmm2, xmm0
movdqu xmm0, xmmword ptr [rdi + 4*rcx + 16]
paddd xmm0, xmm1
movdqu xmmword ptr [rdi + 4*rcx], xmm2
movdqu xmmword ptr [rdi + 4*rcx + 16], xmm0
.LBB2_15:
cmp r10, rdx
jne .LBB2_3
.LBB2_16:
ret
.LBB2_10:
xor ecx, ecx
test r8, r8
jne .LBB2_14
jmp .LBB2_15
What is happening at .LBB2_4 and .LBB2_6? It looks like it's unrolling a loop again but I'm not sure what happens there (mainly because of the registers used).
In .LBB2_12 it even unrolls the SSE part. I think it's only unrolled two-fold though because it needs two SIMD registers to load each operand because they are unaligned now. .LBB2_14 contains the SSE part without the unrolling.
How is the control flow here? I'm assuming it should be:
keep using the unrolled SSE part until the remaining data is too small to fill all the registers (xmm0..3)
switch to the single stage SSE part and do it once if we have enough data remaining to fill xmm0 (4 integers in our case)
process the remaining data (3 operations at max, otherwise it would be SSE suitable again)
The order of the labels and the jump instructions are confusing, is that (approx.) what happens here?
GCC
Gcc's assembly is a bit easier to read:
add_unaligned(int*, int*, unsigned long):
test rdx, rdx
je .L16
lea rcx, [rsi+4]
mov rax, rdi
sub rax, rcx
cmp rax, 8
jbe .L22
lea rax, [rdx-1]
cmp rax, 2
jbe .L22
mov rcx, rdx
xor eax, eax
shr rcx, 2
sal rcx, 4
.L19:
movdqu xmm0, XMMWORD PTR [rdi+rax]
movdqu xmm1, XMMWORD PTR [rsi+rax]
paddd xmm0, xmm1
movups XMMWORD PTR [rdi+rax], xmm0
add rax, 16
cmp rax, rcx
jne .L19
mov rax, rdx
and rax, -4
test dl, 3
je .L16
mov ecx, DWORD PTR [rsi+rax*4]
add DWORD PTR [rdi+rax*4], ecx
lea rcx, [rax+1]
cmp rdx, rcx
jbe .L16
add rax, 2
mov r8d, DWORD PTR [rsi+rcx*4]
add DWORD PTR [rdi+rcx*4], r8d
cmp rdx, rax
jbe .L16
mov edx, DWORD PTR [rsi+rax*4]
add DWORD PTR [rdi+rax*4], edx
ret
.L22:
xor eax, eax
.L18:
mov ecx, DWORD PTR [rsi+rax*4]
add DWORD PTR [rdi+rax*4], ecx
add rax, 1
cmp rdx, rax
jne .L18
.L16:
ret
I assume the control flow is similar to clang
keep using the single stage SSE part until the remaining data is too small to fill xmm0 and xmm1
process the remaining data (3 operations at max, otherwise it would be SSE suitable again)
It looks like exactly this is happening in .L19 but what is .L18 doing then?
Summary
Here is the full code, including assembly. My question are:
Why is clang unrolling the functions that use aligned data instead of using SSE or a combination of both (like gcc)?
What are .LBB2_4 and .LBB2_6 in clang's assembly doing?
Are my assumptions about the control flow of the function with the unaligned data correct?
What is .L18 in gcc's assembly doing?

Different assembly when rangifying a simple algorithm

When I was preparing supplementary info for this question, I noticed that “rangified” implementations of a very simple algorithm resulted in important differences (to my eyes) in the resulting assembly, compared with “legacy” implementations.
I expanded the tests a bit, with the following results (GCC 9.1 -O3):
Case 1. Simple for loop (https://godbolt.org/z/rAVaT2)
#include <vector>
void foo(std::vector<double> &u, std::vector<double> const &v)
{
for (std::size_t i = 0u; i < u.size(); ++i)
u[i] += v[i];
}
mov rdx, QWORD PTR [rdi]
mov rdi, QWORD PTR [rdi+8]
sub rdi, rdx
sar rdi, 3
je .L1
mov rcx, QWORD PTR [rsi]
lea rax, [rcx+15]
sub rax, rdx
cmp rax, 30
jbe .L7
lea rax, [rdi-1]
cmp rax, 1
jbe .L7
mov rsi, rdi
xor eax, eax
shr rsi
sal rsi, 4
.L4:
movupd xmm0, XMMWORD PTR [rcx+rax]
movupd xmm1, XMMWORD PTR [rdx+rax]
addpd xmm0, xmm1
movups XMMWORD PTR [rdx+rax], xmm0
add rax, 16
cmp rsi, rax
jne .L4
mov rsi, rdi
and rsi, -2
and edi, 1
je .L1
lea rax, [rdx+rsi*8]
movsd xmm0, QWORD PTR [rax]
addsd xmm0, QWORD PTR [rcx+rsi*8]
movsd QWORD PTR [rax], xmm0
ret
.L7:
xor eax, eax
.L3:
movsd xmm0, QWORD PTR [rdx+rax*8]
addsd xmm0, QWORD PTR [rcx+rax*8]
movsd QWORD PTR [rdx+rax*8], xmm0
add rax, 1
cmp rdi, rax
jne .L3
.L1:
ret
Case 2. std::transform (https://godbolt.org/z/2iZaqo)
#include <algorithm>
#include <vector>
void foo(std::vector<double> &u, std::vector<double> const &v)
{
std::transform(std::begin(u), std::end(u),
std::begin(v),
std::begin(u),
std::plus());
}
mov rdx, QWORD PTR [rdi]
mov rax, QWORD PTR [rdi+8]
mov rsi, QWORD PTR [rsi]
cmp rax, rdx
je .L1
sub rax, 8
lea rcx, [rsi+15]
sub rax, rdx
sub rcx, rdx
shr rax, 3
cmp rcx, 30
jbe .L7
movabs rcx, 2305843009213693950
test rax, rcx
je .L7
lea rcx, [rax+1]
xor eax, eax
mov rdi, rcx
shr rdi
sal rdi, 4
.L4:
movupd xmm0, XMMWORD PTR [rdx+rax]
movupd xmm1, XMMWORD PTR [rsi+rax]
addpd xmm0, xmm1
movups XMMWORD PTR [rdx+rax], xmm0
add rax, 16
cmp rax, rdi
jne .L4
mov rdi, rcx
and rdi, -2
lea rax, [0+rdi*8]
add rdx, rax
add rsi, rax
cmp rcx, rdi
je .L1
movsd xmm0, QWORD PTR [rdx]
addsd xmm0, QWORD PTR [rsi]
movsd QWORD PTR [rdx], xmm0
ret
.L7:
xor ecx, ecx
.L3:
movsd xmm0, QWORD PTR [rdx+rcx*8]
addsd xmm0, QWORD PTR [rsi+rcx*8]
mov rdi, rcx
movsd QWORD PTR [rdx+rcx*8], xmm0
add rcx, 1
cmp rax, rdi
jne .L3
.L1:
ret
Case 3. Range-v3 view::zip (https://godbolt.org/z/0BEkfT)
#define RANGES_ASSERT(...) ((void)0)
#include <algorithm>
#include <range/v3/view/zip.hpp>
#include <vector>
void foo(std::vector<double> &u, std::vector<double> const &v)
{
auto w = ranges::view::zip(u, v);
std::for_each(std::begin(w), std::end(w),
[](auto &&x) { std::get<0u>(x) += std::get<1u>(x); });
}
mov rdx, QWORD PTR [rsi]
mov rsi, QWORD PTR [rsi+8]
mov rax, QWORD PTR [rdi]
mov rcx, QWORD PTR [rdi+8]
cmp rdx, rsi
je .L1
cmp rax, rcx
je .L1
.L3:
movsd xmm0, QWORD PTR [rax]
addsd xmm0, QWORD PTR [rdx]
add rax, 8
add rdx, 8
movsd QWORD PTR [rax-8], xmm0
cmp rax, rcx
je .L1
cmp rdx, rsi
jne .L3
.L1:
ret
Case 4. cmcstl2 ranges::transform (https://godbolt.org/z/MjYO1G)
#include <experimental/ranges/algorithm>
#include <vector>
namespace std
{
namespace ranges = experimental::ranges;
}
void foo(std::vector<double> &u,s td::vector<double> const &v)
{
std::ranges::transform(std::ranges::begin(u), std::ranges::end(u),
std::ranges::begin(v), std::ranges::end(v),
std::ranges::begin(u),
std::plus());
}
mov r8, QWORD PTR [rsi+8]
mov rdx, QWORD PTR [rsi]
mov rax, QWORD PTR [rdi]
mov rcx, QWORD PTR [rdi+8]
cmp rdx, r8
je .L1
cmp rcx, rax
jne .L3
jmp .L1
.L16:
cmp rdx, r8
je .L1
.L3:
movsd xmm0, QWORD PTR [rax]
addsd xmm0, QWORD PTR [rdx]
add rax, 8
add rdx, 8
movsd QWORD PTR [rax-8], xmm0
cmp rax, rcx
jne .L16
.L1:
ret
I can’t read assembly, but I seem to understand that the assemblies of Case 1 and Case 2 are almost equivalent and involve packed sums, whilst the assembly of the ranges versions (Cases 3 and 4) is much terser, but not vectorized.
I would really love to understand what those differences mean. Do my interpretation of the assembly make any sense? What are the additional instructions in the non-ranges versions? Why are there those differences?

GCC std::sin vectorization bug?

The next code (with -O3 -ffast-math):
#include <cmath>
float a[4];
void sin1() {
for(unsigned i = 0; i < 4; i++) a[i] = sinf(a[i]);
}
Compiles vectorized version of sinf (_ZGVbN4v_sinf):
sin1():
sub rsp, 8
movaps xmm0, XMMWORD PTR a[rip]
call _ZGVbN4v_sinf
movaps XMMWORD PTR a[rip], xmm0
add rsp, 8
ret
But when i use c++ version of sinf (std::sin) no vectorization occurrs:
void sin2() {
for(unsigned i = 0; i < 4; i++) a[i] = std::sin(a[i]);
}
sin2():
sub rsp, 8
movss xmm0, DWORD PTR a[rip]
call sinf
movss DWORD PTR a[rip], xmm0
movss xmm0, DWORD PTR a[rip+4]
call sinf
movss DWORD PTR a[rip+4], xmm0
movss xmm0, DWORD PTR a[rip+8]
call sinf
movss DWORD PTR a[rip+8], xmm0
movss xmm0, DWORD PTR a[rip+12]
call sinf
movss DWORD PTR a[rip+12], xmm0
add rsp, 8
ret
Compiler Explorer Code

Getting GCC/Clang to use CMOV

I have a simple tagged union of values. The values can either be int64_ts or doubles. I am performing addition on the these unions with the caveat that if both arguments represent int64_t values then the result should also have an int64_t value.
Here is the code:
#include<stdint.h>
union Value {
int64_t a;
double b;
};
enum Type { DOUBLE, LONG };
// Value + type.
struct TaggedValue {
Type type;
Value value;
};
void add(const TaggedValue& arg1, const TaggedValue& arg2, TaggedValue* out) {
const Type type1 = arg1.type;
const Type type2 = arg2.type;
// If both args are longs then write a long to the output.
if (type1 == LONG && type2 == LONG) {
out->value.a = arg1.value.a + arg2.value.a;
out->type = LONG;
} else {
// Convert argument to a double and add it.
double op1 = type1 == LONG ? (double)arg1.value.a : arg1.value.b; // Why isn't CMOV used?
double op2 = type2 == LONG ? (double)arg2.value.a : arg2.value.b; // Why isn't CMOV used?
out->value.b = op1 + op2;
out->type = DOUBLE;
}
}
The output of gcc at -O2 is here: http://goo.gl/uTve18
Attached here in case the link doesn't work.
add(TaggedValue const&, TaggedValue const&, TaggedValue*):
cmp DWORD PTR [rdi], 1
sete al
cmp DWORD PTR [rsi], 1
sete cl
je .L17
test al, al
jne .L18
.L4:
test cl, cl
movsd xmm1, QWORD PTR [rdi+8]
jne .L19
.L6:
movsd xmm0, QWORD PTR [rsi+8]
mov DWORD PTR [rdx], 0
addsd xmm0, xmm1
movsd QWORD PTR [rdx+8], xmm0
ret
.L17:
test al, al
je .L4
mov rax, QWORD PTR [rdi+8]
add rax, QWORD PTR [rsi+8]
mov DWORD PTR [rdx], 1
mov QWORD PTR [rdx+8], rax
ret
.L18:
cvtsi2sd xmm1, QWORD PTR [rdi+8]
jmp .L6
.L19:
cvtsi2sd xmm0, QWORD PTR [rsi+8]
addsd xmm0, xmm1
mov DWORD PTR [rdx], 0
movsd QWORD PTR [rdx+8], xmm0
ret
It produced code with a lot of branches. I know that the input data is pretty random i.e it has a random combination of int64_ts and doubles. I'd like to have at least the conversion to a double done with an equivalent of a CMOV instruction. Is there any way I can coax gcc to produce that code? I'd ideally like to run some benchmark on real data to see how the code with a lot of branches does vs one with fewer branches but more expensive CMOV instructions. It might turn out that the code generated by default by GCC works better but I'd like to confirm that. I could inline the assembly myself but I'd prefer not to.
The interactive compiler link is a good way to check the assembly. Any suggestions?