One writer and multiple readers - 256bit - AVX - atomic [duplicate] - c++

This question already has answers here:
Why does clang produce inefficient asm with -O0 (for this simple floating point sum)?
(1 answer)
SSE instructions: which CPUs can do atomic 16B memory operations?
(7 answers)
Largest data type which can be fetch-ANDed atomically?
(2 answers)
Closed 2 years ago.
Would like to write 256bit of data on one core and read it on another one. So there will be only one process to write and can be multiple readers.
Was thinking to implement it using AVX. The reads and writes should be atomic since they are only 1 instruction (vmovdqa) and if aligned by cache line cache coherency would move the data atomically between cores.
Looked at the generated assembly but can see 2 writes and 2 reads. Why is not there just one? Would this solution for atomic read/write work given the assumptions?
#include <immintrin.h>
#include <cstdint>
struct Data {
int64_t a[4];
};
struct DataHolder {
void set_data(Data* in) {
_mm256_store_si256(reinterpret_cast<__m256i *>(&data_), *reinterpret_cast<__m256i *>(in));
}
void get_data(Data* out) {
_mm256_store_si256(reinterpret_cast<__m256i *>(out), *reinterpret_cast<__m256i *>(&data_));
}
alignas(64) Data data_;
char padding [64 - sizeof(Data)];
};
int main() {
Data a, b;
DataHolder ab;
ab.set_data(&a);
ab.get_data(&b);
}
DataHolder::set_data(Data*):
push rbp
mov rbp, rsp
and rsp, -32
mov QWORD PTR [rsp-72], rdi
mov QWORD PTR [rsp-80], rsi
mov rax, QWORD PTR [rsp-80]
vmovdqa ymm0, YMMWORD PTR [rax]
mov rax, QWORD PTR [rsp-72]
mov QWORD PTR [rsp-8], rax
vmovdqa YMMWORD PTR [rsp-64], ymm0
mov rax, QWORD PTR [rsp-8]
vmovdqa ymm0, YMMWORD PTR [rsp-64]
vmovdqa YMMWORD PTR [rax], ymm0
nop
nop
leave
ret
DataHolder::get_data(Data*):
push rbp
mov rbp, rsp
and rsp, -32
mov QWORD PTR [rsp-72], rdi
mov QWORD PTR [rsp-80], rsi
mov rax, QWORD PTR [rsp-72]
vmovdqa ymm0, YMMWORD PTR [rax]
mov rax, QWORD PTR [rsp-80]
mov QWORD PTR [rsp-8], rax
vmovdqa YMMWORD PTR [rsp-64], ymm0
mov rax, QWORD PTR [rsp-8]
vmovdqa ymm0, YMMWORD PTR [rsp-64]
vmovdqa YMMWORD PTR [rax], ymm0
nop
nop
leave
ret
main:
push rbp
mov rbp, rsp
and rsp, -64
add rsp, -128
lea rdx, [rsp+96]
mov rax, rsp
mov rsi, rdx
mov rdi, rax
call DataHolder::set_data(Data*)
lea rdx, [rsp+64]
mov rax, rsp
mov rsi, rdx
mov rdi, rax
call DataHolder::get_data(Data*)
mov eax, 0
leave
ret

Related

vector bool compiler xor specialization?

I was thinking again about implementing the quadratic sieve for fun, which requires Guassian elimination over a binary field, that is the operations required are 1. swapping rows and 2. XORing rows.
My ideas were either to maintain a bit array using a vector of 64-bit ints and bit twiddling, or use vector<bool>, which is probably space-optimized on my system. The bit array must be able to be dynamically sized, so std::bitset won't work. The advantage of maintaining my own ints is that I can XOR 64 bits at a time which is a neat trick. I wanted to see what a compiler would do for a loop that XOR'd bool vectors: (I wasn't able to use ^=, see operator |= on std::vector<bool>)
void xor_vector(std::vector<bool>& a, std::vector<bool>& b) {
for (std::size_t i=0; i<a.size(); ++i)
a[i] = a[i] ^ b[i];
}
I have a very basic understanding of x86 but it looks like the compiler isn't actually XORing words together? Is there a way to get the compiler to XOR entire words at a time?
https://godbolt.org/z/PbGdv3sKT
xor_vector(std::vector<bool, std::allocator<bool> >&, std::vector<bool, std::allocator<bool> >&):
mov r8, QWORD PTR [rdi]
mov rax, QWORD PTR [rdi+16]
mov edx, DWORD PTR [rdi+24]
sub rax, r8
lea rdi, [rdx+rax*8]
test rdi, rdi
je .L11
push rbp
mov r10d, 1
push rbx
mov r9, QWORD PTR [rsi]
xor esi, esi
jmp .L7
.L16:
mov rdx, r10
sal rdx, cl
mov rcx, QWORD PTR [r11]
mov rbp, rdx
test rdx, rcx
setne bl
and rbp, QWORD PTR [rax]
setne bpl
.L4:
mov rax, rdx
not rdx
or rax, rcx
and rdx, rcx
cmp bpl, bl
cmovne rdx, rax
add rsi, 1
mov QWORD PTR [r11], rdx
cmp rsi, rdi
je .L15
.L7:
test rsi, rsi
lea rax, [rsi+63]
mov rdx, rsi
cmovns rax, rsi
sar rdx, 63
shr rdx, 58
sar rax, 6
lea rcx, [rsi+rdx]
sal rax, 3
and ecx, 63
lea r11, [r8+rax]
add rax, r9
sub rcx, rdx
jns .L16
add rcx, 64
mov rdx, r10
sal rdx, cl
mov rcx, QWORD PTR [r11-8]
mov rbp, rdx
test rcx, rdx
setne bl
and rbp, QWORD PTR [rax-8]
setne bpl
sub r11, 8
jmp .L4
.L15:
pop rbx
pop rbp
ret
.L11:
ret
My question is similar to bitwise operations on vector<bool> but the answers are dated and don't seem to answer my question.
Update: I tested with a 256 bit sized bitset too. Still I don't see XORing whole machine words.
void xor_vector(std::bitset<256>& a, std::bitset<256>& b) {
for (std::size_t i=0; i<a.size(); ++i)
a[i] = a[i] ^ b[i];
}
https://godbolt.org/z/jKEf89E1j
xor_vector(std::bitset<256ul>&, std::bitset<256ul>&):
push rbx
mov r8, rdi
mov r11, rsi
xor edx, edx
mov ebx, 1
.L4:
mov rsi, rdx
mov rcx, rdx
mov rax, rbx
shr rsi, 6
and ecx, 63
sal rax, cl
mov rdi, QWORD PTR [r8+rsi*8]
mov rcx, rax
and rcx, QWORD PTR [r11+rsi*8]
mov rcx, rax
setne r10b
test rax, rdi
not rax
setne r9b
or rcx, rdi
and rax, rdi
cmp r10b, r9b
cmovne rax, rcx
add rdx, 1
mov QWORD PTR [r8+rsi*8], rax
cmp rdx, 256
jne .L4
pop rbx
ret

Modulus in Assembly x64 linux question C++ [duplicate]

This question already has answers here:
Why does GCC use multiplication by a strange number in implementing integer division?
(5 answers)
Divide Signed Integer By 2 compiles to complex assembly output, not just a shift
(1 answer)
Closed 1 year ago.
I have these functions in C++
int f1(int a)
{
int x = a / 2;
}
int f2(int a)
{
int y = a % 2;
}
int f3(int a)
{
int z = a % 7;
}
int f4(int a,int b)
{
int xy = a % b;
}
And i saw their assembly code but couldn't understand what they are doing.I couldn't even find a good referance or some explained example for the same. Here is the assembly
f1(int):
push rbp
mov rbp, rsp
mov DWORD PTR [rbp-20], edi
mov eax, DWORD PTR [rbp-20]
mov edx, eax
shr edx, 31
add eax, edx
sar eax
mov DWORD PTR [rbp-4], eax
nop
pop rbp
ret
f2(int):
push rbp
mov rbp, rsp
mov DWORD PTR [rbp-20], edi
mov eax, DWORD PTR [rbp-20]
cdq
shr edx, 31
add eax, edx
and eax, 1
sub eax, edx
mov DWORD PTR [rbp-4], eax
nop
pop rbp
ret
f3(int):
push rbp
mov rbp, rsp
mov DWORD PTR [rbp-20], edi
mov eax, DWORD PTR [rbp-20]
movsx rdx, eax
imul rdx, rdx, -1840700269
shr rdx, 32
add edx, eax
sar edx, 2
mov esi, eax
sar esi, 31
mov ecx, edx
sub ecx, esi
mov edx, ecx
sal edx, 3
sub edx, ecx
sub eax, edx
mov DWORD PTR [rbp-4], eax
nop
pop rbp
ret
f4(int, int):
push rbp
mov rbp, rsp
mov DWORD PTR [rbp-20], edi
mov DWORD PTR [rbp-24], esi
mov eax, DWORD PTR [rbp-20]
cdq
idiv DWORD PTR [rbp-24]
mov DWORD PTR [rbp-4], edx
nop
pop rbp
ret
Can you please tell by some example or what steps it is following to calculate the answers in all these three cases and why would they work just fine instead of normal divide

Why aren't clang++ and g++ de-duplicating these instructions?

Consider the following function:
std::string get_value(const bool b)
{
if (b) {
return "Hello";
}
else {
return "World";
}
}
g++ 11.0.1 20210312 compiles this (as C++17 and with maximum optimization) into:
get_value[abi:cxx11](bool):
lea rdx, [rdi+16]
mov rax, rdi
mov QWORD PTR [rdi], rdx
test sil, sil
je .L2
mov DWORD PTR [rdi+16], 1819043144
mov BYTE PTR [rdx+4], 111
mov QWORD PTR [rax+8], 5
mov BYTE PTR [rax+21], 0
ret
.L2:
mov DWORD PTR [rdi+16], 1819438935
mov BYTE PTR [rdx+4], 100
mov QWORD PTR [rax+8], 5
mov BYTE PTR [rax+21], 0
ret
Why does it not move the two replicated mov instructions up before the jump, or even before the test, reducing the code size by two instructions?
The same thing happens with clang++ and libc++, except it only has one relevant instruction to move up.
(See this also on GodBolt)

Loop unrolling and SSE -- clang vs gcc

Disclaimer: full code can be found here.
16 byte alignment
Given a fairly simple type to support proper SSE alignment
struct alignas(16) simd_pack
{
std::int32_t data[4];
};
and a function that adds two arrays together
void add_packed(simd_pack* lhs_and_result, simd_pack* rhs, std::size_t size)
{
for (std::size_t i = 0; i < size; i++)
for (std::size_t j = 0; j < 4; j++)
lhs_and_result[i].data[j] += rhs[i].data[j];
}
compile the code with clang and gcc using -O3.
Clang produces the following assembly:
add_packed(simd_pack*, simd_pack*, unsigned long): # #add_packed(simd_pack*, simd_pack*, unsigned long)
test rdx, rdx
je .LBB0_3
mov eax, 12
.LBB0_2: # =>This Inner Loop Header: Depth=1
mov ecx, dword ptr [rsi + rax - 12]
add dword ptr [rdi + rax - 12], ecx
mov ecx, dword ptr [rsi + rax - 8]
add dword ptr [rdi + rax - 8], ecx
mov ecx, dword ptr [rsi + rax - 4]
add dword ptr [rdi + rax - 4], ecx
mov ecx, dword ptr [rsi + rax]
add dword ptr [rdi + rax], ecx
add rax, 16
add rdx, -1
jne .LBB0_2
.LBB0_3:
ret
I'm not very literate in assembly but to me it looks like clang is simply unrolling the inner for loop. If we take a look at gcc we get:
add_packed(simd_pack*, simd_pack*, unsigned long):
test rdx, rdx
je .L1
sal rdx, 4
xor eax, eax
.L3:
movdqa xmm0, XMMWORD PTR [rdi+rax]
paddd xmm0, XMMWORD PTR [rsi+rax]
movaps XMMWORD PTR [rdi+rax], xmm0
add rax, 16
cmp rax, rdx
jne .L3
.L1:
ret
which is what I expect.
64 byte alignment
The difference gets even bigger (obviously) if we go to 64 byte alignment (which usually is a cache line if I'm not mistaken)
struct alignas(64) cache_line
{
std::int32_t data[16];
};
void add_cache_line(cache_line* lhs_and_result, cache_line* rhs, std::size_t size)
{
for (std::size_t i = 0; i < size; i++)
for (std::size_t j = 0; j < 16; j++)
lhs_and_result[i].data[j] += rhs[i].data[j];
}
Clang keeps simply unrolling:
add_cache_line(cache_line*, cache_line*, unsigned long): # #add_cache_line(cache_line*, cache_line*, unsigned long)
test rdx, rdx
je .LBB1_3
mov eax, 60
.LBB1_2: # =>This Inner Loop Header: Depth=1
mov ecx, dword ptr [rsi + rax - 60]
add dword ptr [rdi + rax - 60], ecx
mov ecx, dword ptr [rsi + rax - 56]
add dword ptr [rdi + rax - 56], ecx
mov ecx, dword ptr [rsi + rax - 52]
add dword ptr [rdi + rax - 52], ecx
mov ecx, dword ptr [rsi + rax - 48]
add dword ptr [rdi + rax - 48], ecx
mov ecx, dword ptr [rsi + rax - 44]
add dword ptr [rdi + rax - 44], ecx
mov ecx, dword ptr [rsi + rax - 40]
add dword ptr [rdi + rax - 40], ecx
mov ecx, dword ptr [rsi + rax - 36]
add dword ptr [rdi + rax - 36], ecx
mov ecx, dword ptr [rsi + rax - 32]
add dword ptr [rdi + rax - 32], ecx
mov ecx, dword ptr [rsi + rax - 28]
add dword ptr [rdi + rax - 28], ecx
mov ecx, dword ptr [rsi + rax - 24]
add dword ptr [rdi + rax - 24], ecx
mov ecx, dword ptr [rsi + rax - 20]
add dword ptr [rdi + rax - 20], ecx
mov ecx, dword ptr [rsi + rax - 16]
add dword ptr [rdi + rax - 16], ecx
mov ecx, dword ptr [rsi + rax - 12]
add dword ptr [rdi + rax - 12], ecx
mov ecx, dword ptr [rsi + rax - 8]
add dword ptr [rdi + rax - 8], ecx
mov ecx, dword ptr [rsi + rax - 4]
add dword ptr [rdi + rax - 4], ecx
mov ecx, dword ptr [rsi + rax]
add dword ptr [rdi + rax], ecx
add rax, 64
add rdx, -1
jne .LBB1_2
.LBB1_3:
ret
while gcc uses SSE and also unrolls that:
add_cache_line(cache_line*, cache_line*, unsigned long):
mov rcx, rdx
test rdx, rdx
je .L9
sal rcx, 6
mov rax, rdi
mov rdx, rsi
add rcx, rdi
.L11:
movdqa xmm2, XMMWORD PTR [rdx+16]
movdqa xmm3, XMMWORD PTR [rax]
add rax, 64
add rdx, 64
movdqa xmm1, XMMWORD PTR [rdx-32]
movdqa xmm0, XMMWORD PTR [rdx-16]
paddd xmm3, XMMWORD PTR [rdx-64]
paddd xmm2, XMMWORD PTR [rax-48]
paddd xmm1, XMMWORD PTR [rax-32]
paddd xmm0, XMMWORD PTR [rax-16]
movaps XMMWORD PTR [rax-64], xmm3
movaps XMMWORD PTR [rax-48], xmm2
movaps XMMWORD PTR [rax-32], xmm1
movaps XMMWORD PTR [rax-16], xmm0
cmp rax, rcx
jne .L11
.L9:
ret
No alignment
It's getting interesting if we use plain 32 bit integer arrays with no alignment at all. We use the exact same compiler flags.
void add_unaligned(std::int32_t* lhs_and_result, std::int32_t* rhs, std::size_t size)
{
for (std::size_t i = 0; i < size; i++)
lhs_and_result[i] += rhs[i];
}
Clang
Clang's assembly exploaded a fair bit by adding some branches:
add_unaligned(int*, int*, unsigned long): # #add_unaligned(int*, int*, unsigned long)
test rdx, rdx
je .LBB2_16
cmp rdx, 7
jbe .LBB2_2
lea rax, [rsi + 4*rdx]
cmp rax, rdi
jbe .LBB2_9
lea rax, [rdi + 4*rdx]
cmp rax, rsi
jbe .LBB2_9
.LBB2_2:
xor r10d, r10d
.LBB2_3:
mov r8, r10
not r8
add r8, rdx
mov rcx, rdx
and rcx, 3
je .LBB2_5
.LBB2_4: # =>This Inner Loop Header: Depth=1
mov eax, dword ptr [rsi + 4*r10]
add dword ptr [rdi + 4*r10], eax
add r10, 1
add rcx, -1
jne .LBB2_4
.LBB2_5:
cmp r8, 3
jb .LBB2_16
.LBB2_6: # =>This Inner Loop Header: Depth=1
mov eax, dword ptr [rsi + 4*r10]
add dword ptr [rdi + 4*r10], eax
mov eax, dword ptr [rsi + 4*r10 + 4]
add dword ptr [rdi + 4*r10 + 4], eax
mov eax, dword ptr [rsi + 4*r10 + 8]
add dword ptr [rdi + 4*r10 + 8], eax
mov eax, dword ptr [rsi + 4*r10 + 12]
add dword ptr [rdi + 4*r10 + 12], eax
add r10, 4
cmp rdx, r10
jne .LBB2_6
jmp .LBB2_16
.LBB2_9:
mov r10, rdx
and r10, -8
lea rax, [r10 - 8]
mov r9, rax
shr r9, 3
add r9, 1
mov r8d, r9d
and r8d, 1
test rax, rax
je .LBB2_10
sub r9, r8
xor ecx, ecx
.LBB2_12: # =>This Inner Loop Header: Depth=1
movdqu xmm0, xmmword ptr [rsi + 4*rcx]
movdqu xmm1, xmmword ptr [rsi + 4*rcx + 16]
movdqu xmm2, xmmword ptr [rdi + 4*rcx]
paddd xmm2, xmm0
movdqu xmm0, xmmword ptr [rdi + 4*rcx + 16]
paddd xmm0, xmm1
movdqu xmm1, xmmword ptr [rdi + 4*rcx + 32]
movdqu xmm3, xmmword ptr [rdi + 4*rcx + 48]
movdqu xmmword ptr [rdi + 4*rcx], xmm2
movdqu xmmword ptr [rdi + 4*rcx + 16], xmm0
movdqu xmm0, xmmword ptr [rsi + 4*rcx + 32]
paddd xmm0, xmm1
movdqu xmm1, xmmword ptr [rsi + 4*rcx + 48]
paddd xmm1, xmm3
movdqu xmmword ptr [rdi + 4*rcx + 32], xmm0
movdqu xmmword ptr [rdi + 4*rcx + 48], xmm1
add rcx, 16
add r9, -2
jne .LBB2_12
test r8, r8
je .LBB2_15
.LBB2_14:
movdqu xmm0, xmmword ptr [rsi + 4*rcx]
movdqu xmm1, xmmword ptr [rsi + 4*rcx + 16]
movdqu xmm2, xmmword ptr [rdi + 4*rcx]
paddd xmm2, xmm0
movdqu xmm0, xmmword ptr [rdi + 4*rcx + 16]
paddd xmm0, xmm1
movdqu xmmword ptr [rdi + 4*rcx], xmm2
movdqu xmmword ptr [rdi + 4*rcx + 16], xmm0
.LBB2_15:
cmp r10, rdx
jne .LBB2_3
.LBB2_16:
ret
.LBB2_10:
xor ecx, ecx
test r8, r8
jne .LBB2_14
jmp .LBB2_15
What is happening at .LBB2_4 and .LBB2_6? It looks like it's unrolling a loop again but I'm not sure what happens there (mainly because of the registers used).
In .LBB2_12 it even unrolls the SSE part. I think it's only unrolled two-fold though because it needs two SIMD registers to load each operand because they are unaligned now. .LBB2_14 contains the SSE part without the unrolling.
How is the control flow here? I'm assuming it should be:
keep using the unrolled SSE part until the remaining data is too small to fill all the registers (xmm0..3)
switch to the single stage SSE part and do it once if we have enough data remaining to fill xmm0 (4 integers in our case)
process the remaining data (3 operations at max, otherwise it would be SSE suitable again)
The order of the labels and the jump instructions are confusing, is that (approx.) what happens here?
GCC
Gcc's assembly is a bit easier to read:
add_unaligned(int*, int*, unsigned long):
test rdx, rdx
je .L16
lea rcx, [rsi+4]
mov rax, rdi
sub rax, rcx
cmp rax, 8
jbe .L22
lea rax, [rdx-1]
cmp rax, 2
jbe .L22
mov rcx, rdx
xor eax, eax
shr rcx, 2
sal rcx, 4
.L19:
movdqu xmm0, XMMWORD PTR [rdi+rax]
movdqu xmm1, XMMWORD PTR [rsi+rax]
paddd xmm0, xmm1
movups XMMWORD PTR [rdi+rax], xmm0
add rax, 16
cmp rax, rcx
jne .L19
mov rax, rdx
and rax, -4
test dl, 3
je .L16
mov ecx, DWORD PTR [rsi+rax*4]
add DWORD PTR [rdi+rax*4], ecx
lea rcx, [rax+1]
cmp rdx, rcx
jbe .L16
add rax, 2
mov r8d, DWORD PTR [rsi+rcx*4]
add DWORD PTR [rdi+rcx*4], r8d
cmp rdx, rax
jbe .L16
mov edx, DWORD PTR [rsi+rax*4]
add DWORD PTR [rdi+rax*4], edx
ret
.L22:
xor eax, eax
.L18:
mov ecx, DWORD PTR [rsi+rax*4]
add DWORD PTR [rdi+rax*4], ecx
add rax, 1
cmp rdx, rax
jne .L18
.L16:
ret
I assume the control flow is similar to clang
keep using the single stage SSE part until the remaining data is too small to fill xmm0 and xmm1
process the remaining data (3 operations at max, otherwise it would be SSE suitable again)
It looks like exactly this is happening in .L19 but what is .L18 doing then?
Summary
Here is the full code, including assembly. My question are:
Why is clang unrolling the functions that use aligned data instead of using SSE or a combination of both (like gcc)?
What are .LBB2_4 and .LBB2_6 in clang's assembly doing?
Are my assumptions about the control flow of the function with the unaligned data correct?
What is .L18 in gcc's assembly doing?

Different assembly when rangifying a simple algorithm

When I was preparing supplementary info for this question, I noticed that “rangified” implementations of a very simple algorithm resulted in important differences (to my eyes) in the resulting assembly, compared with “legacy” implementations.
I expanded the tests a bit, with the following results (GCC 9.1 -O3):
Case 1. Simple for loop (https://godbolt.org/z/rAVaT2)
#include <vector>
void foo(std::vector<double> &u, std::vector<double> const &v)
{
for (std::size_t i = 0u; i < u.size(); ++i)
u[i] += v[i];
}
mov rdx, QWORD PTR [rdi]
mov rdi, QWORD PTR [rdi+8]
sub rdi, rdx
sar rdi, 3
je .L1
mov rcx, QWORD PTR [rsi]
lea rax, [rcx+15]
sub rax, rdx
cmp rax, 30
jbe .L7
lea rax, [rdi-1]
cmp rax, 1
jbe .L7
mov rsi, rdi
xor eax, eax
shr rsi
sal rsi, 4
.L4:
movupd xmm0, XMMWORD PTR [rcx+rax]
movupd xmm1, XMMWORD PTR [rdx+rax]
addpd xmm0, xmm1
movups XMMWORD PTR [rdx+rax], xmm0
add rax, 16
cmp rsi, rax
jne .L4
mov rsi, rdi
and rsi, -2
and edi, 1
je .L1
lea rax, [rdx+rsi*8]
movsd xmm0, QWORD PTR [rax]
addsd xmm0, QWORD PTR [rcx+rsi*8]
movsd QWORD PTR [rax], xmm0
ret
.L7:
xor eax, eax
.L3:
movsd xmm0, QWORD PTR [rdx+rax*8]
addsd xmm0, QWORD PTR [rcx+rax*8]
movsd QWORD PTR [rdx+rax*8], xmm0
add rax, 1
cmp rdi, rax
jne .L3
.L1:
ret
Case 2. std::transform (https://godbolt.org/z/2iZaqo)
#include <algorithm>
#include <vector>
void foo(std::vector<double> &u, std::vector<double> const &v)
{
std::transform(std::begin(u), std::end(u),
std::begin(v),
std::begin(u),
std::plus());
}
mov rdx, QWORD PTR [rdi]
mov rax, QWORD PTR [rdi+8]
mov rsi, QWORD PTR [rsi]
cmp rax, rdx
je .L1
sub rax, 8
lea rcx, [rsi+15]
sub rax, rdx
sub rcx, rdx
shr rax, 3
cmp rcx, 30
jbe .L7
movabs rcx, 2305843009213693950
test rax, rcx
je .L7
lea rcx, [rax+1]
xor eax, eax
mov rdi, rcx
shr rdi
sal rdi, 4
.L4:
movupd xmm0, XMMWORD PTR [rdx+rax]
movupd xmm1, XMMWORD PTR [rsi+rax]
addpd xmm0, xmm1
movups XMMWORD PTR [rdx+rax], xmm0
add rax, 16
cmp rax, rdi
jne .L4
mov rdi, rcx
and rdi, -2
lea rax, [0+rdi*8]
add rdx, rax
add rsi, rax
cmp rcx, rdi
je .L1
movsd xmm0, QWORD PTR [rdx]
addsd xmm0, QWORD PTR [rsi]
movsd QWORD PTR [rdx], xmm0
ret
.L7:
xor ecx, ecx
.L3:
movsd xmm0, QWORD PTR [rdx+rcx*8]
addsd xmm0, QWORD PTR [rsi+rcx*8]
mov rdi, rcx
movsd QWORD PTR [rdx+rcx*8], xmm0
add rcx, 1
cmp rax, rdi
jne .L3
.L1:
ret
Case 3. Range-v3 view::zip (https://godbolt.org/z/0BEkfT)
#define RANGES_ASSERT(...) ((void)0)
#include <algorithm>
#include <range/v3/view/zip.hpp>
#include <vector>
void foo(std::vector<double> &u, std::vector<double> const &v)
{
auto w = ranges::view::zip(u, v);
std::for_each(std::begin(w), std::end(w),
[](auto &&x) { std::get<0u>(x) += std::get<1u>(x); });
}
mov rdx, QWORD PTR [rsi]
mov rsi, QWORD PTR [rsi+8]
mov rax, QWORD PTR [rdi]
mov rcx, QWORD PTR [rdi+8]
cmp rdx, rsi
je .L1
cmp rax, rcx
je .L1
.L3:
movsd xmm0, QWORD PTR [rax]
addsd xmm0, QWORD PTR [rdx]
add rax, 8
add rdx, 8
movsd QWORD PTR [rax-8], xmm0
cmp rax, rcx
je .L1
cmp rdx, rsi
jne .L3
.L1:
ret
Case 4. cmcstl2 ranges::transform (https://godbolt.org/z/MjYO1G)
#include <experimental/ranges/algorithm>
#include <vector>
namespace std
{
namespace ranges = experimental::ranges;
}
void foo(std::vector<double> &u,s td::vector<double> const &v)
{
std::ranges::transform(std::ranges::begin(u), std::ranges::end(u),
std::ranges::begin(v), std::ranges::end(v),
std::ranges::begin(u),
std::plus());
}
mov r8, QWORD PTR [rsi+8]
mov rdx, QWORD PTR [rsi]
mov rax, QWORD PTR [rdi]
mov rcx, QWORD PTR [rdi+8]
cmp rdx, r8
je .L1
cmp rcx, rax
jne .L3
jmp .L1
.L16:
cmp rdx, r8
je .L1
.L3:
movsd xmm0, QWORD PTR [rax]
addsd xmm0, QWORD PTR [rdx]
add rax, 8
add rdx, 8
movsd QWORD PTR [rax-8], xmm0
cmp rax, rcx
jne .L16
.L1:
ret
I can’t read assembly, but I seem to understand that the assemblies of Case 1 and Case 2 are almost equivalent and involve packed sums, whilst the assembly of the ranges versions (Cases 3 and 4) is much terser, but not vectorized.
I would really love to understand what those differences mean. Do my interpretation of the assembly make any sense? What are the additional instructions in the non-ranges versions? Why are there those differences?