gcc assembly when passing by reference and by value - c++

I have a simple function that computes a product of
two double arrays:
#include <stdlib.h>
#include <emmintrin.h>
struct S {
double *x;
double *y;
double *z;
};
void f(S& s, size_t n) {
for (int i = 0; i < n; i += 2) {
__m128d xs = _mm_load_pd(&s.x[i]);
__m128d ys = _mm_load_pd(&s.y[i]);
_mm_store_pd(&s.z[i], _mm_mul_pd(xs, ys) );
}
return;
}
int main(void) {
S s;
size_t size = 4;
posix_memalign((void **)&s.x, 16, sizeof(double) * size);
posix_memalign((void **)&s.y, 16, sizeof(double) * size);
posix_memalign((void **)&s.z, 16, sizeof(double) * size);
f(s, size);
return 0;
}
Note that the first argument of function f is passed in by reference.
Let's look at the resulting assembly of f() (I removed some irrelevant
pieces, inserted comments and put some labels):
$ g++ -O3 -S asmtest.cpp
.globl _Z1fR1Sm
_Z1fR1Sm:
xorl %eax, %eax
testq %rsi, %rsi
je .L1
.L5:
movq (%rdi), %r8 # array x (1)
movq 8(%rdi), %rcx # array y (2)
movq 16(%rdi), %rdx # array z (3)
movapd (%r8,%rax,8), %xmm0 # load x[0]
mulpd (%rcx,%rax,8), %xmm0 # multiply x[0]*y[0]
movaps %xmm0, (%rdx,%rax,8) # store to y
addq $2, %rax # and loop
cmpq %rax, %rsi
ja .L5
Notice that addresses of arrays x, y, z are loaded into general-purpose
registers on each iteration, see statements (1),(2),(3). Why doesn't gcc move
these instructions outside the loop?
Now make a local copy (not a deep copy) of the structure:
void __attribute__((noinline)) f(S& args, size_t n) {
S s = args;
for (int i = 0; i < n; i += 2) {
__m128d xs = _mm_load_pd(&s.x[i]);
__m128d ys = _mm_load_pd(&s.y[i]);
_mm_store_pd(&s.z[i], _mm_mul_pd(xs, ys) );
}
return;
}
Assembly:
_Z1fR1Sm:
.LFB525:
.cfi_startproc
xorl %eax, %eax
testq %rsi, %rsi
movq (%rdi), %r8 # (1)
movq 8(%rdi), %rcx # (2)
movq 16(%rdi), %rdx # (3)
je .L1
.L5:
movapd (%r8,%rax,8), %xmm0
mulpd (%rcx,%rax,8), %xmm0
movaps %xmm0, (%rdx,%rax,8)
addq $2, %rax
cmpq %rax, %rsi
ja .L5
.L1:
rep ret
Notice that unlike in the previous code,
loads (1), (2), (3) are now outside the loop.
I would appreciate an explanation why these two assembly
codes are different. Is memory aliasing relevant here?
Thanks.
$ gcc --version
gcc (Debian 5.2.1-21) 5.2.1 20151003

Yes, gcc is reloading s.x and s.y with each iteration of the loop because gcc does not know if &s.z[i] for some i aliases part of the S object passed by reference to f(S&, size_t).
With gcc 5.2.0, applying __restrict__ to S::z and the s reference parameter to f(), i.e.:
struct S {
double *x;
double *y;
double *__restrict__ z;
};
void f(S&__restrict__ s, size_t n) {
for (int i = 0; i < n; i += 2) {
__m128d xs = _mm_load_pd(&s.x[i]);
__m128d ys = _mm_load_pd(&s.y[i]);
_mm_store_pd(&s.z[i], _mm_mul_pd(xs, ys));
}
return;
}
.. causes gcc to generate:
__Z1fR1Sm:
LFB518:
testq %rsi, %rsi
je L1
movq (%rdi), %r8
xorl %eax, %eax
movq 8(%rdi), %rcx
movq 16(%rdi), %rdx
.align 4,0x90
L4:
movapd (%r8,%rax,8), %xmm0
mulpd (%rcx,%rax,8), %xmm0
movaps %xmm0, (%rdx,%rax,8)
addq $2, %rax
cmpq %rax, %rsi
ja L4
L1:
ret
With Apple Clang 700.1.76, only __restrict__ on the s reference is needed:
__Z1fR1Sm: ## #_Z1fR1Sm
.cfi_startproc
## BB#0:
pushq %rbp
Ltmp0:
.cfi_def_cfa_offset 16
Ltmp1:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp2:
.cfi_def_cfa_register %rbp
testq %rsi, %rsi
je LBB0_3
## BB#1: ## %.lr.ph
movq (%rdi), %rax
movq 8(%rdi), %rcx
movq 16(%rdi), %rdx
xorl %edi, %edi
.align 4, 0x90
LBB0_2: ## =>This Inner Loop Header: Depth=1
movapd (%rax,%rdi,8), %xmm0
mulpd (%rcx,%rdi,8), %xmm0
movapd %xmm0, (%rdx,%rdi,8)
addq $2, %rdi
cmpq %rsi, %rdi
jb LBB0_2
LBB0_3: ## %._crit_edge
popq %rbp
retq
.cfi_endproc

Related

Expression template code not optimized fully

I have the following linear algebra function call (vector-vector addition) in C++.
int m = 4;
blasfeo_dvec one, two, three;
blasfeo_allocate_dvec(m, &one);
blasfeo_allocate_dvec(m, &two);
blasfeo_allocate_dvec(m, &three);
// initialize vectors ... (omitted)
blasfeo_daxpy(m, 1.0, &one, 0, &two, 0, &three, 0);
Using expression templates (ETs), we can wrap it as follows:
three = one + two;
where the vector struct looks like
struct blasfeo_dvec {
int m; // length
int pm; // packed length
double *pa; // pointer to a pm array of doubles, the first is aligned to cache line size
int memsize; // size of needed memory
void operator=(const vec_expression_sum<blasfeo_dvec, blasfeo_dvec> expr) {
blasfeo_daxpy(m, 1.0, (blasfeo_dvec *) &expr.vec_a, 0, (blasfeo_dvec *) &expr.vec_b, 0, this, 0);
}
};
The cast to non-const is necessary because blasfeo_daxpy takes non-const pointers. The ET code is simply
template<typename Ta, typename Tb>
struct vec_expression_sum {
const Ta vec_a;
const Tb vec_b;
vec_expression_sum(const Ta va, const Tb vb) : vec_a {va}, vec_b {vb} {}
};
template<typename Ta, typename Tb>
auto operator+(const Ta a, const Tb b) {
return vec_expression_sum<Ta, Tb>(a, b);
}
The 'native' call, i.e. blasfeo_daxpy(...) generates the following assembly:
; allocation and initialization omitted ...
movl $0, (%rsp)
movl $4, %edi
xorl %edx, %edx
xorl %r8d, %r8d
movsd LCPI0_0(%rip), %xmm0 ## xmm0 = mem[0],zero
movq %r14, %rsi
movq %rbx, %rcx
movq %r15, %r9
callq _blasfeo_daxpy
...
which is exactly what you would expect. The ET code is quite a bit longer:
; allocation :
leaq -120(%rbp), %rbx
movl $4, %edi
movq %rbx, %rsi
callq _blasfeo_allocate_dvec
leaq -96(%rbp), %r15
movl $4, %edi
movq %r15, %rsi
callq _blasfeo_allocate_dvec
leaq -192(%rbp), %r14
movl $4, %edi
movq %r14, %rsi
callq _blasfeo_allocate_dvec
; initialization code omitted
; operator+ :
movq -104(%rbp), %rax
movq %rax, -56(%rbp)
movq -120(%rbp), %rax
movq -112(%rbp), %rcx
movq %rcx, -64(%rbp)
movq %rax, -72(%rbp)
; vec_expression_sum :
movq -80(%rbp), %rax
movq %rax, -32(%rbp)
movq -96(%rbp), %rax
movq -88(%rbp), %rcx
movq %rcx, -40(%rbp)
movq %rax, -48(%rbp)
movq -32(%rbp), %rax
movq %rax, -128(%rbp)
movq -40(%rbp), %rax
movq %rax, -136(%rbp)
movq -48(%rbp), %rax
movq %rax, -144(%rbp)
movq -56(%rbp), %rax
movq %rax, -152(%rbp)
movq -72(%rbp), %rax
movq -64(%rbp), %rcx
movq %rcx, -160(%rbp)
movq %rax, -168(%rbp)
leaq -144(%rbp), %rcx
; blasfeo_daxpy :
movl -192(%rbp), %edi
movl $0, (%rsp)
leaq -168(%rbp), %rsi
xorl %edx, %edx
xorl %r8d, %r8d
movsd LCPI0_0(%rip), %xmm0 ## xmm0 = mem[0],zero
movq %r14, %r9
callq _blasfeo_daxpy
...
It involves quite a bit of copying, namely the fields of blasfeo_dvec. I (naively, maybe) hoped that the ET code would generate the exact same code as the native call, given that everything is fixed at compile time and const, but it doesn't.
The question is: why the extra loads? And is there a way of getting fully 'optimized' code? (edit: I use Apple LLVM version 8.1.0 (clang-802.0.42) with -std=c++14 -O3)
Note: I read and understood this and this post on a similar topic, but they unfortunately do not contain an answer to my question.

mmap Mac: Segmentation fault

The following on my Mac succeeds:
int main() {
int* addr = (int*) mmap(0, 100, 1 | 2, 2 | 4096, -1, 0);
*addr = 25;
return 0;
}
However the below code is identical but fails when I try to write to *addr with segmentation fault:
int main() {
int* addr = (int*) syscall(SYS_mmap, 0, 100, 1 | 2, 2 | 4096, -1, 0);
*addr = 25;
return 0;
}
I.e. syscall successfully returns me a memory address, but when I try writing to it it fails.
I compile it like this:
g++ ./c++/mmap.cc -o ./mmap && ./mmap
If I run both versions with dtruss:
g++ ./c++/mmap.cc -o ./mmap && sudo dtruss ./mmap
then both version succeed and I see identical mmap call for both:
mmap(0x0, 0x64, 0x3, 0x1002, 0xFFFFFFFF, 0x0) = 0xXXXXXXX 0
Why does the syscall version give me segmentation fault, what am I missing?
P.S. If I do something similar on Linux it works fine.
So, as I understand the mmap function on Mac does not execute syscall(SYS_mmap, .... What does it do then? Can anyone please give me some links where I can see implementation.
EDIT:
It looks like syscall on Mac returns only first 4 bytes. Is there a 64-bit syscall version?
DISASSEMBLED:
mmap version:
_main:
0000000100000cf0 pushq %rbp
0000000100000cf1 movq %rsp, %rbp
0000000100000cf4 subq $0x30, %rsp
0000000100000cf8 xorl %eax, %eax
0000000100000cfa movl %eax, %ecx
0000000100000cfc movl $0x64, %eax
0000000100000d01 movl %eax, %esi
0000000100000d03 movl $0x3, %edx
0000000100000d08 movl $0x1002, %eax
0000000100000d0d movl $0xffffffff, %r8d
0000000100000d13 movl $0x0, -0x14(%rbp)
0000000100000d1a movq %rcx, %rdi
0000000100000d1d movq %rcx, -0x28(%rbp)
0000000100000d21 movl %eax, %ecx
0000000100000d23 movq -0x28(%rbp), %r9
0000000100000d27 callq 0x100000ed6 ## symbol stub for: _mmap
0000000100000d2c movq 0x2cd(%rip), %rdi ## literal pool symbol address: __ZNSt3__14coutE
0000000100000d33 movq %rax, -0x20(%rbp)
0000000100000d37 movq -0x20(%rbp), %rax
0000000100000d3b movq %rax, %rsi
syscall version:
_main:
0000000100000cf0 pushq %rbp
0000000100000cf1 movq %rsp, %rbp
0000000100000cf4 subq $0x30, %rsp
0000000100000cf8 movl $0xc5, %edi
0000000100000cfd xorl %esi, %esi
0000000100000cff movl $0x64, %edx
0000000100000d04 movl $0x3, %ecx
0000000100000d09 movl $0x1002, %r8d
0000000100000d0f movl $0xffffffff, %r9d
0000000100000d15 movl $0x0, -0x14(%rbp)
0000000100000d1c movl $0x0, (%rsp)
0000000100000d23 movb $0x0, %al
0000000100000d25 callq 0x100000ed6 ## symbol stub for: _syscall
0000000100000d2a movq 0x2cf(%rip), %rdi ## literal pool symbol address: __ZNSt3__14coutE
0000000100000d31 movslq %eax, %r10
0000000100000d34 movq %r10, -0x20(%rbp)
0000000100000d38 movq -0x20(%rbp), %r10
0000000100000d3c movq %r10, %rsi
Apparently Mac does not have a 64-bit syscall function, here a is simple implementation:
#include <sys/types.h>
#define CARRY_FLAG_BIT 1
inline int64_t syscall6(int64_t num, int64_t arg1, int64_t arg2, int64_t arg3, int64_t arg4, int64_t arg5, int64_t arg6) {
int64_t result;
int64_t flags;
__asm__ __volatile__ (
"movq %6, %%r10;\n"
"movq %7, %%r8;\n"
"movq %8, %%r9;\n"
"syscall;\n"
"movq %%r11, %1;\n"
: "=a" (result), "=r" (flags)
: "a" (num), "D" (arg1), "S" (arg2), "d" (arg3), "r" (arg4), "r" (arg5), "r" (arg6)
: "%r10", "%r8", "%r9", "%rcx", "%r11"
);
return (flags & CARRY_FLAG_BIT) ? -result : result;
}
And you use it on mac by shifting system call numbers by 0x2000000:
int* addr = (int*) syscall6(0x2000000 + SYS_mmap, 0, 100, 1 | 2, 2 | 4096, -1, 0);
You can find more here.

Vectorization of sin and cos

I was playing around with Compiler Explorer and ran into an anomaly (I think). If I want to make the compiler vectorize a sin calculation using libmvec, I would write:
#include <cmath>
#define NN 512
typedef float T;
typedef T __attribute__((aligned(NN))) AT;
inline T s(const T x)
{
return sinf(x);
}
void func(AT* __restrict x, AT* __restrict y, int length)
{
if (length & NN-1) __builtin_unreachable();
for (int i = 0; i < length; i++)
{
y[i] = s(x[i]);
}
}
compile with gcc 6.2 and -O3 -march=native -ffast-math and get
func(float*, float*, int):
testl %edx, %edx
jle .L10
leaq 8(%rsp), %r10
andq $-32, %rsp
pushq -8(%r10)
pushq %rbp
movq %rsp, %rbp
pushq %r14
xorl %r14d, %r14d
pushq %r13
leal -8(%rdx), %r13d
pushq %r12
shrl $3, %r13d
movq %rsi, %r12
pushq %r10
addl $1, %r13d
pushq %rbx
movq %rdi, %rbx
subq $8, %rsp
.L4:
vmovaps (%rbx), %ymm0
addl $1, %r14d
addq $32, %r12
addq $32, %rbx
call _ZGVcN8v_sinf // YAY! Vectorized trig!
vmovaps %ymm0, -32(%r12)
cmpl %r13d, %r14d
jb .L4
vzeroupper
addq $8, %rsp
popq %rbx
popq %r10
popq %r12
popq %r13
popq %r14
popq %rbp
leaq -8(%r10), %rsp
.L10:
ret
But when I add a cosine to the function, there is no vectorization:
#include <cmath>
#define NN 512
typedef float T;
typedef T __attribute__((aligned(NN))) AT;
inline T f(const T x)
{
return cosf(x)+sinf(x);
}
void func(AT* __restrict x, AT* __restrict y, int length)
{
if (length & NN-1) __builtin_unreachable();
for (int i = 0; i < length; i++)
{
y[i] = f(x[i]);
}
}
which gives:
func(float*, float*, int):
testl %edx, %edx
jle .L10
pushq %r12
leal -1(%rdx), %eax
pushq %rbp
leaq 4(%rdi,%rax,4), %r12
movq %rsi, %rbp
pushq %rbx
movq %rdi, %rbx
subq $16, %rsp
.L4:
vmovss (%rbx), %xmm0
leaq 8(%rsp), %rsi
addq $4, %rbx
addq $4, %rbp
leaq 12(%rsp), %rdi
call sincosf // No vectorization
vmovss 12(%rsp), %xmm0
vaddss 8(%rsp), %xmm0, %xmm0
vmovss %xmm0, -4(%rbp)
cmpq %rbx, %r12
jne .L4
addq $16, %rsp
popq %rbx
popq %rbp
popq %r12
.L10:
ret
I see two good alternatives. Either call a vectorized version of sincosf or call the vectorized sin and cos sequentially. I tried adding -fno-builtin-sincos to no avail. -fopt-info-vec-missed complains about complex float, which there is none.
Is this a known issue with gcc? Either way, is there a way I can convince gcc to vectorize the latter example?
(As an aside, is there any way to get gcc < 6 to vectorize trigonometric functions automatically?)

In C/C++ arithmetic operation inside of for statement arguments

Let's say I have this code:
int v;
setV(&v);
for (int i = 0; i < v - 5; i++) {
// Do stuff here, but don't use v.
}
Will the operation v - 5 be run every time or will a modern compiler be smart enough to store it once and never run it again?
What if I did this:
int v;
setV(&v);
const int cv = v;
for (int i = 0; i < cv - 5; i++) {
// Do stuff here. Changing cv is actually impossible.
}
Would the second style make a difference?
Edit:
This was an interesting question for an unexpected reason. It's more a question of the compiler avoiding the obtuse case of an unintended aliasing of v. If the compiler can prove that this won't happen (version 2) then we get better code.
The lesson here is to be more concerned with eliminating aliasing than trying to do the optimiser's job for it.
Making the copy cv actually presented the biggest optimisation (elision of redundant memory fetches), even though at a first glance it would appear to be (slightly) less efficient.
original answer and demo:
Let's see:
given:
extern void setV(int*);
extern void do_something(int i);
void test1()
{
int v;
setV(&v);
for (int i = 0; i < v - 5; i++) {
// Do stuff here, but don't use v.
do_something(i);
}
}
void test2()
{
int v;
setV(&v);
const int cv = v;
for (int i = 0; i < cv - 5; i++) {
// Do stuff here. Changing cv is actually impossible.
do_something(i);
}
}
compile on gcc5.3 with -x c++ -std=c++14 -O2 -Wall
gives:
test1():
pushq %rbx
subq $16, %rsp
leaq 12(%rsp), %rdi
call setV(int*)
cmpl $5, 12(%rsp)
jle .L1
xorl %ebx, %ebx
.L5:
movl %ebx, %edi
addl $1, %ebx
call do_something(int)
movl 12(%rsp), %eax
subl $5, %eax
cmpl %ebx, %eax
jg .L5
.L1:
addq $16, %rsp
popq %rbx
ret
test2():
pushq %rbp
pushq %rbx
subq $24, %rsp
leaq 12(%rsp), %rdi
call setV(int*)
movl 12(%rsp), %eax
cmpl $5, %eax
jle .L8
leal -5(%rax), %ebp
xorl %ebx, %ebx
.L12:
movl %ebx, %edi
addl $1, %ebx
call do_something(int)
cmpl %ebp, %ebx
jne .L12
.L8:
addq $24, %rsp
popq %rbx
popq %rbp
ret
The second form is better on this compiler.

Segmentation fault with array of __m256i when using clang/g++

I'm attempting to generate arrays of __m256i's to reuse in another computation. When I attempt to do that (even with a minimal testcase), I get a segmentation fault - but only if the code is compiled with g++ or clang. If I compile the code with the Intel compiler (version 16.0), no segmentation fault occurs. Here is a test case I created:
int main() {
__m256i *table = new __m256i[10000];
__m256i zeroes = _mm256_set_epi64x(0, 0, 0, 0);
table[99] = zeroes;
}
When compiling the above with clang 3.6 and g++ 4.8, a segmentation fault occurs.
Here's the assembly generated by the Intel compiler (from https://gcc.godbolt.org/, icc 13.0):
pushq %rbx #3.12
movq %rsp, %rbx #3.12
andq $-32, %rsp #3.12
pushq %rbp #3.12
pushq %rbp #3.12
movq 8(%rbx), %rbp #3.12
movq %rbp, 8(%rsp) #3.12
movq %rsp, %rbp #3.12
subq $112, %rsp #3.12
movl $3200, %eax #4.38
vzeroupper #4.38
movq %rax, %rdi #4.38
call operator new[](unsigned long) #4.38
movq %rax, -112(%rbp) #4.38
movq -112(%rbp), %rax #4.38
movq %rax, -104(%rbp) #4.20
vxorps %ymm0, %ymm0, %ymm0 #5.22
vmovdqu %ymm0, -80(%rbp) #5.22
vmovdqu -80(%rbp), %ymm0 #5.22
vmovdqu %ymm0, -48(%rbp) #5.20
movl $3168, %eax #6.17
addq -104(%rbp), %rax #6.5
vmovdqu -48(%rbp), %ymm0 #6.17
vmovdqu %ymm0, (%rax) #6.5
movl $0, %eax #7.1
vzeroupper #7.1
leave #7.1
movq %rbx, %rsp #7.1
popq %rbx #7.1
ret #7.1
And here's from clang 3.7:
pushq %rbp
movq %rsp, %rbp
andq $-32, %rsp
subq $192, %rsp
xorl %eax, %eax
movl $3200, %ecx # imm = 0xC80
movl %ecx, %edi
movl %eax, 28(%rsp) # 4-byte Spill
callq operator new[](unsigned long)
movq %rax, 88(%rsp)
movq $0, 168(%rsp)
movq $0, 160(%rsp)
movq $0, 152(%rsp)
movq $0, 144(%rsp)
vmovq 168(%rsp), %xmm0 # xmm0 = mem[0],zero
vmovq 160(%rsp), %xmm1 # xmm1 = mem[0],zero
vpunpcklqdq %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0]
vmovq 152(%rsp), %xmm1 # xmm1 = mem[0],zero
vpslldq $8, %xmm1, %xmm1 # xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
vmovaps %xmm1, %xmm2
vinserti128 $1, %xmm0, %ymm2, %ymm2
vmovaps %ymm2, 96(%rsp)
vmovaps %ymm2, 32(%rsp)
movq 88(%rsp), %rax
vmovaps %ymm2, 3168(%rax)
movl 28(%rsp), %eax # 4-byte Reload
movq %rbp, %rsp
popq %rbp
vzeroupper
retq
Am I running into a compiler bug in clang/g++? Or am I simply doing something wrong?
I have said many times before that implicit SIMD loads/stores are a bad idea. Stop using them. Use explicit loads/stores like this
int64_t* table = new int64_t[4*10000];
__m256i zeroes = _mm256_set_epi64x(0, 0, 0, 0);
_mm256_storeu_si256((__m256i*)&table[4*99], zeroes);
or since this is POD use the cross-compiler/OS function _mm_malloc
int64_t* table = (int64_t*)_mm_malloc(sizeof(int64_t)*4*10000, 32);
__m256i zeroes = _mm256_set_epi64x(0, 0, 0, 0);
_mm256_store_si256((__m256i*)&table[4*99], zeroes);
You can use _mm256_setzero_si256() instead of _mm256_set_epi64x(0, 0, 0, 0) (note that _mm256_set_epi64x does not work in 32-bit mode on some version of MSVC) but GCC and Clang are smart enough to know they are the same thing.
Since you're using intrinsics which are not part of the C/C++ specification then some rules such as strict aliasing may be overlooked.
I guess the problem has to do with wrong memory alignment. vmovaps requires the memory location to start at a 32-byte boundary and vmovdqu does not. That's why the Intel version works whereas the clang/g++ code crashes. I don't know if this is a compiler bug, but you may want alignment anyway.
The following code should work, although it's more C than C++.
int main() {
__m256i *table = (__m256i*) memalign( 32, 10000 * sizeof(__m256i) );
__m256i zeroes = _mm256_set_epi64x(0, 0, 0, 0);
table[99] = zeroes;
}