Expression template code not optimized fully - c++

I have the following linear algebra function call (vector-vector addition) in C++.
int m = 4;
blasfeo_dvec one, two, three;
blasfeo_allocate_dvec(m, &one);
blasfeo_allocate_dvec(m, &two);
blasfeo_allocate_dvec(m, &three);
// initialize vectors ... (omitted)
blasfeo_daxpy(m, 1.0, &one, 0, &two, 0, &three, 0);
Using expression templates (ETs), we can wrap it as follows:
three = one + two;
where the vector struct looks like
struct blasfeo_dvec {
int m; // length
int pm; // packed length
double *pa; // pointer to a pm array of doubles, the first is aligned to cache line size
int memsize; // size of needed memory
void operator=(const vec_expression_sum<blasfeo_dvec, blasfeo_dvec> expr) {
blasfeo_daxpy(m, 1.0, (blasfeo_dvec *) &expr.vec_a, 0, (blasfeo_dvec *) &expr.vec_b, 0, this, 0);
}
};
The cast to non-const is necessary because blasfeo_daxpy takes non-const pointers. The ET code is simply
template<typename Ta, typename Tb>
struct vec_expression_sum {
const Ta vec_a;
const Tb vec_b;
vec_expression_sum(const Ta va, const Tb vb) : vec_a {va}, vec_b {vb} {}
};
template<typename Ta, typename Tb>
auto operator+(const Ta a, const Tb b) {
return vec_expression_sum<Ta, Tb>(a, b);
}
The 'native' call, i.e. blasfeo_daxpy(...) generates the following assembly:
; allocation and initialization omitted ...
movl $0, (%rsp)
movl $4, %edi
xorl %edx, %edx
xorl %r8d, %r8d
movsd LCPI0_0(%rip), %xmm0 ## xmm0 = mem[0],zero
movq %r14, %rsi
movq %rbx, %rcx
movq %r15, %r9
callq _blasfeo_daxpy
...
which is exactly what you would expect. The ET code is quite a bit longer:
; allocation :
leaq -120(%rbp), %rbx
movl $4, %edi
movq %rbx, %rsi
callq _blasfeo_allocate_dvec
leaq -96(%rbp), %r15
movl $4, %edi
movq %r15, %rsi
callq _blasfeo_allocate_dvec
leaq -192(%rbp), %r14
movl $4, %edi
movq %r14, %rsi
callq _blasfeo_allocate_dvec
; initialization code omitted
; operator+ :
movq -104(%rbp), %rax
movq %rax, -56(%rbp)
movq -120(%rbp), %rax
movq -112(%rbp), %rcx
movq %rcx, -64(%rbp)
movq %rax, -72(%rbp)
; vec_expression_sum :
movq -80(%rbp), %rax
movq %rax, -32(%rbp)
movq -96(%rbp), %rax
movq -88(%rbp), %rcx
movq %rcx, -40(%rbp)
movq %rax, -48(%rbp)
movq -32(%rbp), %rax
movq %rax, -128(%rbp)
movq -40(%rbp), %rax
movq %rax, -136(%rbp)
movq -48(%rbp), %rax
movq %rax, -144(%rbp)
movq -56(%rbp), %rax
movq %rax, -152(%rbp)
movq -72(%rbp), %rax
movq -64(%rbp), %rcx
movq %rcx, -160(%rbp)
movq %rax, -168(%rbp)
leaq -144(%rbp), %rcx
; blasfeo_daxpy :
movl -192(%rbp), %edi
movl $0, (%rsp)
leaq -168(%rbp), %rsi
xorl %edx, %edx
xorl %r8d, %r8d
movsd LCPI0_0(%rip), %xmm0 ## xmm0 = mem[0],zero
movq %r14, %r9
callq _blasfeo_daxpy
...
It involves quite a bit of copying, namely the fields of blasfeo_dvec. I (naively, maybe) hoped that the ET code would generate the exact same code as the native call, given that everything is fixed at compile time and const, but it doesn't.
The question is: why the extra loads? And is there a way of getting fully 'optimized' code? (edit: I use Apple LLVM version 8.1.0 (clang-802.0.42) with -std=c++14 -O3)
Note: I read and understood this and this post on a similar topic, but they unfortunately do not contain an answer to my question.

Related

Move semantics appear to add additional overhead in the simplest possible* example

* I believe this is the simplest possible example, but if I'm incorrect please let me know.
https://godbolt.org/z/neaTse
I'm attempting to learn and understand move semantics and some of their intricacies, but I've hit a bit of a snag. When attempting to compare the following 2 code snippets, the code using move semantics ends up with 8 additional lines of assembly and 2 additional moves (15 for move, 13 for no-move).
Move:
#include <utility>
template<class T>
void swap(T& a, T& b)
{
T tmp(std::move(a));
a = std::move(b);
b = std::move(tmp);
}
int main(){
int a, b;
swap(a, b);
}
No-Move:
template<class T>
void swap(T& a, T& b)
{
T tmp(a);
a = b;
b = tmp;
}
int main(){
int a, b;
swap(a, b);
}
Move generated assembly:
main:
pushq %rbp
movq %rsp, %rbp
subq $16, %rsp
leaq -8(%rbp), %rdx
leaq -4(%rbp), %rax
movq %rdx, %rsi
movq %rax, %rdi
call void swap<int>(int&, int&)
movl $0, %eax
leave
ret
void swap<int>(int&, int&):
pushq %rbp
movq %rsp, %rbp
subq $32, %rsp
movq %rdi, -24(%rbp)
movq %rsi, -32(%rbp)
movq -24(%rbp), %rax
movq %rax, %rdi
call std::remove_reference<int&>::type&& std::move<int&>(int&)
movl (%rax), %eax
movl %eax, -4(%rbp)
movq -32(%rbp), %rax
movq %rax, %rdi
call std::remove_reference<int&>::type&& std::move<int&>(int&)
movl (%rax), %edx
movq -24(%rbp), %rax
movl %edx, (%rax)
leaq -4(%rbp), %rax
movq %rax, %rdi
call std::remove_reference<int&>::type&& std::move<int&>(int&)
movl (%rax), %edx
movq -32(%rbp), %rax
movl %edx, (%rax)
nop
leave
ret
No-move generated assembly:
main:
pushq %rbp
movq %rsp, %rbp
subq $16, %rsp
leaq -8(%rbp), %rdx
leaq -4(%rbp), %rax
movq %rdx, %rsi
movq %rax, %rdi
call void swap<int>(int&, int&)
movl $0, %eax
leave
ret
void swap<int>(int&, int&):
pushq %rbp
movq %rsp, %rbp
movq %rdi, -24(%rbp)
movq %rsi, -32(%rbp)
movq -24(%rbp), %rax
movl (%rax), %eax
movl %eax, -4(%rbp)
movq -32(%rbp), %rax
movl (%rax), %edx
movq -24(%rbp), %rax
movl %edx, (%rax)
movq -32(%rbp), %rax
movl -4(%rbp), %edx
movl %edx, (%rax)
nop
popq %rbp
ret
I think the the way I've internalized or abstracted move semantics for myself, is that they "enable 'newly available' optimization through the removal of costly temporary copies".
Have I just internalized this incorrectly?
-Or-
Is this just failing because I'm using a primitive type?
-Or-
Have I just missed the mark entirely?
OK, the problem is inlining and effective optimization, to get around it I've annotated it with __attribute__((noinline)).
I've made a class to handle the move
class mover {
public :
int *ptr { nullptr };
__attribute__((noinline)) mover() : ptr(new int(42)) { }
__attribute__((noinline)) mover(mover & other) {
delete ptr;
ptr = new int(*other.ptr);
}
__attribute__((noinline)) mover& operator= ( mover && other) {
delete ptr;
ptr = other.ptr;
other.ptr = nullptr;
return *this;
}
__attribute__((noinline)) mover& operator= ( mover & other) {
delete ptr;
ptr = new int(*other.ptr);
return *this;
}
__attribute__((noinline)) mover(mover && other) {
ptr = other.ptr;
other.ptr = nullptr;
}
__attribute__((noinline)) ~mover() {
delete ptr;
}
};
Specific doesn't use smart pointers to be able to see what goes on.
The move swap now looks like this calling the correct constructors and operators
void swap<mover>(mover&, mover&):
pushq %r12
movq %rdi, %r12
pushq %rbp
movq %rsi, %rbp
movq %rdi, %rsi
subq $24, %rsp
leaq 8(%rsp), %rdi
call mover::mover(mover&&)
movq %rbp, %rsi
movq %r12, %rdi
call mover::operator=(mover&&) [clone .isra.0]
leaq 8(%rsp), %rsi
movq %rbp, %rdi
call mover::operator=(mover&&) [clone .isra.0]
leaq 8(%rsp), %rdi
call mover::~mover() [complete object destructor]
addq $24, %rsp
popq %rbp
popq %r12
ret
And the copy swap looks like this calling copy and copy assign.
void swap<mover>(mover&, mover&):
pushq %r12
movq %rdi, %r12
pushq %rbp
movq %rsi, %rbp
movq %rdi, %rsi
subq $24, %rsp
leaq 8(%rsp), %rdi
call mover::mover(mover&)
movq %rbp, %rsi
movq %r12, %rdi
call mover::operator=(mover&) [clone .isra.0]
leaq 8(%rsp), %rsi
movq %rbp, %rdi
call mover::operator=(mover&) [clone .isra.0]
leaq 8(%rsp), %rdi
call mover::~mover() [complete object destructor]
addq $24, %rsp
popq %rbp
popq %r12
ret
This biggest effect are the different move constructor
mover::mover(mover&&):
movq (%rsi), %rax
movq $0, (%rsi)
movq %rax, (%rdi)
ret
and copy constructor
mover::mover(mover&):
pushq %rbp
movq %rsi, %rbp
pushq %rbx
movq %rdi, %rbx
subq $8, %rsp
movq $0, (%rdi)
movl $4, %edi
call operator new(unsigned long) // <---- new
movq 0(%rbp), %rdx
movq %rax, (%rbx)
movl (%rdx), %edx
movl %edx, (%rax)
addq $8, %rsp
popq %rbx
popq %rbp
ret
With the new call in the latter.

What do these instructions in the diassembly phase indicate?

Hello as I run c++ code in clion IDE debugger, after main() returns, the debugger steps into a file called disassembly, and it contains what looks like assmebly code. What are those instructions? What does it do? Should I care? as I'm new to c++ I'm familiarizing myself with the language, IDE and anything else of relevance.
start:
nop
movl %eax, %edi
callq 0x2e82e ; symbol stub for: exit
hlt
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
exit:
jmpq *0x268c241c(%rip)
exit:
pushq %rbp
movq %rsp, %rbp
pushq %rbx
pushq %rax
movl %edi, %ebx
cmpl $0xad, %edi
jne 0x5a404 ; <+41>
leaq 0x2683a31e(%rip), %rcx
movq (%rcx), %rax
testq %rax, %rax
je 0x5a404 ; <+41>
xorl %eax, %eax
xchgq %rax, (%rcx)
testq %rax, %rax
jne 0x5a427 ; <+76>
xorl %eax, %eax
callq 0x8017c ; symbol stub for: _tlv_exit
xorl %edi, %edi
callq 0x5a196 ; __cxa_finalize
movq 0x268354f7(%rip), %rax
testq %rax, %rax
je 0x5a420 ; <+69>
callq *%rax
movl %ebx, %edi
callq 0x8000e ; symbol stub for: __exit
callq *%rax
ud2
There is also this
_tlv_exit:
pushq %rbp
movq %rsp, %rbp
pushq %rbx
pushq %rax
movq 0x268db5e9(%rip), %rdi
callq 0x2e92a ; symbol stub for: pthread_getspecific
testq %rax, %rax
je 0x18e20 ; <+54>
movq %rax, %rbx
movq 0x268db5d5(%rip), %rdi
xorl %esi, %esi
callq 0x2e942 ; symbol stub for: pthread_setspecific
movq %rbx, %rdi
addq $0x8, %rsp
popq %rbx
popq %rbp
jmp 0x1983e ; tlv_finalize_list
addq $0x8, %rsp
popq %rbx
popq %rbp
retq
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
start:
nop
movl %eax, %edi
callq 0x2e82e ; symbol stub for: exit
hlt
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
nop
exit:
jmpq *0x268c241c(%rip)
pthread_getspecific:
jmpq *0x268c2470(%rip)
__cxa_finalize_ranges:
pushq %rbp
movq %rsp, %rbp
pushq %r15
pushq %r14
pushq %r13
pushq %r12
pushq %rbx
subq $0x18, %rsp
movl %esi, -0x2c(%rbp)
movq %rdi, -0x38(%rbp)
leaq 0x26834d24(%rip), %rdi
callq 0x804d6 ; symbol stub for: pthread_mutex_lock
movq 0x26834ca0(%rip), %r13
testq %r13, %r13
je 0x5a17c ; <+383>
movl -0x2c(%rbp), %ebx
addq $0x8, -0x38(%rbp)
movslq 0x8(%r13), %r15
testq %r15, %r15
jle 0x5a16f ; <+370>
decq %r15
movq %r15, %r14
shlq $0x5, %r14
movl 0x10(%r13,%r14), %r12d
testl %r12d, %r12d
je 0x5a03d ; <+64>
cmpl $0x0, -0x2c(%rbp)
je 0x5a102 ; <+261>
cmpl $0x1, %r12d
je 0x5a0a4 ; <+167>
cmpl $0x3, %r12d
je 0x5a0d1 ; <+212>
cmpl $0x2, %r12d
jne 0x5a102 ; <+261>
movq 0x28(%r13,%r14), %rax
movq -0x38(%rbp), %rcx
xorl %edx, %edx
movq -0x8(%rcx), %rsi
cmpq %rax, %rsi
ja 0x5a096 ; <+153>
addq (%rcx), %rsi
cmpq %rax, %rsi
ja 0x5a102 ; <+261>
incq %rdx
addq $0x10, %rcx
cmpq %rbx, %rdx
jb 0x5a085 ; <+136>
jmp 0x5a03d ; <+64>
movq 0x18(%r13,%r14), %rax
movq -0x38(%rbp), %rcx
xorl %edx, %edx
movq -0x8(%rcx), %rsi
cmpq %rax, %rsi
ja 0x5a0c0 ; <+195>
addq (%rcx), %rsi
cmpq %rax, %rsi
ja 0x5a102 ; <+261>
incq %rdx
addq $0x10, %rcx
cmpq %rbx, %rdx
jb 0x5a0af ; <+178>
jmp 0x5a03d ; <+64>
movq 0x18(%r13,%r14), %rax
movq 0x10(%rax), %rax
movq -0x38(%rbp), %rcx
xorl %edx, %edx
movq -0x8(%rcx), %rsi
cmpq %rax, %rsi
ja 0x5a0f1 ; <+244>
addq (%rcx), %rsi
cmpq %rax, %rsi
ja 0x5a102 ; <+261>
incq %rdx
addq $0x10, %rcx
cmpq %rbx, %rdx
jb 0x5a0e0 ; <+227>
jmp 0x5a03d ; <+64>
leaq 0x10(%r13,%r14), %rax
movl $0x0, (%rax)
movb $0x0, 0x26834b94(%rip)
leaq 0x26834c25(%rip), %rdi
callq 0x804e2 ; symbol stub for: pthread_mutex_unlock
cmpl $0x1, %r12d
je 0x5a13e ; <+321>
cmpl $0x3, %r12d
je 0x5a145 ; <+328>
cmpl $0x2, %r12d
jne 0x5a14d ; <+336>
movq 0x20(%r13,%r14), %rdi
callq *0x18(%r13,%r14)
jmp 0x5a14d ; <+336>
callq *0x18(%r13,%r14)
jmp 0x5a14d ; <+336>
movq 0x18(%r13,%r14), %rdi
callq *0x10(%rdi)
leaq 0x26834bec(%rip), %rdi
callq 0x804d6 ; symbol stub for: pthread_mutex_lock
cmpb $0x0, 0x26834b48(%rip)
je 0x5a03d ; <+64>
movq 0x26834b5b(%rip), %r13
jmp 0x5a173 ; <+374>
movq (%r13), %r13
testq %r13, %r13
jne 0x5a039 ; <+60>
leaq 0x26834bbd(%rip), %rdi
addq $0x18, %rsp
popq %rbx
popq %r12
popq %r13
popq %r14
popq %r15
popq %rbp
jmp 0x804e2 ; symbol stub for: pthread_mutex_unlock
__cxa_finalize:
testq %rdi, %rdi
je 0x5a1c5 ; <+47>
pushq %rbp
movq %rsp, %rbp
subq $0x10, %rsp
leaq -0x10(%rbp), %rax
movq %rdi, (%rax)
movq $0x1, 0x8(%rax)
movq %rax, %rdi
movl $0x1, %esi
callq 0x59ffd ; __cxa_finalize_ranges
addq $0x10, %rsp
popq %rbp
retq
xorl %edi, %edi
xorl %esi, %esi
jmp 0x59ffd ; __cxa_finalize_ranges
exit:
pushq %rbp
movq %rsp, %rbp
pushq %rbx
pushq %rax
movl %edi, %ebx
cmpl $0xad, %edi
jne 0x5a404 ; <+41>
leaq 0x2683a31e(%rip), %rcx
movq (%rcx), %rax
testq %rax, %rax
je 0x5a404 ; <+41>
xorl %eax, %eax
xchgq %rax, (%rcx)
testq %rax, %rax
jne 0x5a427 ; <+76>
xorl %eax, %eax
callq 0x8017c ; symbol stub for: _tlv_exit
xorl %edi, %edi
callq 0x5a196 ; __cxa_finalize
movq 0x268354f7(%rip), %rax
testq %rax, %rax
je 0x5a420 ; <+69>
callq *%rax
movl %ebx, %edi
callq 0x8000e ; symbol stub for: __exit
callq *%rax
ud2
_tlv_exit:
jmpq *0x2680cbd6(%rip)
pthread_getspecific:
movq %gs:(,%rdi,8), %rax
retq
Assembly output is just a dump of the executable code the compiler generated, but in a human-readable form1. This is not actually used by the compiler, it's just an artifact of the compilation process to be used for reference.
Remember, the compiled executable can be converted into assembly code at any time, tools like IDA Pro and Ghidra excel at doing this on any executable, but the compiler can add in contextual information that's lost in the final compilation phase in the form of comments or useful labels for things.
The compiler often emits debug hints for your compiled executable so it can turn a stack-trace into something that maps back to your original source code. These artifacts are much more useful as they allow you to step through C++ code instead of assembly code. If you ever have to debug in a library you don't have the source for you'll be stuck stepping through an assembly view of the executable code.
1 Presuming you can read assembly code.
The code you posted is support code from your libc runtime. The runtime is responsible for, among others:
implementing atexit hooks;
setting up your IO streams (cin, cout);
running constructors of any global static variables.
This answer has a more complete overview. You can search for articles about libc_start_main and related functions to learn more.

64 bits architecture optimization

I'm testing a function that calculates the XOR of two char buffers. In order to increase the speed, I'm checking the speed of doing with a integer pointer (32 bits) and long long integer pointer (64 bits). I use the function with a char pointer for reference. Of course, I'm testing on a 64bits machine.
But I'm not having the results that I expected. I'm trying with these 3 functions at the end. When I compare "XOR_Diff_Char" with "XOR_Diff_Int", I get an increase of speed around 3x, because the function "_Int" iterates 4 times less in the main "for". But when I compare "XOR_Diff_Int" with "XOR_Diff_QWORD", the improvement is arount 5-10%, really slower than I expected because the main "for" iterates 2x times less in "_QWORD" than in "_Int". I had tried (in order to compare speeds) to compile with different flags, between -O0 and -O3, but I found no differences.
I use g++ 4.9.2-10 compiler under Debian 64bits. Do I have to put another flag? Do I suppose something and I'm wrong? Is the compiler so good that doesn't matter if you use 32 or 64 bits?
/////////////////////////////////
int XOR_Diff_Int(char *pBuffIn1, char *pBuffIn2, char *pBuffOut, unsigned int sizeBuff)
{
int i = 0;
/* Check errors ... */
int *pBuff1 = (int*)pBuffIn1;
int *pBuff2 = (int*)pBuffIn2;
int *pOut = (int*)pBuffOut;
unsigned int sizeInt = (sizeBuff/sizeof(int));
unsigned int modInt = sizeBuff-(sizeBuff%sizeof(int));
for (i = 0; i < sizeInt; i++, pBuff1++, pBuff2++, pOut++)
*pOut = *pBuff1 ^ *pBuff2;
// If size is not sizeof(int) multiple
for (i = modInt; i < sizeBuff; i++)
pBuffOut[i] = pBuffIn1[i] ^ pBuffIn2[i];
return sizeBuff;
}
/////////////////////////////////
int XOR_Diff_Char(char *pBuffIn1, char *pBuffIn2, char *pBuffOut, unsigned int sizeBuff)
{
int i = 0;
/* Check errors ... */
for (i = 0; i < sizeBuff; i++)
pBuffOut[i] = pBuffIn1[i] ^ pBuffIn2[i];
return 1;
}
/////////////////////////////////
int XOR_Diff_QWORD(char *pBuffIn1, char *pBuffIn2, char *pBuffOut, unsigned int sizeBuff)
{
int i = 0;
/* Check errors ... */
long long int *pBuff1 = (long long int*)pBuffIn1;
long long int *pBuff2 = (long long int*)pBuffIn2;
long long int *pOut = (long long int*)pBuffOut;
unsigned int sizeLong = (sizeBuff/sizeof(long long int));
unsigned int modLong = sizeBuff-(sizeBuff%sizeof(long long int));
for (i = 0; i < sizeLong; i++, pBuff1++, pBuff2++, pOut++)
*pOut = *pBuff1 ^ *pBuff2;
// If size is not sizeof(long long int) multiple
for (i = modLong; i < sizeBuff; i++)
pBuffOut[i] = pBuffIn1[i] ^ pBuffIn2[i];
return 1;
}
EDIT:
I was using the gcov utility, and I can see that the function with _QWORD executes the half number of iterations than _Int, so the speed should be the double (despite overhead of functions and so on). So I understand less why the speed is similar in both cases. For testing, I just using something as simple as
gettimeofday(&t1, NULL);
count = XOR_Diff_Int(pDataIn, prevData, pOut, SIZE);
gettimeofday(&t2, NULL);
changing "_Int" for "_QWORD" and recompiling for both types of test.
EDIT 2:
I don't know so much about assembler, but I compared both function (the main "for"), and I got this:
// 64bits XOR
movq (%rsi,%r8,8), %r9
xorq (%rdi,%r8,8), %r9
movq %r9, (%rdx,%r8,8)
addq $1, %r8
cmpl %r8d, %ecx
ja .L8
// 32bits XOR
movl (%rsi,%r8,4), %r9d
xorl (%rdi,%r8,4), %r9d
movl %r9d, (%rdx,%r8,4)
addq $1, %r8
cmpl %r8d, %ecx
jg .L8
So I understand that the 64bits case is faster because uses 8 bytes instructions. I think that is not a "instructions" problems, but the operating system or something like that. At the moment I haven't anymore idea about this.
It seems that what you've tried to do is outsmart the compiler. The compiler won.
Given the following simple function:
void f(const char* lhs, const char* rhs, char* out, size_t sz)
{
for (size_t i = 0; i < sz; ++i )
out[i] = lhs[i] ^ rhs[i];
}
and compiling with GCC with -O3 -Wall, the compiler spits out nearly 300 lines of assembler:
f(char const*, char const*, char*, unsigned long):
testq %rcx, %rcx
je .L38
leaq 16(%rdi), %rax
leaq 16(%rdx), %r9
cmpq %rax, %rdx
setnb %r8b
cmpq %r9, %rdi
setnb %al
orl %eax, %r8d
leaq 16(%rsi), %rax
cmpq %rax, %rdx
setnb %r10b
cmpq %r9, %rsi
setnb %al
orl %r10d, %eax
testb %al, %r8b
je .L3
cmpq $19, %rcx
jbe .L3
movq %rdi, %r8
pushq %r13
pushq %r12
negq %r8
pushq %rbp
pushq %rbx
andl $15, %r8d
cmpq %rcx, %r8
cmova %rcx, %r8
xorl %eax, %eax
testq %r8, %r8
je .L4
movzbl (%rdi), %eax
xorb (%rsi), %al
cmpq $1, %r8
movb %al, (%rdx)
je .L15
movzbl 1(%rdi), %eax
xorb 1(%rsi), %al
cmpq $2, %r8
movb %al, 1(%rdx)
je .L16
movzbl 2(%rdi), %eax
xorb 2(%rsi), %al
cmpq $3, %r8
movb %al, 2(%rdx)
je .L17
movzbl 3(%rdi), %eax
xorb 3(%rsi), %al
cmpq $4, %r8
movb %al, 3(%rdx)
je .L18
movzbl 4(%rdi), %eax
xorb 4(%rsi), %al
cmpq $5, %r8
movb %al, 4(%rdx)
je .L19
movzbl 5(%rdi), %eax
xorb 5(%rsi), %al
cmpq $6, %r8
movb %al, 5(%rdx)
je .L20
movzbl 6(%rdi), %eax
xorb 6(%rsi), %al
cmpq $7, %r8
movb %al, 6(%rdx)
je .L21
movzbl 7(%rdi), %eax
xorb 7(%rsi), %al
cmpq $8, %r8
movb %al, 7(%rdx)
je .L22
movzbl 8(%rdi), %eax
xorb 8(%rsi), %al
cmpq $9, %r8
movb %al, 8(%rdx)
je .L23
movzbl 9(%rdi), %eax
xorb 9(%rsi), %al
cmpq $10, %r8
movb %al, 9(%rdx)
je .L24
movzbl 10(%rdi), %eax
xorb 10(%rsi), %al
cmpq $11, %r8
movb %al, 10(%rdx)
je .L25
movzbl 11(%rdi), %eax
xorb 11(%rsi), %al
cmpq $12, %r8
movb %al, 11(%rdx)
je .L26
movzbl 12(%rdi), %eax
xorb 12(%rsi), %al
cmpq $13, %r8
movb %al, 12(%rdx)
je .L27
movzbl 13(%rdi), %eax
xorb 13(%rsi), %al
cmpq $14, %r8
movb %al, 13(%rdx)
je .L28
movzbl 14(%rdi), %eax
xorb 14(%rsi), %al
movb %al, 14(%rdx)
movl $15, %eax
.L4:
movq %rcx, %r11
leaq -1(%rcx), %r10
subq %r8, %r11
leaq -16(%r11), %r9
subq %r8, %r10
shrq $4, %r9
addq $1, %r9
movq %r9, %rbx
salq $4, %rbx
cmpq $14, %r10
jbe .L6
leaq (%rdi,%r8), %r13
leaq (%rsi,%r8), %r12
xorl %r10d, %r10d
addq %rdx, %r8
xorl %ebp, %ebp
.L8:
movdqu (%r12,%r10), %xmm0
addq $1, %rbp
pxor 0(%r13,%r10), %xmm0
movups %xmm0, (%r8,%r10)
addq $16, %r10
cmpq %r9, %rbp
jb .L8
addq %rbx, %rax
cmpq %rbx, %r11
je .L1
.L6:
movzbl (%rsi,%rax), %r8d
xorb (%rdi,%rax), %r8b
movb %r8b, (%rdx,%rax)
leaq 1(%rax), %r8
cmpq %r8, %rcx
jbe .L1
movzbl 1(%rdi,%rax), %r8d
xorb 1(%rsi,%rax), %r8b
movb %r8b, 1(%rdx,%rax)
leaq 2(%rax), %r8
cmpq %r8, %rcx
jbe .L1
movzbl 2(%rdi,%rax), %r8d
xorb 2(%rsi,%rax), %r8b
movb %r8b, 2(%rdx,%rax)
leaq 3(%rax), %r8
cmpq %r8, %rcx
jbe .L1
movzbl 3(%rdi,%rax), %r8d
xorb 3(%rsi,%rax), %r8b
movb %r8b, 3(%rdx,%rax)
leaq 4(%rax), %r8
cmpq %r8, %rcx
jbe .L1
movzbl 4(%rdi,%rax), %r8d
xorb 4(%rsi,%rax), %r8b
movb %r8b, 4(%rdx,%rax)
leaq 5(%rax), %r8
cmpq %r8, %rcx
jbe .L1
movzbl 5(%rdi,%rax), %r8d
xorb 5(%rsi,%rax), %r8b
movb %r8b, 5(%rdx,%rax)
leaq 6(%rax), %r8
cmpq %r8, %rcx
jbe .L1
movzbl 6(%rdi,%rax), %r8d
xorb 6(%rsi,%rax), %r8b
movb %r8b, 6(%rdx,%rax)
leaq 7(%rax), %r8
cmpq %r8, %rcx
jbe .L1
movzbl 7(%rdi,%rax), %r8d
xorb 7(%rsi,%rax), %r8b
movb %r8b, 7(%rdx,%rax)
leaq 8(%rax), %r8
cmpq %r8, %rcx
jbe .L1
movzbl 8(%rdi,%rax), %r8d
xorb 8(%rsi,%rax), %r8b
movb %r8b, 8(%rdx,%rax)
leaq 9(%rax), %r8
cmpq %r8, %rcx
jbe .L1
movzbl 9(%rdi,%rax), %r8d
xorb 9(%rsi,%rax), %r8b
movb %r8b, 9(%rdx,%rax)
leaq 10(%rax), %r8
cmpq %r8, %rcx
jbe .L1
movzbl 10(%rdi,%rax), %r8d
xorb 10(%rsi,%rax), %r8b
movb %r8b, 10(%rdx,%rax)
leaq 11(%rax), %r8
cmpq %r8, %rcx
jbe .L1
movzbl 11(%rdi,%rax), %r8d
xorb 11(%rsi,%rax), %r8b
movb %r8b, 11(%rdx,%rax)
leaq 12(%rax), %r8
cmpq %r8, %rcx
jbe .L1
movzbl 12(%rdi,%rax), %r8d
xorb 12(%rsi,%rax), %r8b
movb %r8b, 12(%rdx,%rax)
leaq 13(%rax), %r8
cmpq %r8, %rcx
jbe .L1
movzbl 13(%rdi,%rax), %r8d
xorb 13(%rsi,%rax), %r8b
movb %r8b, 13(%rdx,%rax)
leaq 14(%rax), %r8
cmpq %r8, %rcx
jbe .L1
movzbl 14(%rdi,%rax), %ecx
xorb 14(%rsi,%rax), %cl
movb %cl, 14(%rdx,%rax)
.L1:
popq %rbx
popq %rbp
popq %r12
popq %r13
.L38:
rep ret
.L3:
xorl %eax, %eax
.L13:
movzbl (%rdi,%rax), %r8d
xorb (%rsi,%rax), %r8b
movb %r8b, (%rdx,%rax)
addq $1, %rax
cmpq %rax, %rcx
jne .L13
rep ret
.L28:
movl $14, %eax
jmp .L4
.L15:
movl $1, %eax
jmp .L4
.L16:
movl $2, %eax
jmp .L4
.L17:
movl $3, %eax
jmp .L4
.L18:
movl $4, %eax
jmp .L4
.L19:
movl $5, %eax
jmp .L4
.L20:
movl $6, %eax
jmp .L4
.L21:
movl $7, %eax
jmp .L4
.L22:
movl $8, %eax
jmp .L4
.L23:
movl $9, %eax
jmp .L4
.L24:
movl $10, %eax
jmp .L4
.L25:
movl $11, %eax
jmp .L4
.L26:
movl $12, %eax
jmp .L4
.L27:
movl $13, %eax
jmp .L4
It does better if we add -march=native -mtune=native
The compiler has done its own striding, and done a much better job at it than it can with the variants you are producing.
void f(const char* lhs, const char* rhs, char* out, size_t sz)
{
const int* ilhs = (const int*)lhs;
const int* irhs = (const int*)rhs;
int* iout = (int*)out;
const size_t isz = (sz / sizeof(*ilhs));
const size_t imod = (isz * sizeof(*ilhs));
for (size_t i = 0; i < isz; ++i)
*(iout++) = *(ilhs++) ^ *(irhs)++;
for (size_t i = imod; i < sz; ++i)
out[i] = lhs[i] ^ rhs[i];
}
This produces almost 400 lines of assembler.
f(char const*, char const*, char*, unsigned long):
movq %rcx, %r8
pushq %r15
pushq %r14
shrq $2, %r8
pushq %r13
pushq %r12
testq %r8, %r8
pushq %rbp
leaq 0(,%r8,4), %rax
pushq %rbx
je .L11
leaq 16(%rsi), %r9
leaq 16(%rdx), %r10
cmpq %r9, %rdx
setnb %r11b
cmpq %r10, %rsi
setnb %r9b
orl %r11d, %r9d
cmpq $8, %r8
seta %r11b
testb %r11b, %r9b
je .L4
leaq 16(%rdi), %r9
cmpq %r9, %rdx
setnb %r11b
cmpq %r10, %rdi
setnb %r9b
orb %r9b, %r11b
je .L4
movq %rdi, %r9
andl $15, %r9d
shrq $2, %r9
negq %r9
andl $3, %r9d
cmpq %r8, %r9
cmova %r8, %r9
testq %r9, %r9
je .L25
movl (%rdi), %r10d
xorl (%rsi), %r10d
cmpq $1, %r9
leaq 4(%rdx), %r13
leaq 4(%rdi), %rbp
leaq 4(%rsi), %rbx
movl %r10d, (%rdx)
movl $1, %r10d
je .L5
movl 4(%rdi), %r10d
xorl 4(%rsi), %r10d
cmpq $2, %r9
leaq 8(%rdx), %r13
leaq 8(%rdi), %rbp
leaq 8(%rsi), %rbx
movl %r10d, 4(%rdx)
movl $2, %r10d
je .L5
movl 8(%rdi), %r10d
xorl 8(%rsi), %r10d
leaq 12(%rdx), %r13
leaq 12(%rdi), %rbp
leaq 12(%rsi), %rbx
movl %r10d, 8(%rdx)
movl $3, %r10d
.L5:
movq %r8, %r15
movq %rax, -16(%rsp)
subq %r9, %r15
salq $2, %r9
leaq -4(%r15), %r11
leaq (%rsi,%r9), %r12
movq %r15, -24(%rsp)
leaq (%rdi,%r9), %r15
addq %rdx, %r9
shrq $2, %r11
movq %r12, -40(%rsp)
movq %r9, -32(%rsp)
addq $1, %r11
xorl %r9d, %r9d
xorl %r12d, %r12d
leaq 0(,%r11,4), %r14
.L8:
movq -40(%rsp), %rax
addq $1, %r12
movdqu (%rax,%r9), %xmm0
movq -32(%rsp), %rax
pxor (%r15,%r9), %xmm0
movups %xmm0, (%rax,%r9)
addq $16, %r9
cmpq %r11, %r12
jb .L8
leaq 0(,%r14,4), %r9
addq %r14, %r10
movq -16(%rsp), %rax
addq %r9, %rbp
addq %r9, %rbx
addq %r9, %r13
cmpq %r14, -24(%rsp)
je .L11
movl 0(%rbp), %r9d
xorl (%rbx), %r9d
movl %r9d, 0(%r13)
leaq 1(%r10), %r9
cmpq %r9, %r8
jbe .L11
movl 4(%rbp), %r9d
xorl 4(%rbx), %r9d
addq $2, %r10
cmpq %r10, %r8
movl %r9d, 4(%r13)
jbe .L11
movl 8(%rbp), %r9d
xorl 8(%rbx), %r9d
movl %r9d, 8(%r13)
.L11:
cmpq %rax, %rcx
jbe .L1
leaq 16(%rax), %r9
leaq (%rsi,%rax), %rbx
movq %rcx, %r11
leaq (%rdx,%rax), %rbp
subq %rax, %r11
leaq (%rdi,%rax), %r10
leaq (%rdx,%r9), %r12
leaq (%rdi,%r9), %r13
cmpq %rbx, %r12
setbe %bl
addq %rsi, %r9
cmpq %r9, %rbp
setnb %r9b
orl %r9d, %ebx
cmpq %r12, %r10
setnb %r12b
cmpq %r13, %rbp
setnb %r9b
orl %r12d, %r9d
testb %r9b, %bl
je .L24
cmpq $19, %r11
jbe .L24
negq %r10
movq %rax, %r9
andl $15, %r10d
cmpq %r11, %r10
cmova %r11, %r10
testq %r10, %r10
je .L15
movzbl (%rdi,%r8,4), %r9d
xorb (%rsi,%r8,4), %r9b
cmpq $1, %r10
movb %r9b, (%rdx,%r8,4)
leaq 1(%rax), %r9
je .L15
movzbl 1(%rdi,%rax), %r8d
leaq 2(%rax), %r9
xorb 1(%rsi,%rax), %r8b
cmpq $2, %r10
movb %r8b, 1(%rdx,%rax)
je .L15
movzbl 2(%rdi,%rax), %r8d
leaq 3(%rax), %r9
xorb 2(%rsi,%rax), %r8b
cmpq $3, %r10
movb %r8b, 2(%rdx,%rax)
je .L15
movzbl 3(%rdi,%rax), %r8d
leaq 4(%rax), %r9
xorb 3(%rsi,%rax), %r8b
cmpq $4, %r10
movb %r8b, 3(%rdx,%rax)
je .L15
movzbl 4(%rdi,%rax), %r8d
leaq 5(%rax), %r9
xorb 4(%rsi,%rax), %r8b
cmpq $5, %r10
movb %r8b, 4(%rdx,%rax)
je .L15
movzbl 5(%rdi,%rax), %r8d
leaq 6(%rax), %r9
xorb 5(%rsi,%rax), %r8b
cmpq $6, %r10
movb %r8b, 5(%rdx,%rax)
je .L15
movzbl 6(%rdi,%rax), %r8d
leaq 7(%rax), %r9
xorb 6(%rsi,%rax), %r8b
cmpq $7, %r10
movb %r8b, 6(%rdx,%rax)
je .L15
movzbl 7(%rdi,%rax), %r8d
leaq 8(%rax), %r9
xorb 7(%rsi,%rax), %r8b
cmpq $8, %r10
movb %r8b, 7(%rdx,%rax)
je .L15
movzbl 8(%rdi,%rax), %r8d
leaq 9(%rax), %r9
xorb 8(%rsi,%rax), %r8b
cmpq $9, %r10
movb %r8b, 8(%rdx,%rax)
je .L15
movzbl 9(%rdi,%rax), %r8d
leaq 10(%rax), %r9
xorb 9(%rsi,%rax), %r8b
cmpq $10, %r10
movb %r8b, 9(%rdx,%rax)
je .L15
movzbl 10(%rdi,%rax), %r8d
leaq 11(%rax), %r9
xorb 10(%rsi,%rax), %r8b
cmpq $11, %r10
movb %r8b, 10(%rdx,%rax)
je .L15
movzbl 11(%rdi,%rax), %r8d
leaq 12(%rax), %r9
xorb 11(%rsi,%rax), %r8b
cmpq $12, %r10
movb %r8b, 11(%rdx,%rax)
je .L15
movzbl 12(%rdi,%rax), %r8d
leaq 13(%rax), %r9
xorb 12(%rsi,%rax), %r8b
cmpq $13, %r10
movb %r8b, 12(%rdx,%rax)
je .L15
movzbl 13(%rdi,%rax), %r8d
leaq 14(%rax), %r9
xorb 13(%rsi,%rax), %r8b
cmpq $14, %r10
movb %r8b, 13(%rdx,%rax)
je .L15
movzbl 14(%rdi,%rax), %r8d
leaq 15(%rax), %r9
xorb 14(%rsi,%rax), %r8b
movb %r8b, 14(%rdx,%rax)
.L15:
movq %r11, %rbp
leaq -1(%r11), %r8
subq %r10, %rbp
leaq -16(%rbp), %rbx
subq %r10, %r8
shrq $4, %rbx
addq $1, %rbx
movq %rbx, %r12
salq $4, %r12
cmpq $14, %r8
jbe .L17
addq %r10, %rax
xorl %r8d, %r8d
xorl %r10d, %r10d
leaq (%rdi,%rax), %r13
leaq (%rsi,%rax), %r11
addq %rdx, %rax
.L19:
movdqu (%r11,%r8), %xmm0
addq $1, %r10
pxor 0(%r13,%r8), %xmm0
movups %xmm0, (%rax,%r8)
addq $16, %r8
cmpq %rbx, %r10
jb .L19
addq %r12, %r9
cmpq %r12, %rbp
je .L1
.L17:
movzbl (%rdi,%r9), %eax
xorb (%rsi,%r9), %al
movb %al, (%rdx,%r9)
leaq 1(%r9), %rax
cmpq %rax, %rcx
jbe .L1
movzbl 1(%rdi,%r9), %eax
xorb 1(%rsi,%r9), %al
movb %al, 1(%rdx,%r9)
leaq 2(%r9), %rax
cmpq %rax, %rcx
jbe .L1
movzbl 2(%rdi,%r9), %eax
xorb 2(%rsi,%r9), %al
movb %al, 2(%rdx,%r9)
leaq 3(%r9), %rax
cmpq %rax, %rcx
jbe .L1
movzbl 3(%rdi,%r9), %eax
xorb 3(%rsi,%r9), %al
movb %al, 3(%rdx,%r9)
leaq 4(%r9), %rax
cmpq %rax, %rcx
jbe .L1
movzbl 4(%rdi,%r9), %eax
xorb 4(%rsi,%r9), %al
movb %al, 4(%rdx,%r9)
leaq 5(%r9), %rax
cmpq %rax, %rcx
jbe .L1
movzbl 5(%rdi,%r9), %eax
xorb 5(%rsi,%r9), %al
movb %al, 5(%rdx,%r9)
leaq 6(%r9), %rax
cmpq %rax, %rcx
jbe .L1
movzbl 6(%rdi,%r9), %eax
xorb 6(%rsi,%r9), %al
movb %al, 6(%rdx,%r9)
leaq 7(%r9), %rax
cmpq %rax, %rcx
jbe .L1
movzbl 7(%rdi,%r9), %eax
xorb 7(%rsi,%r9), %al
movb %al, 7(%rdx,%r9)
leaq 8(%r9), %rax
cmpq %rax, %rcx
jbe .L1
movzbl 8(%rdi,%r9), %eax
xorb 8(%rsi,%r9), %al
movb %al, 8(%rdx,%r9)
leaq 9(%r9), %rax
cmpq %rax, %rcx
jbe .L1
movzbl 9(%rdi,%r9), %eax
xorb 9(%rsi,%r9), %al
movb %al, 9(%rdx,%r9)
leaq 10(%r9), %rax
cmpq %rax, %rcx
jbe .L1
movzbl 10(%rdi,%r9), %eax
xorb 10(%rsi,%r9), %al
movb %al, 10(%rdx,%r9)
leaq 11(%r9), %rax
cmpq %rax, %rcx
jbe .L1
movzbl 11(%rdi,%r9), %eax
xorb 11(%rsi,%r9), %al
movb %al, 11(%rdx,%r9)
leaq 12(%r9), %rax
cmpq %rax, %rcx
jbe .L1
movzbl 12(%rdi,%r9), %eax
xorb 12(%rsi,%r9), %al
movb %al, 12(%rdx,%r9)
leaq 13(%r9), %rax
cmpq %rax, %rcx
jbe .L1
movzbl 13(%rdi,%r9), %eax
xorb 13(%rsi,%r9), %al
movb %al, 13(%rdx,%r9)
leaq 14(%r9), %rax
cmpq %rax, %rcx
jbe .L1
movzbl 14(%rdi,%r9), %eax
xorb 14(%rsi,%r9), %al
movb %al, 14(%rdx,%r9)
.L1:
popq %rbx
popq %rbp
popq %r12
popq %r13
popq %r14
popq %r15
ret
.L24:
movzbl (%rdi,%rax), %r8d
xorb (%rsi,%rax), %r8b
movb %r8b, (%rdx,%rax)
addq $1, %rax
cmpq %rax, %rcx
jne .L24
jmp .L1
.L25:
movq %rdx, %r13
movq %rsi, %rbx
movq %rdi, %rbp
xorl %r10d, %r10d
jmp .L5
.L4:
xorl %r9d, %r9d
.L13:
movl (%rdi,%r9,4), %r10d
xorl (%rsi,%r9,4), %r10d
movl %r10d, (%rdx,%r9,4)
addq $1, %r9
cmpq %r9, %r8
jne .L13
jmp .L11
In the compiler's version of the simple function, there's an immediate and simple test for sz being zero:
f(char const*, char const*, char*, unsigned long):
testq %rcx, %rcx
je .L38
In your version, the compiler hasn't recognized that you're making an attempt at striding, and the code has to walk through a number of steps to get there:
f(char const*, char const*, char*, unsigned long):
movq %rcx, %r8
pushq %r15
pushq %r14
shrq $2, %r8
pushq %r13
pushq %r12
testq %r8, %r8
pushq %rbp
leaq 0(,%r8,4), %rax
pushq %rbx
je .L11
...
.L11:
cmpq %rax, %rcx
jbe .L1 ...
.L1:
popq %rbx
popq %rbp
popq %r12
popq %r13
popq %r14
popq %r15
ret
We also have quite a lot of register spill here keeping track of all these variables.
Lets compare a couple of early blocks of the code:
Compiler:
leaq 16(%rdi), %rax
leaq 16(%rdx), %r9
cmpq %rax, %rdx
setnb %r8b
cmpq %r9, %rdi
setnb %al
orl %eax, %r8d
leaq 16(%rsi), %rax
cmpq %rax, %rdx
setnb %r10b
cmpq %r9, %rsi
setnb %al
orl %r10d, %eax
testb %al, %r8b
je .L3
cmpq $19, %rcx
jbe .L3
movq %rdi, %r8
pushq %r13
pushq %r12
negq %r8
pushq %rbp
pushq %rbx
andl $15, %r8d
cmpq %rcx, %r8
cmova %rcx, %r8
xorl %eax, %eax
testq %r8, %r8
je .L4
Yours:
leaq 16(%rsi), %r9
leaq 16(%rdx), %r10
cmpq %r9, %rdx
setnb %r11b
cmpq %r10, %rsi
setnb %r9b
orl %r11d, %r9d
cmpq $8, %r8
seta %r11b
testb %r11b, %r9b
je .L4
leaq 16(%rdi), %r9
cmpq %r9, %rdx
setnb %r11b
cmpq %r10, %rdi
setnb %r9b
orb %r9b, %r11b
je .L4
movq %rdi, %r9
andl $15, %r9d
shrq $2, %r9
negq %r9
andl $3, %r9d
cmpq %r8, %r9
cmova %r8, %r9
testq %r9, %r9
je .L25
We can see here that the compiler is just having to emit more instructions for each operation than it was producing by itself for the original version.

Vectorization of sin and cos

I was playing around with Compiler Explorer and ran into an anomaly (I think). If I want to make the compiler vectorize a sin calculation using libmvec, I would write:
#include <cmath>
#define NN 512
typedef float T;
typedef T __attribute__((aligned(NN))) AT;
inline T s(const T x)
{
return sinf(x);
}
void func(AT* __restrict x, AT* __restrict y, int length)
{
if (length & NN-1) __builtin_unreachable();
for (int i = 0; i < length; i++)
{
y[i] = s(x[i]);
}
}
compile with gcc 6.2 and -O3 -march=native -ffast-math and get
func(float*, float*, int):
testl %edx, %edx
jle .L10
leaq 8(%rsp), %r10
andq $-32, %rsp
pushq -8(%r10)
pushq %rbp
movq %rsp, %rbp
pushq %r14
xorl %r14d, %r14d
pushq %r13
leal -8(%rdx), %r13d
pushq %r12
shrl $3, %r13d
movq %rsi, %r12
pushq %r10
addl $1, %r13d
pushq %rbx
movq %rdi, %rbx
subq $8, %rsp
.L4:
vmovaps (%rbx), %ymm0
addl $1, %r14d
addq $32, %r12
addq $32, %rbx
call _ZGVcN8v_sinf // YAY! Vectorized trig!
vmovaps %ymm0, -32(%r12)
cmpl %r13d, %r14d
jb .L4
vzeroupper
addq $8, %rsp
popq %rbx
popq %r10
popq %r12
popq %r13
popq %r14
popq %rbp
leaq -8(%r10), %rsp
.L10:
ret
But when I add a cosine to the function, there is no vectorization:
#include <cmath>
#define NN 512
typedef float T;
typedef T __attribute__((aligned(NN))) AT;
inline T f(const T x)
{
return cosf(x)+sinf(x);
}
void func(AT* __restrict x, AT* __restrict y, int length)
{
if (length & NN-1) __builtin_unreachable();
for (int i = 0; i < length; i++)
{
y[i] = f(x[i]);
}
}
which gives:
func(float*, float*, int):
testl %edx, %edx
jle .L10
pushq %r12
leal -1(%rdx), %eax
pushq %rbp
leaq 4(%rdi,%rax,4), %r12
movq %rsi, %rbp
pushq %rbx
movq %rdi, %rbx
subq $16, %rsp
.L4:
vmovss (%rbx), %xmm0
leaq 8(%rsp), %rsi
addq $4, %rbx
addq $4, %rbp
leaq 12(%rsp), %rdi
call sincosf // No vectorization
vmovss 12(%rsp), %xmm0
vaddss 8(%rsp), %xmm0, %xmm0
vmovss %xmm0, -4(%rbp)
cmpq %rbx, %r12
jne .L4
addq $16, %rsp
popq %rbx
popq %rbp
popq %r12
.L10:
ret
I see two good alternatives. Either call a vectorized version of sincosf or call the vectorized sin and cos sequentially. I tried adding -fno-builtin-sincos to no avail. -fopt-info-vec-missed complains about complex float, which there is none.
Is this a known issue with gcc? Either way, is there a way I can convince gcc to vectorize the latter example?
(As an aside, is there any way to get gcc < 6 to vectorize trigonometric functions automatically?)

Optimizations in copying a range

While reading sources of GNU C++ standard library, I found some code for copying (or moving, if possible) a range of iterators (file stl_algobase.h), which uses template specialization for some optimizations. A comment corresponding to it says:
All of these auxiliary structs serve two purposes. (1) Replace calls to copy with memmove whenever possible. (Memmove, not memcpy, because the input and output ranges are permitted to overlap.) (2) If we're using random access iterators, then write the loop as a for loop with an explicit count.
The specialization using the second optimization looks like this:
template<>
struct __copy_move<false, false, random_access_iterator_tag>
{
template<typename _II, typename _OI>
static _OI
__copy_m(_II __first, _II __last, _OI __result)
{
typedef typename iterator_traits<_II>::difference_type _Distance;
for(_Distance __n = __last - __first; __n > 0; --__n)
{
*__result = *__first;
++__first;
++__result;
}
return __result;
}
};
So, I have two questions concerning this
How can memmove increase the spead of copying? Is it implemented somehow more effective than a simple loop?
How can using explicit counter in the for loop affect the performance?
Some clarification: I would like to see some optimization examples actually used by compilers, not elaboration on the possibility of those.
Edit: the first question is quite nicely answered here.
Answering the second question, the explicit count does indeed lead to more opportunities for loop unrolling, though even with pointers iterating through a fixed size array, gcc does not perform aggressive unrolling unless asked to do so with -funroll-loops. The other gain comes from a potentially simpler end-of-loop comparison test for non-trivial iterators.
On a Core i7-4770, I benchmarked the time spent performing a copy of a maximally-aligned 2048-long integer array with a while loop and explicit count copy implementation. (Times in microseconds, includes call overhead; minimum of 200 samples of a timing loop with warm-up.)
while count
gcc -O3 0.179 0.178
gcc -O3 -march=native 0.097 0.095
gcc -O3 -march=native -funroll-loops 0.066 0.066
In each case, the generated code is very similar; the while version does a bit more work at the end in each case, handling checks that there aren't any entries left to copy that didn't fill out a whole 128-bit (SSE) or 256-bit (AVX) register, but these are pretty much taken care of by the branch predictor. The gcc -O3 assembly for each is as follows (leaving out assembler directives). while version:
array_copy_while(int (&) [2048], int (&) [2048]):
leaq 8192(%rdi), %rax
leaq 4(%rdi), %rdx
movq %rax, %rcx
subq %rdx, %rcx
movq %rcx, %rdx
shrq $2, %rdx
leaq 1(%rdx), %r8
cmpq $8, %r8
jbe .L11
leaq 16(%rsi), %rdx
cmpq %rdx, %rdi
leaq 16(%rdi), %rdx
setae %cl
cmpq %rdx, %rsi
setae %dl
orb %dl, %cl
je .L11
movq %r8, %r9
xorl %edx, %edx
xorl %ecx, %ecx
shrq $2, %r9
leaq 0(,%r9,4), %r10
.L9:
movdqa (%rdi,%rdx), %xmm0
addq $1, %rcx
movdqa %xmm0, (%rsi,%rdx)
addq $16, %rdx
cmpq %rcx, %r9
ja .L9
leaq 0(,%r10,4), %rdx
addq %rdx, %rdi
addq %rdx, %rsi
cmpq %r10, %r8
je .L1
movl (%rdi), %edx
movl %edx, (%rsi)
leaq 4(%rdi), %rdx
cmpq %rdx, %rax
je .L1
movl 4(%rdi), %edx
movl %edx, 4(%rsi)
leaq 8(%rdi), %rdx
cmpq %rdx, %rax
je .L20
movl 8(%rdi), %eax
movl %eax, 8(%rsi)
ret
.L11:
movl (%rdi), %edx
addq $4, %rdi
addq $4, %rsi
movl %edx, -4(%rsi)
cmpq %rdi, %rax
jne .L11
.L1:
rep ret
.L20:
rep ret
count version:
array_copy_count(int (&) [2048], int (&) [2048]):
leaq 16(%rsi), %rax
movl $2048, %ecx
cmpq %rax, %rdi
leaq 16(%rdi), %rax
setae %dl
cmpq %rax, %rsi
setae %al
orb %al, %dl
je .L23
movw $512, %cx
xorl %eax, %eax
xorl %edx, %edx
.L29:
movdqa (%rdi,%rax), %xmm0
addq $1, %rdx
movdqa %xmm0, (%rsi,%rax)
addq $16, %rax
cmpq %rdx, %rcx
ja .L29
rep ret
.L23:
xorl %eax, %eax
.L31:
movl (%rdi,%rax,4), %edx
movl %edx, (%rsi,%rax,4)
addq $1, %rax
cmpq %rax, %rcx
jne .L31
rep ret
When the iterators are more complicated however, the difference becomes more pronounced. Consider a hypothetical container that stores values in a series of fixed-sized allocated buffers. An iterator comprises a pointer to the chain of blocks, a block index and a block offset. Comparison of two iterators requires potentially two comparisons. Incrementing the iterator requires checking if we pop over a block boundary.
I made such a container, and performed the same benchmark for copying a 2000-long container of int, with a block size of 512 ints.
while count
gcc -O3 1.560 2.818
gcc -O3 -march=native 1.660 2.854
gcc -O3 -march=native -funroll-loops 1.432 2.858
That looks weird! Oh wait, it's because gcc 4.8 has a misoptimisation, where it uses conditional moves instead of nice, branch-predictor friendly comparisons. (gcc bug 56309).
Let's try icc on a different machine (Xeon E5-2670).
while count
icc -O3 3.952 3.704
icc -O3 -xHost 3.898 3.624
This is closer to what we'd expect, a small but significant improvement from the simpler loop condition. On a different architecture, the gain is more pronounced. clang targeting a PowerA2 at 1.6GHz:
while count
bgclang -O3 36.528 31.623
I'll omit the assembly, as it's quite long!