How does the stack change step by step in this c++ program? - c++

I have the following c++ program where a function returns a reference to a local variable. Can you please show me step by step what happens exactly to the stack?
#include<stdio.h>
double& init_pi()
{
double pi = 3.14;
return pi;
}
double circumference(double r, double& pi)
{
printf("%lf\n", pi);
return 2*r*pi;
}
int main()
{
printf("%lf\n,", circumference(2, init_pi()));
return 0;
}
Thank you for the answers.

Contrary to popular belief, the c++ standard never mentions the concept of a stack. (other than the std::stack class template, which is not what you mean here).
The standard talks in terms of functions, flow of control, local objects, heap objects and static objects.
It is entirely possible to write a c++ compiler for an architecture that does not have a stack (the old TMS 9900 series of chip for which I rote when I was a teenager springs to mind).
Your question might be better put as:
How does the stack change step by step in this c++ program, when compiled with X compiler, with Y options for Z architecture?
For which the answer lies only in your debugger or in the assembler listing (for gcc, compile with -S option)
In truth, if you compile this program with optimisations on, there will be no stack movement at all. The entire flow will be inlined.
for example, gcc 5.3 with -O2 produces the following code (see below)
Note that because you introduced undefined behaviour by returning a reference to a local variable, the compiler is permitted to do anything it likes. In this case it decided that your program does nothing. main simply returns zero.
assembler output:
init_pi():
xorl %eax, %eax
ret
.LC1:
.string "%lf\n"
circumference(double, double&):
pushq %rbx
movl $1, %eax
movq %rdi, %rbx
subq $16, %rsp
movsd %xmm0, 8(%rsp)
movsd (%rdi), %xmm0
movl $.LC1, %edi
call printf
movsd 8(%rsp), %xmm1
movsd (%rbx), %xmm0
addq $16, %rsp
addsd %xmm1, %xmm1
popq %rbx
mulsd %xmm1, %xmm0
ret
main:
movsd 0, %xmm0
ud2
compiler warning:
/tmp/gcc-explorer-compiler11636-75-1libuwy/example.cpp: In function 'double& init_pi()':
5 : warning: reference to local variable 'pi' returned [-Wreturn-local-addr]
double pi = 3.14;
^
Compiled ok
If we fix the warning and subsequent error, we get this:
init_pi():
movsd .LC0(%rip), %xmm0
ret
.LC2:
.string "%lf\n"
circumference(double, double):
subq $24, %rsp
movl $.LC2, %edi
movl $1, %eax
movsd %xmm0, 8(%rsp)
movapd %xmm1, %xmm0
movsd %xmm1, (%rsp)
call printf
movsd 8(%rsp), %xmm2
movsd (%rsp), %xmm1
addq $24, %rsp
addsd %xmm2, %xmm2
movapd %xmm2, %xmm0
mulsd %xmm1, %xmm0
ret
.LC5:
.string "%lf\n,"
main:
subq $8, %rsp
movl $.LC2, %edi
movl $1, %eax
movsd .LC0(%rip), %xmm0
call printf
movsd .LC4(%rip), %xmm0
movl $.LC5, %edi
movl $1, %eax
call printf
xorl %eax, %eax
addq $8, %rsp
ret
.LC0:
.long 1374389535
.long 1074339512
.LC4:
.long 1374389535
.long 1076436664
Again, you will see that main has been completely inlined. There is no stack use whatsoever (other than during the calls to printf)

Related

Segmentation fault with array of __m256i when using clang/g++

I'm attempting to generate arrays of __m256i's to reuse in another computation. When I attempt to do that (even with a minimal testcase), I get a segmentation fault - but only if the code is compiled with g++ or clang. If I compile the code with the Intel compiler (version 16.0), no segmentation fault occurs. Here is a test case I created:
int main() {
__m256i *table = new __m256i[10000];
__m256i zeroes = _mm256_set_epi64x(0, 0, 0, 0);
table[99] = zeroes;
}
When compiling the above with clang 3.6 and g++ 4.8, a segmentation fault occurs.
Here's the assembly generated by the Intel compiler (from https://gcc.godbolt.org/, icc 13.0):
pushq %rbx #3.12
movq %rsp, %rbx #3.12
andq $-32, %rsp #3.12
pushq %rbp #3.12
pushq %rbp #3.12
movq 8(%rbx), %rbp #3.12
movq %rbp, 8(%rsp) #3.12
movq %rsp, %rbp #3.12
subq $112, %rsp #3.12
movl $3200, %eax #4.38
vzeroupper #4.38
movq %rax, %rdi #4.38
call operator new[](unsigned long) #4.38
movq %rax, -112(%rbp) #4.38
movq -112(%rbp), %rax #4.38
movq %rax, -104(%rbp) #4.20
vxorps %ymm0, %ymm0, %ymm0 #5.22
vmovdqu %ymm0, -80(%rbp) #5.22
vmovdqu -80(%rbp), %ymm0 #5.22
vmovdqu %ymm0, -48(%rbp) #5.20
movl $3168, %eax #6.17
addq -104(%rbp), %rax #6.5
vmovdqu -48(%rbp), %ymm0 #6.17
vmovdqu %ymm0, (%rax) #6.5
movl $0, %eax #7.1
vzeroupper #7.1
leave #7.1
movq %rbx, %rsp #7.1
popq %rbx #7.1
ret #7.1
And here's from clang 3.7:
pushq %rbp
movq %rsp, %rbp
andq $-32, %rsp
subq $192, %rsp
xorl %eax, %eax
movl $3200, %ecx # imm = 0xC80
movl %ecx, %edi
movl %eax, 28(%rsp) # 4-byte Spill
callq operator new[](unsigned long)
movq %rax, 88(%rsp)
movq $0, 168(%rsp)
movq $0, 160(%rsp)
movq $0, 152(%rsp)
movq $0, 144(%rsp)
vmovq 168(%rsp), %xmm0 # xmm0 = mem[0],zero
vmovq 160(%rsp), %xmm1 # xmm1 = mem[0],zero
vpunpcklqdq %xmm0, %xmm1, %xmm0 # xmm0 = xmm1[0],xmm0[0]
vmovq 152(%rsp), %xmm1 # xmm1 = mem[0],zero
vpslldq $8, %xmm1, %xmm1 # xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
vmovaps %xmm1, %xmm2
vinserti128 $1, %xmm0, %ymm2, %ymm2
vmovaps %ymm2, 96(%rsp)
vmovaps %ymm2, 32(%rsp)
movq 88(%rsp), %rax
vmovaps %ymm2, 3168(%rax)
movl 28(%rsp), %eax # 4-byte Reload
movq %rbp, %rsp
popq %rbp
vzeroupper
retq
Am I running into a compiler bug in clang/g++? Or am I simply doing something wrong?
I have said many times before that implicit SIMD loads/stores are a bad idea. Stop using them. Use explicit loads/stores like this
int64_t* table = new int64_t[4*10000];
__m256i zeroes = _mm256_set_epi64x(0, 0, 0, 0);
_mm256_storeu_si256((__m256i*)&table[4*99], zeroes);
or since this is POD use the cross-compiler/OS function _mm_malloc
int64_t* table = (int64_t*)_mm_malloc(sizeof(int64_t)*4*10000, 32);
__m256i zeroes = _mm256_set_epi64x(0, 0, 0, 0);
_mm256_store_si256((__m256i*)&table[4*99], zeroes);
You can use _mm256_setzero_si256() instead of _mm256_set_epi64x(0, 0, 0, 0) (note that _mm256_set_epi64x does not work in 32-bit mode on some version of MSVC) but GCC and Clang are smart enough to know they are the same thing.
Since you're using intrinsics which are not part of the C/C++ specification then some rules such as strict aliasing may be overlooked.
I guess the problem has to do with wrong memory alignment. vmovaps requires the memory location to start at a 32-byte boundary and vmovdqu does not. That's why the Intel version works whereas the clang/g++ code crashes. I don't know if this is a compiler bug, but you may want alignment anyway.
The following code should work, although it's more C than C++.
int main() {
__m256i *table = (__m256i*) memalign( 32, 10000 * sizeof(__m256i) );
__m256i zeroes = _mm256_set_epi64x(0, 0, 0, 0);
table[99] = zeroes;
}

How to efficiently add two vectors in C++

Suppose I have two vectors a and b, stored as a vector. I want to make a += b or a +=b * k, where k is a number.
I can for sure do the following,
while (size--) {
(*a++) += (*b++) * k;
}
But what are the possible ways to easily leverage SIMD instructions such as SSE2?
The only thing you should need is to enable auto-vectorization with your compiler.
For example, compiling your code (assuming float) with GCC (5.2.0) -O3 produces this main loop
L8:
movups (%rsi,%rax), %xmm1
addl $1, %r11d
mulps %xmm2, %xmm1
addps (%rdi,%rax), %xmm1
movaps %xmm1, (%rdi,%rax)
addq $16, %rax
cmpl %r11d, %r10d
ja .L8
Clang also vectorizes the loop but also unrolls four times. Unrolling may help on some processors even though there is no dependency chain especially on Haswell. In fact, you can get GCC to unroll by adding -funroll-loops. GCC will unroll to eight independent operations in this case unlike in the case when there is a dependency chain.
One problem you may encounter is that your compiler may need to add some code to determine if the arrays overlap and make two branches one without vectorization for when they do overlap and one with vectorization for when they don't overlap. GCC and Clang both do this. But ICC does not vectorize the loop.
ICC 13.0.01 with -O3
..B1.4: # Preds ..B1.2 ..B1.4
movss (%rsi), %xmm1 #3.21
incl %ecx #2.5
mulss %xmm0, %xmm1 #3.28
addss (%rdi), %xmm1 #3.11
movss %xmm1, (%rdi) #3.11
movss 4(%rsi), %xmm2 #3.21
addq $8, %rsi #3.21
mulss %xmm0, %xmm2 #3.28
addss 4(%rdi), %xmm2 #3.11
movss %xmm2, 4(%rdi) #3.11
addq $8, %rdi #3.11
cmpl %eax, %ecx #2.5
jb ..B1.4 # Prob 63% #2.5
To fix this you need to tell the compiler the arrays don't overlap using the __restrict keyword.
void foo(float * __restrict a, float * __restrict b, float k, int size) {
while (size--) {
(*a++) += (*b++) * k;
}
}
In this case ICC produces two branches. One for when the arrays are 16 byte aligned and one for when they are not. Here is the aligned branch
..B1.16: # Preds ..B1.16 ..B1.15
movaps (%rsi), %xmm2 #3.21
addl $8, %r8d #2.5
movaps 16(%rsi), %xmm3 #3.21
addq $32, %rsi #1.6
mulps %xmm1, %xmm2 #3.28
mulps %xmm1, %xmm3 #3.28
addps (%rdi), %xmm2 #3.11
addps 16(%rdi), %xmm3 #3.11
movaps %xmm2, (%rdi) #3.11
movaps %xmm3, 16(%rdi) #3.11
addq $32, %rdi #1.6
cmpl %ecx, %r8d #2.5
jb ..B1.16 # Prob 82% #2.5
ICC unrolls twice in both cases. Even though GCC and Clang produce a vectorized and unvectorize branch without __restrict you may want to use __restrict anyway to remove the overhead of the code to determine which branch to use.
The last thing you can try is to to tell the compiler the arrays are aligned. This will work with GCC and Clang (3.6)
void foo(float * __restrict a, float * __restrict b, float k, int size) {
a = (float*)__builtin_assume_aligned (a, 32);
b = (float*)__builtin_assume_aligned (b, 32);
while (size--) {
(*a++) += (*b++) * k;
}
}
GCC produces in this case
.L4:
movaps (%rsi,%r8), %xmm1
addl $1, %r10d
mulps %xmm2, %xmm1
addps (%rdi,%r8), %xmm1
movaps %xmm1, (%rdi,%r8)
addq $16, %r8
cmpl %r10d, %eax
ja .L4
Lastly if you compiler supports OpenMP 4.0 you can use OpenMP like this
void foo(float * __restrict a, float * __restrict b, float k, int size) {
#pragma omp simd aligned(a:32) aligned(b:32)
for(int i=0; i<size; i++) {
a[i] += k*b[i];
}
}
GCC produces the same code in this case as when using __builtin_assume_aligned. This should work for a more recent version of ICC (which I don't have).
I did not check MSVC. I expect it vectorizes this loop as well.
For more details about restrict and the compiler producing different branches with and without overlap and for aligned and not aligned see
sum-of-overlapping-arrays-auto-vectorization-and-restrict.
Here is one more suggestion to consider. If you know that the range of the loop is a multiple of the the SIMD width the compiler will not have to use cleanup code. The following code
// gcc -O3
// n = size/8
void foo(float * __restrict a, float * __restrict b, float k, int n) {
a = (float*)__builtin_assume_aligned (a, 32);
b = (float*)__builtin_assume_aligned (b, 32);
//#pragma omp simd aligned(a:32) aligned(b:32)
for(int i=0; i<n*8; i++) {
a[i] += k*b[i];
}
}
produces the simplest assembly so far.
foo(float*, float*, float, int):
sall $2, %edx
testl %edx, %edx
jle .L1
subl $4, %edx
shufps $0, %xmm0, %xmm0
shrl $2, %edx
xorl %eax, %eax
xorl %ecx, %ecx
addl $1, %edx
.L4:
movaps (%rsi,%rax), %xmm1
addl $1, %ecx
mulps %xmm0, %xmm1
addps (%rdi,%rax), %xmm1
movaps %xmm1, (%rdi,%rax)
addq $16, %rax
cmpl %edx, %ecx
jb .L4
.L1:
rep ret
I used a multiple8 and 32 byte alignment because then just by using the compiler switch -mavx the compiler produces nice AVX vectorization.
foo(float*, float*, float, int):
sall $3, %edx
testl %edx, %edx
jle .L5
vshufps $0, %xmm0, %xmm0, %xmm0
subl $8, %edx
xorl %eax, %eax
shrl $3, %edx
xorl %ecx, %ecx
addl $1, %edx
vinsertf128 $1, %xmm0, %ymm0, %ymm0
.L4:
vmulps (%rsi,%rax), %ymm0, %ymm1
addl $1, %ecx
vaddps (%rdi,%rax), %ymm1, %ymm1
vmovaps %ymm1, (%rdi,%rax)
addq $32, %rax
cmpl %edx, %ecx
jb .L4
vzeroupper
.L5:
rep ret
I am not sure how the preamble could be made simpler but the only improvement I see left is to remove one of the iterators and a compare. Namely the addl $1, %ecx instruction should not be necessary. Niether should the cmpl %edx, %ecx be necessary. I'm not sure how to get GCC to fix this. I had a problem like before with GCC (Produce loops without cmp instruction in GCC).
The functions SAXPY (single-precision), DAXPY (double-precision), CAXPY (complex single-precision), and ZAXPY (complex double-precision) compute exactly the expression you want:
Y = a * X + Y
where a is a scalar constant, and X and Y are vectors.
These functions are provided by BLAS libraries and optimized for all practical platforms: for CPUs the best BLAS implementations are OpenBLAS, Intel MKL (optimized for Intel x86 processors and Xeon Phi co-processors only), BLIS, and Apple Accelerate (OS X only); for nVidia GPUs look at cuBLAS (part of CUDA SDK), for any GPUs - ArrayFire.
These libraries are well-optimized and deliver better performance than whatever implementation you can quickly hack up.

Optimizations in copying a range

While reading sources of GNU C++ standard library, I found some code for copying (or moving, if possible) a range of iterators (file stl_algobase.h), which uses template specialization for some optimizations. A comment corresponding to it says:
All of these auxiliary structs serve two purposes. (1) Replace calls to copy with memmove whenever possible. (Memmove, not memcpy, because the input and output ranges are permitted to overlap.) (2) If we're using random access iterators, then write the loop as a for loop with an explicit count.
The specialization using the second optimization looks like this:
template<>
struct __copy_move<false, false, random_access_iterator_tag>
{
template<typename _II, typename _OI>
static _OI
__copy_m(_II __first, _II __last, _OI __result)
{
typedef typename iterator_traits<_II>::difference_type _Distance;
for(_Distance __n = __last - __first; __n > 0; --__n)
{
*__result = *__first;
++__first;
++__result;
}
return __result;
}
};
So, I have two questions concerning this
How can memmove increase the spead of copying? Is it implemented somehow more effective than a simple loop?
How can using explicit counter in the for loop affect the performance?
Some clarification: I would like to see some optimization examples actually used by compilers, not elaboration on the possibility of those.
Edit: the first question is quite nicely answered here.
Answering the second question, the explicit count does indeed lead to more opportunities for loop unrolling, though even with pointers iterating through a fixed size array, gcc does not perform aggressive unrolling unless asked to do so with -funroll-loops. The other gain comes from a potentially simpler end-of-loop comparison test for non-trivial iterators.
On a Core i7-4770, I benchmarked the time spent performing a copy of a maximally-aligned 2048-long integer array with a while loop and explicit count copy implementation. (Times in microseconds, includes call overhead; minimum of 200 samples of a timing loop with warm-up.)
while count
gcc -O3 0.179 0.178
gcc -O3 -march=native 0.097 0.095
gcc -O3 -march=native -funroll-loops 0.066 0.066
In each case, the generated code is very similar; the while version does a bit more work at the end in each case, handling checks that there aren't any entries left to copy that didn't fill out a whole 128-bit (SSE) or 256-bit (AVX) register, but these are pretty much taken care of by the branch predictor. The gcc -O3 assembly for each is as follows (leaving out assembler directives). while version:
array_copy_while(int (&) [2048], int (&) [2048]):
leaq 8192(%rdi), %rax
leaq 4(%rdi), %rdx
movq %rax, %rcx
subq %rdx, %rcx
movq %rcx, %rdx
shrq $2, %rdx
leaq 1(%rdx), %r8
cmpq $8, %r8
jbe .L11
leaq 16(%rsi), %rdx
cmpq %rdx, %rdi
leaq 16(%rdi), %rdx
setae %cl
cmpq %rdx, %rsi
setae %dl
orb %dl, %cl
je .L11
movq %r8, %r9
xorl %edx, %edx
xorl %ecx, %ecx
shrq $2, %r9
leaq 0(,%r9,4), %r10
.L9:
movdqa (%rdi,%rdx), %xmm0
addq $1, %rcx
movdqa %xmm0, (%rsi,%rdx)
addq $16, %rdx
cmpq %rcx, %r9
ja .L9
leaq 0(,%r10,4), %rdx
addq %rdx, %rdi
addq %rdx, %rsi
cmpq %r10, %r8
je .L1
movl (%rdi), %edx
movl %edx, (%rsi)
leaq 4(%rdi), %rdx
cmpq %rdx, %rax
je .L1
movl 4(%rdi), %edx
movl %edx, 4(%rsi)
leaq 8(%rdi), %rdx
cmpq %rdx, %rax
je .L20
movl 8(%rdi), %eax
movl %eax, 8(%rsi)
ret
.L11:
movl (%rdi), %edx
addq $4, %rdi
addq $4, %rsi
movl %edx, -4(%rsi)
cmpq %rdi, %rax
jne .L11
.L1:
rep ret
.L20:
rep ret
count version:
array_copy_count(int (&) [2048], int (&) [2048]):
leaq 16(%rsi), %rax
movl $2048, %ecx
cmpq %rax, %rdi
leaq 16(%rdi), %rax
setae %dl
cmpq %rax, %rsi
setae %al
orb %al, %dl
je .L23
movw $512, %cx
xorl %eax, %eax
xorl %edx, %edx
.L29:
movdqa (%rdi,%rax), %xmm0
addq $1, %rdx
movdqa %xmm0, (%rsi,%rax)
addq $16, %rax
cmpq %rdx, %rcx
ja .L29
rep ret
.L23:
xorl %eax, %eax
.L31:
movl (%rdi,%rax,4), %edx
movl %edx, (%rsi,%rax,4)
addq $1, %rax
cmpq %rax, %rcx
jne .L31
rep ret
When the iterators are more complicated however, the difference becomes more pronounced. Consider a hypothetical container that stores values in a series of fixed-sized allocated buffers. An iterator comprises a pointer to the chain of blocks, a block index and a block offset. Comparison of two iterators requires potentially two comparisons. Incrementing the iterator requires checking if we pop over a block boundary.
I made such a container, and performed the same benchmark for copying a 2000-long container of int, with a block size of 512 ints.
while count
gcc -O3 1.560 2.818
gcc -O3 -march=native 1.660 2.854
gcc -O3 -march=native -funroll-loops 1.432 2.858
That looks weird! Oh wait, it's because gcc 4.8 has a misoptimisation, where it uses conditional moves instead of nice, branch-predictor friendly comparisons. (gcc bug 56309).
Let's try icc on a different machine (Xeon E5-2670).
while count
icc -O3 3.952 3.704
icc -O3 -xHost 3.898 3.624
This is closer to what we'd expect, a small but significant improvement from the simpler loop condition. On a different architecture, the gain is more pronounced. clang targeting a PowerA2 at 1.6GHz:
while count
bgclang -O3 36.528 31.623
I'll omit the assembly, as it's quite long!

Storing a constant in SSE register (GCC, C++)

Hello StackOverflow community
I have encountered a following challenge: In my C++ application I have quite complex (cubic) loop in which, at all depths, I perform the following:
Compute 4 float values
Multiply all 4 values by a constant
Convert the floats to integers
This code is to be run with thousands of iterations in each loop (resulting in billions of operations) and I want to make it as fast as possible, so I'm trying to utilize SSE processor instructions.
While trying to manually optimize the code, I have encountered the following obstacle: each time I get to the part with multiplying all values by a constant, the constant has to be loaded to XMM register. My idea was to reserve one register (and forbid the compiler from using it), load the value once, and hardcode the multiplications with that one specific register, however I can't find the right way to do that.
By the way, could somebody please explain to me, why does this code:
vmovaps .LC0(%rip), %xmm1
movl $1000000000, %eax
vmovaps .LC1(%rip), %xmm0
.p2align 4,,10
.p2align 3
.L2:
#APP
# 26 "sse.cpp" 1
.intel_syntax noprefix;
mulps %xmm1,%xmm0;
.att_syntax prefix;
# 0 "" 2
#NO_APP
subl $1, %eax
jne .L2
Performs worse (real 0m1.656s vs real 0m1.618s) than the following one:
vmovaps .LC0(%rip), %xmm1
movl $1000000000, %eax
vmovaps .LC1(%rip), %xmm0
.p2align 4,,10
.p2align 3
.L2:
vmulps %xmm0, %xmm1, %xmm1
subl $1, %eax
jne .L2
(The difference is that I use intel syntax in my inline asm in gcc [first snippet] and legacy SSE instructions for compatibility, while gcc automatically generated version using AVX vectors [second snippet])
One note, you need to be more specific on how you do compile things and probably provide minimal example. I know it might not be best answer because of this, but I think it's good enough. It got long but it's because of codes.
Bottom line of below work is that it should be safe to leave for the compiler and use appropriate compiler flags. At the bottom I put an example how to use local register variable, but it probably won't be very useful (it gets ignored easily). You could use global register variable but it doesn't yield any good results and is discouraged.
My set-up is Intel(R) Core(TM) i7-4770 CPU, gcc version 4.9.2 and clang version 3.5.0. Below code does store avx_scalar in an xmm register with -O1 and above. With nothing or -O0 they don't. The code to generate assembly was:
[clang++|g++] -march=native -S -Ox ./sse.cpp,
where x was the optimization level.
Interesting thing is that with -march=archive both compilers decided to use SSE4.1 versions over legacy SSE in any case I tested, even though I used legacy SSE intrinsics in the code itself. This is good.
I also tested using smmintrin.h which is SSE4.1 header. With out the flag gcc uses legacy SSE and clang fails to compile with error: "SSE4.1 instruction set not enabled". With xmmintrin.h which is legacy SSE header, both compilers produced AVX versions in the presence of the flag, and legacy ones when it was absent.
Test code avx.cpp:
extern "C"
{
#include <smmintrin.h>
}
const float scalar = 3.14;
const __m128 avx_scalar = _mm_set1_ps(scalar);
__m128 vector;
__m128 its_me(){
__m128 ret;
__m128 result;
for(int i = 0; i < 1000; ++i)
{
vector = _mm_set_ps(i*1,i*2,i*3,i*4);
result = _mm_mul_ps(vector, avx_scalar);
ret = _mm_add_ps(ret, result);
}
return ret;
}
Revelvant part of g++ -march=native -S -O2 ./avx.cpp:
.LFB639:
.cfi_startproc
vmovaps _ZL10avx_scalar(%rip), %xmm5
xorl %edx, %edx
.p2align 4,,10
.p2align 3
.L2:
leal (%rdx,%rdx), %ecx
vxorps %xmm2, %xmm2, %xmm2
vxorps %xmm1, %xmm1, %xmm1
vxorps %xmm3, %xmm3, %xmm3
leal 0(,%rdx,4), %eax
vcvtsi2ss %ecx, %xmm3, %xmm3
vxorps %xmm4, %xmm4, %xmm4
vcvtsi2ss %eax, %xmm2, %xmm2
leal (%rcx,%rdx), %eax
vcvtsi2ss %edx, %xmm4, %xmm4
addl $1, %edx
vcvtsi2ss %eax, %xmm1, %xmm1
vunpcklps %xmm4, %xmm3, %xmm3
vunpcklps %xmm1, %xmm2, %xmm1
vmovlhps %xmm3, %xmm1, %xmm1
vmulps %xmm5, %xmm1, %xmm2
vaddps %xmm2, %xmm0, %xmm0
cmpl $1000, %edx
jne .L2
vmovaps %xmm1, vector(%rip)
ret
.cfi_endproc
And clang++ -march=native -S -O2 ./avx.cpp:
# BB#0:
xorl %eax, %eax
movl $4, %ecx
movl $2, %edx
vmovaps _ZL10avx_scalar(%rip), %xmm1
xorl %esi, %esi
# implicit-def: XMM0
.align 16, 0x90
.LBB0_1: # =>This Inner Loop Header: Depth=1
leal -2(%rdx), %r8d
leal -4(%rcx), %edi
vmovd %edi, %xmm2
vpinsrd $1, %eax, %xmm2, %xmm2
vpinsrd $2, %r8d, %xmm2, %xmm2
vpinsrd $3, %esi, %xmm2, %xmm2
vcvtdq2ps %xmm2, %xmm2
vmulps %xmm1, %xmm2, %xmm2
vaddps %xmm2, %xmm0, %xmm0
leal 1(%rsi), %r8d
leal 3(%rax), %edi
vmovd %ecx, %xmm2
vpinsrd $1, %edi, %xmm2, %xmm2
vpinsrd $2, %edx, %xmm2, %xmm2
vpinsrd $3, %r8d, %xmm2, %xmm2
vcvtdq2ps %xmm2, %xmm2
vmulps %xmm1, %xmm2, %xmm3
vaddps %xmm3, %xmm0, %xmm0
addl $2, %esi
addl $6, %eax
addl $8, %ecx
addl $4, %edx
cmpl $1000, %esi # imm = 0x3E8
jne .LBB0_1
# BB#2:
vmovaps %xmm2, vector(%rip)
retq
Just for the record, you can manually put a local variable into register, but clang ignores completely and gcc with -01 and above.I encourage to look for xmm13 in output from g++ -march=native -S -Ox ./avx.cpp with different x values for the below code (assuming you have at least 13 xmm registers on your cpu):
extern "C"
{
#include <xmmintrin.h>
}
const float scalar = 3.14;
__m128 its_me(){
__m128 vector;
register __m128 avx_scalar asm ("xmm13") = _mm_set1_ps(scalar); // that's how you do it in gcc.
//const __m128 avx_scalar = _mm_set1_ps(scalar);
__m128 ret;
__m128 result;
for(int i = 0; i < 1000; ++i)
{
vector = _mm_set_ps(i*1,i*2,i*3,i*4);
result = _mm_mul_ps(vector, avx_scalar);
ret = _mm_add_ps(ret, result);
}
return ret;
}

Can modern compilers unroll `for` loops expressed using begin and end iterators

Consider the following code
vector<double> v;
// fill v
const vector<double>::iterator end =v.end();
for(vector<double>::iterator i = v.bgin(); i != end; ++i) {
// do stuff
}
Are compilers like g++, clang++, icc able to unroll loops like this. Unfortunately I do not know assembly to be able verify from the output whether the loop gets unrolled or not. (and I only have access to g++.)
To me it seems that this will require more smartness than usual on behalf of the compiler, first to deduce that the iterator is a random access iterator, and then figure out the number of times the loop is executed. Can compilers do this when optimization is enabled ?
Thanks for your replies, and before some of you start lecturing about premature optimization, this is an excercise in curiosity.
To me it seems that this will require more smartness than usual on behalf of the compiler, first to deduce that the iterator is a random access iterator, and then figure out the number of times the loop is executed.
The STL, being comprised entirely of templates, has all the code inline. So, random access iterators reduce to pointers already when the compiler begins to apply optimizations. One of the reasons the STL was created was so that there would be less need for a programmer to outwit the compiler. You should rely on the STL to do the right thing until proven otherwise.
Of course, it is still up to you to choose the proper tool from the STL to use...
Edit: There was discussion about whether g++ does any loop unrolling. On the versions that I am using, loop unrolling is not part of -O, -O2, or -O3, and I get identical assembly for the latter two levels with the following code:
void foo (std::vector<int> &v) {
volatile int c = 0;
const std::vector<int>::const_iterator end = v.end();
for (std::vector<int>::iterator i = v.begin(); i != end; ++i) {
*i = c++;
}
}
With the corresponding assembly -O2 assembly:
_Z3fooRSt6vectorIiSaIiEE:
.LFB435:
movq 8(%rdi), %rcx
movq (%rdi), %rax
movl $0, -4(%rsp)
cmpq %rax, %rcx
je .L4
.p2align 4,,10
.p2align 3
.L3:
movl -4(%rsp), %edx
movl %edx, (%rax)
addq $4, %rax
addl $1, %edx
cmpq %rax, %rcx
movl %edx, -4(%rsp)
jne .L3
.L4:
rep
ret
With the -funroll-loops option added, the function expands into something much much larger. But, the documentation warns about this option:
Unroll loops whose number of iterations can be determined at compile time or upon entry to the loop. -funroll-loops implies -frerun-cse-after-loop. It also turns on complete loop peeling (i.e. complete removal of loops with small constant number of iterations). This option makes code larger, and may or may not make it run faster.
As a further argument to dissuade you from unrolling loops yourself, I'll finish this answer with an illustration of applying Duff's Device to the foo function above:
void foo_duff (std::vector<int> &v) {
volatile int c = 0;
const std::vector<int>::const_iterator end = v.end();
std::vector<int>::iterator i = v.begin();
switch ((end - i) % 4) do {
case 0: *i++ = c++;
case 3: *i++ = c++;
case 2: *i++ = c++;
case 1: *i++ = c++;
} while (i != end);
}
GCC has another loop optimization flag:
-ftree-loop-optimize
Perform loop optimizations on trees. This flag is enabled by default at -O and higher.
So, the -O option enables simple loop optimizations for the innermost loops, including complete loop unrolling (peeling) for loops with a fixed number of iterations. (Thanks to doc for pointing this out to me.)
I would propose that whether or not the compiler CAN unroll the loop, with modern pipelined architectures and caches, unless your "do stuff" is trivial, there is little benefit in doing so, and in many cases doing so would be a performance HIT instead of a boon. If your "do stuff" is nontrivial, unrolling the loop will create multiple copies of this nontrivial code, which will take extra time to load into the cache, significantly slowing down the first iteration through the unrolled loop. At the same time, it will evict more code from the cache, which may have been necessary for performing the "do stuff" if it makes any function calls, which would then need to be reloaded into the cache again. The purpose for unrolling loops made a lot of sense before cacheless pipelined non-branch-predictive architectures, with the goal being to reduce the overhead associated with the loop logic. Nowadays with cache-based pipelined branch-predictive hardware, your cpu will be pipelined well into the next loop iteration, speculatively executing the loop code again, by the time you detect the i==end exit condition, at which point the processor will throw out that final speculatively-executed set of results. In such an architecture, loop unrolling makes very little sense. It would further bloat code for virtually no benefit.
The short answer is yes. It will unroll as much as it can. In your case, it depends how you define end obviously (I assume your example is generic). Not only will most modern compilers unroll, but they will also vectorize and do other optimizations that will often blow your own solutions out of the water.
So what I'm saying is don't prematurely optimize! Just kidding :)
Simple answer: generally NO! At least when it comes to complete loop unrolling.
Let's test loop unrolling on this simple, dirty-coded (for testing purposes) structure.
struct Test
{
Test(): begin(arr), end(arr + 4) {}
double * begin;
double * end;
double arr[4];
};
First let's take counted loop and compile it without any optimizations.
double counted(double param, Test & d)
{
for (int i = 0; i < 4; i++)
param += d.arr[i];
return param;
}
Here's what gcc 4.9 produces.
counted(double, Test&):
pushq %rbp
movq %rsp, %rbp
movsd %xmm0, -24(%rbp)
movq %rdi, -32(%rbp)
movl $0, -4(%rbp)
jmp .L2
.L3:
movq -32(%rbp), %rax
movl -4(%rbp), %edx
movslq %edx, %rdx
addq $2, %rdx
movsd (%rax,%rdx,8), %xmm0
movsd -24(%rbp), %xmm1
addsd %xmm0, %xmm1
movq %xmm1, %rax
movq %rax, -24(%rbp)
addl $1, -4(%rbp)
.L2:
cmpl $3, -4(%rbp)
jle .L3
movq -24(%rbp), %rax
movq %rax, -40(%rbp)
movsd -40(%rbp), %xmm0
popq %rbp
ret
As expected loop hasn't been unrolled and, since no optimizations were performed, code is generally very verbose. Now let's turn on -O3 flag. Produced disassembly:
counted(double, Test&):
addsd 16(%rdi), %xmm0
addsd 24(%rdi), %xmm0
addsd 32(%rdi), %xmm0
addsd 40(%rdi), %xmm0
ret
Voila, loop has been unrolled this time.
Now let's take a look at iterated loop. Function containing the loop will look like this.
double iterated(double param, Test & d)
{
for (double * it = d.begin; it != d.end; ++it)
param += *it;
return param;
}
Still using -O3 flag, let's take a look at disassembly.
iterated(double, Test&):
movq (%rdi), %rax
movq 8(%rdi), %rdx
cmpq %rdx, %rax
je .L3
.L4:
addsd (%rax), %xmm0
addq $8, %rax
cmpq %rdx, %rax
jne .L4
.L3:
rep ret
Code looks better than in the very first case, because optimizations were performed, but loop hasn't been unrolled this time!
What about funroll-loops and funroll-all-loops flags? They will produce result similar to this
iterated(double, Test&):
movq (%rdi), %rsi
movq 8(%rdi), %rcx
cmpq %rcx, %rsi
je .L3
movq %rcx, %rdx
leaq 8(%rsi), %rax
addsd (%rsi), %xmm0
subq %rsi, %rdx
subq $8, %rdx
shrq $3, %rdx
andl $7, %edx
cmpq %rcx, %rax
je .L43
testq %rdx, %rdx
je .L4
cmpq $1, %rdx
je .L29
cmpq $2, %rdx
je .L30
cmpq $3, %rdx
je .L31
cmpq $4, %rdx
je .L32
cmpq $5, %rdx
je .L33
cmpq $6, %rdx
je .L34
addsd (%rax), %xmm0
leaq 16(%rsi), %rax
.L34:
addsd (%rax), %xmm0
addq $8, %rax
.L33:
addsd (%rax), %xmm0
addq $8, %rax
.L32:
addsd (%rax), %xmm0
addq $8, %rax
.L31:
addsd (%rax), %xmm0
addq $8, %rax
.L30:
addsd (%rax), %xmm0
addq $8, %rax
.L29:
addsd (%rax), %xmm0
addq $8, %rax
cmpq %rcx, %rax
je .L44
.L4:
addsd (%rax), %xmm0
addq $64, %rax
addsd -56(%rax), %xmm0
addsd -48(%rax), %xmm0
addsd -40(%rax), %xmm0
addsd -32(%rax), %xmm0
addsd -24(%rax), %xmm0
addsd -16(%rax), %xmm0
addsd -8(%rax), %xmm0
cmpq %rcx, %rax
jne .L4
.L3:
rep ret
.L44:
rep ret
.L43:
rep ret
Compare results with unrolled loop for counted loop. It's clearly not the same. What we see here is that gcc divided the loop into 8 element chunks. This can increase performance in some cases, because loop exit condition is checked once per 8 normal loop iterations. With additional flags vectorization could be also performed. But it isn't complete loop unrolling.
Iterated loop will be unrolled however if Test object is not a function argument.
double iteratedLocal(double param)
{
Test d;
for (double * it = d.begin; it != d.end; ++it)
param += *it;
return param;
}
Disassembly produced with only -O3 flag:
iteratedLocal(double):
addsd -40(%rsp), %xmm0
addsd -32(%rsp), %xmm0
addsd -24(%rsp), %xmm0
addsd -16(%rsp), %xmm0
ret
As you can see loop has been unrolled. This is because compiler can now safely assume that end has fixed value, while it couldn't predict that for function argument.
Test structure is statically allocated however. Things are more complicated with dynamically allocated structures like std::vector. From my observations on modified Test structure, so that it ressembles dynamically allocated container, it looks like gcc tries its best to unroll loops, but in most cases generated code is not as simple as one above.
As you ask for other compilers, here's output from clang 3.4.1 (-O3 flag)
counted(double, Test&): # #counted(double, Test&)
addsd 16(%rdi), %xmm0
addsd 24(%rdi), %xmm0
addsd 32(%rdi), %xmm0
addsd 40(%rdi), %xmm0
ret
iterated(double, Test&): # #iterated(double, Test&)
movq (%rdi), %rax
movq 8(%rdi), %rcx
cmpq %rcx, %rax
je .LBB1_2
.LBB1_1: # %.lr.ph
addsd (%rax), %xmm0
addq $8, %rax
cmpq %rax, %rcx
jne .LBB1_1
.LBB1_2: # %._crit_edge
ret
iteratedLocal(double): # #iteratedLocal(double)
leaq -32(%rsp), %rax
movq %rax, -48(%rsp)
leaq (%rsp), %rax
movq %rax, -40(%rsp)
xorl %eax, %eax
jmp .LBB2_1
.LBB2_2: # %._crit_edge4
movsd -24(%rsp,%rax), %xmm1
addq $8, %rax
.LBB2_1: # =>This Inner Loop Header: Depth=1
movaps %xmm0, %xmm2
cmpq $24, %rax
movaps %xmm1, %xmm0
addsd %xmm2, %xmm0
jne .LBB2_2
ret
Intel's icc 13.01 (-O3 flag)
counted(double, Test&):
addsd 16(%rdi), %xmm0 #24.5
addsd 24(%rdi), %xmm0 #24.5
addsd 32(%rdi), %xmm0 #24.5
addsd 40(%rdi), %xmm0 #24.5
ret #25.10
iterated(double, Test&):
movq (%rdi), %rdx #30.26
movq 8(%rdi), %rcx #30.41
cmpq %rcx, %rdx #30.41
je ..B3.25 # Prob 50% #30.41
subq %rdx, %rcx #30.7
movb $0, %r8b #30.7
lea 7(%rcx), %rax #30.7
sarq $2, %rax #30.7
shrq $61, %rax #30.7
lea 7(%rax,%rcx), %rcx #30.7
sarq $3, %rcx #30.7
cmpq $16, %rcx #30.7
jl ..B3.26 # Prob 10% #30.7
movq %rdx, %rdi #30.7
andq $15, %rdi #30.7
je ..B3.6 # Prob 50% #30.7
testq $7, %rdi #30.7
jne ..B3.26 # Prob 10% #30.7
movl $1, %edi #30.7
..B3.6: # Preds ..B3.5 ..B3.3
lea 16(%rdi), %rax #30.7
cmpq %rax, %rcx #30.7
jl ..B3.26 # Prob 10% #30.7
movq %rcx, %rax #30.7
xorl %esi, %esi #30.7
subq %rdi, %rax #30.7
andq $15, %rax #30.7
negq %rax #30.7
addq %rcx, %rax #30.7
testq %rdi, %rdi #30.7
jbe ..B3.11 # Prob 2% #30.7
..B3.9: # Preds ..B3.7 ..B3.9
addsd (%rdx,%rsi,8), %xmm0 #31.9
incq %rsi #30.7
cmpq %rdi, %rsi #30.7
jb ..B3.9 # Prob 82% #30.7
..B3.11: # Preds ..B3.9 ..B3.7
pxor %xmm6, %xmm6 #28.12
movaps %xmm6, %xmm7 #28.12
movaps %xmm6, %xmm5 #28.12
movsd %xmm0, %xmm7 #28.12
movaps %xmm6, %xmm4 #28.12
movaps %xmm6, %xmm3 #28.12
movaps %xmm6, %xmm2 #28.12
movaps %xmm6, %xmm1 #28.12
movaps %xmm6, %xmm0 #28.12
..B3.12: # Preds ..B3.12 ..B3.11
addpd (%rdx,%rdi,8), %xmm7 #31.9
addpd 16(%rdx,%rdi,8), %xmm6 #31.9
addpd 32(%rdx,%rdi,8), %xmm5 #31.9
addpd 48(%rdx,%rdi,8), %xmm4 #31.9
addpd 64(%rdx,%rdi,8), %xmm3 #31.9
addpd 80(%rdx,%rdi,8), %xmm2 #31.9
addpd 96(%rdx,%rdi,8), %xmm1 #31.9
addpd 112(%rdx,%rdi,8), %xmm0 #31.9
addq $16, %rdi #30.7
cmpq %rax, %rdi #30.7
jb ..B3.12 # Prob 82% #30.7
addpd %xmm6, %xmm7 #28.12
addpd %xmm4, %xmm5 #28.12
addpd %xmm2, %xmm3 #28.12
addpd %xmm0, %xmm1 #28.12
addpd %xmm5, %xmm7 #28.12
addpd %xmm1, %xmm3 #28.12
addpd %xmm3, %xmm7 #28.12
movaps %xmm7, %xmm0 #28.12
unpckhpd %xmm7, %xmm0 #28.12
addsd %xmm0, %xmm7 #28.12
movaps %xmm7, %xmm0 #28.12
..B3.14: # Preds ..B3.13 ..B3.26
lea 1(%rax), %rsi #30.7
cmpq %rsi, %rcx #30.7
jb ..B3.25 # Prob 50% #30.7
subq %rax, %rcx #30.7
cmpb $1, %r8b #30.7
jne ..B3.17 # Prob 50% #30.7
..B3.16: # Preds ..B3.17 ..B3.15
xorl %r8d, %r8d #30.7
jmp ..B3.21 # Prob 100% #30.7
..B3.17: # Preds ..B3.15
cmpq $2, %rcx #30.7
jl ..B3.16 # Prob 10% #30.7
movq %rcx, %r8 #30.7
xorl %edi, %edi #30.7
pxor %xmm1, %xmm1 #28.12
lea (%rdx,%rax,8), %rsi #31.19
andq $-2, %r8 #30.7
movsd %xmm0, %xmm1 #28.12
..B3.19: # Preds ..B3.19 ..B3.18
addpd (%rsi,%rdi,8), %xmm1 #31.9
addq $2, %rdi #30.7
cmpq %r8, %rdi #30.7
jb ..B3.19 # Prob 82% #30.7
movaps %xmm1, %xmm0 #28.12
unpckhpd %xmm1, %xmm0 #28.12
addsd %xmm0, %xmm1 #28.12
movaps %xmm1, %xmm0 #28.12
..B3.21: # Preds ..B3.20 ..B3.16
cmpq %rcx, %r8 #30.7
jae ..B3.25 # Prob 2% #30.7
lea (%rdx,%rax,8), %rax #31.19
..B3.23: # Preds ..B3.23 ..B3.22
addsd (%rax,%r8,8), %xmm0 #31.9
incq %r8 #30.7
cmpq %rcx, %r8 #30.7
jb ..B3.23 # Prob 82% #30.7
..B3.25: # Preds ..B3.23 ..B3.21 ..B3.14 ..B3.1
ret #32.14
..B3.26: # Preds ..B3.2 ..B3.6 ..B3.4 # Infreq
movb $1, %r8b #30.7
xorl %eax, %eax #30.7
jmp ..B3.14 # Prob 100% #30.7
iteratedLocal(double):
lea -8(%rsp), %rax #8.13
lea -40(%rsp), %rdx #7.11
cmpq %rax, %rdx #33.41
je ..B4.15 # Prob 50% #33.41
movq %rax, -48(%rsp) #32.12
movq %rdx, -56(%rsp) #32.12
xorl %eax, %eax #33.7
..B4.13: # Preds ..B4.11 ..B4.13
addsd -40(%rsp,%rax,8), %xmm0 #34.9
incq %rax #33.7
cmpq $4, %rax #33.7
jb ..B4.13 # Prob 82% #33.7
..B4.15: # Preds ..B4.13 ..B4.1
ret #35.14
To avoid misunderstandings. If counted loop condition would rely on external parameter like this one.
double countedDep(double param, Test & d)
{
for (int i = 0; i < d.size; i++)
param += d.arr[i];
return param;
}
Such loop also will not be unrolled.