Can the compiler optimize loops if the last index of the loops (a and b in the following example) are not known at compile time?
Unoptimized:
int* arr = new int[a*b];
for (i = 0; i < a; ++i){
for(j = 0; j < b; ++j){
arr[i*b+j] *= 8;
}
}
//delete arr after done.
More Optimized: (assuming a and b are large...)
int c = a*b;
int* arr = new int[c];
for (i = 0; i < c; ++i){
arr[c] *= 8;
}
//delete arr after done.
If you treat the array as linear space, gcc (and presumably others) will optimise even without knowing the extents at compile time.
This code:
void by8(int* arr, int a, int b)
{
auto extent = a * b;
for (int i = 0; i < extent; ++i)
{
arr[i] *= 8;
}
}
compiles to this (notice how the inner part of the loop is vectorised)
by8(int*, int, int):
imull %esi, %edx
testl %edx, %edx
jle .L23
movq %rdi, %rax
andl $31, %eax
shrq $2, %rax
negq %rax
andl $7, %eax
cmpl %edx, %eax
cmova %edx, %eax
cmpl $8, %edx
jg .L26
movl %edx, %eax
.L3:
sall $3, (%rdi)
cmpl $1, %eax
je .L15
sall $3, 4(%rdi)
cmpl $2, %eax
je .L16
sall $3, 8(%rdi)
cmpl $3, %eax
je .L17
sall $3, 12(%rdi)
cmpl $4, %eax
je .L18
sall $3, 16(%rdi)
cmpl $5, %eax
je .L19
sall $3, 20(%rdi)
cmpl $6, %eax
je .L20
sall $3, 24(%rdi)
cmpl $7, %eax
je .L21
sall $3, 28(%rdi)
movl $8, %ecx
.L5:
cmpl %eax, %edx
je .L27
.L4:
leal -1(%rdx), %r8d
movl %edx, %r9d
movl %eax, %r10d
subl %eax, %r9d
subl %eax, %r8d
leal -8(%r9), %esi
shrl $3, %esi
addl $1, %esi
leal 0(,%rsi,8), %r11d
cmpl $6, %r8d
jbe .L7
leaq (%rdi,%r10,4), %r10
xorl %eax, %eax
xorl %r8d, %r8d
.L9:
vmovdqa (%r10,%rax), %ymm0
addl $1, %r8d
vpslld $3, %ymm0, %ymm0
vmovdqa %ymm0, (%r10,%rax)
addq $32, %rax
cmpl %r8d, %esi
ja .L9
addl %r11d, %ecx
cmpl %r11d, %r9d
je .L22
vzeroupper
.L7:
movslq %ecx, %rax
sall $3, (%rdi,%rax,4)
leal 1(%rcx), %eax
cmpl %eax, %edx
jle .L23
cltq
sall $3, (%rdi,%rax,4)
leal 2(%rcx), %eax
cmpl %eax, %edx
jle .L23
cltq
sall $3, (%rdi,%rax,4)
leal 3(%rcx), %eax
cmpl %eax, %edx
jle .L23
cltq
sall $3, (%rdi,%rax,4)
leal 4(%rcx), %eax
cmpl %eax, %edx
jle .L23
cltq
sall $3, (%rdi,%rax,4)
leal 5(%rcx), %eax
cmpl %eax, %edx
jle .L23
cltq
addl $6, %ecx
sall $3, (%rdi,%rax,4)
cmpl %ecx, %edx
jle .L28
movslq %ecx, %rcx
sall $3, (%rdi,%rcx,4)
ret
.L22:
vzeroupper
.L23:
ret
.L27:
ret
.L26:
testl %eax, %eax
jne .L3
xorl %ecx, %ecx
jmp .L4
.L28:
ret
.L21:
movl $7, %ecx
jmp .L5
.L15:
movl $1, %ecx
jmp .L5
.L16:
movl $2, %ecx
jmp .L5
.L17:
movl $3, %ecx
jmp .L5
.L18:
movl $4, %ecx
jmp .L5
.L19:
movl $5, %ecx
jmp .L5
.L20:
movl $6, %ecx
jmp .L5
compiler : gcc 5.4 with command line options: -std=c++14 -O3 -march=native
Yes it probably can, given that the sizes are constant and do not change in your loop, as it happens here. Read Optimize "for" loop for more please.
FYI, in your fist example, this:
arr[j*a+b] *= 8;
should be this:
arr[j*a+i] *= 8;
Modern compilers can definitely change the order of the two array, to prevent unneeded cache misses, from:
for (i = 0; i < a; ++i){
for(j = 0; j < b; ++j){
arr[j*a+i] *= 8;
}
}
to this:
for(j = 0; j < b; ++j){
for (i = 0; i < a; ++i){
arr[j*a+i] *= 8;
}
}
After this optimizations, the two examples (compared to your manual optimization) shouldn't measurably differ in performance.
if you are using the visual studio compiler you can use the /Qvec-report command line argument and it will tell you which loops are/are not being vectorized and give you reason codes as to why they are not
vectorization of loops (unlike unrolling) is where the compiler uses the SIMD (SSE,SSE2,AVX) instructions to break the loop into a series of operation which are performed in parallel
https://msdn.microsoft.com/en-us/library/jj658585.aspx
gcc and clang may have similar capabilities
You can always unroll a for loop. Even if you don't know the number of iterations it should do with a trick called Duff's device
Also see the explanation here on stackoverflow: How does Duff's device work?
You can have an interleaved switch and while loop, and let the while loop process, say, 4 items at once. if you'd like to process 6 items, you can then cheat by jumping to the second last item in the loop processing 2+4=6 items:
int n = 6;
int it = n / 4;
int check = 0;
switch (n % 4) {
case 0: do { check += 1;
case 3: check += 1;
case 2: check += 1;
case 1: check += 1;
} while (it--);
}
printf("processed %i items\n", check);
Related
I'm trying to build simple lexical analyzer - lexer. The part I'm working on now is tokenizer. I'm writing function which determines separators (whitespaces, tabs, newlines(CR, LF)) in the input sequence. So the question is which code is more correct:
The code with switch-case statement:
bool isWhitespace(wchar_t &symbol) {
switch (symbol) {
case L' ':
case L'\t':
case L'\r':
case L'\n':
return true;
default:
return false;
}
}
Or the code with if(.. || .. || ..) statement:
bool isWhitespace(wchar_t &symbol) {
if (symbol == L' ' ||
symbol == L'\t' ||
symbol == L'\r' ||
symbol == L'\n') {
return true;
}
return false;
}
And which one would be faster?
UPD
Well, the result of assambler generated code and speed tests are:
For switch-case:
__Z12isWhitespaceRw:
pushl %ebp
movl %esp, %ebp
movl 8(%ebp), %eax
movzwl (%eax), %eax
movzwl %ax, %eax
cmpl $13, %eax
je L18
cmpl $13, %eax
jg L19
subl $9, %eax
cmpl $1, %eax
ja L17
jmp L18
L19:
cmpl $32, %eax
jne L17
L18:
movl $1, %eax
jmp L20
L17:
movl $0, %eax
L20:
popl %ebp
ret
And for if:
__Z12isWhitespaceRw:
pushl %ebp
movl %esp, %ebp
movl 8(%ebp), %eax
movzwl (%eax), %eax
cmpw $32, %ax
je L22
movl 8(%ebp), %eax
movzwl (%eax), %eax
cmpw $9, %ax
je L22
movl 8(%ebp), %eax
movzwl (%eax), %eax
cmpw $13, %ax
je L22
movl 8(%ebp), %eax
movzwl (%eax), %eax
cmpw $10, %ax
jne L23
L22:
movl $1, %eax
jmp L24
L23:
movl $0, %eax
L24:
popl %ebp
ret
I'm not an assambler mega expert but looks like switch-case code has more jumps and less compare operations.
And speed measures:
1
Switch-case: %time = 7.7, self = 0.01
If: %time = 46.2, self = 0.06
2
Switch-case: %time = 34.6, self = 0.03
If: %time = 34.6, self = 0.03
UPD #2
Yes, Unicode. Isn't it so obvious if looking at code?!
Both are correct but the switch/case version will produce warnings with some compilers.
Both are likely to create the same (or similar) machine code. So there is no significant performance difference.
The second version makes clearer what is happening. This is not a typical use case for switch/case but that's just my opinion.
The order of comparisons does have an effect on performance. You should start with the most frequent case, then the second and so on. This will reduce the number of comparisons.
I have a question regarding GCC's optimization flags and how they work.
I have a very long piece of code that utilizes all local arrays and variables. At the end of the code, I copy the contents of the local array to a global array. Here is an extremely stripped down example of my code:
uint8_t globalArray[16]={0};
void func()
{
unsigned char localArray[16]={0};
for (int r=0; r<1000000; r++)
{
**manipulate localArray with a lot of calculations**
}
memcpy(&globalArray,localArray,16);
}
Here's the approximate speed of the code in three different scenarios:
Without "-O3" optimization: 3.203s
With "-O3" optimization: 1.457s
With "-O3" optimization and without the final memcpy(&globalArray,localArray,16); statement: 0.015s
Without copying the local array into the global array, the code runs almost 100 times faster. I know that the global array is stored in the memory and the local array is stored in registers. My question is:
Why does just copying 16 elements of a local array to a global array cause 100 times slower execution? I have searched this forum and online and I cannot find a definite answer to this particular scenario of mine.
Is there any way that I can extract the contents of the local variable without the speed loss?
Thank you in advance to anyone that can help me with this problem.
Without the memcpy, your compiler will likely see that localArray is never read from, so it doesn't need to do any of the calculations in the loop body.
Take this code as an example:
uint8_t globalArray[16]={0};
void func()
{
unsigned char localArray[16]={0};
for (int r=0; r<1000000; r++)
{
localArray[r%16] = r;
}
memcpy(&globalArray,localArray,16);
}
Clang 3.7.1 with -O3 outputs this assembly:
func(): # #func()
# BB#0:
xorps %xmm0, %xmm0
movaps %xmm0, -24(%rsp)
#DEBUG_VALUE: r <- 0
xorl %eax, %eax
.LBB0_1: # =>This Inner Loop Header: Depth=1
#DEBUG_VALUE: r <- 0
movl %eax, %ecx
sarl $31, %ecx
shrl $28, %ecx
leal (%rcx,%rax), %ecx
andl $-16, %ecx
movl %eax, %edx
subl %ecx, %edx
movslq %edx, %rcx
movb %al, -24(%rsp,%rcx)
leal 1(%rax), %ecx
#DEBUG_VALUE: r <- ECX
movl %ecx, %edx
sarl $31, %edx
shrl $28, %edx
leal 1(%rax,%rdx), %edx
andl $-16, %edx
negl %edx
leal 1(%rax,%rdx), %edx
movslq %edx, %rdx
movb %cl, -24(%rsp,%rdx)
leal 2(%rax), %ecx
movl %ecx, %edx
sarl $31, %edx
shrl $28, %edx
leal 2(%rax,%rdx), %edx
andl $-16, %edx
negl %edx
leal 2(%rax,%rdx), %edx
movslq %edx, %rdx
movb %cl, -24(%rsp,%rdx)
leal 3(%rax), %ecx
movl %ecx, %edx
sarl $31, %edx
shrl $28, %edx
leal 3(%rax,%rdx), %edx
andl $-16, %edx
negl %edx
leal 3(%rax,%rdx), %edx
movslq %edx, %rdx
movb %cl, -24(%rsp,%rdx)
leal 4(%rax), %ecx
movl %ecx, %edx
sarl $31, %edx
shrl $28, %edx
leal 4(%rax,%rdx), %edx
andl $-16, %edx
negl %edx
leal 4(%rax,%rdx), %edx
movslq %edx, %rdx
movb %cl, -24(%rsp,%rdx)
addl $5, %eax
cmpl $1000000, %eax # imm = 0xF4240
jne .LBB0_1
# BB#2:
movaps -24(%rsp), %xmm0
movaps %xmm0, globalArray(%rip)
retq
For the same code without the memcpy, it outputs this:
func(): # #func()
# BB#0:
#DEBUG_VALUE: r <- 0
retq
Even if you know nothing about assembly, it's clear to see that the latter just does nothing.
I've been told many times that recursion is slow due to function calls, but in this code, it seems much faster than the iterative solution. At best, I typically expect a compiler to optimize recursion into iteration (which looking at the assembly, did seem to happen).
#include <iostream>
bool isDivisable(int x, int y)
{
for (int i = y; i != 1; --i)
if (x % i != 0)
return false;
return true;
}
bool isDivisableRec(int x, int y)
{
if (y == 1)
return true;
return x % y == 0 && isDivisableRec(x, y-1);
}
int findSmallest()
{
int x = 20;
for (; !isDivisable(x,20); ++x);
return x;
}
int main()
{
std::cout << findSmallest() << std::endl;
}
Assembly here: https://gist.github.com/PatrickAupperle/2b56e16e9e5a6a9b251e
I'd love to know what is going on here. I'm sure it is some tricky compiler optimization that I can be amazed to learn about.
Edit: I just realized I forgot to mention that if I use the recursive version, it runs in about .25 seconds, the iterative, about .6.
Edit 2: I am compiling with -O3 using
$ g++ --version
g++ (Ubuntu 4.8.4-2ubuntu1~14.04) 4.8.4
Though, I'm not really sure what that matters.
Edit 3:
Better benchmarking:
Source: http://gist.github.com/PatrickAupperle/ee8241ac51417437d012
Output: http://gist.github.com/PatrickAupperle/5870136a5552b83fd0f1
Running with 100 iterations shows very similar results
Edit 4:
At Roman's suggestion, I added -fno-inline-functions -fno-inline-small-functions to the compilation flags. The effect is extremely bizarre to me. The code runs about 15x faster, but the ratio between the recursive version and the iterative version remains similar.
https://gist.github.com/PatrickAupperle/3a87eb53a9f11c1f0bec
Using this code I also see large timing difference (in favor of the recursive version) with GCC 4.9.3 in Cygwin. I get
13.411 seconds for iterative
4.29101 seconds for recursive
Looking at the assembly code it generated with -O3, I see two things
The compiler replaced tail recursion in isDivisableRec with a cycle and then unrolled the cycle: each iteration of the cycle in the machine code covers two levels of the original recursion.
_Z14isDivisableRecii:
.LFB1467:
.seh_endprologue
movl %edx, %r8d
.L15:
cmpl $1, %r8d
je .L18
movl %ecx, %eax ; First unrolled divisibility check
cltd
idivl %r8d
testl %edx, %edx
je .L20
.L19:
xorl %eax, %eax
ret
.p2align 4,,10
.L20:
leal -1(%r8), %r9d
cmpl $1, %r9d
jne .L21
.p2align 4,,10
.L18:
movl $1, %eax
ret
.p2align 4,,10
.L21:
movl %ecx, %eax ; Second unrolled divisibility check
cltd
idivl %r9d
testl %edx, %edx
jne .L19
subl $2, %r8d
jmp .L15
.seh_endproc
The compiler inlined several iterations of isDivisableRec by lifting them into findSmallestRec. Since the value of y parameter of isDivisableRec is hardcoded as 20 the compiler managed to replace the iterations for 20, 19...15 with some "magical" code inlined directly into findSmallestRec. The actual call to isDivisableRec happens only for y parameter value of 14 (if it happens at all).
Here's the inlined code in findSmallestRec
movl $20, %ebx
movl $1717986919, %esi ; Magic constants
movl $1808407283, %edi ; for divisibility tests
movl $954437177, %ebp ;
movl $2021161081, %r12d ;
movl $-2004318071, %r13d ;
jmp .L28
.p2align 4,,10
.L29: ; The main cycle
addl $1, %ebx
.L28:
movl %ebx, %eax ; Divisibility by 20 test
movl %ebx, %ecx
imull %esi
sarl $31, %ecx
sarl $3, %edx
subl %ecx, %edx
leal (%rdx,%rdx,4), %eax
sall $2, %eax
cmpl %eax, %ebx
jne .L29
movl %ebx, %eax ; Divisibility by 19 test
imull %edi
sarl $3, %edx
subl %ecx, %edx
leal (%rdx,%rdx,8), %eax
leal (%rdx,%rax,2), %eax
cmpl %eax, %ebx
jne .L29
movl %ebx, %eax ; Divisibility by 18 test
imull %ebp
sarl $2, %edx
subl %ecx, %edx
leal (%rdx,%rdx,8), %eax
addl %eax, %eax
cmpl %eax, %ebx
jne .L29
movl %ebx, %eax ; Divisibility by 17 test
imull %r12d
sarl $3, %edx
subl %ecx, %edx
movl %edx, %eax
sall $4, %eax
addl %eax, %edx
cmpl %edx, %ebx
jne .L29
testb $15, %bl ; Divisibility by 16 test
jne .L29
movl %ebx, %eax ; Divisibility by 15 test
imull %r13d
leal (%rdx,%rbx), %eax
sarl $3, %eax
subl %ecx, %eax
movl %eax, %edx
sall $4, %edx
subl %eax, %edx
cmpl %edx, %ebx
jne .L29
movl $14, %edx
movl %ebx, %ecx
call _Z14isDivisableRecii ; call isDivisableRecii(x, 14)
...
The above blocks of machine instructions before each jne .L29 jump are divisibility tests for 20, 19...15 lifted directly into findSmallestRec. Apparently, they are more efficient than the tests used inside isDivisableRec for a run-time value of y. As you can see, the divisibility by 16 test is implemented simply as testb $15, %bl. Because of this, non-divisibility of x by high values of y is caught early by the above highly optimized code.
None of this happens for isDivisable and findSmallest - they are basically translated literally. Even the cycle is not unrolled.
I believe it is the second optimization that makes for the most of the difference. The compiler used highly optimized methods of checking divisibility for higher y values, which happen to be known at compile time.
If you replace the second argument of isDivisableRec with an "unpredictable" run-time value of 20 (instead of hard-coded compile-time constant 20), it should disable this optimization and bring the timings in line. I just tried this and ended up with
12.9 seconds for iterative
13.26 seconds for recursive
I have written a simple Fibonacci function as an exercise in C++ (using Visual Studio) to test Tail Recursion and to see how it works.
this is the code:
int fib_tail(int n, int res, int next) {
if (n == 0) {
return res;
}
return fib_tail(n - 1, next, res + next);
}
int main()
{
fib_tail(10,0,1); //Tail Recursion works
}
when I compiled using Release mode I saw the optimized assembly using the JMP instruction in spite of a call. So my conclusion was: tail recursion works. See image below:
I wanted to do some performance tests by increasing the input variable n in my Fibonacci function. I then opted to change the variable type, used in the function, from int to unsigned long long. Then I passed a big number like: 10e+08
This is now the new function:
typedef unsigned long long ULONG64;
ULONG64 fib_tail(ULONG64 n, ULONG64 res, ULONG64 next) {
if (n == 0) {
return res;
}
return fib_tail(n - 1, next, res + next);
}
int main()
{
fib_tail(10e+9,0,1); //Tail recursion does not work
}
When I ran the code above I got a stack overflow exception, which made me think that tail recursion was not working. I looked at the assembly and in fact I found this:
As you see now there is a call instruction whereas I was expecting only a simple JMP. I don't understand the reason why using a 8 bytes variable disables tail recursion. Why the compiler doesn't perform an optimization in such case?
This is one of those questions that you'd have to ask the guys that do compiler optimisation for MS - there is really no technical reason why ANY return type should prevent tail-recursion from being a jump as such - there may be OTHER reasons such as "the code is too complex to understand" or some such.
clang 3.7 as of a couple of weeks back clearly figures it out:
_Z8fib_tailyyy: # #_Z8fib_tailyyy
pushl %ebp
pushl %ebx
pushl %edi
pushl %esi
pushl %eax
movl 36(%esp), %ecx
movl 32(%esp), %esi
movl 28(%esp), %edi
movl 24(%esp), %ebx
movl %ebx, %eax
orl %edi, %eax
je .LBB0_1
movl 44(%esp), %ebp
movl 40(%esp), %eax
movl %eax, (%esp) # 4-byte Spill
.LBB0_3: # %if.end
movl %ebp, %edx
movl (%esp), %eax # 4-byte Reload
addl $-1, %ebx
adcl $-1, %edi
addl %eax, %esi
adcl %edx, %ecx
movl %ebx, %ebp
orl %edi, %ebp
movl %esi, (%esp) # 4-byte Spill
movl %ecx, %ebp
movl %eax, %esi
movl %edx, %ecx
jne .LBB0_3
jmp .LBB0_4
.LBB0_1:
movl %esi, %eax
movl %ecx, %edx
.LBB0_4: # %return
addl $4, %esp
popl %esi
popl %edi
popl %ebx
popl %ebp
retl
main: # #main
subl $28, %esp
movl $0, 20(%esp)
movl $1, 16(%esp)
movl $0, 12(%esp)
movl $0, 8(%esp)
movl $2, 4(%esp)
movl $1410065408, (%esp) # imm = 0x540BE400
calll _Z8fib_tailyyy
movl %edx, f+4
movl %eax, f
xorl %eax, %eax
addl $28, %esp
retl
Same applies to gcc 4.9.2 if you give it -O2 (but not in -O1 which was all clang needed)
(And of course also in 64-bit mode)
I have thought one compare must be faster than two. But after my test, I found in debug mode short compare is a bit faster, and in release mode char compare is faster. And I want to know the true reason.
Following is the test code and test result. I wrote two simple functions, func1() using two char compares, and func2() using one short compare. The main function returns temporary return value to avoid compile optimization ignoring my test code. My compiler is GCC 4.7.2, CPU Intel® Xeon® CPU E5-2430 0 # 2.20GHz (VM).
inline int func1(unsigned char word[2])
{
if (word[0] == 0xff && word[1] == 0xff)
return 1;
return 0;
}
inline int func2(unsigned char word[2])
{
if (*(unsigned short*)word == 0xffff)
return 1;
return 0;
}
int main()
{
int n_ret = 0;
for (int j = 0; j < 10000; ++j)
for (int i = 0; i < 70000; ++i)
n_ret += func2((unsigned char*)&i);
return n_ret;
}
Debug mode:
func1 func2
real 0m3.621s 0m3.586s
user 0m3.614s 0m3.579s
sys 0m0.001s 0m0.000s
Release mode:
func1 func2
real 0m0.833s 0m0.880s
user 0m0.831s 0m0.878s
sys 0m0.000s 0m0.002s
func1 edition's assembly code:
.cfi_startproc
movl $10000, %esi
xorl %eax, %eax
.p2align 4,,10
.p2align 3
.L6:
movl $1, %edx
xorl %ecx, %ecx
.p2align 4,,10
.p2align 3
.L8:
movl %edx, -24(%rsp)
addl $1, %edx
addl %ecx, %eax
cmpl $70001, %edx
je .L3
xorl %ecx, %ecx
cmpb $-1, -24(%rsp)
jne .L8
xorl %ecx, %ecx
cmpb $-1, -23(%rsp)
sete %cl
jmp .L8
.p2align 4,,10
.p2align 3
.L3:
subl $1, %esi
jne .L6
rep
ret
.cfi_endproc
func2 edition's assembly code:
.cfi_startproc
movl $10000, %esi
xorl %eax, %eax
.p2align 4,,10
.p2align 3
.L4:
movl $1, %edx
xorl %ecx, %ecx
jmp .L3
.p2align 4,,10
.p2align 3
.L7:
movzwl -24(%rsp), %ecx
.L3:
cmpw $-1, %cx
movl %edx, -24(%rsp)
sete %cl
addl $1, %edx
movzbl %cl, %ecx
addl %ecx, %eax
cmpl $70001, %edx
jne .L7
subl $1, %esi
jne .L4
rep
ret
.cfi_endproc
In GCC 4.6.3 the code is different for the first and second pieces of code, and the runtime for the func1 option is noticeably slower if you run it for long enough. Unfortunately, with your very short runtime, the two appear similar in time.
Increasing the outer loop by a factor of 10 means it takes about 6 seconds for func2, and 10 seconds for func1. This s using gcc -std=c99 -O3 to compile the code.
The main difference, I expect, is from the extra branch introduced with the && statement. And the extra xorl %ecx, %ecx doesn't help much (I get the same, although my code looks subtly different when it comes to label names).
Edit: I did try to come up with a branchless solution using and instead of a branch, but the compile refuses to inline the function, so it takes 30 seconds instead of 10.
Benchmarks run on:
AMD Phenom(tm) II X4 965
Runs at 3.4 GHz.