There is such code in C++:
#include <iostream>
int main(){
int a = 4;
std::cout << (a + 1) << '\n';
return 0;
and corresponding code of main function in assembly code produced by g++:
.globl main
.type main, #function
.cfi_personality 0x0,__gxx_personality_v0
pushl %ebp
.cfi_def_cfa_offset 8
movl %esp, %ebp
.cfi_offset 5, -8
.cfi_def_cfa_register 5
andl $-16, %esp
subl $32, %esp
movl $4, 28(%esp) # int a = 4;
jmp .L2
movl 28(%esp), %eax # std::cout << (a + 1) << '\n';
addl $1, %eax
movl %eax, 4(%esp)
movl $_ZSt4cout, (%esp)
call _ZNSolsEi
movl $10, 4(%esp)
movl %eax, (%esp)
call _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_c
cmpl $0, 28(%esp)
setne %al
subl $1, 28(%esp) # a = a - 1
testb %al, %al
jne .L3
movl $0, %eax
.size main, .-main
What are used instructions setne and testb in following fragment for?
cmpl $0, 28(%esp)
setne %al
subl $1, 28(%esp) # a = a - 1
testb %al, %al
jne .L3
Couldn't it be just so to check in while loop whether a is not zero and jump?
The while condition is formally the equivalent of:
while ( a -- != 0 )
(Omitting the comparison is a legal obfuscation.)
The compiler is generating code to compare a with 0, save the
results in register al, then decrement a, and then test the saved
Because a-- means
return tmpval;
so compiler needs to save the previous value of a.
In this program, the body part of while will be executed when a = 0 (after a--, so it will print 1).
It's been a long while since I did assembler, but I would assume it's some optimisation to keep the pipelines busy / optimise register use.
Here is my question:
I wrote a piece of code of assemble. It could read a file, transform the content to the uppercase and print the outputs in a newfile.
I complie and link the assemble code with:
as -gstabs read-files.s -o read-files.o
ld read-files.o -o read-files
And a test like "./read-files input-file output-file" works well.
But what if I want to debug this piece of code with gdb? I tried, but:
when I set the breakpoint and args of target code in gdb with:
(gdb) b *_start+1
(gdb) run test-file TEST-FILE
It will end with a segmentfault immediately.
Can I really debug this code like what I just stated aboved? Thanks
And the assemble code is here:
.section .data
.equ SYS_OPEN, 5
.equ SYS_WRITE, 4
.equ SYS_READ, 3
.equ SYS_CLOSE, 6
.equ SYS_EXIT, 1
.equ O_RDONLY, 0
.equ STDIN, 0
.equ STDOUT, 1
.equ STDERR, 2
.equ LINUX_SYSCALL, 0x80
.equ END_OF_FILE, 0
.section .bss
.equ BUFFER_SIZE, 500
.section .text
.equ ST_FD_IN, -4
.equ ST_FD_OUT, -8
.equ ST_ARGC, 0
.equ ST_ARGV_0, 4
.equ ST_ARGV_1, 8
.equ ST_ARGV_2, 12
.globl _start
movl %esp, %ebp
subl $ST_SIZE_RESERVE, %esp
movl $SYS_OPEN, %eax
movl ST_ARGV_1(%ebp), %ebx
movl $O_RDONLY, %ecx
movl $0666, %edx
movl %eax, ST_FD_IN(%ebp)
movl $SYS_OPEN, %eax
movl ST_ARGV_2(%ebp), %ebx
movl $0666, %edx
movl %eax, ST_FD_OUT(%ebp)
movl $SYS_READ, %eax
movl ST_FD_IN(%ebp), %ebx
movl $BUFFER_DATA, %ecx
movl $BUFFER_SIZE, %edx
cmpl $END_OF_FILE, %eax
jle end_loop
pushl %eax
call convert_to_upper
popl %eax
addl $4, %esp
movl %eax, %edx
movl $SYS_WRITE, %eax
movl ST_FD_OUT(%ebp), %ebx
movl $BUFFER_DATA, %ecx
jmp read_loop_begin
movl $SYS_CLOSE, %eax
movl ST_FD_OUT(%ebp), %ebx
movl $SYS_CLOSE, %eax
movl ST_FD_IN(%ebp), %ebx
movl $SYS_EXIT, %eax
movl $0, %ebx
.equ LOWERCASE_A, 'a'
.equ LOWERCASE_Z, 'z'
.equ UPPER_CONVERSION, 'A' - 'a'
.equ ST_BUFFER, 12
pushl %ebp
movl %esp, %ebp
movl ST_BUFFER(%ebp), %eax
movl ST_BUFFER_LEN(%ebp), %ebx
movl $0, %edi
cmpl $0, %ebx
je end_convert_loop
movb (%eax, %edi, 1), %cl
cmpb $LOWERCASE_A, %cl
jl next_byte
cmpb $LOWERCASE_Z, %cl
jg next_byte
movb %cl, (%eax, %edi, 1)
incl %edi
cmpl %edi, %ebx
jne convert_loop
movl %ebp, %esp
popl %ebp
I've been told many times that recursion is slow due to function calls, but in this code, it seems much faster than the iterative solution. At best, I typically expect a compiler to optimize recursion into iteration (which looking at the assembly, did seem to happen).
#include <iostream>
bool isDivisable(int x, int y)
for (int i = y; i != 1; --i)
if (x % i != 0)
return false;
return true;
bool isDivisableRec(int x, int y)
if (y == 1)
return true;
return x % y == 0 && isDivisableRec(x, y-1);
int findSmallest()
int x = 20;
for (; !isDivisable(x,20); ++x);
return x;
int main()
std::cout << findSmallest() << std::endl;
Assembly here:
I'd love to know what is going on here. I'm sure it is some tricky compiler optimization that I can be amazed to learn about.
Edit: I just realized I forgot to mention that if I use the recursive version, it runs in about .25 seconds, the iterative, about .6.
Edit 2: I am compiling with -O3 using
$ g++ --version
g++ (Ubuntu 4.8.4-2ubuntu1~14.04) 4.8.4
Though, I'm not really sure what that matters.
Edit 3:
Better benchmarking:
Running with 100 iterations shows very similar results
Edit 4:
At Roman's suggestion, I added -fno-inline-functions -fno-inline-small-functions to the compilation flags. The effect is extremely bizarre to me. The code runs about 15x faster, but the ratio between the recursive version and the iterative version remains similar.
Using this code I also see large timing difference (in favor of the recursive version) with GCC 4.9.3 in Cygwin. I get
13.411 seconds for iterative
4.29101 seconds for recursive
Looking at the assembly code it generated with -O3, I see two things
The compiler replaced tail recursion in isDivisableRec with a cycle and then unrolled the cycle: each iteration of the cycle in the machine code covers two levels of the original recursion.
movl %edx, %r8d
cmpl $1, %r8d
je .L18
movl %ecx, %eax ; First unrolled divisibility check
idivl %r8d
testl %edx, %edx
je .L20
xorl %eax, %eax
.p2align 4,,10
leal -1(%r8), %r9d
cmpl $1, %r9d
jne .L21
.p2align 4,,10
movl $1, %eax
.p2align 4,,10
movl %ecx, %eax ; Second unrolled divisibility check
idivl %r9d
testl %edx, %edx
jne .L19
subl $2, %r8d
jmp .L15
The compiler inlined several iterations of isDivisableRec by lifting them into findSmallestRec. Since the value of y parameter of isDivisableRec is hardcoded as 20 the compiler managed to replace the iterations for 20, 19...15 with some "magical" code inlined directly into findSmallestRec. The actual call to isDivisableRec happens only for y parameter value of 14 (if it happens at all).
Here's the inlined code in findSmallestRec
movl $20, %ebx
movl $1717986919, %esi ; Magic constants
movl $1808407283, %edi ; for divisibility tests
movl $954437177, %ebp ;
movl $2021161081, %r12d ;
movl $-2004318071, %r13d ;
jmp .L28
.p2align 4,,10
.L29: ; The main cycle
addl $1, %ebx
movl %ebx, %eax ; Divisibility by 20 test
movl %ebx, %ecx
imull %esi
sarl $31, %ecx
sarl $3, %edx
subl %ecx, %edx
leal (%rdx,%rdx,4), %eax
sall $2, %eax
cmpl %eax, %ebx
jne .L29
movl %ebx, %eax ; Divisibility by 19 test
imull %edi
sarl $3, %edx
subl %ecx, %edx
leal (%rdx,%rdx,8), %eax
leal (%rdx,%rax,2), %eax
cmpl %eax, %ebx
jne .L29
movl %ebx, %eax ; Divisibility by 18 test
imull %ebp
sarl $2, %edx
subl %ecx, %edx
leal (%rdx,%rdx,8), %eax
addl %eax, %eax
cmpl %eax, %ebx
jne .L29
movl %ebx, %eax ; Divisibility by 17 test
imull %r12d
sarl $3, %edx
subl %ecx, %edx
movl %edx, %eax
sall $4, %eax
addl %eax, %edx
cmpl %edx, %ebx
jne .L29
testb $15, %bl ; Divisibility by 16 test
jne .L29
movl %ebx, %eax ; Divisibility by 15 test
imull %r13d
leal (%rdx,%rbx), %eax
sarl $3, %eax
subl %ecx, %eax
movl %eax, %edx
sall $4, %edx
subl %eax, %edx
cmpl %edx, %ebx
jne .L29
movl $14, %edx
movl %ebx, %ecx
call _Z14isDivisableRecii ; call isDivisableRecii(x, 14)
The above blocks of machine instructions before each jne .L29 jump are divisibility tests for 20, 19...15 lifted directly into findSmallestRec. Apparently, they are more efficient than the tests used inside isDivisableRec for a run-time value of y. As you can see, the divisibility by 16 test is implemented simply as testb $15, %bl. Because of this, non-divisibility of x by high values of y is caught early by the above highly optimized code.
None of this happens for isDivisable and findSmallest - they are basically translated literally. Even the cycle is not unrolled.
I believe it is the second optimization that makes for the most of the difference. The compiler used highly optimized methods of checking divisibility for higher y values, which happen to be known at compile time.
If you replace the second argument of isDivisableRec with an "unpredictable" run-time value of 20 (instead of hard-coded compile-time constant 20), it should disable this optimization and bring the timings in line. I just tried this and ended up with
12.9 seconds for iterative
13.26 seconds for recursive
I have written a simple Fibonacci function as an exercise in C++ (using Visual Studio) to test Tail Recursion and to see how it works.
this is the code:
int fib_tail(int n, int res, int next) {
if (n == 0) {
return res;
return fib_tail(n - 1, next, res + next);
int main()
fib_tail(10,0,1); //Tail Recursion works
when I compiled using Release mode I saw the optimized assembly using the JMP instruction in spite of a call. So my conclusion was: tail recursion works. See image below:
I wanted to do some performance tests by increasing the input variable n in my Fibonacci function. I then opted to change the variable type, used in the function, from int to unsigned long long. Then I passed a big number like: 10e+08
This is now the new function:
typedef unsigned long long ULONG64;
ULONG64 fib_tail(ULONG64 n, ULONG64 res, ULONG64 next) {
if (n == 0) {
return res;
return fib_tail(n - 1, next, res + next);
int main()
fib_tail(10e+9,0,1); //Tail recursion does not work
When I ran the code above I got a stack overflow exception, which made me think that tail recursion was not working. I looked at the assembly and in fact I found this:
As you see now there is a call instruction whereas I was expecting only a simple JMP. I don't understand the reason why using a 8 bytes variable disables tail recursion. Why the compiler doesn't perform an optimization in such case?
This is one of those questions that you'd have to ask the guys that do compiler optimisation for MS - there is really no technical reason why ANY return type should prevent tail-recursion from being a jump as such - there may be OTHER reasons such as "the code is too complex to understand" or some such.
clang 3.7 as of a couple of weeks back clearly figures it out:
_Z8fib_tailyyy: # #_Z8fib_tailyyy
pushl %ebp
pushl %ebx
pushl %edi
pushl %esi
pushl %eax
movl 36(%esp), %ecx
movl 32(%esp), %esi
movl 28(%esp), %edi
movl 24(%esp), %ebx
movl %ebx, %eax
orl %edi, %eax
je .LBB0_1
movl 44(%esp), %ebp
movl 40(%esp), %eax
movl %eax, (%esp) # 4-byte Spill
.LBB0_3: # %if.end
movl %ebp, %edx
movl (%esp), %eax # 4-byte Reload
addl $-1, %ebx
adcl $-1, %edi
addl %eax, %esi
adcl %edx, %ecx
movl %ebx, %ebp
orl %edi, %ebp
movl %esi, (%esp) # 4-byte Spill
movl %ecx, %ebp
movl %eax, %esi
movl %edx, %ecx
jne .LBB0_3
jmp .LBB0_4
movl %esi, %eax
movl %ecx, %edx
.LBB0_4: # %return
addl $4, %esp
popl %esi
popl %edi
popl %ebx
popl %ebp
main: # #main
subl $28, %esp
movl $0, 20(%esp)
movl $1, 16(%esp)
movl $0, 12(%esp)
movl $0, 8(%esp)
movl $2, 4(%esp)
movl $1410065408, (%esp) # imm = 0x540BE400
calll _Z8fib_tailyyy
movl %edx, f+4
movl %eax, f
xorl %eax, %eax
addl $28, %esp
Same applies to gcc 4.9.2 if you give it -O2 (but not in -O1 which was all clang needed)
(And of course also in 64-bit mode)
I have thought one compare must be faster than two. But after my test, I found in debug mode short compare is a bit faster, and in release mode char compare is faster. And I want to know the true reason.
Following is the test code and test result. I wrote two simple functions, func1() using two char compares, and func2() using one short compare. The main function returns temporary return value to avoid compile optimization ignoring my test code. My compiler is GCC 4.7.2, CPU Intel® Xeon® CPU E5-2430 0 # 2.20GHz (VM).
inline int func1(unsigned char word[2])
if (word[0] == 0xff && word[1] == 0xff)
return 1;
return 0;
inline int func2(unsigned char word[2])
if (*(unsigned short*)word == 0xffff)
return 1;
return 0;
int main()
int n_ret = 0;
for (int j = 0; j < 10000; ++j)
for (int i = 0; i < 70000; ++i)
n_ret += func2((unsigned char*)&i);
return n_ret;
Debug mode:
func1 func2
real 0m3.621s 0m3.586s
user 0m3.614s 0m3.579s
sys 0m0.001s 0m0.000s
Release mode:
func1 func2
real 0m0.833s 0m0.880s
user 0m0.831s 0m0.878s
sys 0m0.000s 0m0.002s
func1 edition's assembly code:
movl $10000, %esi
xorl %eax, %eax
.p2align 4,,10
.p2align 3
movl $1, %edx
xorl %ecx, %ecx
.p2align 4,,10
.p2align 3
movl %edx, -24(%rsp)
addl $1, %edx
addl %ecx, %eax
cmpl $70001, %edx
je .L3
xorl %ecx, %ecx
cmpb $-1, -24(%rsp)
jne .L8
xorl %ecx, %ecx
cmpb $-1, -23(%rsp)
sete %cl
jmp .L8
.p2align 4,,10
.p2align 3
subl $1, %esi
jne .L6
func2 edition's assembly code:
movl $10000, %esi
xorl %eax, %eax
.p2align 4,,10
.p2align 3
movl $1, %edx
xorl %ecx, %ecx
jmp .L3
.p2align 4,,10
.p2align 3
movzwl -24(%rsp), %ecx
cmpw $-1, %cx
movl %edx, -24(%rsp)
sete %cl
addl $1, %edx
movzbl %cl, %ecx
addl %ecx, %eax
cmpl $70001, %edx
jne .L7
subl $1, %esi
jne .L4
In GCC 4.6.3 the code is different for the first and second pieces of code, and the runtime for the func1 option is noticeably slower if you run it for long enough. Unfortunately, with your very short runtime, the two appear similar in time.
Increasing the outer loop by a factor of 10 means it takes about 6 seconds for func2, and 10 seconds for func1. This s using gcc -std=c99 -O3 to compile the code.
The main difference, I expect, is from the extra branch introduced with the && statement. And the extra xorl %ecx, %ecx doesn't help much (I get the same, although my code looks subtly different when it comes to label names).
Edit: I did try to come up with a branchless solution using and instead of a branch, but the compile refuses to inline the function, so it takes 30 seconds instead of 10.
Benchmarks run on:
AMD Phenom(tm) II X4 965
Runs at 3.4 GHz.
Closed 9 years ago.
#include <stdint.h>
#include <iostream>
using namespace std;
uint32_t k[] = {0, 1, 17};
template <typename T>
bool f(T *data, int i) {
return data[0] < (T)(1 << k[i]);
int main() {
uint8_t v = 0;
cout << f(&v, 2) << endl;
cout << (0 < (uint8_t)(1 << 17)) << endl;
return 0;
g++ a.cpp && ./a.out
Why am I getting these results?
It looks like gcc reverses the shift and applies it to the other side, and I guess this is a bug.
In C (instead of C++) the same thing happens, and C translated to asm is easier to read, so I'm using C here; also I reduced the test cases (dropping templates and the k array).
foo() is the original buggy f() function, foo1() is what foo() behaves like with gcc but shouldn't, and bar() shows what foo() should look like apart from the pointer read.
I'm on 64-bit, but 32-bit is the same apart from the parameter handling and finding k.
#include <stdint.h>
#include <stdio.h>
uint32_t k = 17;
char foo(uint8_t *data) {
return *data < (uint8_t)(1<<k);
with gcc -O3 -S: (gcc version 4.7.2 (Debian 4.7.2-5))
movzbl (%rdi), %eax
movl k(%rip), %ecx
shrb %cl, %al
testb %al, %al
sete %al
char foo1(uint8_t *data) {
return (((uint32_t)*data) >> k) < 1;
movzbl (%rdi), %eax
movl k(%rip), %ecx
shrl %cl, %eax
testl %eax, %eax
sete %al
char bar(uint8_t data) {
return data < (uint8_t)(1<<k);
movl k(%rip), %ecx
movl $1, %eax
sall %cl, %eax
cmpb %al, %dil
setb %al
int main() {
uint8_t v = 0;
printf("All should be 0: %i %i %i\n", foo(&v), foo1(&v), bar(v));
return 0;
If your int is 16-bit long, you're running into undefined behavior and either result is "OK".
Shifting N-bit integers by N or more bit positions left or right results in undefined behavior.
Since this happens with 32-bit ints, this is a bug in the compiler.
Here are some more data points:
basically, it looks like gcc optimizes (even in when the -O flag is off and -g is on):
[variable] < (type-cast)(1 << [variable2])
((type-cast)[variable] >> [variable2]) == 0
[variable] >= (type-cast)(1 << [variable2])
((type-cast)[variable] >> [variable2]) != 0
where [variable] needs to be an array access.
I guess the advantage here is that it doesn't have to load the literal 1 into a register, which saves 1 register.
So here are the data points:
changing 1 to a number > 1 forces it to implement the correct version.
changing any of the variables to a literal forces it to implement the correct version
changing [variable] to a non array access forces it to implement the correct version
[variable] > (type-cast)(1 << [variable2]) implements the correct version.
I suspect this is all trying to save a register. When [variable] is an array access, it needs to also keep an index. Someone probably thought this is so clever, until it's wrong.
Using code from the bug report
#include <stdio.h>
int main(void)
int a, s = 8;
unsigned char data[1] = {0};
a = data[0] < (unsigned char) (1 << s);
printf("%d\n", a);
return 0;
compiled with gcc -O2 -S
.globl main
.type main, #function
leal 4(%esp), %ecx
andl $-16, %esp
pushl -4(%ecx)
pushl %ebp
movl %esp, %ebp
pushl %ecx
subl $8, %esp
pushl $1 ***** seems it already precomputed the result to be 1
pushl $.LC0
pushl $1
call __printf_chk
xorl %eax, %eax
movl -4(%ebp), %ecx
leal -4(%ecx), %esp
compile with just gcc -S
.globl main
.type main, #function
leal 4(%esp), %ecx
andl $-16, %esp
pushl -4(%ecx)
pushl %ebp
movl %esp, %ebp
pushl %ebx
pushl %ecx
subl $16, %esp
movl $8, -12(%ebp)
movb $0, -17(%ebp)
movb -17(%ebp), %dl
movl -12(%ebp), %eax
movb %dl, %bl
movb %al, %cl
shrb %cl, %bl ****** (unsigned char)data[0] >> s => %bl
movb %bl, %al %bl => %al
testb %al, %al %al = 0?
sete %dl
movl $0, %eax
movb %dl, %al
movl %eax, -16(%ebp)
movl $.LC0, %eax
subl $8, %esp
pushl -16(%ebp)
pushl %eax
call printf
addl $16, %esp
movl $0, %eax
leal -8(%ebp), %esp
addl $0, %esp
popl %ecx
popl %ebx
popl %ebp
leal -4(%ecx), %esp
I guess the next step is to dig through gcc's source code.
I'm pretty sure we're talking undefined behaviour here - converting a "large" integer to a smaller, of a value that doesn't fit in the size of the new value, is undefined as far as I know. 131072 definitely doesn't fit in a uint_8.
Although looking at the code generated, I'd say that it's probably not quite right, since it does "sete" rather than "setb"??? That does seem very suspicios to me.
If I turn the expression around:
return (T)(1<<k[i]) > data[0];
then it uses a "seta" instruction, which is what I'd expect. I'll do a bit more digging - but something seems a bit wrong.