Related
I would like to know how to implement this lines of code into x86 masm assembly:
if (x >= 1 && x <= 100) {
printsomething1();
} else if (x >= 101 && x <= 200) {
printsomething2();
} else {
printsomething3();
}
I'd break it into contiguous ranges, (assuming x is unsigned) like:
x is 0, do printsomething3()
x is 1 to 100, do nothing printsomething1()
x is 101 to 200, do nothing printsomething2()
x is 201 or higher, do nothing printsomething3()
Then work from lowest to highest, like:
;eax = x;
cmp eax,0
je .printsomething3
cmp eax,100
jbe .printsomething1
cmp eax,200
jbe .printsomething2
jmp .printsomething3
If the only difference is the string they print (and not the code they use to print it) I'd go one step further:
mov esi,something3 ;esi = address of string if x is 0
cmp eax,0
je .print
mov esi,something1 ;esi = address of string if x is 1 to 100
cmp eax,100
jbe .print
mov esi,something2 ;esi = address of string if x is 101 to 200
cmp eax,200
jbe .print
mov esi,something3 ;esi = address of string if x is 201 or higher
jmp .print
If you have access to a decent C compiler, you can compile it into assembly language. For gcc use the -S flag:
gcc test.c -S
This creates the file test.s which contains the assembly language output which can be assembled and linked if needed.
For example, to make your code compile successfully, I rewrote it slightly to this:
#include <stdio.h>
#include <stdlib.h>
void printsomething (int y)
{
printf ("something %d", y);
}
void func (int x)
{
if (x >= 1 && x <= 100)
printsomething(1);
else
if (x >= 101 && x <= 200)
printsomething(2);
else
printsomething(3);
}
int main (int argc, char **argv)
{
int x = 0;
if (argc > 1)
x = atoi (argv [1]);
return 0;
}
It compiles into this assembler:
.file "s.c"
.text
.section .rodata
.LC0:
.string "something %d"
.text
.globl printsomething
.type printsomething, #function
printsomething:
.LFB5:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $16, %rsp
movl %edi, -4(%rbp)
movl -4(%rbp), %eax
movl %eax, %esi
movl $.LC0, %edi
movl $0, %eax
call printf
nop
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE5:
.size printsomething, .-printsomething
.globl func
.type func, #function
func:
.LFB6:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $16, %rsp
movl %edi, -4(%rbp)
cmpl $0, -4(%rbp)
jle .L3
cmpl $100, -4(%rbp)
jg .L3
movl $1, %edi
call printsomething
jmp .L4
.L3:
cmpl $100, -4(%rbp)
jle .L5
cmpl $200, -4(%rbp)
jg .L5
movl $2, %edi
call printsomething
jmp .L4
.L5:
movl $3, %edi
call printsomething
.L4:
nop
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE6:
.size func, .-func
.globl main
.type main, #function
main:
.LFB7:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $32, %rsp
movl %edi, -20(%rbp)
movq %rsi, -32(%rbp)
movl $0, -4(%rbp)
cmpl $1, -20(%rbp)
jle .L7
movq -32(%rbp), %rax
addq $8, %rax
movq (%rax), %rax
movq %rax, %rdi
call atoi
movl %eax, -4(%rbp)
.L7:
movl $0, %eax
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE7:
.size main, .-main
.ident "GCC: (GNU) 7.3.1 20180712 (Red Hat 7.3.1-6)"
.section .note.GNU-stack,"",#progbits
Examine the func: part of it and you'll see how it sets up the comparisons with 1, 100, 101, etc.
The compare function is a function that takes two arguments a and b and returns an integer describing their order. If a is smaller than b, the result is some negative integer. If a is bigger than b, the result is some positive integer. Otherwise, a and b are equal, and the result is zero.
This function is often used to parameterize sorting and searching algorithms from standard libraries.
Implementing the compare function for characters is quite easy; you simply subtract the arguments:
int compare_char(char a, char b)
{
return a - b;
}
This works because the difference between two characters is generally assumed to fit into an integer. (Note that this assumption does not hold for systems where sizeof(char) == sizeof(int).)
This trick cannot work to compare integers, because the difference between two integers generally does not fit into an integer. For example, INT_MAX - (-1) = INT_MIN suggests that INT_MAX is smaller than -1 (technically, the overflow leads to undefined behavior, but let's assume modulo arithmetic).
So how can we implement the compare function efficiently for integers? Here is my first attempt:
int compare_int(int a, int b)
{
int temp;
int result;
__asm__ __volatile__ (
"cmp %3, %2 \n\t"
"mov $0, %1 \n\t"
"mov $1, %0 \n\t"
"cmovg %0, %1 \n\t"
"mov $-1, %0 \n\t"
"cmovl %0, %1 \n\t"
: "=r"(temp), "=r"(result)
: "r"(a), "r"(b)
: "cc");
return result;
}
Can it be done in less than 6 instructions? Is there a less straightforward way that is more efficient?
This one has no branches, and doesn't suffer from overflow or underflow:
return (a > b) - (a < b);
With gcc -O2 -S, this compiles down to the following six instructions:
xorl %eax, %eax
cmpl %esi, %edi
setl %dl
setg %al
movzbl %dl, %edx
subl %edx, %eax
Here's some code to benchmark various compare implementations:
#include <stdio.h>
#include <stdlib.h>
#define COUNT 1024
#define LOOPS 500
#define COMPARE compare2
#define USE_RAND 1
int arr[COUNT];
int compare1 (int a, int b)
{
if (a < b) return -1;
if (a > b) return 1;
return 0;
}
int compare2 (int a, int b)
{
return (a > b) - (a < b);
}
int compare3 (int a, int b)
{
return (a < b) ? -1 : (a > b);
}
int compare4 (int a, int b)
{
__asm__ __volatile__ (
"sub %1, %0 \n\t"
"jno 1f \n\t"
"cmc \n\t"
"rcr %0 \n\t"
"1: "
: "+r"(a)
: "r"(b)
: "cc");
return a;
}
int main ()
{
for (int i = 0; i < COUNT; i++) {
#if USE_RAND
arr[i] = rand();
#else
for (int b = 0; b < sizeof(arr[i]); b++) {
*((unsigned char *)&arr[i] + b) = rand();
}
#endif
}
int sum = 0;
for (int l = 0; l < LOOPS; l++) {
for (int i = 0; i < COUNT; i++) {
for (int j = 0; j < COUNT; j++) {
sum += COMPARE(arr[i], arr[j]);
}
}
}
printf("%d=0\n", sum);
return 0;
}
The results on my 64-bit system, compiled with gcc -std=c99 -O2, for positive integers (USE_RAND=1):
compare1: 0m1.118s
compare2: 0m0.756s
compare3: 0m1.101s
compare4: 0m0.561s
Out of C-only solutions, the one I suggested was the fastest. user315052's solution was slower despite compiling to only 5 instructions. The slowdown is likely because, despite having one less instruction, there is a conditional instruction (cmovge).
Overall, FredOverflow's 4-instruction assembly implementation was the fastest when used with positive integers. However, this code only benchmarked the integer range RAND_MAX, so the 4-instuction test is biased, because it handles overflows separately, and these don't occur in the test; the speed may be due to successful branch prediction.
With a full range of integers (USE_RAND=0), the 4-instruction solution is in fact very slow (others are the same):
compare4: 0m1.897s
The following has always proven to be fairly efficient for me:
return (a < b) ? -1 : (a > b);
With gcc -O2 -S, this compiles down to the following five instructions:
xorl %edx, %edx
cmpl %esi, %edi
movl $-1, %eax
setg %dl
cmovge %edx, %eax
As a follow-up to Ambroz Bizjak's excellent companion answer, I was not convinced that his program tested the same assembly code what was posted above. And, when I was studying the compiler output more closely, I noticed that the compiler was not generating the same instructions as was posted in either of our answers. So, I took his test program, hand modified the assembly output to match what we posted, and compared the resulting times. It seems the two versions compare roughly identically.
./opt_cmp_branchless: 0m1.070s
./opt_cmp_branch: 0m1.037s
I am posting the assembly of each program in full so that others may attempt the same experiment, and confirm or contradict my observation.
The following is the version with the cmovge instruction ((a < b) ? -1 : (a > b)):
.file "cmp.c"
.text
.section .rodata.str1.1,"aMS",#progbits,1
.LC0:
.string "%d=0\n"
.text
.p2align 4,,15
.globl main
.type main, #function
main:
.LFB20:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
pushq %rbx
.cfi_def_cfa_offset 24
.cfi_offset 3, -24
movl $arr.2789, %ebx
subq $8, %rsp
.cfi_def_cfa_offset 32
.L9:
leaq 4(%rbx), %rbp
.L10:
call rand
movb %al, (%rbx)
addq $1, %rbx
cmpq %rbx, %rbp
jne .L10
cmpq $arr.2789+4096, %rbp
jne .L9
xorl %r8d, %r8d
xorl %esi, %esi
orl $-1, %edi
.L12:
xorl %ebp, %ebp
.p2align 4,,10
.p2align 3
.L18:
movl arr.2789(%rbp), %ecx
xorl %eax, %eax
.p2align 4,,10
.p2align 3
.L15:
movl arr.2789(%rax), %edx
xorl %ebx, %ebx
cmpl %ecx, %edx
movl $-1, %edx
setg %bl
cmovge %ebx, %edx
addq $4, %rax
addl %edx, %esi
cmpq $4096, %rax
jne .L15
addq $4, %rbp
cmpq $4096, %rbp
jne .L18
addl $1, %r8d
cmpl $500, %r8d
jne .L12
movl $.LC0, %edi
xorl %eax, %eax
call printf
addq $8, %rsp
.cfi_def_cfa_offset 24
xorl %eax, %eax
popq %rbx
.cfi_def_cfa_offset 16
popq %rbp
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE20:
.size main, .-main
.local arr.2789
.comm arr.2789,4096,32
.section .note.GNU-stack,"",#progbits
The version below uses the branchless method ((a > b) - (a < b)):
.file "cmp.c"
.text
.section .rodata.str1.1,"aMS",#progbits,1
.LC0:
.string "%d=0\n"
.text
.p2align 4,,15
.globl main
.type main, #function
main:
.LFB20:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
pushq %rbx
.cfi_def_cfa_offset 24
.cfi_offset 3, -24
movl $arr.2789, %ebx
subq $8, %rsp
.cfi_def_cfa_offset 32
.L9:
leaq 4(%rbx), %rbp
.L10:
call rand
movb %al, (%rbx)
addq $1, %rbx
cmpq %rbx, %rbp
jne .L10
cmpq $arr.2789+4096, %rbp
jne .L9
xorl %r8d, %r8d
xorl %esi, %esi
.L19:
movl %ebp, %ebx
xorl %edi, %edi
.p2align 4,,10
.p2align 3
.L24:
movl %ebp, %ecx
xorl %eax, %eax
jmp .L22
.p2align 4,,10
.p2align 3
.L20:
movl arr.2789(%rax), %ecx
.L22:
xorl %edx, %edx
cmpl %ebx, %ecx
setg %cl
setl %dl
movzbl %cl, %ecx
subl %ecx, %edx
addl %edx, %esi
addq $4, %rax
cmpq $4096, %rax
jne .L20
addq $4, %rdi
cmpq $4096, %rdi
je .L21
movl arr.2789(%rdi), %ebx
jmp .L24
.L21:
addl $1, %r8d
cmpl $500, %r8d
jne .L19
movl $.LC0, %edi
xorl %eax, %eax
call printf
addq $8, %rsp
.cfi_def_cfa_offset 24
xorl %eax, %eax
popq %rbx
.cfi_def_cfa_offset 16
popq %rbp
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE20:
.size main, .-main
.local arr.2789
.comm arr.2789,4096,32
.section .note.GNU-stack,"",#progbits
Okay, I managed to get it down to four instructions :) The basic idea is as follows:
Half the time, the difference is small enough to fit into an integer. In that case, just return the difference. Otherwise, shift the number one to the right. The crucial question is what bit to shift into the MSB then.
Let's look at two extreme examples, using 8 bits instead of 32 bits for the sake of simplicity:
10000000 INT_MIN
01111111 INT_MAX
---------
000000001 difference
00000000 shifted
01111111 INT_MAX
10000000 INT_MIN
---------
111111111 difference
11111111 shifted
Shifting the carry bit in would yield 0 for the first case (although INT_MIN is not equal to INT_MAX) and some negative number for the second case (although INT_MAX is not smaller than INT_MIN).
But if we flip the carry bit before doing the shift, we get sensible numbers:
10000000 INT_MIN
01111111 INT_MAX
---------
000000001 difference
100000001 carry flipped
10000000 shifted
01111111 INT_MAX
10000000 INT_MIN
---------
111111111 difference
011111111 carry flipped
01111111 shifted
I'm sure there's a deep mathematical reason why it makes sense to flip the carry bit, but I don't see it yet.
int compare_int(int a, int b)
{
__asm__ __volatile__ (
"sub %1, %0 \n\t"
"jno 1f \n\t"
"cmc \n\t"
"rcr %0 \n\t"
"1: "
: "+r"(a)
: "r"(b)
: "cc");
return a;
}
I have tested the code with one million random inputs plus every combination of INT_MIN, -INT_MAX, INT_MIN/2, -1, 0, 1, INT_MAX/2, INT_MAX/2+1, INT_MAX. All tests passed. Can you proove me wrong?
For what it's worth I put together an SSE2 implementation. vec_compare1 uses the same approach as compare2 but requires just three SSE2 arithmetic instructions:
#include <stdio.h>
#include <stdlib.h>
#include <emmintrin.h>
#define COUNT 1024
#define LOOPS 500
#define COMPARE vec_compare1
#define USE_RAND 1
int arr[COUNT] __attribute__ ((aligned(16)));
typedef __m128i vSInt32;
vSInt32 vec_compare1 (vSInt32 va, vSInt32 vb)
{
vSInt32 vcmp1 = _mm_cmpgt_epi32(va, vb);
vSInt32 vcmp2 = _mm_cmpgt_epi32(vb, va);
return _mm_sub_epi32(vcmp2, vcmp1);
}
int main ()
{
for (int i = 0; i < COUNT; i++) {
#if USE_RAND
arr[i] = rand();
#else
for (int b = 0; b < sizeof(arr[i]); b++) {
*((unsigned char *)&arr[i] + b) = rand();
}
#endif
}
vSInt32 vsum = _mm_set1_epi32(0);
for (int l = 0; l < LOOPS; l++) {
for (int i = 0; i < COUNT; i++) {
for (int j = 0; j < COUNT; j+=4) {
vSInt32 v1 = _mm_loadu_si128(&arr[i]);
vSInt32 v2 = _mm_load_si128(&arr[j]);
vSInt32 v = COMPARE(v1, v2);
vsum = _mm_add_epi32(vsum, v);
}
}
}
printf("vsum = %vd\n", vsum);
return 0;
}
Time for this is 0.137s.
Time for compare2 with the same CPU and compiler is 0.674s.
So the SSE2 implementation is around 4x faster, as might be expected (since it's 4-wide SIMD).
This code has no branches and uses 5 instructions. It may outperform other branch-less alternatives on recent Intel processors, where cmov* instructions are quite expensive. Disadvantage is non-symmetrical return value (INT_MIN+1, 0, 1).
int compare_int (int a, int b)
{
int res;
__asm__ __volatile__ (
"xor %0, %0 \n\t"
"cmpl %2, %1 \n\t"
"setl %b0 \n\t"
"rorl $1, %0 \n\t"
"setnz %b0 \n\t"
: "=q"(res)
: "r"(a)
, "r"(b)
: "cc"
);
return res;
}
This variant does not need initialization, so it uses only 4 instructions:
int compare_int (int a, int b)
{
__asm__ __volatile__ (
"subl %1, %0 \n\t"
"setl %b0 \n\t"
"rorl $1, %0 \n\t"
"setnz %b0 \n\t"
: "+q"(a)
: "r"(b)
: "cc"
);
return a;
}
Maybe you can use the following idea (in pseudo-code; didn't write asm-code because i am not comfortable with syntax):
Subtract the numbers (result = a - b)
If no overflow, done (jo instruction and branch prediction should work very well here)
If there was overflow, use any robust method (return (a < b) ? -1 : (a > b))
Edit: for additional simplicity: if there was overflow, flip the sign of the result, instead of step 3.
You could consider promoting the integers to 64bit values.
I have thought one compare must be faster than two. But after my test, I found in debug mode short compare is a bit faster, and in release mode char compare is faster. And I want to know the true reason.
Following is the test code and test result. I wrote two simple functions, func1() using two char compares, and func2() using one short compare. The main function returns temporary return value to avoid compile optimization ignoring my test code. My compiler is GCC 4.7.2, CPU Intel® Xeon® CPU E5-2430 0 # 2.20GHz (VM).
inline int func1(unsigned char word[2])
{
if (word[0] == 0xff && word[1] == 0xff)
return 1;
return 0;
}
inline int func2(unsigned char word[2])
{
if (*(unsigned short*)word == 0xffff)
return 1;
return 0;
}
int main()
{
int n_ret = 0;
for (int j = 0; j < 10000; ++j)
for (int i = 0; i < 70000; ++i)
n_ret += func2((unsigned char*)&i);
return n_ret;
}
Debug mode:
func1 func2
real 0m3.621s 0m3.586s
user 0m3.614s 0m3.579s
sys 0m0.001s 0m0.000s
Release mode:
func1 func2
real 0m0.833s 0m0.880s
user 0m0.831s 0m0.878s
sys 0m0.000s 0m0.002s
func1 edition's assembly code:
.cfi_startproc
movl $10000, %esi
xorl %eax, %eax
.p2align 4,,10
.p2align 3
.L6:
movl $1, %edx
xorl %ecx, %ecx
.p2align 4,,10
.p2align 3
.L8:
movl %edx, -24(%rsp)
addl $1, %edx
addl %ecx, %eax
cmpl $70001, %edx
je .L3
xorl %ecx, %ecx
cmpb $-1, -24(%rsp)
jne .L8
xorl %ecx, %ecx
cmpb $-1, -23(%rsp)
sete %cl
jmp .L8
.p2align 4,,10
.p2align 3
.L3:
subl $1, %esi
jne .L6
rep
ret
.cfi_endproc
func2 edition's assembly code:
.cfi_startproc
movl $10000, %esi
xorl %eax, %eax
.p2align 4,,10
.p2align 3
.L4:
movl $1, %edx
xorl %ecx, %ecx
jmp .L3
.p2align 4,,10
.p2align 3
.L7:
movzwl -24(%rsp), %ecx
.L3:
cmpw $-1, %cx
movl %edx, -24(%rsp)
sete %cl
addl $1, %edx
movzbl %cl, %ecx
addl %ecx, %eax
cmpl $70001, %edx
jne .L7
subl $1, %esi
jne .L4
rep
ret
.cfi_endproc
In GCC 4.6.3 the code is different for the first and second pieces of code, and the runtime for the func1 option is noticeably slower if you run it for long enough. Unfortunately, with your very short runtime, the two appear similar in time.
Increasing the outer loop by a factor of 10 means it takes about 6 seconds for func2, and 10 seconds for func1. This s using gcc -std=c99 -O3 to compile the code.
The main difference, I expect, is from the extra branch introduced with the && statement. And the extra xorl %ecx, %ecx doesn't help much (I get the same, although my code looks subtly different when it comes to label names).
Edit: I did try to come up with a branchless solution using and instead of a branch, but the compile refuses to inline the function, so it takes 30 seconds instead of 10.
Benchmarks run on:
AMD Phenom(tm) II X4 965
Runs at 3.4 GHz.
There is such code in C++:
#include <iostream>
int main(){
int a = 4;
while(a--){
std::cout << (a + 1) << '\n';
}
return 0;
}
and corresponding code of main function in assembly code produced by g++:
.globl main
.type main, #function
main:
.LFB957:
.cfi_startproc
.cfi_personality 0x0,__gxx_personality_v0
pushl %ebp
.cfi_def_cfa_offset 8
movl %esp, %ebp
.cfi_offset 5, -8
.cfi_def_cfa_register 5
andl $-16, %esp
subl $32, %esp
movl $4, 28(%esp) # int a = 4;
jmp .L2
.L3:
movl 28(%esp), %eax # std::cout << (a + 1) << '\n';
addl $1, %eax
movl %eax, 4(%esp)
movl $_ZSt4cout, (%esp)
call _ZNSolsEi
movl $10, 4(%esp)
movl %eax, (%esp)
call _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_c
.L2:
cmpl $0, 28(%esp)
setne %al
subl $1, 28(%esp) # a = a - 1
testb %al, %al
jne .L3
movl $0, %eax
leave
ret
.cfi_endproc
.LFE957:
.size main, .-main
What are used instructions setne and testb in following fragment for?
.L2:
cmpl $0, 28(%esp)
setne %al
subl $1, 28(%esp) # a = a - 1
testb %al, %al
jne .L3
Couldn't it be just so to check in while loop whether a is not zero and jump?
The while condition is formally the equivalent of:
while ( a -- != 0 )
(Omitting the comparison is a legal obfuscation.)
The compiler is generating code to compare a with 0, save the
results in register al, then decrement a, and then test the saved
results.
Because a-- means
tmpval=a;
a=a-1;
return tmpval;
so compiler needs to save the previous value of a.
In this program, the body part of while will be executed when a = 0 (after a--, so it will print 1).
It's been a long while since I did assembler, but I would assume it's some optimisation to keep the pipelines busy / optimise register use.
After reading a question related with the performance of sin/cos (Why is std::sin() and std::cos() slower than sin() and cos()?), I made some tests with his code and found a weird thing: If i call sin/cos with a float value, it is much slower than with double when compiled with optimization.
#include <cmath>
#include <cstdio>
const int N = 4000;
float cosine[N][N];
float sine[N][N];
int main() {
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
float ang = i*j*2*M_PI/N;
cosine[i][j] = cos(ang);
sine[i][j] = sin(ang);
}
}
}
With the above code I get:
With -O0: 2.402s
With -O1: 9.004s
With -O2: 9.013s
With -O3: 9.001s
Now if I change
float ang = i*j*2*M_PI/N;
To
double ang = i*j*2*M_PI/N;
I get:
With -O0: 2.362s
With -O1: 1.188s
With -O2: 1.197s
With -O3: 1.197s
How can the first test be that faster without optimizations?
I'm using g++ (Ubuntu/Linaro 4.5.2-8ubuntu4) 4.5.2, 64 bits.
EDIT: Changed the title to better describe the problem.
EDIT: Added assembly code
Assembly for first test with O0:
.file "main.cpp"
.globl cosine
.bss
.align 32
.type cosine, #object
.size cosine, 64000000
cosine:
.zero 64000000
.globl sine
.align 32
.type sine, #object
.size sine, 64000000
sine:
.zero 64000000
.text
.globl main
.type main, #function
main:
.LFB87:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
movq %rsp, %rbp
.cfi_offset 6, -16
.cfi_def_cfa_register 6
subq $16, %rsp
movl $0, -4(%rbp)
jmp .L2
.L5:
movl $0, -8(%rbp)
jmp .L3
.L4:
movl -4(%rbp), %eax
imull -8(%rbp), %eax
addl %eax, %eax
cvtsi2sd %eax, %xmm0
movsd .LC0(%rip), %xmm1
mulsd %xmm1, %xmm0
movsd .LC1(%rip), %xmm1
divsd %xmm1, %xmm0
unpcklpd %xmm0, %xmm0
cvtpd2ps %xmm0, %xmm0
movss %xmm0, -12(%rbp)
movss -12(%rbp), %xmm0
cvtps2pd %xmm0, %xmm0
call cos
unpcklpd %xmm0, %xmm0
cvtpd2ps %xmm0, %xmm0
movl -8(%rbp), %eax
cltq
movl -4(%rbp), %edx
movslq %edx, %rdx
imulq $4000, %rdx, %rdx
leaq (%rdx,%rax), %rax
movss %xmm0, cosine(,%rax,4)
movss -12(%rbp), %xmm0
cvtps2pd %xmm0, %xmm0
call sin
unpcklpd %xmm0, %xmm0
cvtpd2ps %xmm0, %xmm0
movl -8(%rbp), %eax
cltq
movl -4(%rbp), %edx
movslq %edx, %rdx
imulq $4000, %rdx, %rdx
leaq (%rdx,%rax), %rax
movss %xmm0, sine(,%rax,4)
addl $1, -8(%rbp)
.L3:
cmpl $3999, -8(%rbp)
setle %al
testb %al, %al
jne .L4
addl $1, -4(%rbp)
.L2:
cmpl $3999, -4(%rbp)
setle %al
testb %al, %al
jne .L5
movl $0, %eax
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE87:
.size main, .-main
.section .rodata
.align 4
.type _ZL1N, #object
.size _ZL1N, 4
_ZL1N:
.long 4000
.align 8
.LC0:
.long 1413754136
.long 1074340347
.align 8
.LC1:
.long 0
.long 1085227008
.ident "GCC: (Ubuntu/Linaro 4.5.2-8ubuntu4) 4.5.2"
.section .note.GNU-stack,"",#progbits
Assembly for first test with O3:
.file "main.cpp"
.text
.p2align 4,,15
.globl main
.type main, #function
main:
.LFB121:
.cfi_startproc
pushq %r15
.cfi_def_cfa_offset 16
xorl %r15d, %r15d
.cfi_offset 15, -16
pushq %r14
.cfi_def_cfa_offset 24
movl $cosine+16000, %r14d
.cfi_offset 14, -24
pushq %r13
.cfi_def_cfa_offset 32
xorl %r13d, %r13d
.cfi_offset 13, -32
pushq %r12
.cfi_def_cfa_offset 40
pushq %rbp
.cfi_def_cfa_offset 48
pushq %rbx
.cfi_def_cfa_offset 56
subq $24, %rsp
.cfi_def_cfa_offset 80
.p2align 4,,10
.p2align 3
.L2:
movslq %r15d, %rbp
.cfi_offset 3, -56
.cfi_offset 6, -48
.cfi_offset 12, -40
movl %r13d, %r12d
movl $0x3f800000, %edx
imulq $16000, %rbp, %rbp
xorl %eax, %eax
leaq cosine(%rbp), %rbx
addq $sine, %rbp
jmp .L5
.p2align 4,,10
.p2align 3
.L3:
movl %r12d, %eax
leaq 8(%rsp), %rsi
leaq 12(%rsp), %rdi
subl %r13d, %eax
cvtsi2sd %eax, %xmm0
mulsd .LC2(%rip), %xmm0
divsd .LC3(%rip), %xmm0
unpcklpd %xmm0, %xmm0
cvtpd2ps %xmm0, %xmm0
call sincosf
movl 8(%rsp), %edx
movl 12(%rsp), %eax
.L5:
movl %edx, (%rbx)
addq $4, %rbx
movl %eax, 0(%rbp)
addl %r13d, %r12d
addq $4, %rbp
cmpq %r14, %rbx
jne .L3
addl $1, %r15d
addl $2, %r13d
leaq 16000(%rbx), %r14
cmpl $4000, %r15d
jne .L2
addq $24, %rsp
.cfi_def_cfa_offset 56
xorl %eax, %eax
popq %rbx
.cfi_def_cfa_offset 48
popq %rbp
.cfi_def_cfa_offset 40
popq %r12
.cfi_def_cfa_offset 32
popq %r13
.cfi_def_cfa_offset 24
popq %r14
.cfi_def_cfa_offset 16
popq %r15
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE121:
.size main, .-main
.globl cosine
.bss
.align 32
.type cosine, #object
.size cosine, 64000000
cosine:
.zero 64000000
.globl sine
.align 32
.type sine, #object
.size sine, 64000000
sine:
.zero 64000000
.section .rodata.cst8,"aM",#progbits,8
.align 8
.LC2:
.long 1413754136
.long 1074340347
.align 8
.LC3:
.long 0
.long 1085227008
.ident "GCC: (Ubuntu/Linaro 4.5.2-8ubuntu4) 4.5.2"
.section .note.GNU-stack,"",#progbits
Here's a possibility:
In C, cos is double precision and cosf is single precision. In C++, std::cos has overloads for both double and single.
You aren't calling std::cos. If <cmath> doesn't also overload ::cos (as far as I know, it is not required to), then you are just calling the C double precision function. If this is the case, then you're suffering the cost of converting between float, double, and back.
Now, some standard libraries implement cos(float x) as (float)cos((double)x), so even if you are calling the float function it might still be doing conversions behind the scenes.
This shouldn't account for a 9x performance difference, though.
AFAIK it's because computers work at double precision natively. Using float requires conversions.'