Tool for simplifying/optimizing logic? [closed] - c++

Closed. This question does not meet Stack Overflow guidelines. It is not currently accepting answers.
Questions asking us to recommend or find a tool, library or favorite off-site resource are off-topic for Stack Overflow as they tend to attract opinionated answers and spam. Instead, describe the problem and what has been done so far to solve it.
Closed 9 years ago.
Improve this question
Usually I would let the compiler do it's magic of optimizing complicated logical expressions, however, in this case the compiler I have to use is not very good at this (basically all it can do is to replaced things like /64 with bit-shifts and %512 with bitwise-and).
Is there any tool available that can analyze and provide optimized versions of expressions, (i.e. the same way good optimizing compilers do)?
e.g. I would like to optimize the following:
int w = 2 - z/2;
int y0 = y + (((v % 512) / 64) / 4) * 8 + ((v / 512) / mb)*16;
int x0 = x + (((v % 512) / 64) % 4) * 8 * (w - 1) + ((v / 512) % mb)*8 * w;
int i = x0 * (w ^ 3) * 2 + y0 * mb * 16 * 2 + (2*z - 3) * (z/2);

Here's a test:
typedef int MyInt; // or unsigned int
MyInt get(MyInt x, MyInt y, MyInt z, MyInt v, MyInt mb)
{
MyInt w = 2 - z/2;
MyInt y0 = y + (((v % 512) / 64) / 4) * 8 + ((v / 512) / mb)*16;
MyInt x0 = x + (((v % 512) / 64) % 4) * 8 * (w - 1) + ((v / 512) % mb)*8 * w;
MyInt i = x0 * (w ^ 3) * 2 + y0 * mb * 16 * 2 + (2*z - 3) * (z/2);
return i;
}
I compiled with GCC 4.7.0 with -O3.
With int:
.LFB0:
movl %ecx, %eax
movq %r12, -24(%rsp)
.LCFI0:
movl %edx, %r12d
sarl $31, %eax
shrl $31, %r12d
movq %r13, -16(%rsp)
shrl $23, %eax
addl %edx, %r12d
movq %rbx, -40(%rsp)
leal (%rcx,%rax), %r9d
movl %r12d, %r11d
movq %r14, -8(%rsp)
sarl %r11d
movq %rbp, -32(%rsp)
.LCFI1:
movl %edx, %ebp
andl $511, %r9d
negl %r11d
subl %eax, %r9d
leal 511(%rcx), %eax
testl %ecx, %ecx
leal 2(%r11), %r13d
leal 63(%r9), %ebx
cmovns %ecx, %eax
sarl $9, %eax
movl %r13d, %r14d
xorl $3, %r14d
movl %eax, %edx
testl %r9d, %r9d
cmovns %r9d, %ebx
sarl $31, %edx
addl $1, %r11d
idivl %r8d
movl %ebx, %r10d
sarl $31, %ebx
shrl $30, %ebx
sarl $6, %r10d
addl %ebx, %r10d
andl $3, %r10d
subl %ebx, %r10d
movq -40(%rsp), %rbx
sall $3, %r10d
sall $3, %edx
imull %r11d, %r10d
imull %r13d, %edx
movq -16(%rsp), %r13
addl %edi, %r10d
addl %edx, %r10d
leal 255(%r9), %edx
imull %r10d, %r14d
testl %r9d, %r9d
cmovs %edx, %r9d
sall $4, %eax
sarl %r12d
sarl $8, %r9d
leal (%rsi,%r9,8), %ecx
addl %eax, %ecx
leal -3(%rbp,%rbp), %eax
movq -32(%rsp), %rbp
imull %r8d, %ecx
imull %r12d, %eax
movq -24(%rsp), %r12
sall $4, %ecx
addl %r14d, %ecx
movq -8(%rsp), %r14
leal (%rax,%rcx,2), %eax
ret
With unsigned int:
.LFB0:
movl %ecx, %eax
movq %rbp, -16(%rsp)
movl %edx, %r11d
.LCFI0:
movl %edx, %ebp
shrl $9, %eax
xorl %edx, %edx
divl %r8d
movq %r12, -8(%rsp)
.LCFI1:
movl %ecx, %r12d
shrl %r11d
andl $511, %r12d
movq %rbx, -24(%rsp)
.LCFI2:
movl $2, %r10d
movl %r12d, %r9d
movl $1, %ebx
subl %r11d, %r10d
shrl $6, %r9d
subl %r11d, %ebx
shrl $8, %r12d
andl $3, %r9d
sall $4, %r8d
imull %ebx, %r9d
leal (%r12,%rax,2), %eax
movq -24(%rsp), %rbx
imull %r10d, %edx
xorl $3, %r10d
movq -8(%rsp), %r12
leal (%rsi,%rax,8), %eax
addl %edx, %r9d
leal (%rdi,%r9,8), %edi
imull %eax, %r8d
leal -3(%rbp,%rbp), %eax
movq -16(%rsp), %rbp
imull %r10d, %edi
imull %r11d, %eax
addl %edi, %r8d
leal (%rax,%r8,2), %eax
ret
"Optimizing" further by folding constants manually has (predictably) no further effect.

When I want optimizations, I tend to check what Clang generates as LLVM IR. It's more readable (I find) than pure assembly.
int foo(int v, int mb, int x, int y, int z) {
int w = 2 - z/2;
// When you have specific constraints, tell the optimizer about it !
if (w < 0 || w > 2) { return 0; }
int y0 = y + (((v % 512) / 64) / 4) * 8 + ((v / 512) / mb)*16;
int x0 = x + (((v % 512) / 64) % 4) * 8 * (w - 1) + ((v / 512) % mb)*8 * w;
int i = x0 * (w ^ 3) * 2 + y0 * mb * 16 * 2 + (2*z - 3) * (z/2);
return i;
}
Is transformed into:
define i32 #foo(i32 %v, i32 %mb, i32 %x, i32 %y, i32 %z) nounwind uwtable readnone {
%1 = sdiv i32 %z, 2
%2 = sub nsw i32 2, %1
%3 = icmp slt i32 %2, 0
%4 = icmp slt i32 %z, -1
%or.cond = or i1 %3, %4
br i1 %or.cond, label %31, label %5
; <label>:5 ; preds = %0
%6 = srem i32 %v, 512
%7 = sdiv i32 %6, 64
%8 = sdiv i32 %6, 256
%9 = shl i32 %8, 3
%10 = sdiv i32 %v, 512
%11 = sdiv i32 %10, %mb
%12 = shl i32 %11, 4
%13 = add i32 %9, %y
%14 = add i32 %13, %12
%15 = srem i32 %7, 4
%16 = add nsw i32 %2, -1
%17 = mul i32 %16, %15
%18 = srem i32 %10, %mb
%19 = mul i32 %2, %18
%tmp = add i32 %19, %17
%tmp2 = shl i32 %tmp, 3
%20 = add nsw i32 %tmp2, %x
%21 = shl i32 %2, 1
%22 = xor i32 %21, 6
%23 = mul i32 %22, %20
%24 = shl i32 %mb, 5
%25 = mul i32 %24, %14
%26 = shl i32 %z, 1
%27 = add nsw i32 %26, -3
%28 = mul nsw i32 %1, %27
%29 = add i32 %25, %28
%30 = add i32 %29, %23
br label %31
; <label>:31 ; preds = %5, %0
%.0 = phi i32 [ %30, %5 ], [ 0, %0 ]
ret i32 %.0
}
I do not know whether it is optimal, but it certainly is relatively readable.
It would be great if you could indicate all your constraints on the input (all five of them if necessary) because the optimizer might be able to use them.

Related

Debugging with an assemble program with GDB

Here is my question:
I wrote a piece of code of assemble. It could read a file, transform the content to the uppercase and print the outputs in a newfile.
I complie and link the assemble code with:
as -gstabs read-files.s -o read-files.o
ld read-files.o -o read-files
And a test like "./read-files input-file output-file" works well.
But what if I want to debug this piece of code with gdb? I tried, but:
when I set the breakpoint and args of target code in gdb with:
(gdb) b *_start+1
(gdb) run test-file TEST-FILE
It will end with a segmentfault immediately.
Can I really debug this code like what I just stated aboved? Thanks
And the assemble code is here:
.section .data
.equ SYS_OPEN, 5
.equ SYS_WRITE, 4
.equ SYS_READ, 3
.equ SYS_CLOSE, 6
.equ SYS_EXIT, 1
.equ O_RDONLY, 0
.equ O_CREAT_WRONLY_TRUNC, 03101
.equ STDIN, 0
.equ STDOUT, 1
.equ STDERR, 2
.equ LINUX_SYSCALL, 0x80
.equ END_OF_FILE, 0
.equ NUMBER_ARGUMENTS, 2
.section .bss
.equ BUFFER_SIZE, 500
.lcomm BUFFER_DATA, BUFFER_SIZE
.section .text
.equ ST_SIZE_RESERVE, 8
.equ ST_FD_IN, -4
.equ ST_FD_OUT, -8
.equ ST_ARGC, 0
.equ ST_ARGV_0, 4
.equ ST_ARGV_1, 8
.equ ST_ARGV_2, 12
.globl _start
_start:
movl %esp, %ebp
subl $ST_SIZE_RESERVE, %esp
open_files:
open_fd_in:
movl $SYS_OPEN, %eax
movl ST_ARGV_1(%ebp), %ebx
movl $O_RDONLY, %ecx
movl $0666, %edx
int $LINUX_SYSCALL
store_fd_in:
movl %eax, ST_FD_IN(%ebp)
open_fd_out:
movl $SYS_OPEN, %eax
movl ST_ARGV_2(%ebp), %ebx
movl $O_CREAT_WRONLY_TRUNC, %ecx
movl $0666, %edx
int $LINUX_SYSCALL
store_fd_out:
movl %eax, ST_FD_OUT(%ebp)
read_loop_begin:
movl $SYS_READ, %eax
movl ST_FD_IN(%ebp), %ebx
movl $BUFFER_DATA, %ecx
movl $BUFFER_SIZE, %edx
int $LINUX_SYSCALL
cmpl $END_OF_FILE, %eax
jle end_loop
continue_read_loop:
pushl $BUFFER_DATA
pushl %eax
call convert_to_upper
popl %eax
addl $4, %esp
movl %eax, %edx
movl $SYS_WRITE, %eax
movl ST_FD_OUT(%ebp), %ebx
movl $BUFFER_DATA, %ecx
int $LINUX_SYSCALL
jmp read_loop_begin
end_loop:
movl $SYS_CLOSE, %eax
movl ST_FD_OUT(%ebp), %ebx
int $LINUX_SYSCALL
movl $SYS_CLOSE, %eax
movl ST_FD_IN(%ebp), %ebx
int $LINUX_SYSCALL
movl $SYS_EXIT, %eax
movl $0, %ebx
int $LINUX_SYSCALL
.equ LOWERCASE_A, 'a'
.equ LOWERCASE_Z, 'z'
.equ UPPER_CONVERSION, 'A' - 'a'
.equ ST_BUFFER_LEN, 8
.equ ST_BUFFER, 12
convert_to_upper:
pushl %ebp
movl %esp, %ebp
movl ST_BUFFER(%ebp), %eax
movl ST_BUFFER_LEN(%ebp), %ebx
movl $0, %edi
cmpl $0, %ebx
je end_convert_loop
convert_loop:
movb (%eax, %edi, 1), %cl
cmpb $LOWERCASE_A, %cl
jl next_byte
cmpb $LOWERCASE_Z, %cl
jg next_byte
addb $UPPER_CONVERSION, %cl
movb %cl, (%eax, %edi, 1)
next_byte:
incl %edi
cmpl %edi, %ebx
jne convert_loop
end_convert_loop:
movl %ebp, %esp
popl %ebp
ret

mmap Mac: Segmentation fault

The following on my Mac succeeds:
int main() {
int* addr = (int*) mmap(0, 100, 1 | 2, 2 | 4096, -1, 0);
*addr = 25;
return 0;
}
However the below code is identical but fails when I try to write to *addr with segmentation fault:
int main() {
int* addr = (int*) syscall(SYS_mmap, 0, 100, 1 | 2, 2 | 4096, -1, 0);
*addr = 25;
return 0;
}
I.e. syscall successfully returns me a memory address, but when I try writing to it it fails.
I compile it like this:
g++ ./c++/mmap.cc -o ./mmap && ./mmap
If I run both versions with dtruss:
g++ ./c++/mmap.cc -o ./mmap && sudo dtruss ./mmap
then both version succeed and I see identical mmap call for both:
mmap(0x0, 0x64, 0x3, 0x1002, 0xFFFFFFFF, 0x0) = 0xXXXXXXX 0
Why does the syscall version give me segmentation fault, what am I missing?
P.S. If I do something similar on Linux it works fine.
So, as I understand the mmap function on Mac does not execute syscall(SYS_mmap, .... What does it do then? Can anyone please give me some links where I can see implementation.
EDIT:
It looks like syscall on Mac returns only first 4 bytes. Is there a 64-bit syscall version?
DISASSEMBLED:
mmap version:
_main:
0000000100000cf0 pushq %rbp
0000000100000cf1 movq %rsp, %rbp
0000000100000cf4 subq $0x30, %rsp
0000000100000cf8 xorl %eax, %eax
0000000100000cfa movl %eax, %ecx
0000000100000cfc movl $0x64, %eax
0000000100000d01 movl %eax, %esi
0000000100000d03 movl $0x3, %edx
0000000100000d08 movl $0x1002, %eax
0000000100000d0d movl $0xffffffff, %r8d
0000000100000d13 movl $0x0, -0x14(%rbp)
0000000100000d1a movq %rcx, %rdi
0000000100000d1d movq %rcx, -0x28(%rbp)
0000000100000d21 movl %eax, %ecx
0000000100000d23 movq -0x28(%rbp), %r9
0000000100000d27 callq 0x100000ed6 ## symbol stub for: _mmap
0000000100000d2c movq 0x2cd(%rip), %rdi ## literal pool symbol address: __ZNSt3__14coutE
0000000100000d33 movq %rax, -0x20(%rbp)
0000000100000d37 movq -0x20(%rbp), %rax
0000000100000d3b movq %rax, %rsi
syscall version:
_main:
0000000100000cf0 pushq %rbp
0000000100000cf1 movq %rsp, %rbp
0000000100000cf4 subq $0x30, %rsp
0000000100000cf8 movl $0xc5, %edi
0000000100000cfd xorl %esi, %esi
0000000100000cff movl $0x64, %edx
0000000100000d04 movl $0x3, %ecx
0000000100000d09 movl $0x1002, %r8d
0000000100000d0f movl $0xffffffff, %r9d
0000000100000d15 movl $0x0, -0x14(%rbp)
0000000100000d1c movl $0x0, (%rsp)
0000000100000d23 movb $0x0, %al
0000000100000d25 callq 0x100000ed6 ## symbol stub for: _syscall
0000000100000d2a movq 0x2cf(%rip), %rdi ## literal pool symbol address: __ZNSt3__14coutE
0000000100000d31 movslq %eax, %r10
0000000100000d34 movq %r10, -0x20(%rbp)
0000000100000d38 movq -0x20(%rbp), %r10
0000000100000d3c movq %r10, %rsi
Apparently Mac does not have a 64-bit syscall function, here a is simple implementation:
#include <sys/types.h>
#define CARRY_FLAG_BIT 1
inline int64_t syscall6(int64_t num, int64_t arg1, int64_t arg2, int64_t arg3, int64_t arg4, int64_t arg5, int64_t arg6) {
int64_t result;
int64_t flags;
__asm__ __volatile__ (
"movq %6, %%r10;\n"
"movq %7, %%r8;\n"
"movq %8, %%r9;\n"
"syscall;\n"
"movq %%r11, %1;\n"
: "=a" (result), "=r" (flags)
: "a" (num), "D" (arg1), "S" (arg2), "d" (arg3), "r" (arg4), "r" (arg5), "r" (arg6)
: "%r10", "%r8", "%r9", "%rcx", "%r11"
);
return (flags & CARRY_FLAG_BIT) ? -result : result;
}
And you use it on mac by shifting system call numbers by 0x2000000:
int* addr = (int*) syscall6(0x2000000 + SYS_mmap, 0, 100, 1 | 2, 2 | 4096, -1, 0);
You can find more here.

conversion of size_t difference into int [duplicate]

The compare function is a function that takes two arguments a and b and returns an integer describing their order. If a is smaller than b, the result is some negative integer. If a is bigger than b, the result is some positive integer. Otherwise, a and b are equal, and the result is zero.
This function is often used to parameterize sorting and searching algorithms from standard libraries.
Implementing the compare function for characters is quite easy; you simply subtract the arguments:
int compare_char(char a, char b)
{
return a - b;
}
This works because the difference between two characters is generally assumed to fit into an integer. (Note that this assumption does not hold for systems where sizeof(char) == sizeof(int).)
This trick cannot work to compare integers, because the difference between two integers generally does not fit into an integer. For example, INT_MAX - (-1) = INT_MIN suggests that INT_MAX is smaller than -1 (technically, the overflow leads to undefined behavior, but let's assume modulo arithmetic).
So how can we implement the compare function efficiently for integers? Here is my first attempt:
int compare_int(int a, int b)
{
int temp;
int result;
__asm__ __volatile__ (
"cmp %3, %2 \n\t"
"mov $0, %1 \n\t"
"mov $1, %0 \n\t"
"cmovg %0, %1 \n\t"
"mov $-1, %0 \n\t"
"cmovl %0, %1 \n\t"
: "=r"(temp), "=r"(result)
: "r"(a), "r"(b)
: "cc");
return result;
}
Can it be done in less than 6 instructions? Is there a less straightforward way that is more efficient?
This one has no branches, and doesn't suffer from overflow or underflow:
return (a > b) - (a < b);
With gcc -O2 -S, this compiles down to the following six instructions:
xorl %eax, %eax
cmpl %esi, %edi
setl %dl
setg %al
movzbl %dl, %edx
subl %edx, %eax
Here's some code to benchmark various compare implementations:
#include <stdio.h>
#include <stdlib.h>
#define COUNT 1024
#define LOOPS 500
#define COMPARE compare2
#define USE_RAND 1
int arr[COUNT];
int compare1 (int a, int b)
{
if (a < b) return -1;
if (a > b) return 1;
return 0;
}
int compare2 (int a, int b)
{
return (a > b) - (a < b);
}
int compare3 (int a, int b)
{
return (a < b) ? -1 : (a > b);
}
int compare4 (int a, int b)
{
__asm__ __volatile__ (
"sub %1, %0 \n\t"
"jno 1f \n\t"
"cmc \n\t"
"rcr %0 \n\t"
"1: "
: "+r"(a)
: "r"(b)
: "cc");
return a;
}
int main ()
{
for (int i = 0; i < COUNT; i++) {
#if USE_RAND
arr[i] = rand();
#else
for (int b = 0; b < sizeof(arr[i]); b++) {
*((unsigned char *)&arr[i] + b) = rand();
}
#endif
}
int sum = 0;
for (int l = 0; l < LOOPS; l++) {
for (int i = 0; i < COUNT; i++) {
for (int j = 0; j < COUNT; j++) {
sum += COMPARE(arr[i], arr[j]);
}
}
}
printf("%d=0\n", sum);
return 0;
}
The results on my 64-bit system, compiled with gcc -std=c99 -O2, for positive integers (USE_RAND=1):
compare1: 0m1.118s
compare2: 0m0.756s
compare3: 0m1.101s
compare4: 0m0.561s
Out of C-only solutions, the one I suggested was the fastest. user315052's solution was slower despite compiling to only 5 instructions. The slowdown is likely because, despite having one less instruction, there is a conditional instruction (cmovge).
Overall, FredOverflow's 4-instruction assembly implementation was the fastest when used with positive integers. However, this code only benchmarked the integer range RAND_MAX, so the 4-instuction test is biased, because it handles overflows separately, and these don't occur in the test; the speed may be due to successful branch prediction.
With a full range of integers (USE_RAND=0), the 4-instruction solution is in fact very slow (others are the same):
compare4: 0m1.897s
The following has always proven to be fairly efficient for me:
return (a < b) ? -1 : (a > b);
With gcc -O2 -S, this compiles down to the following five instructions:
xorl %edx, %edx
cmpl %esi, %edi
movl $-1, %eax
setg %dl
cmovge %edx, %eax
As a follow-up to Ambroz Bizjak's excellent companion answer, I was not convinced that his program tested the same assembly code what was posted above. And, when I was studying the compiler output more closely, I noticed that the compiler was not generating the same instructions as was posted in either of our answers. So, I took his test program, hand modified the assembly output to match what we posted, and compared the resulting times. It seems the two versions compare roughly identically.
./opt_cmp_branchless: 0m1.070s
./opt_cmp_branch: 0m1.037s
I am posting the assembly of each program in full so that others may attempt the same experiment, and confirm or contradict my observation.
The following is the version with the cmovge instruction ((a < b) ? -1 : (a > b)):
.file "cmp.c"
.text
.section .rodata.str1.1,"aMS",#progbits,1
.LC0:
.string "%d=0\n"
.text
.p2align 4,,15
.globl main
.type main, #function
main:
.LFB20:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
pushq %rbx
.cfi_def_cfa_offset 24
.cfi_offset 3, -24
movl $arr.2789, %ebx
subq $8, %rsp
.cfi_def_cfa_offset 32
.L9:
leaq 4(%rbx), %rbp
.L10:
call rand
movb %al, (%rbx)
addq $1, %rbx
cmpq %rbx, %rbp
jne .L10
cmpq $arr.2789+4096, %rbp
jne .L9
xorl %r8d, %r8d
xorl %esi, %esi
orl $-1, %edi
.L12:
xorl %ebp, %ebp
.p2align 4,,10
.p2align 3
.L18:
movl arr.2789(%rbp), %ecx
xorl %eax, %eax
.p2align 4,,10
.p2align 3
.L15:
movl arr.2789(%rax), %edx
xorl %ebx, %ebx
cmpl %ecx, %edx
movl $-1, %edx
setg %bl
cmovge %ebx, %edx
addq $4, %rax
addl %edx, %esi
cmpq $4096, %rax
jne .L15
addq $4, %rbp
cmpq $4096, %rbp
jne .L18
addl $1, %r8d
cmpl $500, %r8d
jne .L12
movl $.LC0, %edi
xorl %eax, %eax
call printf
addq $8, %rsp
.cfi_def_cfa_offset 24
xorl %eax, %eax
popq %rbx
.cfi_def_cfa_offset 16
popq %rbp
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE20:
.size main, .-main
.local arr.2789
.comm arr.2789,4096,32
.section .note.GNU-stack,"",#progbits
The version below uses the branchless method ((a > b) - (a < b)):
.file "cmp.c"
.text
.section .rodata.str1.1,"aMS",#progbits,1
.LC0:
.string "%d=0\n"
.text
.p2align 4,,15
.globl main
.type main, #function
main:
.LFB20:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
pushq %rbx
.cfi_def_cfa_offset 24
.cfi_offset 3, -24
movl $arr.2789, %ebx
subq $8, %rsp
.cfi_def_cfa_offset 32
.L9:
leaq 4(%rbx), %rbp
.L10:
call rand
movb %al, (%rbx)
addq $1, %rbx
cmpq %rbx, %rbp
jne .L10
cmpq $arr.2789+4096, %rbp
jne .L9
xorl %r8d, %r8d
xorl %esi, %esi
.L19:
movl %ebp, %ebx
xorl %edi, %edi
.p2align 4,,10
.p2align 3
.L24:
movl %ebp, %ecx
xorl %eax, %eax
jmp .L22
.p2align 4,,10
.p2align 3
.L20:
movl arr.2789(%rax), %ecx
.L22:
xorl %edx, %edx
cmpl %ebx, %ecx
setg %cl
setl %dl
movzbl %cl, %ecx
subl %ecx, %edx
addl %edx, %esi
addq $4, %rax
cmpq $4096, %rax
jne .L20
addq $4, %rdi
cmpq $4096, %rdi
je .L21
movl arr.2789(%rdi), %ebx
jmp .L24
.L21:
addl $1, %r8d
cmpl $500, %r8d
jne .L19
movl $.LC0, %edi
xorl %eax, %eax
call printf
addq $8, %rsp
.cfi_def_cfa_offset 24
xorl %eax, %eax
popq %rbx
.cfi_def_cfa_offset 16
popq %rbp
.cfi_def_cfa_offset 8
ret
.cfi_endproc
.LFE20:
.size main, .-main
.local arr.2789
.comm arr.2789,4096,32
.section .note.GNU-stack,"",#progbits
Okay, I managed to get it down to four instructions :) The basic idea is as follows:
Half the time, the difference is small enough to fit into an integer. In that case, just return the difference. Otherwise, shift the number one to the right. The crucial question is what bit to shift into the MSB then.
Let's look at two extreme examples, using 8 bits instead of 32 bits for the sake of simplicity:
10000000 INT_MIN
01111111 INT_MAX
---------
000000001 difference
00000000 shifted
01111111 INT_MAX
10000000 INT_MIN
---------
111111111 difference
11111111 shifted
Shifting the carry bit in would yield 0 for the first case (although INT_MIN is not equal to INT_MAX) and some negative number for the second case (although INT_MAX is not smaller than INT_MIN).
But if we flip the carry bit before doing the shift, we get sensible numbers:
10000000 INT_MIN
01111111 INT_MAX
---------
000000001 difference
100000001 carry flipped
10000000 shifted
01111111 INT_MAX
10000000 INT_MIN
---------
111111111 difference
011111111 carry flipped
01111111 shifted
I'm sure there's a deep mathematical reason why it makes sense to flip the carry bit, but I don't see it yet.
int compare_int(int a, int b)
{
__asm__ __volatile__ (
"sub %1, %0 \n\t"
"jno 1f \n\t"
"cmc \n\t"
"rcr %0 \n\t"
"1: "
: "+r"(a)
: "r"(b)
: "cc");
return a;
}
I have tested the code with one million random inputs plus every combination of INT_MIN, -INT_MAX, INT_MIN/2, -1, 0, 1, INT_MAX/2, INT_MAX/2+1, INT_MAX. All tests passed. Can you proove me wrong?
For what it's worth I put together an SSE2 implementation. vec_compare1 uses the same approach as compare2 but requires just three SSE2 arithmetic instructions:
#include <stdio.h>
#include <stdlib.h>
#include <emmintrin.h>
#define COUNT 1024
#define LOOPS 500
#define COMPARE vec_compare1
#define USE_RAND 1
int arr[COUNT] __attribute__ ((aligned(16)));
typedef __m128i vSInt32;
vSInt32 vec_compare1 (vSInt32 va, vSInt32 vb)
{
vSInt32 vcmp1 = _mm_cmpgt_epi32(va, vb);
vSInt32 vcmp2 = _mm_cmpgt_epi32(vb, va);
return _mm_sub_epi32(vcmp2, vcmp1);
}
int main ()
{
for (int i = 0; i < COUNT; i++) {
#if USE_RAND
arr[i] = rand();
#else
for (int b = 0; b < sizeof(arr[i]); b++) {
*((unsigned char *)&arr[i] + b) = rand();
}
#endif
}
vSInt32 vsum = _mm_set1_epi32(0);
for (int l = 0; l < LOOPS; l++) {
for (int i = 0; i < COUNT; i++) {
for (int j = 0; j < COUNT; j+=4) {
vSInt32 v1 = _mm_loadu_si128(&arr[i]);
vSInt32 v2 = _mm_load_si128(&arr[j]);
vSInt32 v = COMPARE(v1, v2);
vsum = _mm_add_epi32(vsum, v);
}
}
}
printf("vsum = %vd\n", vsum);
return 0;
}
Time for this is 0.137s.
Time for compare2 with the same CPU and compiler is 0.674s.
So the SSE2 implementation is around 4x faster, as might be expected (since it's 4-wide SIMD).
This code has no branches and uses 5 instructions. It may outperform other branch-less alternatives on recent Intel processors, where cmov* instructions are quite expensive. Disadvantage is non-symmetrical return value (INT_MIN+1, 0, 1).
int compare_int (int a, int b)
{
int res;
__asm__ __volatile__ (
"xor %0, %0 \n\t"
"cmpl %2, %1 \n\t"
"setl %b0 \n\t"
"rorl $1, %0 \n\t"
"setnz %b0 \n\t"
: "=q"(res)
: "r"(a)
, "r"(b)
: "cc"
);
return res;
}
This variant does not need initialization, so it uses only 4 instructions:
int compare_int (int a, int b)
{
__asm__ __volatile__ (
"subl %1, %0 \n\t"
"setl %b0 \n\t"
"rorl $1, %0 \n\t"
"setnz %b0 \n\t"
: "+q"(a)
: "r"(b)
: "cc"
);
return a;
}
Maybe you can use the following idea (in pseudo-code; didn't write asm-code because i am not comfortable with syntax):
Subtract the numbers (result = a - b)
If no overflow, done (jo instruction and branch prediction should work very well here)
If there was overflow, use any robust method (return (a < b) ? -1 : (a > b))
Edit: for additional simplicity: if there was overflow, flip the sign of the result, instead of step 3.
You could consider promoting the integers to 64bit values.

Can Compiler Optimize Loop with Variable Length?

Can the compiler optimize loops if the last index of the loops (a and b in the following example) are not known at compile time?
Unoptimized:
int* arr = new int[a*b];
for (i = 0; i < a; ++i){
for(j = 0; j < b; ++j){
arr[i*b+j] *= 8;
}
}
//delete arr after done.
More Optimized: (assuming a and b are large...)
int c = a*b;
int* arr = new int[c];
for (i = 0; i < c; ++i){
arr[c] *= 8;
}
//delete arr after done.
If you treat the array as linear space, gcc (and presumably others) will optimise even without knowing the extents at compile time.
This code:
void by8(int* arr, int a, int b)
{
auto extent = a * b;
for (int i = 0; i < extent; ++i)
{
arr[i] *= 8;
}
}
compiles to this (notice how the inner part of the loop is vectorised)
by8(int*, int, int):
imull %esi, %edx
testl %edx, %edx
jle .L23
movq %rdi, %rax
andl $31, %eax
shrq $2, %rax
negq %rax
andl $7, %eax
cmpl %edx, %eax
cmova %edx, %eax
cmpl $8, %edx
jg .L26
movl %edx, %eax
.L3:
sall $3, (%rdi)
cmpl $1, %eax
je .L15
sall $3, 4(%rdi)
cmpl $2, %eax
je .L16
sall $3, 8(%rdi)
cmpl $3, %eax
je .L17
sall $3, 12(%rdi)
cmpl $4, %eax
je .L18
sall $3, 16(%rdi)
cmpl $5, %eax
je .L19
sall $3, 20(%rdi)
cmpl $6, %eax
je .L20
sall $3, 24(%rdi)
cmpl $7, %eax
je .L21
sall $3, 28(%rdi)
movl $8, %ecx
.L5:
cmpl %eax, %edx
je .L27
.L4:
leal -1(%rdx), %r8d
movl %edx, %r9d
movl %eax, %r10d
subl %eax, %r9d
subl %eax, %r8d
leal -8(%r9), %esi
shrl $3, %esi
addl $1, %esi
leal 0(,%rsi,8), %r11d
cmpl $6, %r8d
jbe .L7
leaq (%rdi,%r10,4), %r10
xorl %eax, %eax
xorl %r8d, %r8d
.L9:
vmovdqa (%r10,%rax), %ymm0
addl $1, %r8d
vpslld $3, %ymm0, %ymm0
vmovdqa %ymm0, (%r10,%rax)
addq $32, %rax
cmpl %r8d, %esi
ja .L9
addl %r11d, %ecx
cmpl %r11d, %r9d
je .L22
vzeroupper
.L7:
movslq %ecx, %rax
sall $3, (%rdi,%rax,4)
leal 1(%rcx), %eax
cmpl %eax, %edx
jle .L23
cltq
sall $3, (%rdi,%rax,4)
leal 2(%rcx), %eax
cmpl %eax, %edx
jle .L23
cltq
sall $3, (%rdi,%rax,4)
leal 3(%rcx), %eax
cmpl %eax, %edx
jle .L23
cltq
sall $3, (%rdi,%rax,4)
leal 4(%rcx), %eax
cmpl %eax, %edx
jle .L23
cltq
sall $3, (%rdi,%rax,4)
leal 5(%rcx), %eax
cmpl %eax, %edx
jle .L23
cltq
addl $6, %ecx
sall $3, (%rdi,%rax,4)
cmpl %ecx, %edx
jle .L28
movslq %ecx, %rcx
sall $3, (%rdi,%rcx,4)
ret
.L22:
vzeroupper
.L23:
ret
.L27:
ret
.L26:
testl %eax, %eax
jne .L3
xorl %ecx, %ecx
jmp .L4
.L28:
ret
.L21:
movl $7, %ecx
jmp .L5
.L15:
movl $1, %ecx
jmp .L5
.L16:
movl $2, %ecx
jmp .L5
.L17:
movl $3, %ecx
jmp .L5
.L18:
movl $4, %ecx
jmp .L5
.L19:
movl $5, %ecx
jmp .L5
.L20:
movl $6, %ecx
jmp .L5
compiler : gcc 5.4 with command line options: -std=c++14 -O3 -march=native
Yes it probably can, given that the sizes are constant and do not change in your loop, as it happens here. Read Optimize "for" loop for more please.
FYI, in your fist example, this:
arr[j*a+b] *= 8;
should be this:
arr[j*a+i] *= 8;
Modern compilers can definitely change the order of the two array, to prevent unneeded cache misses, from:
for (i = 0; i < a; ++i){
for(j = 0; j < b; ++j){
arr[j*a+i] *= 8;
}
}
to this:
for(j = 0; j < b; ++j){
for (i = 0; i < a; ++i){
arr[j*a+i] *= 8;
}
}
After this optimizations, the two examples (compared to your manual optimization) shouldn't measurably differ in performance.
if you are using the visual studio compiler you can use the /Qvec-report command line argument and it will tell you which loops are/are not being vectorized and give you reason codes as to why they are not
vectorization of loops (unlike unrolling) is where the compiler uses the SIMD (SSE,SSE2,AVX) instructions to break the loop into a series of operation which are performed in parallel
https://msdn.microsoft.com/en-us/library/jj658585.aspx
gcc and clang may have similar capabilities
You can always unroll a for loop. Even if you don't know the number of iterations it should do with a trick called Duff's device
Also see the explanation here on stackoverflow: How does Duff's device work?
You can have an interleaved switch and while loop, and let the while loop process, say, 4 items at once. if you'd like to process 6 items, you can then cheat by jumping to the second last item in the loop processing 2+4=6 items:
int n = 6;
int it = n / 4;
int check = 0;
switch (n % 4) {
case 0: do { check += 1;
case 3: check += 1;
case 2: check += 1;
case 1: check += 1;
} while (it--);
}
printf("processed %i items\n", check);

gcc assembly when passing by reference and by value

I have a simple function that computes a product of
two double arrays:
#include <stdlib.h>
#include <emmintrin.h>
struct S {
double *x;
double *y;
double *z;
};
void f(S& s, size_t n) {
for (int i = 0; i < n; i += 2) {
__m128d xs = _mm_load_pd(&s.x[i]);
__m128d ys = _mm_load_pd(&s.y[i]);
_mm_store_pd(&s.z[i], _mm_mul_pd(xs, ys) );
}
return;
}
int main(void) {
S s;
size_t size = 4;
posix_memalign((void **)&s.x, 16, sizeof(double) * size);
posix_memalign((void **)&s.y, 16, sizeof(double) * size);
posix_memalign((void **)&s.z, 16, sizeof(double) * size);
f(s, size);
return 0;
}
Note that the first argument of function f is passed in by reference.
Let's look at the resulting assembly of f() (I removed some irrelevant
pieces, inserted comments and put some labels):
$ g++ -O3 -S asmtest.cpp
.globl _Z1fR1Sm
_Z1fR1Sm:
xorl %eax, %eax
testq %rsi, %rsi
je .L1
.L5:
movq (%rdi), %r8 # array x (1)
movq 8(%rdi), %rcx # array y (2)
movq 16(%rdi), %rdx # array z (3)
movapd (%r8,%rax,8), %xmm0 # load x[0]
mulpd (%rcx,%rax,8), %xmm0 # multiply x[0]*y[0]
movaps %xmm0, (%rdx,%rax,8) # store to y
addq $2, %rax # and loop
cmpq %rax, %rsi
ja .L5
Notice that addresses of arrays x, y, z are loaded into general-purpose
registers on each iteration, see statements (1),(2),(3). Why doesn't gcc move
these instructions outside the loop?
Now make a local copy (not a deep copy) of the structure:
void __attribute__((noinline)) f(S& args, size_t n) {
S s = args;
for (int i = 0; i < n; i += 2) {
__m128d xs = _mm_load_pd(&s.x[i]);
__m128d ys = _mm_load_pd(&s.y[i]);
_mm_store_pd(&s.z[i], _mm_mul_pd(xs, ys) );
}
return;
}
Assembly:
_Z1fR1Sm:
.LFB525:
.cfi_startproc
xorl %eax, %eax
testq %rsi, %rsi
movq (%rdi), %r8 # (1)
movq 8(%rdi), %rcx # (2)
movq 16(%rdi), %rdx # (3)
je .L1
.L5:
movapd (%r8,%rax,8), %xmm0
mulpd (%rcx,%rax,8), %xmm0
movaps %xmm0, (%rdx,%rax,8)
addq $2, %rax
cmpq %rax, %rsi
ja .L5
.L1:
rep ret
Notice that unlike in the previous code,
loads (1), (2), (3) are now outside the loop.
I would appreciate an explanation why these two assembly
codes are different. Is memory aliasing relevant here?
Thanks.
$ gcc --version
gcc (Debian 5.2.1-21) 5.2.1 20151003
Yes, gcc is reloading s.x and s.y with each iteration of the loop because gcc does not know if &s.z[i] for some i aliases part of the S object passed by reference to f(S&, size_t).
With gcc 5.2.0, applying __restrict__ to S::z and the s reference parameter to f(), i.e.:
struct S {
double *x;
double *y;
double *__restrict__ z;
};
void f(S&__restrict__ s, size_t n) {
for (int i = 0; i < n; i += 2) {
__m128d xs = _mm_load_pd(&s.x[i]);
__m128d ys = _mm_load_pd(&s.y[i]);
_mm_store_pd(&s.z[i], _mm_mul_pd(xs, ys));
}
return;
}
.. causes gcc to generate:
__Z1fR1Sm:
LFB518:
testq %rsi, %rsi
je L1
movq (%rdi), %r8
xorl %eax, %eax
movq 8(%rdi), %rcx
movq 16(%rdi), %rdx
.align 4,0x90
L4:
movapd (%r8,%rax,8), %xmm0
mulpd (%rcx,%rax,8), %xmm0
movaps %xmm0, (%rdx,%rax,8)
addq $2, %rax
cmpq %rax, %rsi
ja L4
L1:
ret
With Apple Clang 700.1.76, only __restrict__ on the s reference is needed:
__Z1fR1Sm: ## #_Z1fR1Sm
.cfi_startproc
## BB#0:
pushq %rbp
Ltmp0:
.cfi_def_cfa_offset 16
Ltmp1:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp2:
.cfi_def_cfa_register %rbp
testq %rsi, %rsi
je LBB0_3
## BB#1: ## %.lr.ph
movq (%rdi), %rax
movq 8(%rdi), %rcx
movq 16(%rdi), %rdx
xorl %edi, %edi
.align 4, 0x90
LBB0_2: ## =>This Inner Loop Header: Depth=1
movapd (%rax,%rdi,8), %xmm0
mulpd (%rcx,%rdi,8), %xmm0
movapd %xmm0, (%rdx,%rdi,8)
addq $2, %rdi
cmpq %rsi, %rdi
jb LBB0_2
LBB0_3: ## %._crit_edge
popq %rbp
retq
.cfi_endproc