I'm writing a function pass in LLVM, which generates IR file. The problem is that the assembled code does not seem to behave as I expect. Since I'm pretty new to LLVM, I'd like to know if I misunderstood the LLVM IR semantics or this is an incorrect behavior of llc.
The LLVM IR is:
define void #fff(i32*) #0 {
%2 = alloca i32*, align 8
%3 = alloca i32, align 4
%4 = load i8*, i8** #dirty
br label %5
; <label>:5: ; preds = %1
store i32* %0, i32** %2, align 8
%6 = load i32*, i32** %2, align 8
%7 = call i32 (i8*, ...) #printf(i8* getelementptr inbounds ([11 x i8], [11 x i8]* #.str.4, i32 0, i32 0), i32* %6)
%8 = load i32*, i32** %2, align 8
%9 = load i32, i32* %8, align 4
%readDirty = load atomic i8, i8* %4 acquire, align 8
%10 = icmp eq i8 %readDirty, 1
br i1 %10, label %Restart, label %11, !prof !3
; <label>:11: ; preds = %5
store i32 %9, i32* %3, align 4
ret void
Restart: ; preds = %5
;EDIT: bug was here. Must include label %5 as a possible destination block
indirectbr i8* blockaddress(#fff, %5), []
}
This correspond (roughly) to the following C code:
char *dirty=1;
void fff(int *head) ATTR{
restart:
printf("head = %p\n", head);
int r = *head;
if(*dirty)
goto restart; //But using indirect branch
}
Next I assemble, link and run using:
llc -filetype=obj simpleOut.ll -o out.o
gcc -o exe out.o
./exe
If I call the function with address 0x7ffeea51d7a8, it prints:
head = 0x7ffeea51d7a8
head = 0x2e889e825bf4005c
Segmentation fault: 11
The x86_64 assembly code is:
;head reside in rcx
100000d60: 55 pushq %rbp
100000d61: 48 89 e5 movq %rsp, %rbp
100000d64: 53 pushq %rbx
100000d65: 48 83 ec 18 subq $24, %rsp
100000d69: 48 89 f9 movq %rdi, %rcx
100000d6c: 48 8d 3d dd 02 00 00 leaq 733(%rip), %rdi
100000d73: ff 17 callq *(%rdi)
100000d75: 48 8b 18 movq (%rax), %rbx
100000d78: 48 8d 3d c0 01 00 00 leaq 448(%rip), %rdi
100000d7f: 48 89 4d f0 movq %rcx, -16(%rbp)
100000d83: 48 8b 75 f0 movq -16(%rbp), %rsi
100000d87: b0 00 movb $0, %al
100000d89: e8 62 01 00 00 callq 354 ;call to printf, corrupt rcx
100000d8e: 48 8b 45 f0 movq -16(%rbp), %rax
100000d92: 8b 00 movl (%rax), %eax
100000d94: 80 3b 01 cmpb $1, (%rbx)
100000d97: 74 0a je 10 <_fff+0x43>
100000d99: 89 45 ec movl %eax, -20(%rbp)
100000d9c: 48 83 c4 18 addq $24, %rsp
100000da0: 5b popq %rbx
100000da1: 5d popq %rbp
100000da2: c3 retq
100000da3: 48 8d 05 ce ff ff ff leaq -50(%rip), %rax
100000daa: ff e0 jmpq *%rax ;jumps to 100000d78
100000dac: 0f 1f 40 00 nopl (%rax)
The problem seems to be that the LLVM statement store i32* %0, i32** %2, align 8 translates to movq %rcx, -16(%rbp) even after the restart, where the register rcx was already corrupted by printf function.
If this seems like a bug I'll file a bug report with LLVM. Just wanted to check that I don't misunderstand the LLVM IR.
llc version is 5.0.0, installed via homebrew. gcc (used for linking) is clang-900.0.39.2.
Thanks
According to the documentation, indirectbr instruction should be supplied with the list of all possible destination blocks. Omitting a BB that is being jumped to produces undefined behavior.
Related
I am trying some cpp binary disassembling. I wrote this utterly simple code:
#include <iostream>
int main() {
int i=0; int i2=0;
for(int i=0; i<1000000; i++) {i2++; std::cout << "\n" << i2;}
return 0;
}
I then compiled it with g++ using something like:
g++ .cpp -o .cpp.bin
I then ran a:
objdump -d .cpp.bin
Here's what I extracted:
;1lim.cpp.bin: file format elf64-x86-64
;Disassembly of section .init:
_init:
endbr64
sub $0x8,%rsp
mov 0x2fd9(%rip),%rax
test %rax,%rax
je 1016 <_init+0x16>
call *%rax
add $0x8,%rsp
ret
;Disassembly of section .plt:
.plt:
push 0x2f7a(%rip)
bnd jmp *0x2f7b(%rip)
nopl (%rax)
endbr64
push $0x0
bnd jmp 1020 <_init+0x20>
nop
endbr64
push $0x1
bnd jmp 1020 <_init+0x20>
nop
endbr64
push $0x2
bnd jmp 1020 <_init+0x20>
nop
endbr64
push $0x3
bnd jmp 1020 <_init+0x20>
nop
;Disassembly of section .plt.got:
__cxa_finalize#plt:
endbr64
bnd jmp *0x2f55(%rip)
nopl 0x0(%rax,%rax,1)
;Disassembly of section .plt.sec:
__cxa_atexit#plt:
endbr64
bnd jmp *0x2f25(%rip)
nopl 0x0(%rax,%rax,1)
_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc#plt:
endbr64
bnd jmp *0x2f1d(%rip)
nopl 0x0(%rax,%rax,1)
_ZNSt8ios_base4InitC1Ev#plt:
endbr64
bnd jmp *0x2f15(%rip)
nopl 0x0(%rax,%rax,1)
_ZNSolsEi#plt:
endbr64
bnd jmp *0x2f0d(%rip)
nopl 0x0(%rax,%rax,1)
;Disassembly of section .text:
_start:
endbr64
xor %ebp,%ebp
mov %rdx,%r9
pop %rsi
mov %rsp,%rdx
and $0xfffffffffffffff0,%rsp
push %rax
push %rsp
xor %r8d,%r8d
xor %ecx,%ecx
lea 0xca(%rip),%rdi
call *0x2ef3(%rip)
hlt
cs nopw 0x0(%rax,%rax,1)
deregister_tm_clones:
lea 0x2f19(%rip),%rdi
lea 0x2f12(%rip),%rax
cmp %rdi,%rax
je 1118 <deregister_tm_clones+0x28>
mov 0x2ed6(%rip),%rax
test %rax,%rax
je 1118 <deregister_tm_clones+0x28>
jmp *%rax
nopl 0x0(%rax)
ret
nopl 0x0(%rax)
register_tm_clones:
lea 0x2ee9(%rip),%rdi
lea 0x2ee2(%rip),%rsi
sub %rdi,%rsi
mov %rsi,%rax
shr $0x3f,%rsi
sar $0x3,%rax
add %rax,%rsi
sar %rsi
je 1158 <register_tm_clones+0x38>
mov 0x2ea5(%rip),%rax
test %rax,%rax
je 1158 <register_tm_clones+0x38>
jmp *%rax
nopw 0x0(%rax,%rax,1)
ret
nopl 0x0(%rax)
__do_global_dtors_aux:
endbr64
cmpb $0x0,0x2fe5(%rip)
jne 1198 <__do_global_dtors_aux+0x38>
push %rbp
cmpq $0x0,0x2e5a(%rip)
mov %rsp,%rbp
je 1187 <__do_global_dtors_aux+0x27>
mov 0x2e86(%rip),%rdi
call 1070 <__cxa_finalize#plt>
call 10f0 <deregister_tm_clones>
movb $0x1,0x2fbd(%rip)
pop %rbp
ret
nopl (%rax)
ret
nopl 0x0(%rax)
frame_dummy:
endbr64
jmp 1120 <register_tm_clones>
main:
endbr64
push %rbp
mov %rsp,%rbp
sub $0x10,%rsp
movl $0x0,-0x4(%rbp)
movl $0x0,-0xc(%rbp)
movl $0x0,-0x8(%rbp)
jmp 11fd <main+0x54>
addl $0x1,-0xc(%rbp)
lea 0xe2d(%rip),%rax
mov %rax,%rsi
lea 0x2e5f(%rip),%rax
mov %rax,%rdi
call 1090 <_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc#plt>
mov %rax,%rdx
mov -0xc(%rbp),%eax
mov %eax,%esi
mov %rdx,%rdi
call 10b0 <_ZNSolsEi#plt>
addl $0x1,-0x8(%rbp)
cmpl $0xf423f,-0x8(%rbp)
jle 11cc <main+0x23>
mov $0x0,%eax
leave
ret
_Z41__static_initialization_and_destruction_0ii:
endbr64
push %rbp
mov %rsp,%rbp
sub $0x10,%rsp
mov %edi,-0x4(%rbp)
mov %esi,-0x8(%rbp)
cmpl $0x1,-0x4(%rbp)
jne 1260 <_Z41__static_initialization_and_destruction_0ii+0x53>
cmpl $0xffff,-0x8(%rbp)
jne 1260 <_Z41__static_initialization_and_destruction_0ii+0x53>
lea 0x2f1c(%rip),%rax
mov %rax,%rdi
call 10a0 <_ZNSt8ios_base4InitC1Ev#plt>
lea 0x2dc4(%rip),%rax
mov %rax,%rdx
lea 0x2f03(%rip),%rax
mov %rax,%rsi
mov 0x2da0(%rip),%rax
mov %rax,%rdi
call 1080 <__cxa_atexit#plt>
nop
leave
ret
_GLOBAL__sub_I_main:
endbr64
push %rbp
mov %rsp,%rbp
mov $0xffff,%esi
mov $0x1,%edi
call 120d <_Z41__static_initialization_and_destruction_0ii>
pop %rbp
ret
;Disassembly of section .fini:
_fini:
endbr64
sub $0x8,%rsp
add $0x8,%rsp
ret
I am now trying to interpret it using the following:
nasm -f elf64 .asm
How can I possibly fix the assembly code, in order to try to compile it with NASM (it's already a slickly modified version from what I got from objdump)
the answer that i found to this question was the following: instead of running the predeceasing quoted commands, using the following:
gcc -S .cpp
this will produce the following code:
.file "1lim.cpp"
.text
.local _ZStL8__ioinit
.comm _ZStL8__ioinit,1,1
.section .rodata
.LC0:
.string "\n"
.text
.globl main
.type main, #function
main:
.LFB1731:
.cfi_startproc
endbr64
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $16, %rsp
movl $0, -4(%rbp)
movl $0, -12(%rbp)
movl $0, -8(%rbp)
jmp .L2
.L3:
addl $1, -12(%rbp)
leaq .LC0(%rip), %rax
movq %rax, %rsi
leaq _ZSt4cout(%rip), %rax
movq %rax, %rdi
call _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc#PLT
movq %rax, %rdx
movl -12(%rbp), %eax
movl %eax, %esi
movq %rdx, %rdi
call _ZNSolsEi#PLT
addl $1, -8(%rbp)
.L2:
cmpl $999999, -8(%rbp)
jle .L3
movl $0, %eax
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE1731:
.size main, .-main
.type _Z41__static_initialization_and_destruction_0ii, #function
_Z41__static_initialization_and_destruction_0ii:
.LFB2229:
.cfi_startproc
endbr64
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $16, %rsp
movl %edi, -4(%rbp)
movl %esi, -8(%rbp)
cmpl $1, -4(%rbp)
jne .L7
cmpl $65535, -8(%rbp)
jne .L7
leaq _ZStL8__ioinit(%rip), %rax
movq %rax, %rdi
call _ZNSt8ios_base4InitC1Ev#PLT
leaq __dso_handle(%rip), %rax
movq %rax, %rdx
leaq _ZStL8__ioinit(%rip), %rax
movq %rax, %rsi
movq _ZNSt8ios_base4InitD1Ev#GOTPCREL(%rip), %rax
movq %rax, %rdi
call __cxa_atexit#PLT
.L7:
nop
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE2229:
.size _Z41__static_initialization_and_destruction_0ii, .-_Z41__static_initialization_and_destruction_0ii
.type _GLOBAL__sub_I_main, #function
_GLOBAL__sub_I_main:
.LFB2230:
.cfi_startproc
endbr64
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
movl $65535, %esi
movl $1, %edi
call _Z41__static_initialization_and_destruction_0ii
popq %rbp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE2230:
.size _GLOBAL__sub_I_main, .-_GLOBAL__sub_I_main
.section .init_array,"aw"
.align 8
.quad _GLOBAL__sub_I_main
.hidden __dso_handle
.ident "GCC: (Ubuntu 11.2.0-19ubuntu1) 11.2.0"
.section .note.GNU-stack,"",#progbits
.section .note.gnu.property,"a"
.align 8
.long 1f - 0f
.long 4f - 1f
.long 5
0:
.string "GNU"
1:
.align 8
.long 0xc0000002
.long 3f - 2f
2:
.long 0x3
3:
.align 8
4:
to then compile it it's just about doing the following:
g++ .s -o .s.bin
to then run it it's just a matter of
./.s.bin
now my question is:
how can i do the same with executables or binaries?
apparently, the answer resides in using a binary called objconv, which stands for converting c++ binaries into assembly code in a correct way. apparently, it can be installed on any operative system using anaconda packge and environment managers
cheers
I wrote a simple C program:
example.c:
int main() {
return 0;
}
Then converted it to .ll by using
clang -S -emit-llvm example.c
Which generated a example.ll file which looks like this:
; ModuleID = 'example.c'
source_filename = "example.c"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-linux-gnu"
; Function Attrs: noinline nounwind optnone uwtable
define dso_local i32 #main() #0 {
%1 = alloca i32, align 4
store i32 0, i32* %1, align 4
ret i32 0
}
attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
!llvm.module.flags = !{!0}
!llvm.ident = !{!1}
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{!"clang version 8.0.0-3 (tags/RELEASE_800/final)"}
Then I converted .ll file to .o by using:
llc -filetype=obj example.ll
And then I tried to link that file to make it executable by using:
ld.lld example.o -o example -e main
Which created an executable ./example.
Running example yields a segmentation fault
29185 segmentation fault (core dumped) ./example
objdump of example.o looks like this:
example.o: file format elf64-x86-64
Disassembly of section .text:
0000000000000000 <main>:
0: 55 push %rbp
1: 48 89 e5 mov %rsp,%rbp
4: c7 45 fc 00 00 00 00 movl $0x0,-0x4(%rbp)
b: 31 c0 xor %eax,%eax
d: 5d pop %rbp
e: c3 retq
And the executable looks like this:
example: file format elf64-x86-64
Disassembly of section .text:
0000000000201000 <main>:
201000: 55 push %rbp
201001: 48 89 e5 mov %rsp,%rbp
201004: c7 45 fc 00 00 00 00 movl $0x0,-0x4(%rbp)
20100b: 31 c0 xor %eax,%eax
20100d: 5d pop %rbp
20100e: c3 retq
I also tried linking the object file with ld but that also didn't work. Am I missing something. How can I make a llvm object file executable? Please note that none of the commands yielded any errors or warnings.
Well, this is not how you supposed to link the executable. E.g. entry point is supposed to be named "_start" and so on, so you're missing bunch of runtime initialization objects / libraries here.
Either link with clang (so, clang example.ll or clang example.o) or pass -v to clang invocation to obtain the proper linker cmdline.
I transform the following llvm-IR
; Function Attrs: noinline norecurse nounwind uwtable
define i32 #main() #0{
entry:
%sub = sub nsw i32 5, 3
%cmp = icmp slt i32 %sub, 3
br i1 %cmp, label %if.then, label %if.else
if.then: ; preds = %entry
%mul = mul nsw i32 %sub, 2
br label %if.end
if.else: ; preds = %entry
%sub1 = sub nsw i32 %sub, 3
br label %if.end
if.end: ; preds = %if.else,
%if.then
%y.0 = phi i32 [ %mul, %if.then ], [ %sub1, %if.else ]
%sub2 = sub nsw i32 %sub, %y.0
%add = add nsw i32 %sub, %y.0
ret i32 0
}
to assembly code for x86_64-unknown-linux-gnu
using llc sample.ll
generated assembly code:
.text
.file "phi.cpp"
.globl main # -- Begin function main
.p2align 4, 0x90
.type main,#function
main: # #main
.cfi_startproc
# BB#0: # %entry
pushq %rbp
.Lcfi0:
.cfi_def_cfa_offset 16
.Lcfi1:
.cfi_offset %rbp, -16
movq %rsp, %rbp
.Lcfi2:
.cfi_def_cfa_register %rbp
xorl %eax, %eax
testb %al, %al
xorl %eax, %eax
popq %rbp
retq
.Lfunc_end0:
.size main, .Lfunc_end0-main
.cfi_endproc
# -- End function
The register in the above code: %rbp is the base pointer, which points to the base of the current stack frame, and %rsp is the stack pointer, which points to the top of the current stack frame and operand are store in %eax and %al for arithmatic operation but in can't find the instruction where the value is load in %eax and %al register
I also want to know
How llc is handling phi node on assembly level
lli defaults to -O2 and your code start with a constant expression sub nsw i32 5, 3. Thus, your function does, basically, nothing, and the only thing LLVM should keep is to nullify EAX.
If you run lli -O0 your.ll, you'll get much verbose code, that perform spills on stack and register loads.
BTW, there are a pair of passes called mem2reg and reg2mem that convert code back and forth code from SSA form. Specifically, these passes would convert phi nodes to branches and introduce explicit stores and loads in IR.
Consider the following code
#include <stdio.h>
#include <string.h>
main()
{
const int a = 2;
long p = (long)&a;
int *c = (int *)p;
*c =3;
printf("%d", a);
}
This code can change the value to a in C but not in C++. I understand that C++ is applying optimization and replacing instances of a with 2. So was this a bug fix in C++ or was the bug fixed by chance due to optimization?
It's undefined behavior to modify a const value no matter directly or indirectly. This may compile in C and may even run without problem on your machine, but it's still undefined behavior.
The difference between C and C++ on this is: with const int a = 2, C++ treats a as a constant expression, for instance, you can use a as array dimension:
int n[a]; //fine in C++
But in C, a is not a constant expression, with the same code:
int n[a]; //VLA in C99
Here n is not a fixed-sized array, but a variable length array.
This is not a C vs C++ issue. By modifying a const value (as well as by double-casting a pointer via a long), you enter the realm of undefined behaviour in both languages. Therefore the difference is simply a matter of how the undefined behaviour chooses to manifest itself.
You are casting away the constness out of &a and modifying the pointed value, which is undefined behavior both in C and in C++ (the trip through long just adds some more gratuitous UB). In C++ your compiler happens to optimize more aggressively the constant, but the point of the situation is unchanged.
Your code generates undefined behavior on C++ since you're accessing memory you shouldn't
include <stdio.h>
#include <string.h>
void main()
{
const int a = 2;
printf("%x != %x !!", sizeof(long), sizeof(void*)); // on a x64 system 4 != 8
long p = (long)&a;
int *c = (int *)p;
*c =3;
printf("%d", a);
}
and even if it works on a 32 bit system modifying const memory by casting away the constness is undefined behavior in both languages.
Following is the assembly code generated by g++. The compiler statically use "$2" instead of "a", but in case of gcc it doesn't perform any static optimization. I guess there shouldn't be any undefined behaviour.
.Ltext0:
.section .rodata
.LC0:
0000 256400 .string "%d"
.text
.globl main
main:
.LFB0:
.cfi_startproc
.cfi_personality 0x3,__gxx_personality_v0
.cfi_lsda 0x3,.LLSDA0
0000 55 pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
0001 4889E5 movq %rsp, %rbp
.cfi_def_cfa_register 6
0004 4883EC20 subq $32, %rsp
.LBB2:
0008 C745EC02 movl $2, -20(%rbp)
000000
000f 488D45EC leaq -20(%rbp), %rax
0013 488945F0 movq %rax, -16(%rbp)
0017 488B45F0 movq -16(%rbp), %rax
001b 488945F8 movq %rax, -8(%rbp)
001f 488B45F8 movq -8(%rbp), %rax
0023 C7000300 movl $3, (%rax)
0000
0029 488B45F8 movq -8(%rbp), %rax
002d 8B00 movl (%rax), %eax
002f 89C6 movl %eax, %esi
0031 BF000000 movl $.LC0, %edi
00
0036 B8000000 movl $0, %eax
00
.LEHB0:
003b E8000000 call printf
00
0040 BE020000 movl $2, %esi
00
0045 BF000000 movl $.LC0, %edi
00
004a B8000000 movl $0, %eax
00
004f E8000000 call printf
00
.LEHE0:
0054 B8000000 movl $0, %eax
00
0059 EB08 jmp .L5
.L4:
005b 4889C7 movq %rax, %rdi
.LEHB1:
005e E8000000 call _Unwind_Resume
00
.LEHE1:
.L5:
.LBE2:
0063 C9 leave
.cfi_def_cfa 7, 8
0064 C3 ret
.cfi_endproc
.LFE0:
.globl __gxx_personality_v0
.section .gcc_except_table,"a",#progbits
.LLSDA0:
0000 FF .byte 0xff
0001 FF .byte 0xff
0002 01 .byte 0x1
0003 08 .uleb128 .LLSDACSE0-.LLSDACSB0
.LLSDACSB0:
0004 3B .uleb128 .LEHB0-.LFB0
0005 19 .uleb128 .LEHE0-.LEHB0
0006 5B .uleb128 .L4-.LFB0
0007 00 .uleb128 0
0008 5E .uleb128 .LEHB1-.LFB0
0009 05 .uleb128 .LEHE1-.LEHB1
000a 00 .uleb128 0
000b 00 .uleb128 0
.LLSDACSE0:
.text
.Letext0:
Closed. This question does not meet Stack Overflow guidelines. It is not currently accepting answers.
Questions asking us to recommend or find a tool, library or favorite off-site resource are off-topic for Stack Overflow as they tend to attract opinionated answers and spam. Instead, describe the problem and what has been done so far to solve it.
Closed 9 years ago.
Improve this question
Usually I would let the compiler do it's magic of optimizing complicated logical expressions, however, in this case the compiler I have to use is not very good at this (basically all it can do is to replaced things like /64 with bit-shifts and %512 with bitwise-and).
Is there any tool available that can analyze and provide optimized versions of expressions, (i.e. the same way good optimizing compilers do)?
e.g. I would like to optimize the following:
int w = 2 - z/2;
int y0 = y + (((v % 512) / 64) / 4) * 8 + ((v / 512) / mb)*16;
int x0 = x + (((v % 512) / 64) % 4) * 8 * (w - 1) + ((v / 512) % mb)*8 * w;
int i = x0 * (w ^ 3) * 2 + y0 * mb * 16 * 2 + (2*z - 3) * (z/2);
Here's a test:
typedef int MyInt; // or unsigned int
MyInt get(MyInt x, MyInt y, MyInt z, MyInt v, MyInt mb)
{
MyInt w = 2 - z/2;
MyInt y0 = y + (((v % 512) / 64) / 4) * 8 + ((v / 512) / mb)*16;
MyInt x0 = x + (((v % 512) / 64) % 4) * 8 * (w - 1) + ((v / 512) % mb)*8 * w;
MyInt i = x0 * (w ^ 3) * 2 + y0 * mb * 16 * 2 + (2*z - 3) * (z/2);
return i;
}
I compiled with GCC 4.7.0 with -O3.
With int:
.LFB0:
movl %ecx, %eax
movq %r12, -24(%rsp)
.LCFI0:
movl %edx, %r12d
sarl $31, %eax
shrl $31, %r12d
movq %r13, -16(%rsp)
shrl $23, %eax
addl %edx, %r12d
movq %rbx, -40(%rsp)
leal (%rcx,%rax), %r9d
movl %r12d, %r11d
movq %r14, -8(%rsp)
sarl %r11d
movq %rbp, -32(%rsp)
.LCFI1:
movl %edx, %ebp
andl $511, %r9d
negl %r11d
subl %eax, %r9d
leal 511(%rcx), %eax
testl %ecx, %ecx
leal 2(%r11), %r13d
leal 63(%r9), %ebx
cmovns %ecx, %eax
sarl $9, %eax
movl %r13d, %r14d
xorl $3, %r14d
movl %eax, %edx
testl %r9d, %r9d
cmovns %r9d, %ebx
sarl $31, %edx
addl $1, %r11d
idivl %r8d
movl %ebx, %r10d
sarl $31, %ebx
shrl $30, %ebx
sarl $6, %r10d
addl %ebx, %r10d
andl $3, %r10d
subl %ebx, %r10d
movq -40(%rsp), %rbx
sall $3, %r10d
sall $3, %edx
imull %r11d, %r10d
imull %r13d, %edx
movq -16(%rsp), %r13
addl %edi, %r10d
addl %edx, %r10d
leal 255(%r9), %edx
imull %r10d, %r14d
testl %r9d, %r9d
cmovs %edx, %r9d
sall $4, %eax
sarl %r12d
sarl $8, %r9d
leal (%rsi,%r9,8), %ecx
addl %eax, %ecx
leal -3(%rbp,%rbp), %eax
movq -32(%rsp), %rbp
imull %r8d, %ecx
imull %r12d, %eax
movq -24(%rsp), %r12
sall $4, %ecx
addl %r14d, %ecx
movq -8(%rsp), %r14
leal (%rax,%rcx,2), %eax
ret
With unsigned int:
.LFB0:
movl %ecx, %eax
movq %rbp, -16(%rsp)
movl %edx, %r11d
.LCFI0:
movl %edx, %ebp
shrl $9, %eax
xorl %edx, %edx
divl %r8d
movq %r12, -8(%rsp)
.LCFI1:
movl %ecx, %r12d
shrl %r11d
andl $511, %r12d
movq %rbx, -24(%rsp)
.LCFI2:
movl $2, %r10d
movl %r12d, %r9d
movl $1, %ebx
subl %r11d, %r10d
shrl $6, %r9d
subl %r11d, %ebx
shrl $8, %r12d
andl $3, %r9d
sall $4, %r8d
imull %ebx, %r9d
leal (%r12,%rax,2), %eax
movq -24(%rsp), %rbx
imull %r10d, %edx
xorl $3, %r10d
movq -8(%rsp), %r12
leal (%rsi,%rax,8), %eax
addl %edx, %r9d
leal (%rdi,%r9,8), %edi
imull %eax, %r8d
leal -3(%rbp,%rbp), %eax
movq -16(%rsp), %rbp
imull %r10d, %edi
imull %r11d, %eax
addl %edi, %r8d
leal (%rax,%r8,2), %eax
ret
"Optimizing" further by folding constants manually has (predictably) no further effect.
When I want optimizations, I tend to check what Clang generates as LLVM IR. It's more readable (I find) than pure assembly.
int foo(int v, int mb, int x, int y, int z) {
int w = 2 - z/2;
// When you have specific constraints, tell the optimizer about it !
if (w < 0 || w > 2) { return 0; }
int y0 = y + (((v % 512) / 64) / 4) * 8 + ((v / 512) / mb)*16;
int x0 = x + (((v % 512) / 64) % 4) * 8 * (w - 1) + ((v / 512) % mb)*8 * w;
int i = x0 * (w ^ 3) * 2 + y0 * mb * 16 * 2 + (2*z - 3) * (z/2);
return i;
}
Is transformed into:
define i32 #foo(i32 %v, i32 %mb, i32 %x, i32 %y, i32 %z) nounwind uwtable readnone {
%1 = sdiv i32 %z, 2
%2 = sub nsw i32 2, %1
%3 = icmp slt i32 %2, 0
%4 = icmp slt i32 %z, -1
%or.cond = or i1 %3, %4
br i1 %or.cond, label %31, label %5
; <label>:5 ; preds = %0
%6 = srem i32 %v, 512
%7 = sdiv i32 %6, 64
%8 = sdiv i32 %6, 256
%9 = shl i32 %8, 3
%10 = sdiv i32 %v, 512
%11 = sdiv i32 %10, %mb
%12 = shl i32 %11, 4
%13 = add i32 %9, %y
%14 = add i32 %13, %12
%15 = srem i32 %7, 4
%16 = add nsw i32 %2, -1
%17 = mul i32 %16, %15
%18 = srem i32 %10, %mb
%19 = mul i32 %2, %18
%tmp = add i32 %19, %17
%tmp2 = shl i32 %tmp, 3
%20 = add nsw i32 %tmp2, %x
%21 = shl i32 %2, 1
%22 = xor i32 %21, 6
%23 = mul i32 %22, %20
%24 = shl i32 %mb, 5
%25 = mul i32 %24, %14
%26 = shl i32 %z, 1
%27 = add nsw i32 %26, -3
%28 = mul nsw i32 %1, %27
%29 = add i32 %25, %28
%30 = add i32 %29, %23
br label %31
; <label>:31 ; preds = %5, %0
%.0 = phi i32 [ %30, %5 ], [ 0, %0 ]
ret i32 %.0
}
I do not know whether it is optimal, but it certainly is relatively readable.
It would be great if you could indicate all your constraints on the input (all five of them if necessary) because the optimizer might be able to use them.