I am trying to run this code but I am getting:
error: expected instruction opcode
label_3:
this is the relevant part of the code:
define void #main(){
%r1 = alloca [50 x i32]
%r7 = alloca i32
store i32 0 , i32* %r7
label_3:
%r9 = load i32 , i32* %r7
%r8 = getelementptr [258 x i32], [258 x i32]* %r6 , i32 0 , i32 %r9
store i32 0 , i32* %r8
%r10 = add i32 1 , %r9
store i32 %r10 , i32* %r7
%r11 = icmp eq i32 256 , i32 %r10
br i1 %r11 , label %label_4 , label %label_3
label_4:
.....
Thanks in advance!
I solved the problem, the problem was that before entering the loop (label_3) we need to close the previous block implicitly and to do that a “Terminator” instruction is required, so I added before label_3: line, br label label_3
for more details read this:
https://zanopia.wordpress.com/2010/09/14/understanding-llvm-assembly-with-fractals-part-i/
Related
I've been playing around with compilers and have been working on my own toy C compiler. Currently I'm attempting to target LLVM IR, but I'm having trouble wrapping my head around the syntax.
My current current issue: why is this valid IR syntax:
define i32 #main() {
%1 = alloca i32, align 4
%2 = add i32 0, 0
store i32 %2, i32* %1, align 4
%3 = alloca i32, align 4
%4 = add i32 0, 1
store i32 %4, i32* %3, align 4
%5 = load i32, i32* %1, align 4
%6 = icmp ne i32 %5, 0
br i1 %6, label %true0, label %else0
true0: ; preds %0
%7 = add i32 0, 1
store i32 %7, i32* %3, align 4
br label %end0
else0: ; preds %0
%8 = load i32, i32* %3, align 4
%9 = icmp ne i32 %8, 0
br i1 %9, label %true1, label %end1
true1: ; preds %else0
%10 = add i32 0, 2
store i32 %10, i32* %3, align 4
br label %end1
end1: ; preds %true1, %else0
br label %end0
end0: ; preds %true0, %else1
%11 = load i32, i32* %3, align 4
ret i32 %11
}
but this is not:
define i32 #main() {
%1 = alloca i32, align 4
%2 = add i32 0, 0
store i32 %2, i32* %1, align 4 ; variable a
%3 = load i32, i32* %1, align 4
%4 = icmp ne i32 %3, 0
br i1 %4, label %true0, label %else0
true0: ; preds %0
%5 = add i32 0, 1
ret i32 %5
br label %end0
else0: ; preds %0
%6 = add i32 0, 2
ret i32 %6
br label %end0
end0: ; % preds %true0, %else0
ret i32 0
}
I get the error:
llc-6.0: test2.ll:13:1: error: instruction expected to be numbered '%7'
%6 = add i32 0, 2
^
I don't understand why that block needs to be %7, given the previously used number was %6. Compare the %else0 label of the first example, that's very similar syntax and works fine.
And yes, my compiler needs a lot of optimization, but I'm not finished yet :)
Your code is invalid because there is actually another basic block you did not labeled:
true0: ; preds %0
%5 = add i32 0, 1
ret i32 %5
hidden_bb: ; this will named as %6 by default
br label %end0
else0: ; preds %0
If it has a label than the error will gone. Note that all terminator instructions, like br and ret will create their own basic block.
Please consider following code:
float test(int len, int* tab)
{
for(int i = 0; i<len; i++)
tab[i] = i;
}
Obviously return is missing. For this scenario for both clang and ndk compiler for ARM processor an infinite loop is generated. After disassembling it becomes clear that compiler generates regular branch instruction instead of conditional branch.
mov r0, #0
.LBB0_1:
str r0, [r1, r0, lsl #2]
add r0, r0, #1
b .LBB0_1
The example with an error can be found here: https://godbolt.org/z/YDSFw-
Please note that c++ specification states that missing return is considered as undefined behaviour but it refers only to the returned value. It shall not affect the preceding instructions.
Am I missing something here? Any thoughts?
No, you can't reason that way with undefined behaviour.
The compiler is free to use undefined behaviour and assumptions around it for optimizations. The compiler is free to assume your code will not contain undefined behaviour.
In this case, the compiler can assume that the code with undefined behaviour won't be reached. As the end of the function contains undefined behaviour, the compiler concludes that the end of the function actually never will be reached, and thus can optimize the loop.
If you remove the -Oz and add -emit-llvm to the compiler explorer command, you'll see what LLVM IR clang produces originally, when not doing optimizations:
https://godbolt.org/z/-dbeNj
define dso_local float #_Z4testiPi(i32 %0, i32* %1) #0 {
%3 = alloca i32, align 4
%4 = alloca i32*, align 4
%5 = alloca i32, align 4
store i32 %0, i32* %3, align 4
store i32* %1, i32** %4, align 4
store i32 0, i32* %5, align 4
br label %6
6: ; preds = %15, %2
%7 = load i32, i32* %5, align 4
%8 = load i32, i32* %3, align 4
%9 = icmp slt i32 %7, %8
br i1 %9, label %10, label %18
10: ; preds = %6
%11 = load i32, i32* %5, align 4
%12 = load i32*, i32** %4, align 4
%13 = load i32, i32* %5, align 4
%14 = getelementptr inbounds i32, i32* %12, i32 %13
store i32 %11, i32* %14, align 4
br label %15
15: ; preds = %10
%16 = load i32, i32* %5, align 4
%17 = add nsw i32 %16, 1
store i32 %17, i32* %5, align 4
br label %6
18: ; preds = %6
call void #llvm.trap()
unreachable
}
The end of the loop, label 18, contains unreachable. This can be used for further optimizations, getting rid of the branch and comparison at the start of the loop.
Edit:
There's an excellent blog post from John Regehr about how to reason around undefined behaviour in C and C++. It's a bit long but well worth a read.
Let's say I have this function in C/C++:
int foo(int x) {
if (x <= 1) return 1;
return x * foo(x-1);
}
And I compile it with Clang.
Clang generates the following IR:
; Function Attrs: ssp uwtable
define i32 #_Z3fooi(i32 %x) #0 {
%1 = alloca i32, align 4
%2 = alloca i32, align 4
store i32 %x, i32* %2, align 4
%3 = load i32, i32* %2, align 4
%4 = icmp sle i32 %3, 1
br i1 %4, label %5, label %6
; <label>:5 ; preds = %0
store i32 1, i32* %1, align 4
br label %12
; <label>:6 ; preds = %0
%7 = load i32, i32* %2, align 4
%8 = load i32, i32* %2, align 4
%9 = sub nsw i32 %8, 1
%10 = call i32 #_Z3fooi(i32 %9)
%11 = mul nsw i32 %7, %10
store i32 %11, i32* %1, align 4
br label %12
; <label>:12 ; preds = %6, %5
%13 = load i32, i32* %1, align 4
ret i32 %13
}
As you can see, LLVM passes optimizes out the code and creates a "return register" (where I put the return value), and a "return block" (where the return value is effectively returned).
I'm trying to get the same effect, but when I use SROA pass or the Instruction Combining pass, they translate the exits in a phi instruction:
; Function Attrs: nounwind ssp uwtable
define i32 #__HF3fooTi(i32 %x) #0 {
%1 = icmp sle i32 %x, 1
br i1 %1, label %2, label %3
; <label>:2 ; preds = %0
br label %7
; <label>:3 ; preds = %0
%4 = sub nsw i32 %x, 1
%5 = call i32 #__HF3fooTi(i32 %4)
%6 = mul nsw i32 %x, %5
br label %7
; <label>:7 ; preds = %3, %2
%.0 = phi i32 [ 1, %2 ], [ %6, %3 ]
ret i32 %.0
}
My question is: which solution is faster? And which pass is Clang using to achieve this? (In the Clang source files I found the 2 passes I used, and they give me this different result)
I tried to insert line 35 and line 36 into the code, but an error happened when I tried to use llc to generate an .o file.
35 %12 = ptrtoint i32* %1 to i64
36 call void #__Storemy(i32 10, i64 %12)
37 store i32 %10, i32* %1
38 br label %18
error information:
Instruction does not dominate all uses!
%12 = ptrtoint i32* %1 to i64
call void #__StoreTo(i32 15, i64 %12)
Broken module found, compilation aborted!
0 libLLVM-3.4.so.1 0x00007f6d31fe25d2 llvm::sys::PrintStackTrace(_IO_FILE*) + 34
1 libLLVM-3.4.so.1 0x00007f6d31fe23c4
2 libc.so.6 0x00007f6d30a62d40
3 libc.so.6 0x00007f6d30a62cc9 gsignal + 57
4 libc.so.6 0x00007f6d30a660d8 abort + 328
5 libLLVM-3.4.so.1 0x00007f6d319d2a41
6 libLLVM-3.4.so.1 0x00007f6d319dbb03
7 libLLVM-3.4.so.1 0x00007f6d319b2f77 llvm::FPPassManager::runOnFunction(llvm::Function&) + 471
8 libLLVM-3.4.so.1 0x00007f6d319b2ffb llvm::FPPassManager::runOnModule(llvm::Module&) + 43
9 libLLVM-3.4.so.1 0x00007f6d319b54b5 llvm::legacy::PassManagerImpl::run(llvm::Module&) + 693
10 llc 0x000000000040c0d4
11 llc 0x000000000040b150 main + 368
12 libc.so.6 0x00007f6d30a4dec5 __libc_start_main + 245
13 llc 0x000000000040b1a9
Stack dump:
0. Program arguments: llc -filetype=obj test.bc -o test.o
1. Running pass 'Function Pass Manager' on module 'test.bc'.
2. Running pass 'Module Verifier' on function '#dblfun'
this is the whole IR for this function:
define i32 #dbl(i32* %a, i32 %x) #0 {
call void #__myFuncCall(i32 1, i32 1)
%1 = alloca i32, align 4
%2 = alloca i32*, align 8
%3 = alloca i32, align 4
store i32* %a, i32** %2, align 8
%4 = ptrtoint i32* %3 to i64
call void #__myStore(i32 2, i64 %4)
store i32 %x, i32* %3, align 4
%5 = load i32* %3, align 4
%6 = ptrtoint i32* %3 to i64
call void #__myLoad(i32 3, i64 %6, i32 %5)
call void #__myLoad(i32 4, i64 0, i32 5)
%7 = icmp sgt i32 %5, 5
call void #__myApply(i32 5, i32 14, i1 %7)
br i1 %7, label %_then, label %_else
; <label>:8 ; preds = %_then
%9 = call i32 (i8*, ...)* #printf(i8* getelementptr inbounds ([3 x i8]* #.str, i32 0, i32 0))
call void #__Clear(i32 8)
%10 = load i32* %3, align 4
%11 = ptrtoint i32* %3 to i64
call void #__myLoad(i32 9, i64 %11, i32 %10)
%12 = ptrtoint i32* %1 to i64
call void #__StoreTo(i32 10, i64 %12)
store i32 %10, i32* %1
br label %18
_else: ; preds = %0
call void #__test(i32 7, i32 2, i32 0)
br label %13
_then: ; preds = %0
call void #__test(i32 6, i32 1, i32 1)
br label %8
; <label>:13 ; preds = %_else
%14 = call i32 (i8*, ...)* #printf(i8* getelementptr inbounds ([3 x i8]* #.str1, i32 0, i32 0))
call void #__Clear(i32 11)
%15 = load i32* %3, align 4
%16 = ptrtoint i32* %3 to i64
call void #__myLoad(i32 12, i64 0, i32 2)
call void #__myLoad(i32 13, i64 %16, i32 %15)
%17 = mul nsw i32 2, %15
call void #__myApply(i32 14, i32 2, i32 %17)
call void #__myStore(i32 15, i64 %12)
store i32 %17, i32* %1
br label %18
; <label>:18 ; preds = %13, %8
%19 = load i32* %1
%20 = ptrtoint i32* %1 to i64
call void #__myLoad(i32 16, i64 %20, i32 %19)
call void #__myReturn(i32 17)
ret i32 %19
}
I can't find any problems with this code, anyone can give me some suggestions?
The control flow graph of your function looks like this:
entry
/ \
/ \
_then _else
| |
8 13
\ /
\ /
18
%12 is defined in block 8. It's used once immediately afterwards which is fine, and then there is another use in block 13, which you can see from the diagram is not dominated by 8 (actually it's not even reachable from 8).
In this case, you should be able to move the ptrtoint instruction up to the entry block -- that way it'll be accessible in every other block.
I have a c code that calculates the factorial of an int "factorial.c". I compile it to llvm readable code "factorial.ll" and I modify in the compiled llvm code.
The objective is to execute the modified llvm code and to see its output, How can I do this?
It will depend on how your outputted LLVM is assembled and what libraries it links against, but for example executing the following factorial.ll with the shell command lli
$ lli factorial.ll
Factorial of 10 = 3628800
Will execute the main function with the JIT and use the standard printf to output the result to stdout.
#.str = private unnamed_addr constant [22 x i8] c"Factorial of %d = %d\0A\00", align 1
declare i32 #printf(i8*, ...)
define i32 #factorial(i32 %n) nounwind uwtable {
entry:
%n.addr = alloca i32, align 4
store i32 %n, i32* %n.addr, align 4
%0 = load i32* %n.addr, align 4
%cmp = icmp sle i32 %0, 1
br i1 %cmp, label %cond.true, label %cond.false
cond.true: ; preds = %entry
br label %cond.end
cond.false: ; preds = %entry
%1 = load i32* %n.addr, align 4
%2 = load i32* %n.addr, align 4
%sub = sub nsw i32 %2, 1
%call = call i32 #factorial(i32 %sub)
%mul = mul nsw i32 %1, %call
br label %cond.end
cond.end: ; preds = %cond.false, %cond.true
%cond = phi i32 [ 1, %cond.true ], [ %mul, %cond.false ]
ret i32 %cond
}
define i32 #main(i32 %argc, i8** %argv) nounwind uwtable {
entry:
%retval = alloca i32, align 4
%argc.addr = alloca i32, align 4
%argv.addr = alloca i8**, align 8
store i32 0, i32* %retval
store i32 %argc, i32* %argc.addr, align 4
store i8** %argv, i8*** %argv.addr, align 8
%call = call i32 #factorial(i32 10)
%call1 = call i32 (i8*, ...)* #printf(i8* getelementptr inbounds ([22 x i8]* #.str, i32 0, i32 0), i32 10, i32 %call)
ret i32 0
}