I am new to LLVM, I am compiling the scripting language I wrote into LLVM IR and running it with jit
This is my script code:
for (i = 0; i < 10; ++i) {
if (i == 5) {
print(100);
break;
}
print(i);
}
This is the compiled LLVM IR:
; ModuleID = 'jit'
source_filename = "jit"
declare i32 #print(i32)
define i32 #main() {
entry:
%i = alloca i32, align 4
store i32 0, i32* %i, align 4
br label %loop
loop: ; preds = %loopStep, %entry
%i1 = load i32, i32* %i, align 4
%0 = icmp slt i32 %i1, 10
br i1 %0, label %loopBody, label %loopEnd
loopBody: ; preds = %loop
%i2 = load i32, i32* %i, align 4
%1 = icmp eq i32 %i2, 5
br i1 %1, label %then, label %else
loopStep: ; preds = %merge
%i4 = load i32, i32* %i, align 4
%2 = add i32 %i4, 1
store i32 %2, i32* %i, align 4
br label %loop
loopEnd: ; preds = %then, %loop
ret i32 0
merge: ; preds = %else, %then
%i3 = load i32, i32* %i, align 4
%3 = call i32 #print(i32 %i3)
br label %loopStep
then: ; preds = %loopBody
%4 = call i32 #print(i32 100)
br label %loopEnd
br label %merge
else: ; preds = %loopBody
br label %merge
}
This is the result of running jit:
0
1
2
3
4
100
5
6
7
8
9
I think the correct route when 1==5 is %then -> %loopEnd -> ret, but the result of jit running is completely unexpected
I don't know what went wrong, thanks for the help
I think LLVM IR is as flexible as assembly, but it is not the case. When generating LLVM IR, a lot of processing needs to be done. For example, there cannot be multiple consecutive brs and multiple consecutive rets.
Related
I am new to llvm framework and was able to run a basic pass to iterate over instructions in a simple IR function with only entry basic block, but to expand upon that I got an .ll file from clang for a simple c function ( don't mind the correctness of the function I don't care about it for the sake of learning llvm at least for now ).
// fact.c
int fact(int n){
int t =1;
for(int i = 2;i<=n;i++){
t = t*i;
}
return t;
}
I was able to get a fact.ll file for this function, given below, and there are 3 functions in the fact.ll which I hand coded into the IR. foo , add and bar. And I attempt to run a simple pass which will iterate over each BasicBlock and gather it's inst opcodes and simply print them at the end, My issue is the opt tool is able to do it for foo, add and bar functions but not for the fact function.
Pass file :
#include "llvm/Transforms/Utils/MyHello.h"
#include <string>
using namespace llvm;
PreservedAnalyses MyHelloPass::run(Function &F,FunctionAnalysisManager &AM) {
std::string output;
errs()<<F.getName()<<"\n";
for(Function::iterator BB = F.begin();BB!=F.end();BB++){
for(BasicBlock::iterator I = BB->begin();I!=BB->end();I++){
output+=(I->getOpcodeName());
output+='\n';
}
}
errs()<<output<<'\n';
return PreservedAnalyses::all();
}
fact.ll
; ModuleID = 'fact.c'
source_filename = "fact.c"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; Function Attrs: noinline nounwind optnone uwtable
define dso_local i32 #fact(i32 noundef %n) #0 {
entry:
%n.addr = alloca i32, align 4
%t = alloca i32, align 4
%i = alloca i32, align 4
store i32 %n, i32* %n.addr, align 4
store i32 1, i32* %t, align 4
store i32 2, i32* %i, align 4
br label %for.cond
for.cond: ; preds = %for.inc, %entry
%0 = load i32, i32* %i, align 4
%1 = load i32, i32* %n.addr, align 4
%cmp = icmp sle i32 %0, %1
br i1 %cmp, label %for.body, label %for.end
for.body: ; preds = %for.cond
%2 = load i32, i32* %t, align 4
%3 = load i32, i32* %i, align 4
%mul = mul nsw i32 %2, %3
store i32 %mul, i32* %t, align 4
br label %for.inc
for.inc: ; preds = %for.body
%4 = load i32, i32* %i, align 4
%inc = add nsw i32 %4, 1
store i32 %inc, i32* %i, align 4
br label %for.cond, !llvm.loop !6
for.end: ; preds = %for.cond
%5 = load i32, i32* %t, align 4
ret i32 %5
}
define i32 #foo(){
%a = add i32 2,3
ret i32 %a
}
define i32 #add(i32 %a,i32 %b){
%c = add i32 %a,%b
%d = add i32 %c,%c
%e = sub i32 %c, %d
%f = mul i32 %d, %e
ret i32 %f
}
define void #bar(){
ret void
}
attributes #0 = { noinline nounwind optnone uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
!llvm.module.flags = !{!0, !1, !2, !3, !4}
!llvm.ident = !{!5}
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{i32 7, !"PIC Level", i32 2}
!2 = !{i32 7, !"PIE Level", i32 2}
!3 = !{i32 7, !"uwtable", i32 2}
!4 = !{i32 7, !"frame-pointer", i32 2}
!5 = !{!"AMD\C2\A0\C2\A0-DCLANG_REPOSITORY_STRING=CLANG: clang version 15.0.0 (CLANG: Jenkins CPUPC_Mirror_To_Staging_Merge-Build#892) (based on LLVM Mirror.Version.14.0.0)"}
!6 = distinct !{!6, !7}
!7 = !{!"llvm.loop.mustprogress"}
run command : opt -disable-output fact.ll -passes="myhello"
Ouput :
foo
add
ret
add
add
add
sub
mul
ret
bar
ret
See the optnone in:
; Function Attrs: noinline nounwind optnone uwtable
define dso_local i32 #fact(i32 noundef %n) #0 {
That means that this function is opting out of optimizations, hence your pass will not be run on that function.
You can manually remove the optnone from the definition of #0 at the bottom (note: the ; Function Attrs: ... line is merely a comment, changing it has no effect) or you can build your LLVM IR with "clang -O2". You may want to also add -mllvm -disable-llvm-optzns if you want clang to produce IR that could be optimized but hasn't been run through LLVM passes.
I notice that in LLVM bitcode files, all PHI instructions in the same basic block often have the same set of incoming blocks.
Does anyone know if it is always true for all LLVM bitcode files?
Or is there any optimization pass for doing so?
For example, here is my C code:
// test.c
int main(){
int **p, *q;
int *a, *b, c, d;
p = &a;
if (p) {
if (c) {
q = &c;
}
}
else{
p = &b;
q = &d;
}
if (d) {
*p = q;
}
}
After being compiled by clang and opt:
clang -Xclang -disable-O0-optnone -c -emit-llvm test.c
opt -mem2reg test.bc -o test.opt.bc
Here is the output test.opt.bc, where all PHI instructions in block 12 have the same incomming blocks 11 and 12:
; Function Attrs: noinline nounwind uwtable
define dso_local i32 #main() #0 {
%1 = alloca i32*, align 8
%2 = alloca i32*, align 8
%3 = alloca i32, align 4
%4 = alloca i32, align 4
%5 = icmp ne i32** %1, null
br i1 %5, label %6, label %11
6: ; preds = %0
%7 = load i32, i32* %3, align 4
%8 = icmp ne i32 %7, 0
br i1 %8, label %9, label %10
9: ; preds = %6
br label %10
10: ; preds = %9, %6
br label %12
11: ; preds = %0
br label %12
12: ; preds = %11, %10
%.01 = phi i32** [ %1, %10 ], [ %2, %11 ]
%.1 = phi i32* [ %3, %10 ], [ %4, %11 ]
%13 = load i32, i32* %4, align 4
%14 = icmp ne i32 %13, 0
br i1 %14, label %15, label %16
15: ; preds = %12
store i32* %.1, i32** %.01, align 8
br label %16
16: ; preds = %15, %12
ret i32 0
}
The answer is yes.
It directly follows from the phi instruction description from the language reference:
After this, the ‘phi’ instruction takes a list of pairs as arguments, with one pair for each predecessor basic block of the current block.
Since all of phi instructions in question belong to the same basic block, these lists of pairs should reference the same set of predecessors.
I've been playing around with compilers and have been working on my own toy C compiler. Currently I'm attempting to target LLVM IR, but I'm having trouble wrapping my head around the syntax.
My current current issue: why is this valid IR syntax:
define i32 #main() {
%1 = alloca i32, align 4
%2 = add i32 0, 0
store i32 %2, i32* %1, align 4
%3 = alloca i32, align 4
%4 = add i32 0, 1
store i32 %4, i32* %3, align 4
%5 = load i32, i32* %1, align 4
%6 = icmp ne i32 %5, 0
br i1 %6, label %true0, label %else0
true0: ; preds %0
%7 = add i32 0, 1
store i32 %7, i32* %3, align 4
br label %end0
else0: ; preds %0
%8 = load i32, i32* %3, align 4
%9 = icmp ne i32 %8, 0
br i1 %9, label %true1, label %end1
true1: ; preds %else0
%10 = add i32 0, 2
store i32 %10, i32* %3, align 4
br label %end1
end1: ; preds %true1, %else0
br label %end0
end0: ; preds %true0, %else1
%11 = load i32, i32* %3, align 4
ret i32 %11
}
but this is not:
define i32 #main() {
%1 = alloca i32, align 4
%2 = add i32 0, 0
store i32 %2, i32* %1, align 4 ; variable a
%3 = load i32, i32* %1, align 4
%4 = icmp ne i32 %3, 0
br i1 %4, label %true0, label %else0
true0: ; preds %0
%5 = add i32 0, 1
ret i32 %5
br label %end0
else0: ; preds %0
%6 = add i32 0, 2
ret i32 %6
br label %end0
end0: ; % preds %true0, %else0
ret i32 0
}
I get the error:
llc-6.0: test2.ll:13:1: error: instruction expected to be numbered '%7'
%6 = add i32 0, 2
^
I don't understand why that block needs to be %7, given the previously used number was %6. Compare the %else0 label of the first example, that's very similar syntax and works fine.
And yes, my compiler needs a lot of optimization, but I'm not finished yet :)
Your code is invalid because there is actually another basic block you did not labeled:
true0: ; preds %0
%5 = add i32 0, 1
ret i32 %5
hidden_bb: ; this will named as %6 by default
br label %end0
else0: ; preds %0
If it has a label than the error will gone. Note that all terminator instructions, like br and ret will create their own basic block.
I am trying to perform -O2 optimisation with LLVM IR obtained by calling CLANG API. Unfortunately, optimisation works only with IR created with manual calls. I have the following function:
int mult_add(int x, int y){
if(x > 2){
return y + 1 + 2;
} else {
return y - 1 + 2;
}
}
And with these calls:
clang -S -emit-llvm main.cpp
opt main.ll -o opt.ll -S -O2
I get the correct result:
define i32 #_Z8mult_addii(i32, i32) local_unnamed_addr #0 {
%3 = icmp sgt i32 %0, 2
%.sink = select i1 %3, i32 3, i32 1
%4 = add nsw i32 %.sink, %1
ret i32 %4
}
Unfortunately, when I do it through LLVM API with legacy::PassManager and legacy::FunctionPassManager optimisation simply does not work and got long ugly code:
define i32 #_Z8mult_addii(i32, i32) #0 {
%3 = alloca i32, align 4
%4 = alloca i32, align 4
%5 = alloca i32, align 4
store i32 %0, i32* %4, align 4
store i32 %1, i32* %5, align 4
%6 = load i32, i32* %4, align 4
%7 = icmp sgt i32 %6, 2
br i1 %7, label %8, label %12
; <label>:8: ; preds = %2
%9 = load i32, i32* %5, align 4
%10 = add nsw i32 %9, 1
%11 = add nsw i32 %10, 2
store i32 %11, i32* %3, align 4
br label %16
; <label>:12: ; preds = %2
%13 = load i32, i32* %5, align 4
%14 = sub nsw i32 %13, 1
%15 = add nsw i32 %14, 2
store i32 %15, i32* %3, align 4
br label %16
; <label>:16: ; preds = %12, %8
%17 = load i32, i32* %3, align 4
ret i32 %17
}
Seems like CLANG creates IR in some unoptimisable state? Because running the passes on a manual created IR works fine.
By the way, PMBuilder.populateModulePassManager is called, here is the code:
legacy::PassManager Passes;
legacy::FunctionPassManager FPasses(M2.get());
AddOptimizationPasses(Passes, FPasses, &(TheJIT->getTargetMachine()), 2, 0);
Passes.add(createPrintModulePass(outs()));
Passes.run(*M2);
And AddOptimizationPasses is stolen and simplified from opt utility:
static void AddOptimizationPasses(legacy::PassManagerBase &MPM,
legacy::FunctionPassManager &FPM,
TargetMachine *TM, unsigned OptLevel,
unsigned SizeLevel) {
FPM.add(createVerifierPass());
PassManagerBuilder Builder;
Builder.OptLevel = OptLevel;
Builder.SizeLevel = SizeLevel;
Builder.Inliner = createFunctionInliningPass(50);
Builder.DisableUnitAtATime = true;//!UnitAtATime;
Builder.DisableUnrollLoops = false;
if (TM)
TM->adjustPassManager(Builder);
//Builder.populateFunctionPassManager(FPM);
Builder.populateModulePassManager(MPM);
}
By the way, initialisation is following:
InitializeAllTargets();
InitializeAllTargetMCs();
InitializeAllAsmPrinters();
Unfortunately, it does not work.
Did you forget to populate the pass manager?
PassManagerBase& PM = ...; // create the pass manager.
PassManagerBuilder PMBuilder;
PMBuilder.OptLevel = 2;
PMBuilder.DisableUnrollLoops = false;
PMBuilder.Inliner = createFunctionInliningPass(50);
PMBuilder.populateModulePassManager(PM);
Module& = ...; // your IR module here
PM.run(M);
Note that a "FunctionPassManager" may not do what you need. You're likely looking for legacy::PassManager instead (which can hold any type of pass).
Let's say I have this function in C/C++:
int foo(int x) {
if (x <= 1) return 1;
return x * foo(x-1);
}
And I compile it with Clang.
Clang generates the following IR:
; Function Attrs: ssp uwtable
define i32 #_Z3fooi(i32 %x) #0 {
%1 = alloca i32, align 4
%2 = alloca i32, align 4
store i32 %x, i32* %2, align 4
%3 = load i32, i32* %2, align 4
%4 = icmp sle i32 %3, 1
br i1 %4, label %5, label %6
; <label>:5 ; preds = %0
store i32 1, i32* %1, align 4
br label %12
; <label>:6 ; preds = %0
%7 = load i32, i32* %2, align 4
%8 = load i32, i32* %2, align 4
%9 = sub nsw i32 %8, 1
%10 = call i32 #_Z3fooi(i32 %9)
%11 = mul nsw i32 %7, %10
store i32 %11, i32* %1, align 4
br label %12
; <label>:12 ; preds = %6, %5
%13 = load i32, i32* %1, align 4
ret i32 %13
}
As you can see, LLVM passes optimizes out the code and creates a "return register" (where I put the return value), and a "return block" (where the return value is effectively returned).
I'm trying to get the same effect, but when I use SROA pass or the Instruction Combining pass, they translate the exits in a phi instruction:
; Function Attrs: nounwind ssp uwtable
define i32 #__HF3fooTi(i32 %x) #0 {
%1 = icmp sle i32 %x, 1
br i1 %1, label %2, label %3
; <label>:2 ; preds = %0
br label %7
; <label>:3 ; preds = %0
%4 = sub nsw i32 %x, 1
%5 = call i32 #__HF3fooTi(i32 %4)
%6 = mul nsw i32 %x, %5
br label %7
; <label>:7 ; preds = %3, %2
%.0 = phi i32 [ 1, %2 ], [ %6, %3 ]
ret i32 %.0
}
My question is: which solution is faster? And which pass is Clang using to achieve this? (In the Clang source files I found the 2 passes I used, and they give me this different result)