Insert an LLVM instruction using insertAfter() - c++

I'm new to LLVM, and I'm doing some experiments on it such as inserting an instruction.
My main.c is shown below:
int foo(int e, int a) {
int b = a + 1;
int c = b * 2;
b = e << 1;
int d = b / 4;
return c * d;
}
I use the command below to generate the LLVM bytecode
clang-12 -O0 -Xclang -disable-O0-optnone -emit-llvm -c main.c -o main.bc
opt-12 -S -mem2reg main.bc -o main.ll
The bytecode is
; Function Attrs: noinline nounwind uwtable
define dso_local i32 #foo(i32 %0, i32 %1) #0 {
%3 = add nsw i32 %1, 1
%4 = mul nsw i32 %3, 2
%5 = shl i32 %0, 1
%6 = sdiv i32 %5, 4
%7 = mul nsw i32 %4, %6
ret i32 %7
}
And I use the code to insert an instruction after the first instruction:
bool runOnBasicBlock(BasicBlock &B) {
// get the first and second instruction
Instruction &Inst1st = *B.begin();
Instruction *NewInst = BinaryOperator::Create(
Instruction::Add, Inst1st.getOperand(0), Inst1st.getOperand(0));
NewInst->insertAfter(&Inst1st);
...
}
After I run this pass, the bytecode is changed to
; Function Attrs: noinline nounwind uwtable
define dso_local i32 #foo(i32 %0, i32 %1) #0 {
%3 = add nsw i32 %1, 1
%4 = add i32 %1, %1
%5 = mul nsw i32 %4, 2
%6 = shl i32 %0, 1
%7 = sdiv i32 %6, 4
%8 = mul nsw i32 %5, %7
ret i32 %8
}
It seems that the inserted instruction is equal to b = a + a;, so the instruction %4 = mul nsw i32 %3, 2 is changed to %5 = mul nsw i32 %4, 2. I cannot understand the reason. Any help?

As I know, NewInst->insertAfter(&Inst1st); makes from the block
int b = a + 1;
int c = b * 2;
the following block
int b = a + 1, a + a;
int c = b * 2;
therefore b drops off the previous value %3 and gets the new value %4 and further mul uses that new value of b.

Related

Do all LLVM PHI instructions in the same basic block always have the same set of incomming blocks?

I notice that in LLVM bitcode files, all PHI instructions in the same basic block often have the same set of incoming blocks.
Does anyone know if it is always true for all LLVM bitcode files?
Or is there any optimization pass for doing so?
For example, here is my C code:
// test.c
int main(){
int **p, *q;
int *a, *b, c, d;
p = &a;
if (p) {
if (c) {
q = &c;
}
}
else{
p = &b;
q = &d;
}
if (d) {
*p = q;
}
}
After being compiled by clang and opt:
clang -Xclang -disable-O0-optnone -c -emit-llvm test.c
opt -mem2reg test.bc -o test.opt.bc
Here is the output test.opt.bc, where all PHI instructions in block 12 have the same incomming blocks 11 and 12:
; Function Attrs: noinline nounwind uwtable
define dso_local i32 #main() #0 {
%1 = alloca i32*, align 8
%2 = alloca i32*, align 8
%3 = alloca i32, align 4
%4 = alloca i32, align 4
%5 = icmp ne i32** %1, null
br i1 %5, label %6, label %11
6: ; preds = %0
%7 = load i32, i32* %3, align 4
%8 = icmp ne i32 %7, 0
br i1 %8, label %9, label %10
9: ; preds = %6
br label %10
10: ; preds = %9, %6
br label %12
11: ; preds = %0
br label %12
12: ; preds = %11, %10
%.01 = phi i32** [ %1, %10 ], [ %2, %11 ]
%.1 = phi i32* [ %3, %10 ], [ %4, %11 ]
%13 = load i32, i32* %4, align 4
%14 = icmp ne i32 %13, 0
br i1 %14, label %15, label %16
15: ; preds = %12
store i32* %.1, i32** %.01, align 8
br label %16
16: ; preds = %15, %12
ret i32 0
}
The answer is yes.
It directly follows from the phi instruction description from the language reference:
After this, the ‘phi’ instruction takes a list of pairs as arguments, with one pair for each predecessor basic block of the current block.
Since all of phi instructions in question belong to the same basic block, these lists of pairs should reference the same set of predecessors.

LLVM API optimisation run

I am trying to perform -O2 optimisation with LLVM IR obtained by calling CLANG API. Unfortunately, optimisation works only with IR created with manual calls. I have the following function:
int mult_add(int x, int y){
if(x > 2){
return y + 1 + 2;
} else {
return y - 1 + 2;
}
}
And with these calls:
clang -S -emit-llvm main.cpp
opt main.ll -o opt.ll -S -O2
I get the correct result:
define i32 #_Z8mult_addii(i32, i32) local_unnamed_addr #0 {
%3 = icmp sgt i32 %0, 2
%.sink = select i1 %3, i32 3, i32 1
%4 = add nsw i32 %.sink, %1
ret i32 %4
}
Unfortunately, when I do it through LLVM API with legacy::PassManager and legacy::FunctionPassManager optimisation simply does not work and got long ugly code:
define i32 #_Z8mult_addii(i32, i32) #0 {
%3 = alloca i32, align 4
%4 = alloca i32, align 4
%5 = alloca i32, align 4
store i32 %0, i32* %4, align 4
store i32 %1, i32* %5, align 4
%6 = load i32, i32* %4, align 4
%7 = icmp sgt i32 %6, 2
br i1 %7, label %8, label %12
; <label>:8: ; preds = %2
%9 = load i32, i32* %5, align 4
%10 = add nsw i32 %9, 1
%11 = add nsw i32 %10, 2
store i32 %11, i32* %3, align 4
br label %16
; <label>:12: ; preds = %2
%13 = load i32, i32* %5, align 4
%14 = sub nsw i32 %13, 1
%15 = add nsw i32 %14, 2
store i32 %15, i32* %3, align 4
br label %16
; <label>:16: ; preds = %12, %8
%17 = load i32, i32* %3, align 4
ret i32 %17
}
Seems like CLANG creates IR in some unoptimisable state? Because running the passes on a manual created IR works fine.
By the way, PMBuilder.populateModulePassManager is called, here is the code:
legacy::PassManager Passes;
legacy::FunctionPassManager FPasses(M2.get());
AddOptimizationPasses(Passes, FPasses, &(TheJIT->getTargetMachine()), 2, 0);
Passes.add(createPrintModulePass(outs()));
Passes.run(*M2);
And AddOptimizationPasses is stolen and simplified from opt utility:
static void AddOptimizationPasses(legacy::PassManagerBase &MPM,
legacy::FunctionPassManager &FPM,
TargetMachine *TM, unsigned OptLevel,
unsigned SizeLevel) {
FPM.add(createVerifierPass());
PassManagerBuilder Builder;
Builder.OptLevel = OptLevel;
Builder.SizeLevel = SizeLevel;
Builder.Inliner = createFunctionInliningPass(50);
Builder.DisableUnitAtATime = true;//!UnitAtATime;
Builder.DisableUnrollLoops = false;
if (TM)
TM->adjustPassManager(Builder);
//Builder.populateFunctionPassManager(FPM);
Builder.populateModulePassManager(MPM);
}
By the way, initialisation is following:
InitializeAllTargets();
InitializeAllTargetMCs();
InitializeAllAsmPrinters();
Unfortunately, it does not work.
Did you forget to populate the pass manager?
PassManagerBase& PM = ...; // create the pass manager.
PassManagerBuilder PMBuilder;
PMBuilder.OptLevel = 2;
PMBuilder.DisableUnrollLoops = false;
PMBuilder.Inliner = createFunctionInliningPass(50);
PMBuilder.populateModulePassManager(PM);
Module& = ...; // your IR module here
PM.run(M);
Note that a "FunctionPassManager" may not do what you need. You're likely looking for legacy::PassManager instead (which can hold any type of pass).

Why is this block of LLVM instructions generated?

The DataFlowSanitizer pass on LLVM 3.8.0, 64 bit (Ubuntu 16.04.2) generates the following IR from source:
The source:
test.c
#include <sanitizer/dfsan_interface.h>
int main(void) {
int i = 1;
dfsan_label i_label = dfsan_create_label("i", 0);
dfsan_set_label(i_label, &i, sizeof(i));
return 0;
}
The commands to generate the IR:
clang -c -emit-llvm -fsanitize=dataflow test.c -o test.bc
llvm-dis test.bc
The disassembly:
test.ll
; Function Attrs: nounwind uwtable
define i32 #main() #0 {
entry:
%0 = alloca i16
%retval = alloca i32, align 4
%i = alloca i32, align 4
%1 = alloca i16
%i_label = alloca i16, align 2
store i16 0, i16* %0
store i32 0, i32* %retval, align 4
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%2 = ptrtoint i32* %i to i64
%3 = and i64 %2, -123145302310913
%4 = mul i64 %3, 2
%5 = inttoptr i64 %4 to i16*
%6 = bitcast i16* %5 to i64*
store i64 0, i64* %6, align 2
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
store i32 1, i32* %i, align 4
%call = call zeroext i16 #dfsan_create_label(i8* getelementptr inbounds ([2 x i8], [2 x i8]* #.str, i32 0, i32 0), i8* null)
store i16 0, i16* %1
store i16 %call, i16* %i_label, align 2
%7 = load i16, i16* %1
%8 = load i16, i16* %i_label, align 2
%9 = bitcast i32* %i to i8*
call void #dfsan_set_label(i16 zeroext %8, i8* %9, i64 4)
ret i32 0
}
I don't understand why the block of instruction I separated out is being generated. Looking at the Transform/Instrumentation/DataFlowsanitizer.cpp, I can't find the code that inserts the instrumentation above. Can anyone explain this behavior?

Unifying function exits with LLVM

Let's say I have this function in C/C++:
int foo(int x) {
if (x <= 1) return 1;
return x * foo(x-1);
}
And I compile it with Clang.
Clang generates the following IR:
; Function Attrs: ssp uwtable
define i32 #_Z3fooi(i32 %x) #0 {
%1 = alloca i32, align 4
%2 = alloca i32, align 4
store i32 %x, i32* %2, align 4
%3 = load i32, i32* %2, align 4
%4 = icmp sle i32 %3, 1
br i1 %4, label %5, label %6
; <label>:5 ; preds = %0
store i32 1, i32* %1, align 4
br label %12
; <label>:6 ; preds = %0
%7 = load i32, i32* %2, align 4
%8 = load i32, i32* %2, align 4
%9 = sub nsw i32 %8, 1
%10 = call i32 #_Z3fooi(i32 %9)
%11 = mul nsw i32 %7, %10
store i32 %11, i32* %1, align 4
br label %12
; <label>:12 ; preds = %6, %5
%13 = load i32, i32* %1, align 4
ret i32 %13
}
As you can see, LLVM passes optimizes out the code and creates a "return register" (where I put the return value), and a "return block" (where the return value is effectively returned).
I'm trying to get the same effect, but when I use SROA pass or the Instruction Combining pass, they translate the exits in a phi instruction:
; Function Attrs: nounwind ssp uwtable
define i32 #__HF3fooTi(i32 %x) #0 {
%1 = icmp sle i32 %x, 1
br i1 %1, label %2, label %3
; <label>:2 ; preds = %0
br label %7
; <label>:3 ; preds = %0
%4 = sub nsw i32 %x, 1
%5 = call i32 #__HF3fooTi(i32 %4)
%6 = mul nsw i32 %x, %5
br label %7
; <label>:7 ; preds = %3, %2
%.0 = phi i32 [ 1, %2 ], [ %6, %3 ]
ret i32 %.0
}
My question is: which solution is faster? And which pass is Clang using to achieve this? (In the Clang source files I found the 2 passes I used, and they give me this different result)

How to execute llvm code

I have a c code that calculates the factorial of an int "factorial.c". I compile it to llvm readable code "factorial.ll" and I modify in the compiled llvm code.
The objective is to execute the modified llvm code and to see its output, How can I do this?
It will depend on how your outputted LLVM is assembled and what libraries it links against, but for example executing the following factorial.ll with the shell command lli
$ lli factorial.ll
Factorial of 10 = 3628800
Will execute the main function with the JIT and use the standard printf to output the result to stdout.
#.str = private unnamed_addr constant [22 x i8] c"Factorial of %d = %d\0A\00", align 1
declare i32 #printf(i8*, ...)
define i32 #factorial(i32 %n) nounwind uwtable {
entry:
%n.addr = alloca i32, align 4
store i32 %n, i32* %n.addr, align 4
%0 = load i32* %n.addr, align 4
%cmp = icmp sle i32 %0, 1
br i1 %cmp, label %cond.true, label %cond.false
cond.true: ; preds = %entry
br label %cond.end
cond.false: ; preds = %entry
%1 = load i32* %n.addr, align 4
%2 = load i32* %n.addr, align 4
%sub = sub nsw i32 %2, 1
%call = call i32 #factorial(i32 %sub)
%mul = mul nsw i32 %1, %call
br label %cond.end
cond.end: ; preds = %cond.false, %cond.true
%cond = phi i32 [ 1, %cond.true ], [ %mul, %cond.false ]
ret i32 %cond
}
define i32 #main(i32 %argc, i8** %argv) nounwind uwtable {
entry:
%retval = alloca i32, align 4
%argc.addr = alloca i32, align 4
%argv.addr = alloca i8**, align 8
store i32 0, i32* %retval
store i32 %argc, i32* %argc.addr, align 4
store i8** %argv, i8*** %argv.addr, align 8
%call = call i32 #factorial(i32 10)
%call1 = call i32 (i8*, ...)* #printf(i8* getelementptr inbounds ([22 x i8]* #.str, i32 0, i32 0), i32 10, i32 %call)
ret i32 0
}