I just started learning LLVM and I am wondering why we have two indices in getelementptr? what are the first and second indices (0 and 0) used for?
#tmp = global [18 x i8] c"Hello world!: %d\0A\00"
declare i32 #printf(i8* %0, ...)
define i32 #fact(i32 %x) {
0:
%1 = icmp sle i32 %x, 0
br i1 %1, label %2, label %3
2:
ret i32 1
3:
%4 = sub i32 %x, 1
%5 = call i32 #fact(i32 %4)
%6 = mul i32 %x, %5
ret i32 %6
}
define i32 #main() {
entry:
%0 = getelementptr [18 x i8], [18 x i8]* #tmp, i32 0, i32 0 ; <---- HERE
%1 = call i32 #fact(i32 23)
%2 = call i32 (i8*, ...) #printf(i8* %0, i32 %1)
ret i32 1
}
enter code here
The DataFlowSanitizer pass on LLVM 3.8.0, 64 bit (Ubuntu 16.04.2) generates the following IR from source:
The source:
test.c
#include <sanitizer/dfsan_interface.h>
int main(void) {
int i = 1;
dfsan_label i_label = dfsan_create_label("i", 0);
dfsan_set_label(i_label, &i, sizeof(i));
return 0;
}
The commands to generate the IR:
clang -c -emit-llvm -fsanitize=dataflow test.c -o test.bc
llvm-dis test.bc
The disassembly:
test.ll
; Function Attrs: nounwind uwtable
define i32 #main() #0 {
entry:
%0 = alloca i16
%retval = alloca i32, align 4
%i = alloca i32, align 4
%1 = alloca i16
%i_label = alloca i16, align 2
store i16 0, i16* %0
store i32 0, i32* %retval, align 4
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%2 = ptrtoint i32* %i to i64
%3 = and i64 %2, -123145302310913
%4 = mul i64 %3, 2
%5 = inttoptr i64 %4 to i16*
%6 = bitcast i16* %5 to i64*
store i64 0, i64* %6, align 2
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
store i32 1, i32* %i, align 4
%call = call zeroext i16 #dfsan_create_label(i8* getelementptr inbounds ([2 x i8], [2 x i8]* #.str, i32 0, i32 0), i8* null)
store i16 0, i16* %1
store i16 %call, i16* %i_label, align 2
%7 = load i16, i16* %1
%8 = load i16, i16* %i_label, align 2
%9 = bitcast i32* %i to i8*
call void #dfsan_set_label(i16 zeroext %8, i8* %9, i64 4)
ret i32 0
}
I don't understand why the block of instruction I separated out is being generated. Looking at the Transform/Instrumentation/DataFlowsanitizer.cpp, I can't find the code that inserts the instrumentation above. Can anyone explain this behavior?
Let's say I have this function in C/C++:
int foo(int x) {
if (x <= 1) return 1;
return x * foo(x-1);
}
And I compile it with Clang.
Clang generates the following IR:
; Function Attrs: ssp uwtable
define i32 #_Z3fooi(i32 %x) #0 {
%1 = alloca i32, align 4
%2 = alloca i32, align 4
store i32 %x, i32* %2, align 4
%3 = load i32, i32* %2, align 4
%4 = icmp sle i32 %3, 1
br i1 %4, label %5, label %6
; <label>:5 ; preds = %0
store i32 1, i32* %1, align 4
br label %12
; <label>:6 ; preds = %0
%7 = load i32, i32* %2, align 4
%8 = load i32, i32* %2, align 4
%9 = sub nsw i32 %8, 1
%10 = call i32 #_Z3fooi(i32 %9)
%11 = mul nsw i32 %7, %10
store i32 %11, i32* %1, align 4
br label %12
; <label>:12 ; preds = %6, %5
%13 = load i32, i32* %1, align 4
ret i32 %13
}
As you can see, LLVM passes optimizes out the code and creates a "return register" (where I put the return value), and a "return block" (where the return value is effectively returned).
I'm trying to get the same effect, but when I use SROA pass or the Instruction Combining pass, they translate the exits in a phi instruction:
; Function Attrs: nounwind ssp uwtable
define i32 #__HF3fooTi(i32 %x) #0 {
%1 = icmp sle i32 %x, 1
br i1 %1, label %2, label %3
; <label>:2 ; preds = %0
br label %7
; <label>:3 ; preds = %0
%4 = sub nsw i32 %x, 1
%5 = call i32 #__HF3fooTi(i32 %4)
%6 = mul nsw i32 %x, %5
br label %7
; <label>:7 ; preds = %3, %2
%.0 = phi i32 [ 1, %2 ], [ %6, %3 ]
ret i32 %.0
}
My question is: which solution is faster? And which pass is Clang using to achieve this? (In the Clang source files I found the 2 passes I used, and they give me this different result)
When building a project with LLVM, some function calls will be replaced by intrinsic functions. Is the replacement completed by the front-end (e.g. clang) or the LLVM back-end?
Discussions through the Internet indicate that the intrinsic functions replacement is related to optimization options. So does it mean if there is no optimization option, then no intrinsic replacement will happen? Or in fact, there are some default intrinsic functions replacement that cannot be disabled?
If there is any method to disable all the intrinsic functions, how should I do that?
It depends. Intrinsics written in code are emitted through the front-end directly. Intrinsics like llvm.memset are introduced to the code during optimization at IR level (eigther the front-end nor the back-end perform this optimizations).
Here is a (quite stupid) example:
int main(int argc, char** argv)
{
int a[8];
for (int i = 0; i != 8; ++i)
a[i] = 0;
for (int i = 7; i >= 0; --i)
a[i] = a[i+1] + argc;
return a[0];
}
Compiled with clang 3.5 (clang -S -emit-llvm) you will get the following IR without any intrinsics:
; Function Attrs: nounwind uwtable
define i32 #main(i32 %argc, i8** %argv) #0 {
%1 = alloca i32, align 4
%2 = alloca i32, align 4
%3 = alloca i8**, align 8
%a = alloca [8 x i32], align 16
%i = alloca i32, align 4
%i1 = alloca i32, align 4
store i32 0, i32* %1
store i32 %argc, i32* %2, align 4
store i8** %argv, i8*** %3, align 8
store i32 0, i32* %i, align 4
br label %4
; <label>:4 ; preds = %11, %0
%5 = load i32* %i, align 4
%6 = icmp ne i32 %5, 8
br i1 %6, label %7, label %14
; <label>:7 ; preds = %4
%8 = load i32* %i, align 4
%9 = sext i32 %8 to i64
%10 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 %9
store i32 0, i32* %10, align 4
br label %11
; <label>:11 ; preds = %7
%12 = load i32* %i, align 4
%13 = add nsw i32 %12, 1
store i32 %13, i32* %i, align 4
br label %4
; <label>:14 ; preds = %4
store i32 7, i32* %i1, align 4
br label %15
; <label>:15 ; preds = %29, %14
%16 = load i32* %i1, align 4
%17 = icmp sge i32 %16, 0
br i1 %17, label %18, label %32
; <label>:18 ; preds = %15
%19 = load i32* %i1, align 4
%20 = add nsw i32 %19, 1
%21 = sext i32 %20 to i64
%22 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 %21
%23 = load i32* %22, align 4
%24 = load i32* %2, align 4
%25 = add nsw i32 %23, %24
%26 = load i32* %i1, align 4
%27 = sext i32 %26 to i64
%28 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 %27
store i32 %25, i32* %28, align 4
br label %29
; <label>:29 ; preds = %18
%30 = load i32* %i1, align 4
%31 = add nsw i32 %30, -1
store i32 %31, i32* %i1, align 4
br label %15
; <label>:32 ; preds = %15
%33 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 0
%34 = load i32* %33, align 4
ret i32 %34
}
Compiled again with clang -emit-llvm -O1 you will see this:
; Function Attrs: nounwind readnone uwtable
define i32 #main(i32 %argc, i8** nocapture readnone %argv) #0 {
.preheader:
%a = alloca [8 x i32], align 16
%a6 = bitcast [8 x i32]* %a to i8*
call void #llvm.memset.p0i8.i64(i8* %a6, i8 0, i64 32, i32 4, i1 false)
br label %0
; <label>:0 ; preds = %.preheader, %0
%indvars.iv = phi i64 [ 7, %.preheader ], [ %indvars.iv.next, %0 ]
%1 = add nsw i64 %indvars.iv, 1
%2 = getelementptr inbounds [8 x i32]* %a, i64 0, i64 %1
%3 = load i32* %2, align 4, !tbaa !1
%4 = add nsw i32 %3, %argc
%5 = getelementptr inbounds [8 x i32]* %a, i64 0, i64 %indvars.iv
store i32 %4, i32* %5, align 4, !tbaa !1
%indvars.iv.next = add nsw i64 %indvars.iv, -1
%6 = trunc i64 %indvars.iv to i32
%7 = icmp sgt i32 %6, 0
br i1 %7, label %0, label %8
; <label>:8 ; preds = %0
%9 = getelementptr inbounds [8 x i32]* %a, i64 0, i64 0
%10 = load i32* %9, align 16, !tbaa !1
ret i32 %10
}
The initialization loop was replaced by the llvm.memset intrinsic. The back-end is free to handle the intrinsic as it want's but commonly llvm.memset is lowered to a libc library call.
To answer your first question: Yes, if you don't optimize your code, then you will not get intrinsics in your IR.
To prevent intrinsics being introduced in your code all you have to do is find the optimization pass on your IR and don't run it. Here is a related question how to find out what passes are done on the IR: Where to find the optimization sequence for clang -OX?
for -O1 we get:
prune-eh -inline-cost -always-inline -functionattrs -sroa -domtree
-early-cse -lazy-value-info -jump-threading -correlated-propagation -simplifycfg -instcombine -tailcallelim -simplifycfg -reassociate -domtree -loops -loop-simplify -lcssa -loop-rotate -licm -loop-unswitch -instcombine -scalar-evolution -lcssa -indvars -loop-idiom -loop-deletion -loop-unroll -memdep -memcpyopt -sccp -instcombine -lazy-value-info -jump-threading -correlated-propagation -domtree -memdep -dse -adce -simplifycfg -instcombine -barrier -domtree -loops -loop-simplify -lcssa -branch-prob -block-freq -scalar-evolution -loop-vectorize -instcombine -simplifycfg -strip-dead-prototypes -verify
A wild guess: instcombine is introducing the llvm.memset. I run the passes without instcombine and opt on the unoptimized IR and get this:
; Function Attrs: nounwind readnone uwtable
define i32 #main(i32 %argc, i8** %argv) #0 {
%a = alloca [8 x i32], align 16
%1 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 8
%2 = load i32* %1, align 4
%3 = add nsw i32 %2, %argc
%4 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 7
store i32 %3, i32* %4, align 4
%5 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 7
%6 = load i32* %5, align 4
%7 = add nsw i32 %6, %argc
%8 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 6
store i32 %7, i32* %8, align 4
%9 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 6
%10 = load i32* %9, align 4
%11 = add nsw i32 %10, %argc
%12 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 5
store i32 %11, i32* %12, align 4
%13 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 5
%14 = load i32* %13, align 4
%15 = add nsw i32 %14, %argc
%16 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 4
store i32 %15, i32* %16, align 4
%17 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 4
%18 = load i32* %17, align 4
%19 = add nsw i32 %18, %argc
%20 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 3
store i32 %19, i32* %20, align 4
%21 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 3
%22 = load i32* %21, align 4
%23 = add nsw i32 %22, %argc
%24 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 2
store i32 %23, i32* %24, align 4
%25 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 2
%26 = load i32* %25, align 4
%27 = add nsw i32 %26, %argc
%28 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 1
store i32 %27, i32* %28, align 4
%29 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 1
%30 = load i32* %29, align 4
%31 = add nsw i32 %30, %argc
%32 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 0
store i32 %31, i32* %32, align 4
%33 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 0
%34 = load i32* %33, align 4
ret i32 %34
}
No instructions. So to prevent (at least the memset) intrinsics in your code don't run instcombine on your IR. However, instcombine is a mighty opt pass that realy shortens the code.
Now you have two options:
don't use opt passes that introduce intrinsics
write your own llvm
opt pass that transforms intrinsics back to whatever they could be
replaced with an run it after optimization and before the back-end
starts working
I hope this helps you somehow. Cheers!
I have a c code that calculates the factorial of an int "factorial.c". I compile it to llvm readable code "factorial.ll" and I modify in the compiled llvm code.
The objective is to execute the modified llvm code and to see its output, How can I do this?
It will depend on how your outputted LLVM is assembled and what libraries it links against, but for example executing the following factorial.ll with the shell command lli
$ lli factorial.ll
Factorial of 10 = 3628800
Will execute the main function with the JIT and use the standard printf to output the result to stdout.
#.str = private unnamed_addr constant [22 x i8] c"Factorial of %d = %d\0A\00", align 1
declare i32 #printf(i8*, ...)
define i32 #factorial(i32 %n) nounwind uwtable {
entry:
%n.addr = alloca i32, align 4
store i32 %n, i32* %n.addr, align 4
%0 = load i32* %n.addr, align 4
%cmp = icmp sle i32 %0, 1
br i1 %cmp, label %cond.true, label %cond.false
cond.true: ; preds = %entry
br label %cond.end
cond.false: ; preds = %entry
%1 = load i32* %n.addr, align 4
%2 = load i32* %n.addr, align 4
%sub = sub nsw i32 %2, 1
%call = call i32 #factorial(i32 %sub)
%mul = mul nsw i32 %1, %call
br label %cond.end
cond.end: ; preds = %cond.false, %cond.true
%cond = phi i32 [ 1, %cond.true ], [ %mul, %cond.false ]
ret i32 %cond
}
define i32 #main(i32 %argc, i8** %argv) nounwind uwtable {
entry:
%retval = alloca i32, align 4
%argc.addr = alloca i32, align 4
%argv.addr = alloca i8**, align 8
store i32 0, i32* %retval
store i32 %argc, i32* %argc.addr, align 4
store i8** %argv, i8*** %argv.addr, align 8
%call = call i32 #factorial(i32 10)
%call1 = call i32 (i8*, ...)* #printf(i8* getelementptr inbounds ([22 x i8]* #.str, i32 0, i32 0), i32 10, i32 %call)
ret i32 0
}