I'm interested in how the Microsoft Visual C++ compiler treat/optimize static variables.
My code:
#include <cstdlib>
void no_static_initialization()
{
static int value = 3;
}
void static_initialization(int new_value)
{
static int value = new_value;
}
int main()
{
no_static_initialization();
static_initialization(1);
static_initialization(std::rand());
return 0;
}
Here's the assembly for the code (compiled with optimizations):
My main area of interest is the last case.
Here, the first statement got fully optimized and two calls of the second statement were inlined and they actually represent similiar chunks of code.
Each of them does test something something and then makes a short jump if the test wasn't successful (these jumps obviously point to the end of corresponding routine).
Does the compiler make an explicit check on every function call for if the function is called the first time?
Does the compiler actually have a flag, which indicates if this is the first time the function was called or not?
Where is it stored (I guess all that test stuff is about it, but I'm not exactly sure)?
Yes, the compiler has to add a hidden flag to test whether it is the first call to the function and initialize or not depending on that. In both snippets it is testing the flag, if it is raised it will jump to the end of the function or else it will initialize the static variable. Note that since the compiler has inlined the function it could as well optimize away the second test, knowing that the flag is to be tested only on the first call.
The flag seems to be located at address 0x00403374, and takes a byte, while the variable itself is located at address 0x00403370.
I like to use LLVM because the code it generates tells you a bit more explicitly what it's doing:
The actual code is below, because it's kind of a long read. Yes, LLVM creates guard condition variables for static values. notice how static_initialization/bb: acquires the guard, checks to see if its a certain value corresponding with already initialized, and either branches to bb1 if it needs to initialize, or bb2 if it doesn't. This isn't the only way to possibly solve the single initialization requirement, but it's the usual way.
; ModuleID = '/tmp/webcompile/_31867_0.bc'
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
target triple = "x86_64-linux-gnu"
#guard variable for static_initialization(int)::value = internal global i64 0 ; <i64*> [#uses=3]
#static_initialization(int)::value = internal global i32 0 ; <i32*> [#uses=1]
define void #no_static_initialization()() nounwind {
entry:
br label %return
return: ; preds = %entry
ret void
}
define void #static_initialization(int)(i32 %new_value) nounwind {
entry:
%new_value_addr = alloca i32 ; <i32*> [#uses=2]
%0 = alloca i8 ; <i8*> [#uses=2]
%retval.1 = alloca i8 ; <i8*> [#uses=2]
%"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
store i32 %new_value, i32* %new_value_addr
%1 = load i8* bitcast (i64* #guard variable for static_initialization(int)::value to i8*), align 1 ; <i8> [#uses=1]
%2 = icmp eq i8 %1, 0 ; <i1> [#uses=1]
br i1 %2, label %bb, label %bb2
bb: ; preds = %entry
%3 = call i32 #__cxa_guard_acquire(i64* #guard variable for static_initialization(int)::value) nounwind ; <i32> [#uses=1]
%4 = icmp ne i32 %3, 0 ; <i1> [#uses=1]
%5 = zext i1 %4 to i8 ; <i8> [#uses=1]
store i8 %5, i8* %retval.1, align 1
%6 = load i8* %retval.1, align 1 ; <i8> [#uses=1]
%toBool = icmp ne i8 %6, 0 ; <i1> [#uses=1]
br i1 %toBool, label %bb1, label %bb2
bb1: ; preds = %bb
store i8 0, i8* %0, align 1
%7 = load i32* %new_value_addr, align 4 ; <i32> [#uses=1]
store i32 %7, i32* #static_initialization(int)::value, align 4
store i8 1, i8* %0, align 1
call void #__cxa_guard_release(i64* #guard variable for static_initialization(int)::value) nounwind
br label %bb2
bb2: ; preds = %bb1, %bb, %entry
br label %return
return: ; preds = %bb2
ret void
}
declare i32 #__cxa_guard_acquire(i64*) nounwind
declare void #__cxa_guard_release(i64*) nounwind
define i32 #main() nounwind {
entry:
%retval = alloca i32 ; <i32*> [#uses=2]
%0 = alloca i32 ; <i32*> [#uses=2]
%"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
call void #no_static_initialization()() nounwind
call void #static_initialization(int)(i32 1) nounwind
%1 = call i32 #rand() nounwind ; <i32> [#uses=1]
call void #static_initialization(int)(i32 %1) nounwind
store i32 0, i32* %0, align 4
%2 = load i32* %0, align 4 ; <i32> [#uses=1]
store i32 %2, i32* %retval, align 4
br label %return
return: ; preds = %entry
%retval1 = load i32* %retval ; <i32> [#uses=1]
ret i32 %retval1
}
declare i32 #rand() nounwind
Related
I've been playing around with compilers and have been working on my own toy C compiler. Currently I'm attempting to target LLVM IR, but I'm having trouble wrapping my head around the syntax.
My current current issue: why is this valid IR syntax:
define i32 #main() {
%1 = alloca i32, align 4
%2 = add i32 0, 0
store i32 %2, i32* %1, align 4
%3 = alloca i32, align 4
%4 = add i32 0, 1
store i32 %4, i32* %3, align 4
%5 = load i32, i32* %1, align 4
%6 = icmp ne i32 %5, 0
br i1 %6, label %true0, label %else0
true0: ; preds %0
%7 = add i32 0, 1
store i32 %7, i32* %3, align 4
br label %end0
else0: ; preds %0
%8 = load i32, i32* %3, align 4
%9 = icmp ne i32 %8, 0
br i1 %9, label %true1, label %end1
true1: ; preds %else0
%10 = add i32 0, 2
store i32 %10, i32* %3, align 4
br label %end1
end1: ; preds %true1, %else0
br label %end0
end0: ; preds %true0, %else1
%11 = load i32, i32* %3, align 4
ret i32 %11
}
but this is not:
define i32 #main() {
%1 = alloca i32, align 4
%2 = add i32 0, 0
store i32 %2, i32* %1, align 4 ; variable a
%3 = load i32, i32* %1, align 4
%4 = icmp ne i32 %3, 0
br i1 %4, label %true0, label %else0
true0: ; preds %0
%5 = add i32 0, 1
ret i32 %5
br label %end0
else0: ; preds %0
%6 = add i32 0, 2
ret i32 %6
br label %end0
end0: ; % preds %true0, %else0
ret i32 0
}
I get the error:
llc-6.0: test2.ll:13:1: error: instruction expected to be numbered '%7'
%6 = add i32 0, 2
^
I don't understand why that block needs to be %7, given the previously used number was %6. Compare the %else0 label of the first example, that's very similar syntax and works fine.
And yes, my compiler needs a lot of optimization, but I'm not finished yet :)
Your code is invalid because there is actually another basic block you did not labeled:
true0: ; preds %0
%5 = add i32 0, 1
ret i32 %5
hidden_bb: ; this will named as %6 by default
br label %end0
else0: ; preds %0
If it has a label than the error will gone. Note that all terminator instructions, like br and ret will create their own basic block.
Let's say I have this function in C/C++:
int foo(int x) {
if (x <= 1) return 1;
return x * foo(x-1);
}
And I compile it with Clang.
Clang generates the following IR:
; Function Attrs: ssp uwtable
define i32 #_Z3fooi(i32 %x) #0 {
%1 = alloca i32, align 4
%2 = alloca i32, align 4
store i32 %x, i32* %2, align 4
%3 = load i32, i32* %2, align 4
%4 = icmp sle i32 %3, 1
br i1 %4, label %5, label %6
; <label>:5 ; preds = %0
store i32 1, i32* %1, align 4
br label %12
; <label>:6 ; preds = %0
%7 = load i32, i32* %2, align 4
%8 = load i32, i32* %2, align 4
%9 = sub nsw i32 %8, 1
%10 = call i32 #_Z3fooi(i32 %9)
%11 = mul nsw i32 %7, %10
store i32 %11, i32* %1, align 4
br label %12
; <label>:12 ; preds = %6, %5
%13 = load i32, i32* %1, align 4
ret i32 %13
}
As you can see, LLVM passes optimizes out the code and creates a "return register" (where I put the return value), and a "return block" (where the return value is effectively returned).
I'm trying to get the same effect, but when I use SROA pass or the Instruction Combining pass, they translate the exits in a phi instruction:
; Function Attrs: nounwind ssp uwtable
define i32 #__HF3fooTi(i32 %x) #0 {
%1 = icmp sle i32 %x, 1
br i1 %1, label %2, label %3
; <label>:2 ; preds = %0
br label %7
; <label>:3 ; preds = %0
%4 = sub nsw i32 %x, 1
%5 = call i32 #__HF3fooTi(i32 %4)
%6 = mul nsw i32 %x, %5
br label %7
; <label>:7 ; preds = %3, %2
%.0 = phi i32 [ 1, %2 ], [ %6, %3 ]
ret i32 %.0
}
My question is: which solution is faster? And which pass is Clang using to achieve this? (In the Clang source files I found the 2 passes I used, and they give me this different result)
I have been trying to create a function using the module pass in LLVM. What I am trying to do is create a variable argument function and then add the logic to manipulate the variable arguments.
For example:
/\*can do this\*/
int foo(int a, ...)
{
double var1;
//can't figure out how to add any of this using llvm
va_list ap;
va_start(ap, a);
va_arg(var1,double);
va_end(ap);
}
Creating the function type is easy because I just set the vararg boolean to true. What do I do after that?
I always use clang to check what it needs to convert for c/c++ lang.
Use llvm instruction va_arg and intinsics llvm.va_start, llvm.va_end, llvm.va_copy to use llvm variable argument support.
you also need target-specific value type “va_list” for functions that operates on arguments that use this.
; This struct is different for every platform. For most platforms,
; it is merely an i8*.
%struct.va_list = type { i8* }
; For Unix x86_64 platforms, va_list is the following struct:
; %struct.va_list = type { i32, i32, i8*, i8* }
ref http://llvm.org/docs/LangRef.html#variable-argument-handling-intrinsics
for your listed code,
; ModuleID = 'test.c'
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-linux-gnu"
%struct.__va_list_tag = type { i32, i32, i8*, i8* }
; Function Attrs: nounwind uwtable
define i32 #foo(i32 %a, ...) #0 {
%1 = alloca i32, align 4
%2 = alloca i32, align 4
%var1 = alloca double, align 8
%ap = alloca [1 x %struct.__va_list_tag], align 16
store i32 %a, i32* %2, align 4
%3 = getelementptr inbounds [1 x %struct.__va_list_tag]* %ap, i32 0, i32 0
%4 = bitcast %struct.__va_list_tag* %3 to i8*
call void #llvm.va_start(i8* %4)
%5 = getelementptr inbounds [1 x %struct.__va_list_tag]* %ap, i32 0, i32 0
%6 = getelementptr inbounds %struct.__va_list_tag* %5, i32 0, i32 1
%7 = load i32* %6
%8 = icmp ule i32 %7, 160
br i1 %8, label %9, label %15
; <label>:9 ; preds = %0
%10 = getelementptr inbounds %struct.__va_list_tag* %5, i32 0, i32 3
%11 = load i8** %10
%12 = getelementptr i8* %11, i32 %7
%13 = bitcast i8* %12 to double*
%14 = add i32 %7, 16
store i32 %14, i32* %6
br label %20
; <label>:15 ; preds = %0
%16 = getelementptr inbounds %struct.__va_list_tag* %5, i32 0, i32 2
%17 = load i8** %16
%18 = bitcast i8* %17 to double*
%19 = getelementptr i8* %17, i32 8
store i8* %19, i8** %16
br label %20
; <label>:20 ; preds = %15, %9
%21 = phi double* [ %13, %9 ], [ %18, %15 ]
%22 = load double* %21
%23 = getelementptr inbounds [1 x %struct.__va_list_tag]* %ap, i32 0, i32 0
%24 = bitcast %struct.__va_list_tag* %23 to i8*
call void #llvm.va_end(i8* %24)
%25 = load i32* %1
ret i32 %25
}
; Function Attrs: nounwind
declare void #llvm.va_start(i8*) #1
; Function Attrs: nounwind
declare void #llvm.va_end(i8*) #1
; Function Attrs: nounwind uwtable
define i32 #main() #0 {
ret i32 0
}
How can I change a loop in do-while form into a loop in while-form in LLVM IR?
Here we have a little loop example. The loops are just running through a boolean array until they find the first occurrence of true. I compiled it with clang -emit-llvm to get the optimized llvm IR.
#include <stdio.h>
#include <string.h>
int foo(bool* start){
bool* cond = start;;
while (*cond != true)
cond++;
return cond - start;
}
int bar(bool* start){
bool* cond = start;
do {
}while (*(++cond) != true);
return cond - start;
}
int main(){
bool cond[8];
memset(&cond, 0, sizeof(bool)*8);
cond[5] = true;
printf("%i %i\n", foo(cond), bar(cond));
}
The IR for the foo function (using just a while loop) looks like this:
; Function Attrs: nounwind uwtable
define i32 #_Z3fooPb(i8* %start) #0 {
%1 = alloca i8*, align 8
%cond = alloca i8*, align 8
store i8* %start, i8** %1, align 8
%2 = load i8** %1, align 8
store i8* %2, i8** %cond, align 8
br label %3
; <label>:3 ; preds = %9, %0
%4 = load i8** %cond, align 8
%5 = load i8* %4, align 1
%6 = trunc i8 %5 to i1
%7 = zext i1 %6 to i32
%8 = icmp ne i32 %7, 1
br i1 %8, label %9, label %12
; <label>:9 ; preds = %3
%10 = load i8** %cond, align 8
%11 = getelementptr inbounds i8* %10, i32 1
store i8* %11, i8** %cond, align 8
br label %3
; <label>:12 ; preds = %3
%13 = load i8** %cond, align 8
%14 = load i8** %1, align 8
%15 = ptrtoint i8* %13 to i64
%16 = ptrtoint i8* %14 to i64
%17 = sub i64 %15, %16
%18 = trunc i64 %17 to i32
ret i32 %18
}
and for bar, which is using a do while we get:
; Function Attrs: nounwind uwtable
define i32 #_Z3barPb(i8* %start) #0 {
%1 = alloca i8*, align 8
%cond = alloca i8*, align 8
store i8* %start, i8** %1, align 8
%2 = load i8** %1, align 8
store i8* %2, i8** %cond, align 8
br label %3
; <label>:3 ; preds = %4, %0
br label %4
; <label>:4 ; preds = %3
%5 = load i8** %cond, align 8
%6 = getelementptr inbounds i8* %5, i32 1
store i8* %6, i8** %cond, align 8
%7 = load i8* %6, align 1
%8 = trunc i8 %7 to i1
%9 = zext i1 %8 to i32
%10 = icmp ne i32 %9, 1
br i1 %10, label %3, label %11
; <label>:11 ; preds = %4
%12 = load i8** %cond, align 8
%13 = load i8** %1, align 8
%14 = ptrtoint i8* %12 to i64
%15 = ptrtoint i8* %13 to i64
%16 = sub i64 %14, %15
%17 = trunc i64 %16 to i32
ret i32 %17
}
The differences are very small for bar we have one additional label and an additional br because we jump strait to the body of the loop and execute it before we evaluate the condition.
So the first thing to transform a do while is to get rid of the branch and just jump to the condition. Now its a while loop where the condition is evaluated first. That is easy. Now you have two choices how you handle the condition. You can try to modify the condition what is a realy hard task because you can put almost everything inside a loops condition. The easy way is to just copy the loop body one time (everything from ;<label>:4 to ;<label>:11) prior to the first branch of the loop. so you want change the correctness of your code and your do-while loop will become a loop (with on loop-body execution) in-front of the loop.
You can copy the loop body with CloneBasicBlock from llvm/Transforms/Utils/Cloning.h:
/// CloneBasicBlock - Return a copy of the specified basic block, but without
/// embedding the block into a particular function. The block returned is an
/// exact copy of the specified basic block, without any remapping having been
/// performed. Because of this, this is only suitable for applications where
/// the basic block will be inserted into the same function that it was cloned
/// from (loop unrolling would use this, for example).
///
/// Also, note that this function makes a direct copy of the basic block, and
/// can thus produce illegal LLVM code. In particular, it will copy any PHI
/// nodes from the original block, even though there are no predecessors for the
/// newly cloned block (thus, phi nodes will have to be updated). Also, this
/// block will branch to the old successors of the original block: these
/// successors will have to have any PHI nodes updated to account for the new
/// incoming edges.
///
/// The correlation between instructions in the source and result basic blocks
/// is recorded in the VMap map.
///
/// If you have a particular suffix you'd like to use to add to any cloned
/// names, specify it as the optional third parameter.
///
/// If you would like the basic block to be auto-inserted into the end of a
/// function, you can specify it as the optional fourth parameter.
///
/// If you would like to collect additional information about the cloned
/// function, you can specify a ClonedCodeInfo object with the optional fifth
/// parameter.
///
BasicBlock *CloneBasicBlock(const BasicBlock *BB,
ValueToValueMapTy &VMap,
const Twine &NameSuffix = "", Function *F = nullptr,
ClonedCodeInfo *CodeInfo = nullptr);
I hope this is a little help. Have Fun!
I have a c code that calculates the factorial of an int "factorial.c". I compile it to llvm readable code "factorial.ll" and I modify in the compiled llvm code.
The objective is to execute the modified llvm code and to see its output, How can I do this?
It will depend on how your outputted LLVM is assembled and what libraries it links against, but for example executing the following factorial.ll with the shell command lli
$ lli factorial.ll
Factorial of 10 = 3628800
Will execute the main function with the JIT and use the standard printf to output the result to stdout.
#.str = private unnamed_addr constant [22 x i8] c"Factorial of %d = %d\0A\00", align 1
declare i32 #printf(i8*, ...)
define i32 #factorial(i32 %n) nounwind uwtable {
entry:
%n.addr = alloca i32, align 4
store i32 %n, i32* %n.addr, align 4
%0 = load i32* %n.addr, align 4
%cmp = icmp sle i32 %0, 1
br i1 %cmp, label %cond.true, label %cond.false
cond.true: ; preds = %entry
br label %cond.end
cond.false: ; preds = %entry
%1 = load i32* %n.addr, align 4
%2 = load i32* %n.addr, align 4
%sub = sub nsw i32 %2, 1
%call = call i32 #factorial(i32 %sub)
%mul = mul nsw i32 %1, %call
br label %cond.end
cond.end: ; preds = %cond.false, %cond.true
%cond = phi i32 [ 1, %cond.true ], [ %mul, %cond.false ]
ret i32 %cond
}
define i32 #main(i32 %argc, i8** %argv) nounwind uwtable {
entry:
%retval = alloca i32, align 4
%argc.addr = alloca i32, align 4
%argv.addr = alloca i8**, align 8
store i32 0, i32* %retval
store i32 %argc, i32* %argc.addr, align 4
store i8** %argv, i8*** %argv.addr, align 8
%call = call i32 #factorial(i32 10)
%call1 = call i32 (i8*, ...)* #printf(i8* getelementptr inbounds ([22 x i8]* #.str, i32 0, i32 0), i32 10, i32 %call)
ret i32 0
}