LLVM LoopPass values used outside the loop - llvm

I'm writing an LLVM LoopPass in which I need to know which values
are used outside the loop. For that I have this code:
virtual bool runOnLoop(Loop *loop, LPPassManager &LPM)
{
for (auto it = loop->block_begin(); it != loop->block_end(); it++)
{
for (auto inst = (*it)->begin(); inst != (*it)->end(); inst++)
{
if (Is_Used_Outside_This_loop(loop,(Instruction *) inst))
{
errs() << inst->getName().str();
errs() << " is used outside the loop\n";
}
}
}
// ...
}
The inner function seemed right at first, but with the *.ll file below,
it gives incorrect classification for %tmp5 since it is used twice
inside a basic block of the loop.
bool Is_Used_Outside_This_loop(Loop *loop, Value *v)
{
int n=0;
int numUses = v->getNumUses();
for (auto it = loop->block_begin(); it != loop->block_end(); it++)
{
if (v->isUsedInBasicBlock(*it))
{
n++;
}
}
if (n == numUses) return false;
else return true;
}
The following *.ll code shows that %tmp5 is used twice
inside a basic block of the loop. When I carefully searched the API,
I couldn't find anything like Value::numUsesInBasicBlock( ... )
; Function Attrs: nounwind uwtable
define internal void #foo(i8* %s) #0 {
entry:
%s.addr = alloca i8*, align 8
%c = alloca i8, align 1
store i8* %s, i8** %s.addr, align 8
store i8 0, i8* %c, align 1
br label %while.cond
while.cond: ; preds = %while.body, %entry
%tmp = load i8*, i8** %s.addr, align 8
%tmp1 = load i8, i8* %tmp, align 1
%conv = sext i8 %tmp1 to i32
%cmp = icmp eq i32 %conv, 97
br i1 %cmp, label %lor.end, label %lor.rhs
lor.rhs: ; preds = %while.cond
%tmp2 = load i8*, i8** %s.addr, align 8
%tmp3 = load i8, i8* %tmp2, align 1
%conv2 = sext i8 %tmp3 to i32
%cmp3 = icmp eq i32 %conv2, 98
br label %lor.end
lor.end:; preds = %lor.rhs, %while.cond
%tmp4 = phi i1 [ true, %while.cond ], [ %cmp3, %lor.rhs ]
br i1 %tmp4, label %while.body, label %while.end
while.body: ; preds = %lor.end
%tmp5 = load i8*, i8** %s.addr, align 8
%incdec.ptr = getelementptr inbounds i8, i8* %tmp5, i32 1
store i8* %incdec.ptr, i8** %s.addr, align 8
%tmp6 = load i8, i8* %tmp5, align 1
store i8 %tmp6, i8* %c, align 1
br label %while.cond
while.end: ; preds = %lor.end
%tmp7 = load i8*, i8** %s.addr, align 8
%tmp8 = load i8, i8* %tmp7, align 1
%conv5 = sext i8 %tmp8 to i32
%cmp6 = icmp eq i32 %conv5, 99
br i1 %cmp6, label %if.then, label %if.end
if.then: ; preds = %while.end
%tmp9 = load i8*, i8** %s.addr, align 8
%incdec.ptr8 = getelementptr inbounds i8, i8* %tmp9, i32 1
store i8* %incdec.ptr8, i8** %s.addr, align 8
br label %if.end
if.end: ; preds = %if.then, %while.end
ret void
}
Clearly, there's got to be a way of doing this, right? Thanks!

The problem is your exit condition. You ask for uses, but then compare with the number of blocks that the value is used in.
So, if the value is used twice in the same basic block, the number of uses is 2 and the n counter of basic blocks that used the value is only incremented once, hence the mismatch of n and numUses.
Maybe a more concise way to do what you want is:
void FindUsesNotIn(
llvm::SmallPtrSetImpl<llvm::BasicBlock *> &Blocks,
llvm::SmallPtrSetImpl<llvm::Value *> &OutUses) {
for(const auto &b : Blocks)
for(auto &i : *b)
for(const auto &u : i.users()) {
auto *userInst = llvm::dyn_cast<llvm::Instruction>(u);
if(userInst && !Blocks.count(userInst->getParent())) {
OutUses.insert(&i);
break;
}
}
}
and then in the runOnLoop method have something like this:
virtual bool runOnLoop(llvm::Loop *loop, llvm::LPPassManager &LPM) {
llvm::SmallPtrSet<llvm::BasicBlock*, 10> loopBlocks(loop->block_begin(), loop->block_end());
llvm::SmallPtrSet<llvm::Value *, 10> outs;
FindUsesNotIn(loopBlocks, outs);
for(const auto *e : outs)
llvm::dbgs() << *e << '\n';
return false;
}

Here's how I solved it, though it looks overly complicated:
virtual bool runOnLoop(Loop *loop, LPPassManager &LPM)
{
for (auto it = loop->block_begin(); it != loop->block_end(); it++)
{
for (auto inst = (*it)->begin(); inst != (*it)->end(); inst++)
{
int n=0;
for (auto use = inst->use_begin(); use != inst->use_end(); use++)
{
Instruction *i = (Instruction *) use->getUser();
if (BasicBlockBelongsToLoop(i->getParent(),loop))
{
n++;
}
}
assert(n <= inst->getNumUses());
if (n < inst->getNumUses())
{
errs() << inst->getName().str();
errs() << " is used outside the loop\n";
}
}
}
// ...
I also couldn't found in the API, how to check if a basic block belongs to a loop,
so I had to write my own BasicBlockBelongsToLoop, here it is:
bool BasicBlockBelongsToLoop(BasicBlock *BB, Loop *loop)
{
for (auto it = loop->block_begin(); it != loop->block_end(); it++)
{
if (BB == (*it))
{
return true;
}
}
return false;
}

Related

Algorithm that was supposed to go into recurse does not go into recurse

I am pretty new to LLVM IR and I am trying to convert the following into LLVM IR
double sum(double *input, int n) {
double result = input[0] + ... + input[n-1]
return result
}
and this is what I have done:
%free_func = type void (double*)*
%list = type { %list*, double*, %free_func }
define double #sum(double* %ptr, i32 %n) {
entry:
%var1 = alloca i32
store i32 %n, i32* %var1
%var2 = alloca double*
store double* %ptr, double** %var2
%conv1.1 = load i32, i32* %var1
%tmp1 = icmp sle i32 %conv1.1, 0
br i1 %tmp1, label %recurse, label %done
recurse:
%0 = bitcast double** %var2 to %list* ; base field
%1 = bitcast double** %var2 to %list** ; next field
%2 = bitcast double** %var2 to %list*** ; next next field
%3 = bitcast %list* %0 to i16*
%conv2.1 = load i16, i16* %3
%4 = bitcast %list** %1 to i16*
%conv2.2 = load i16, i16* %4
%tmp2 = add nsw i16 %conv2.2, %conv2.1 ; add base value & next value
%tmp3 = alloca i16
store i16 %tmp2, i16* %tmp3
%5 = bitcast i16* %tmp3 to %list* ; convert vector back to list
store %list* %5, %list** %1 ; replace next value with sum
%conv3.1 = load %list*, %list** %1 ; replace base with next
%conv3.2 = load %list**, %list*** %2 ; replace next with next next
%6 = bitcast %list* %conv3.1 to double* ; convert list back to double
%tmp5 = sub nsw i32 %conv1.1, 1
%tmp6 = call double #sum(double* %6, i32 %tmp5)
ret double %tmp6
done:
%conv6.1 = load double, double* %ptr
ret double %conv6.1
}
It seems to go straight to done without going through recurse and I am not sure what is wrong. May I ask for tips on how to make this work? Thank you and I am sorry if the code looks messy because I've only just started learning and am practising.

Delete complete branch from llvm ir

There is a branch in ir that I want to delete completely(condtion + branch + true_basic_block + false_basic_block). It looks like this:
%4 = icmp sge i32 %2, %3
br i1 %4, label %5, label %7
; <label>:5 ; preds = %0
%6 = load i32* %x, align 4
store i32 %6, i32* %z, align 4
br label %9
; <label>:7 ; preds = %0
%8 = load i32* %y, align 4
store i32 %8, i32* %z, align 4
br label %9
; <label>:9 ; preds = %7, %5
%10 = call dereferenceable(140) %"class.std::basic_ostream"*#_ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc(%"class.std::basic_ostream"* dereferenceable(140) #_ZSt4cout, i8* getelementptr inbounds ([5 x i8]* #.str, i32 0, i32 0))
%11 = load i32* %z, align 4
%12 = call dereferenceable(140) %"class.std::basic_ostream"* #_ZNSolsEi(%"class.std::basic_ostream"* %10, i32 %11)
%13 = call dereferenceable(140) %"class.std::basic_ostream"* #_ZNSolsEPFRSoS_E(%"class.std::basic_ostream"* %12, %"class.std::basic_ostream"* (%"class.std::basic_ostream"*)* #_ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_)
ret i32 0
Now to delete it , is there a removeBranch function , or do I need to delete instructions one by one. I have been trying the latter way but I have seen every error from "Basic block in main does not have an terminator" to "use remains when def is destroyed", and many more.. I have used erasefromparent, replaceinstwithvalue, replaceinstwithinst, removefromparent, etc.
Can anyone be kind enough to point me in the correct direction?
This is my function_pass :
bool runOnFunction(Function &F) override {
for (auto& B : F)
for (auto& I : B)
if(auto* brn = dyn_cast<BranchInst>(&I))
if(brn->isConditional()){
Instruction* cond = dyn_cast<Instruction>(brn->getCondition());
if(cond->getOpcode() == Instruction::ICmp){
branch_vector.push_back(brn);
//removeConditionalBranch(dyn_cast<BranchInst>(brn));
}
}
/*For now just delete the branches in the vector.*/
for(auto b : branch_vector)
removeConditionalBranch(dyn_cast<BranchInst>(b));
return true;
}
This is the output :
I don't know of any RemoveBranch utility function, but something like this should work. The idea is to delete the branch instruction, then delete anything that becomes dead as a result, and then merge the initial block with the join block.
// for DeleteDeadBlock, MergeBlockIntoPredecessor
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
// for RecursivelyDeleteTriviallyDeadInstructions
#include "llvm/Transforms/Utils/Local.h"
void removeConditionalBranch(BranchInst *Branch) {
assert(Branch &&
Branch->isConditional() &&
Branch->getNumSuccessors() == 2);
BasicBlock *Parent = Branch->getParent();
BasicBlock *ThenBlock = Branch->getSuccessor(0);
BasicBlock *ElseBlock = Branch->getSuccessor(1);
BasicBlock *ThenSuccessor = ThenBlock->getUniqueSuccessor();
BasicBlock *ElseSuccessor = ElseBlock->getUniqueSuccessor();
assert(ThenSuccessor && ElseSuccessor && ThenSuccessor == ElseSuccessor);
Branch->eraseFromParent();
RecursivelyDeleteTriviallyDeadInstructions(Branch->getCondition());
DeleteDeadBlock(ThenBlock);
DeleteDeadBlock(ElseBlock);
IRBuilder<> Builder(Parent);
Builder.CreateBr(ThenSuccessor);
bool Merged = MergeBlockIntoPredecessor(ThenSuccessor);
assert(Merged);
}
This code only handles the simple case you've shown, with the then and else blocks both jumping unconditionally to a common join block (it will fail with an assertion error for anything more complicated). More complicated control flow will be a bit trickier to handle, but you should still be able to use this code as a starting point.

LLVM "Instruction does not dominate all uses" - Inserting new Instruction

I am getting the following error while inserting an instruction using an llvm pass:
Instruction does not dominate all uses!
%add = add nsw i32 10, 2
%cmp3 = icmp ne i32 %a.01, %add
Broken module found, compilation aborted!
I have the source code in a bitcode file whose snippet is:
if.then: ; preds = %entry
%add = add nsw i32 10, 2
br label %if.end
if.else: ; preds = %entry
%sub = sub nsw i32 10, 2
br label %if.end
if.end: ; preds = %if.else, %if.then
%a.0 = phi i32 [ %add, %if.then ], [ %sub, %if.else ]
%a.01 = call i32 #tauInt32Ty(i32 %a.0) ; line A
%add3 = add nsw i32 %a.01, 2
%add4 = add nsw i32 %a.01, 3
%call5 = call i32 (i8*, ...)* #printf(i8* getelementptr inbounds ([7 x i8]* #.str2, i32 0, i32 0), i32 %add3, i32 %add4)
I want to insert a new instruction after "line A" which is :
%cmp3 = icmp ne i32 %a.01, %add
And I have written a function pass whose snippet of the code which does this task is :
for (Function::iterator bb = F.begin(), e = F.end(); bb != e; ++bb) {
for (BasicBlock::iterator i = bb->begin(), e = bb->end(); i != e; ++i) {
std::string str;
if(isa<CallInst>(i))// || true) {
BasicBlock::iterator next_it = i;
next_it++;
Instruction* next = dyn_cast<Instruction>(&*next_it);
CallInst* ci = dyn_cast<CallInst>(&*i);
Function* ff = ci->getCalledFunction();
str = ff->getName();
errs()<<"> "<<str<<"\n";
if(!str.compare("tauInt32Ty")) {
hotPathSSA1::varVersionWithPathsSet::iterator start = tauArguments[&*ci].begin();
hotPathSSA1::varVersionWithPathsSet::iterator end = tauArguments[&*ci].end();
Value* specArgs = start->second; // specArgs points to %add
ICmpInst* int1_cmp_56 = new ICmpInst(next, ICmpInst::ICMP_NE, ci, specArgs, "cmp3");
}
}
}
}
I have not encountered such a problem jet but I think your problem is the if statement. %add belonges to the if.then BasicBlock and it is not accessable from the if.end block. This is why the phi instruction "chooses" which value is available %add or %sub. So you have to take %a.0 for your IcmpInst as argument not %add.

How to create an array of classes types?

I have a single class "Base", and a few tens of classes derived from Base. I would like to have a method that creates me the right class by an index. Like this:
class Base
{
};
class A : public Base
{
}
class B : public Base
{
}
class C : public Base
{
}
Type array = { A, B, C };
and then I could do new array[i];
How could this be achieved with C++(0x)? Usually I would use an the Abstract Factory Pattern. But since I have a LOT of derived classes, this would really slow down the program.
Since the derived classes will be used only once I also taught to use this:
Base *array = { new A, new B, new C };
But this would lead to huge memory consumption, not counting that not every class will always be used.
Any suggestion?
You cannot use an array of classes, but you can use an array of pointers to functions.
typedef std::unique_ptr<Base> (*Creator)();
template <typename T>
std::unique_ptr<Base> make() { return new T{}; }
Creator const array[] = { make<A>, make<B>, make<C> };
int main() {
std::unique_ptr<Base> b = array[1]();
b->foo();
}
For those worried by the cost of creating so many template functions, here is an example:
#include <stdio.h>
struct Base { virtual void foo() const = 0; };
struct A: Base { void foo() const { printf("A"); } };
struct B: Base { void foo() const { printf("B"); } };
struct C: Base { void foo() const { printf("C"); } };
typedef Base* (*Creator)();
template <typename T>
static Base* make() { return new T{}; }
static Creator const array[] = { make<A>, make<B>, make<C> };
Base* select_array(int i) {
return array[i]();
}
Base* select_switch(int i) {
switch(i) {
case 0: return make<A>();
case 1: return make<B>();
case 2: return make<C>();
default: return 0;
}
}
LLVM/Clang generates the following output:
define %struct.Base* #select_array(int)(i32 %i) uwtable {
%1 = sext i32 %i to i64
%2 = getelementptr inbounds [3 x %struct.Base* ()*]* #array, i64 0, i64 %1
%3 = load %struct.Base* ()** %2, align 8, !tbaa !0
%4 = tail call %struct.Base* %3()
ret %struct.Base* %4
}
define noalias %struct.Base* #select_switch(int)(i32 %i) uwtable {
switch i32 %i, label %13 [
i32 0, label %1
i32 1, label %5
i32 2, label %9
]
; <label>:1 ; preds = %0
%2 = tail call noalias i8* #operator new(unsigned long)(i64 8)
%3 = bitcast i8* %2 to i32 (...)***
store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*]* #vtable for A, i64 0, i64 2) to i32 (...)**), i32 (...)*** %3, align 8
%4 = bitcast i8* %2 to %struct.Base*
br label %13
; <label>:5 ; preds = %0
%6 = tail call noalias i8* #operator new(unsigned long)(i64 8)
%7 = bitcast i8* %6 to i32 (...)***
store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*]* #vtable for B, i64 0, i64 2) to i32 (...)**), i32 (...)*** %7, align 8
%8 = bitcast i8* %6 to %struct.Base*
br label %13
; <label>:9 ; preds = %0
%10 = tail call noalias i8* #operator new(unsigned long)(i64 8)
%11 = bitcast i8* %10 to i32 (...)***
store i32 (...)** bitcast (i8** getelementptr inbounds ([3 x i8*]* #vtable for C, i64 0, i64 2) to i32 (...)**), i32 (...)*** %11, align 8
%12 = bitcast i8* %10 to %struct.Base*
br label %13
; <label>:13 ; preds = %9, %5, %1, %0
%.0 = phi %struct.Base* [ %12, %9 ], [ %8, %5 ], [ %4, %1 ], [ null, %0 ]
ret %struct.Base* %.0
}
Unfortunately, it is not quite intelligent enough to automatically inline the functions with a regular array code (known issue with the LLVM optimizer, I don't know if gcc does better)... but using switch it is indeed possible.
typedef Base* BaseMaker();
template <class X> Base* make() {
return new X;
}
BaseMaker* makers[] = { make<A>, make<B>, make<C> };
Base* b = makers[2]();

nested if vs loop condition

I have to do a comparison and I want to know which will be faster.
1)
for (i=0;i<4;i++){
if (object1(i)==object2(i))
retval = true;
else {
retval = false;
break;
}
}
2)
if ( (object1(0)==object2(0) && (object1(1)==object2(1) && (object1(2)==object2(2) && (object1(3)==object2(3)){
retval = true;
else
retval = false;
Or both will perform the same?
Thanks for Advice
Strictly speaking the most efficient path would be:
retval = object1(0) == object2(0) && object1(1) == object2(1).....
This basically does the same as your loop, but doesn't have to compare the result to true to determine the outcome of the condition.
However, I strongly recommend keeping the loop, as it is far easier to adapt to add or remove numbers.
You need to measure. But in any case the first code can be simplified quite a bit:
for (i = 0; i < 4; ++i)
if (object1(i) != object2(i))
return false;
return true;
Now choose the more readable form. I’d choose the loop here, unless you have confirmed that there is a performance problem caused by this code.
If the optimization flags are on, then the compiler might produce same machine instructtions for both code, unlooping the for loop completely, as the exact number of iteration is known to the compiler:
loop unrolling
By the way, if you care so much, then you could write this:
bool retValue = (object1(0)==object2(0)) &&
(object1(1)==object2(1)) &&
(object1(2)==object2(2)) &&
(object1(3)==object2(3));
which avoids both: for loop, as well as if-else branch, and it doesn't depend on compiler optimization.
As always with optimization, the one and single rule is MEASURE.
Furthermore, I guess that the compiler could optimize this code in some ways you (and I) couldn't even imagine. Therefore I'd suggest to write it in the most readable form.
I like to play with the Try out LLVM and Clang page for this:
struct Object {
int operator()(int i) const;
};
bool loop(Object const& left, Object const& right) {
bool retval = false;
for (int i = 0; i < 4; i++) {
if (left(i) == right(i) )
retval = true;
else {
retval = false;
break;
}
}
return true;
}
bool inlineif(Object const& left, Object const& right) {
bool retval = true;
if ( left(0) == right(0) &&
left(1) == right(1) &&
left(2) == right(2) &&
left(3) == right(3))
retval = true;
else
retval = false;
return retval;
}
bool betterloop(Object const& left, Object const& right) {
for (int i = 0; i < 4; ++i)
if (left(i) != right(i))
return false;
return true;
}
bool betterif(Object const& left, Object const& right) {
return left(0) == right(0) &&
left(1) == right(1) &&
left(2) == right(2) &&
left(3) == right(3);
}
Produces the following IR for loops (regardless of how they are written):
define zeroext i1 #_Z4loopRK6ObjectS1_(%struct.Object* %left, %struct.Object* %right) uwtable {
br label %1
; <label>:1 ; preds = %7, %0
%i.0 = phi i32 [ 0, %0 ], [ %8, %7 ]
%2 = icmp slt i32 %i.0, 4
br i1 %2, label %3, label %9
; <label>:3 ; preds = %1
%4 = tail call i32 #_ZNK6ObjectclEi(%struct.Object* %left, i32 %i.0)
%5 = tail call i32 #_ZNK6ObjectclEi(%struct.Object* %right, i32 %i.0)
%6 = icmp eq i32 %4, %5
br i1 %6, label %7, label %9
; <label>:7 ; preds = %3
%8 = add nsw i32 %i.0, 1
br label %1
; <label>:9 ; preds = %3, %1
ret i1 true
}
And a very similar IR for the two if (so I'll give only one):
define zeroext i1 #_Z8betterifRK6ObjectS1_(%struct.Object* %left, %struct.Object* %right) uwtable {
%1 = tail call i32 #_ZNK6ObjectclEi(%struct.Object* %left, i32 0)
%2 = tail call i32 #_ZNK6ObjectclEi(%struct.Object* %right, i32 0)
%3 = icmp eq i32 %1, %2
br i1 %3, label %4, label %16
; <label>:4 ; preds = %0
%5 = tail call i32 #_ZNK6ObjectclEi(%struct.Object* %left, i32 1)
%6 = tail call i32 #_ZNK6ObjectclEi(%struct.Object* %right, i32 1)
%7 = icmp eq i32 %5, %6
br i1 %7, label %8, label %16
; <label>:8 ; preds = %4
%9 = tail call i32 #_ZNK6ObjectclEi(%struct.Object* %left, i32 2)
%10 = tail call i32 #_ZNK6ObjectclEi(%struct.Object* %right, i32 2)
%11 = icmp eq i32 %9, %10
br i1 %11, label %12, label %16
; <label>:12 ; preds = %8
%13 = tail call i32 #_ZNK6ObjectclEi(%struct.Object* %left, i32 3)
%14 = tail call i32 #_ZNK6ObjectclEi(%struct.Object* %right, i32 3)
%15 = icmp eq i32 %13, %14
br label %16
; <label>:16 ; preds = %12, %8, %4, %0
%17 = phi i1 [ false, %8 ], [ false, %4 ], [ false, %0 ], [ %15, %12 ]
ret i1 %17
}
The important instructions here is br which is the branching instruction. It can be used either as a simple goto or with conditions on the edges:
br i1 %11, label %12, label %16
means if i1 is true, go to label %12, otherwise go to label %16.
It seems that "naturally" LLVM will not unroll the traditional loop version, so the if version performs better here. I am quite surprised, actually, that it does not and I cannot figure out why it would not...
So, the inline if code might be a bit faster, but it might also be unnoticeable depending on the cost of left(i) == right(i) (and even then), as CPU are quite good at branch prediction.