Why doesn't LLVM SIMD-vectorize this code? - llvm

I have the following IR:
; ModuleID = 'vec.ir'
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-darwin15.3.0"
define void #patch(i64) {
entry:
%1 = load float, float* inttoptr (i64 4388240000 to float*)
%2 = load float, float* inttoptr (i64 4387644544 to float*)
%3 = fadd float %1, %2
%4 = load float, float* inttoptr (i64 4387729024 to float*)
%5 = fadd float %1, %4
%6 = load float, float* inttoptr (i64 4387730560 to float*)
%7 = fadd float %1, %6
%8 = load float, float* inttoptr (i64 4387513984 to float*)
%9 = fadd float %1, %8
store float %3, float* inttoptr (i64 4371309760 to float*)
call void #__tickValue(i64 105553117467608, i64 %0)
store float %5, float* inttoptr (i64 4371851456 to float*)
call void #__tickValue(i64 105553117465688, i64 %0)
store float %7, float* inttoptr (i64 4371574976 to float*)
call void #__tickValue(i64 105553117465528, i64 %0)
store float %9, float* inttoptr (i64 4371576512 to float*)
call void #__tickValue(i64 105553117466648, i64 %0)
ret void
}
declare void #__tickValue(i64, i64)
When I run /usr/local/opt/llvm/bin/opt -S -O3 vec.ir > vec-opt.ir, I get:
; ModuleID = 'vec.ir'
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-darwin15.3.0"
define void #patch(i64) {
entry:
%1 = load float, float* inttoptr (i64 4388240000 to float*), align 128
%2 = load float, float* inttoptr (i64 4387644544 to float*), align 128
%3 = fadd float %1, %2
%4 = load float, float* inttoptr (i64 4387729024 to float*), align 128
%5 = fadd float %1, %4
%6 = load float, float* inttoptr (i64 4387730560 to float*), align 128
%7 = fadd float %1, %6
%8 = load float, float* inttoptr (i64 4387513984 to float*), align 128
%9 = fadd float %1, %8
store float %3, float* inttoptr (i64 4371309760 to float*), align 64
tail call void #__tickValue(i64 105553117467608, i64 %0)
store float %5, float* inttoptr (i64 4371851456 to float*), align 64
tail call void #__tickValue(i64 105553117465688, i64 %0)
store float %7, float* inttoptr (i64 4371574976 to float*), align 64
tail call void #__tickValue(i64 105553117465528, i64 %0)
store float %9, float* inttoptr (i64 4371576512 to float*), align 64
tail call void #__tickValue(i64 105553117466648, i64 %0)
ret void
}
declare void #__tickValue(i64, i64)
Per http://llvm.org/docs/Vectorizers.html#the-slp-vectorizer, I was hoping the fadd instructions would be combined.
How can I determine more information about why the optimizer isn't vectorizing?

LLVM has internal cost models which will detect automatically if SIMD vectorization is beneficial. You can have diagnostics info by adding some flags to your build line
You can also try to "force" vectorization by adding some directives in your code.
If you start from llvm IR and not from source code, you still have command line switches for opt

Related

LLVM IR get a pointer from a llvalue

I'm working with the LLVM IR in Ocaml to build a toy language and, now my problem is to convert the variable into the reference to this variable.
In other words, my simple program is this
int main(){
int i;
i = 2;
int *p;
p = &i;
print(*p);
return 0;
}
and my problem is to get the pointer of the variable i in the instruction p = &i;, my actual
IR generated is
define i32 #main() {
entry:
%i = alloca i32
store i32 2, i32* %i
%p = alloca i32*
%0 = getelementptr i32, i32* %i, i32 0
store i32* %0, i32** %p
%1 = load i32*, i32** %p
%2 = load i32, i32* %1
call void #print(i32 %2)
ret i32 0
}
I don't like this line %0 = getelementptr i32, i32* %i, i32 0, and I think that I'm only lucky that my code work as expected.
To summarize my question is, What is the good practice to make this memory operation with a variable like C language? In particular, I need to to the following
i = 2;
int *p;
p = &i;
And also
int *p;
p = &i;
*p = *p + 2;
I'm missing something because when I try to compile code like that *p = *p + 2; I receive some core dump.
I noted also that clang for my first example doesn't use getelementptr, but generate some code like that
; Function Attrs: noinline nounwind optnone uwtable
define dso_local i32 #main() #0 {
%1 = alloca i32, align 4
%2 = alloca i32, align 4
%3 = alloca i32*, align 8
store i32 0, i32* %1, align 4
store i32 2, i32* %2, align 4
store i32* %2, i32** %3, align 8
%4 = load i32*, i32** %3, align 8
%5 = load i32, i32* %4, align 4
%6 = call i32 (i32, ...) bitcast (i32 (...)* #print to i32 (i32, ...)*)(i32 %5)
ret i32 0
}
In my grammar, the *p is a pointer and I convert it into llvm IR into an llvm pointer type.

LLVM Register use in function

In the below code the function has 2 arguments, which I assume are stored in %0 and %1.
Jet the function starts with %3.
What is %2 used for?
define void #swap(i32*, i32*) #0 {
%3 = alloca i32*, align 8
%4 = alloca i32*, align 8
%5 = alloca i32*, align 8
store i32* %0, i32** %3, align 8
store i32* %1, i32** %4, align 8
%6 = load i32*, i32** %3, align 8
store i32* %6, i32** %5, align 8
%7 = load i32*, i32** %4, align 8
store i32* %7, i32** %3, align 8
%8 = load i32*, i32** %5, align 8
store i32* %8, i32** %4, align 8
ret void
}
The above LLVM code was generated with clang from this c code:
void swap(int* i, int* j){
int* temp = i;
i = j;
j = temp;
}
%2 is the name of the entry basic block. Non-entry BBs have their names explicit, like <label>:123.

LoadInst and StoreInst Values and addresses LLVM

I have a file print.c, which has two functions:
void printLoad(...) {
// print address and value of memory location from which value
printf("address=... value=...", ...);
}
void printStore(...) {
// print address and value of memory location from which value
}
I have an LLVM pass which iterates over the instructions and adds CallInst instruction either printLoad or printStore (depending on the instruction type) after the current one (load/store inst).
In order to call this printStore or printLoad I need to add appropriate arguments to CallInst::Create function, which are the address and the value of the memory location.
This is an example of what I want to achieve:
define void #mains() #0 {
%1 = alloca i32, align 4
%2 = alloca i32, align 4
store i32 0, i32* %1, align 4
store i32 5, i32* %1, align 4
store i32 2, i32* %2, align 4
store i32 4, i32* %2, align 4
%3 = load i32, i32* %2, align 4
%4 = add nsw i32 %3, 5
store i32 %4, i32* %1, align 4
ret void
}
The output should be:
store instruction:
address=... // address of %1
value=0
...
...
...
load instruction:
address=... // address of %2
value=4
store instruction:
address=... // address of %1
value=9
Progress so far:
I am able to get the addresses of the operands using getPointerOperand() on LoadInst/StoreInst.
I can also get the value of StoreInst in the first 4 store instructions by casting the operand to ConstantInt, but I don't know how to extract the value in the last StoreInst. Is it even possible?
EDITED:
Using
void printLoad(int32_t p)
and
Constant *hookLoadFunc = M.getOrInsertFunction("printLoad", Type::getVoidTy(M.getContext()), Type::getInt32Ty(M.getContext()));
.
%1 = alloca i32, align 4
%2 = alloca i32, align 4
%3 = alloca i32, align 4
store i32 0, i32* %1, align 4
call void #printStore(i32 0)
store i32 0, i32* %2, align 4
call void #printStore(i32 0)
store i32 5, i32* %2, align 4
call void #printStore(i32 5)
store i32 2, i32* %3, align 4
call void #printStore(i32 2)
store i32 4, i32* %3, align 4
call void #printStore(i32 4)
%4 = load i32, i32* %3, align 4
%5 = add nsw i32 %4, 5
store i32 %5, i32* %2, align 4
call void #printStore(i32 %5)
ret i32 0
%2 = alloca i32, align 4
store i32 %0, i32* %2, align 4
call void #printStore(i32 %0)
%3 = load i32, i32* %2, align 4
%4 = call i32 (i8*, ...) #printf(i8* getelementptr inbounds ([22 x i8], [22 x i8]* #.str, i32 0, i32 0), i32 %3)
ret void
%2 = alloca i32, align 4
store i32 %0, i32* %2, align 4
call void #printStore(i32 %0)
%3 = load i32, i32* %2, align 4
%4 = call i32 (i8*, ...) #printf(i8* getelementptr inbounds ([22 x i8], [22 x i8]* #.str.1, i32 0, i32 0), i32 %3)
ret void
This causes Segmentation fault: 11 when run.
SOLVED:
Figured out that I had infinity loop (due to recursion). printStore actually uses load/store instructions, thus creating another call to printStore and so on.
Assuming that you have an llvm::Function that represents printLoad() and printStore():
llvm::Function * print_load = ....
llvm::Function * print_store = ...
You can emit a CallInst for each LoadInst and StoreInst.
For LoadInst:
LoadInst * some_load = ...
Value * address_of_load = some_load->getOperand(0);
Value * print_load_arguments[] = { address_of_load, some_load };
// Insert a CallInst just after the load.
CallInst::Create(print_load, print_load_arguments )->insertAfter( some_load );
Remember that in llvm the value loaded by the LoadInst is the same thing as the LoadInst itself.
For StoreInst:
StoreInst * some_store = ...
Value * value_to_store = some_store->getOperand(0);
Value * address_of_store = some_store->getOperand(1);
Value * print_store_arguments[] = { address_of_store, value_to_store };
// Insert a CallInst just after the store.
CallInst::Create(print_store, print_store_arguments)->insertAfter(some_store);
This will work if all the types match. Otherwise, you have to insert BitCast instructions just before calling printStore() or printLoad().

Why is this block of LLVM instructions generated?

The DataFlowSanitizer pass on LLVM 3.8.0, 64 bit (Ubuntu 16.04.2) generates the following IR from source:
The source:
test.c
#include <sanitizer/dfsan_interface.h>
int main(void) {
int i = 1;
dfsan_label i_label = dfsan_create_label("i", 0);
dfsan_set_label(i_label, &i, sizeof(i));
return 0;
}
The commands to generate the IR:
clang -c -emit-llvm -fsanitize=dataflow test.c -o test.bc
llvm-dis test.bc
The disassembly:
test.ll
; Function Attrs: nounwind uwtable
define i32 #main() #0 {
entry:
%0 = alloca i16
%retval = alloca i32, align 4
%i = alloca i32, align 4
%1 = alloca i16
%i_label = alloca i16, align 2
store i16 0, i16* %0
store i32 0, i32* %retval, align 4
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
%2 = ptrtoint i32* %i to i64
%3 = and i64 %2, -123145302310913
%4 = mul i64 %3, 2
%5 = inttoptr i64 %4 to i16*
%6 = bitcast i16* %5 to i64*
store i64 0, i64* %6, align 2
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
store i32 1, i32* %i, align 4
%call = call zeroext i16 #dfsan_create_label(i8* getelementptr inbounds ([2 x i8], [2 x i8]* #.str, i32 0, i32 0), i8* null)
store i16 0, i16* %1
store i16 %call, i16* %i_label, align 2
%7 = load i16, i16* %1
%8 = load i16, i16* %i_label, align 2
%9 = bitcast i32* %i to i8*
call void #dfsan_set_label(i16 zeroext %8, i8* %9, i64 4)
ret i32 0
}
I don't understand why the block of instruction I separated out is being generated. Looking at the Transform/Instrumentation/DataFlowsanitizer.cpp, I can't find the code that inserts the instrumentation above. Can anyone explain this behavior?

How to put Load Instructions into InsertElement Instructions through LLVM pass?

I would like to auto-generate Load and InsertElement Instruction through llvm pass.
My problem is how to put Load Instructions such as %4, %6, %8, %10 into InsertElement Instructions, like my goal of LLVM-IR code:
%4 = load i32, i32* %1, align 4
%5 = insertelement <4 x i32> undef, i32 %4, i32 0
%6 = load i32, i32* %1, align 4
%7 = insertelement <4 x i32> %5, i32 %6, i32 1
%8 = load i32, i32* %1, align 4
%9 = insertelement <4 x i32> %7, i32 %8, i32 2
%10 = load i32, i32* %1, align 4
%11 = insertelement <4 x i32> %9, i32 %10, i32 3
and my pass:
if (auto *op = dyn_cast<LoadInst>(&I)) {
...
Value* loadinst_ptr=op->getPointerOperand();
Type* load_ty= loadinst_ptr->getType();
Value* val = UndefValue::get(VectorType::get(load_ty, 4));
for (unsigned i = 0; i < 4; i++)//for 4 copies
{
LoadInst* load_val=builder.CreateLoad(loadinst_ptr);
load_val->setAlignment(4);
//my problem is here, parameter load_val is invalid on CreateInsertElement.
val = builder.CreateInsertElement(val,load_val, builder.getInt32(i));
}
}
here is error message:
opt: /home/shun/llvm-proj/llvm/lib/IR/Instructions.cpp:1748:
llvm::InsertElementInst::InsertElementInst(llvm::Value*, llvm::Value*, llvm::Value*, const llvm::Twine&, llvm::Instruction*):
Assertion `isValidOperands(Vec, Elt, Index) && "Invalid insertelement instruction operands!"' failed.