ML-IR to LLVM IR generates wrong IR - llvm

I am following this toy example to lower mlir to llvm ir and compile the result. Following the instructions, I can compile the project. However, the generated llvm ir (print.ll file) is different from what is in the github. For instance, I have ptr instead of i8* in the print.ll (example: declare ptr #malloc(i64) ). As a result lli cannot compile and gives error. I am using Ubuntu LLVM version 14.0.0.
Error:
lli: lli: ./print2.ll:9:9: error: expected type declare ptr #malloc(i64)
File print.ll:
; ModuleID = 'LLVMDialectModule'
source_filename = "LLVMDialectModule"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
#nl = internal constant [2 x i8] c"\0A\00"
#frmt_spec = internal constant [4 x i8] c"%f \00"
**declare ptr #malloc(i64)**
declare void #free(ptr)
declare i32 #printf(ptr, ...)
define void #main() {
%1 = call ptr #malloc(i64 ptrtoint (ptr getelementptr (double, ptr null, i64 6) to i64))
%2 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } undef, ptr %1, 0
%3 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %2, ptr %1, 1
%4 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %3, i64 0, 2
%5 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %4, i64 2, 3, 0
%6 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %5, i64 3, 3, 1
%7 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %6, i64 3, 4, 0
%8 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %7, i64 1, 4, 1
%9 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %8, 1
%10 = getelementptr double, ptr %9, i64 0
store double 1.000000e+00, ptr %10, align 8
%11 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %8, 1
%12 = getelementptr double, ptr %11, i64 1
store double 2.000000e+00, ptr %12, align 8
%13 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %8, 1
%14 = getelementptr double, ptr %13, i64 2
store double 3.000000e+00, ptr %14, align 8
%15 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %8, 1
%16 = getelementptr double, ptr %15, i64 3
store double 4.000000e+00, ptr %16, align 8
%17 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %8, 1
%18 = getelementptr double, ptr %17, i64 4
store double 5.000000e+00, ptr %18, align 8
%19 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %8, 1
%20 = getelementptr double, ptr %19, i64 5
store double 6.000000e+00, ptr %20, align 8
br label %21
21: ; preds = %36, %0
%22 = phi i64 [ 0, %0 ], [ %38, %36 ]
%23 = icmp slt i64 %22, 2
br i1 %23, label %24, label %39
24: ; preds = %21
br label %25
25: ; preds = %28, %24
%26 = phi i64 [ 0, %24 ], [ %35, %28 ]
%27 = icmp slt i64 %26, 3
br i1 %27, label %28, label %36
28: ; preds = %25
%29 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %8, 1
%30 = mul i64 %22, 3
%31 = add i64 %30, %26
%32 = getelementptr double, ptr %29, i64 %31
%33 = load double, ptr %32, align 8
%34 = call i32 (ptr, ...) #printf(ptr #frmt_spec, double %33)
%35 = add i64 %26, 1
br label %25
36: ; preds = %25
%37 = call i32 (ptr, ...) #printf(ptr #nl)
%38 = add i64 %22, 1
br label %21
39: ; preds = %21
%40 = extractvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %8, 0
call void #free(ptr %40)
ret void
}
!llvm.module.flags = !{!0}
!0 = !{i32 2, !"Debug Info Version", i32 3}

You need to use matching LLVM versions: one used by MLIR and another one locally. In your particular case, you'd need to upgrade your 14.0.0 to something newer – the one that does support opaque pointers.
You can install the latest LLVM using instructions from https://apt.llvm.org/

Related

Getting the value stored from register in llvm IR

I have a simple C program.
int
1.main(int argc, char **argv) {
2. unsigned buffer[4] = { 0, 0, 0, 0 };
3. return buffer[argc];
4. }
And the IR code is as below
; Function Attrs: norecurse nounwind readnone uwtable
define i32 #main(i32 %argc, i8** nocapture readnone %argv) #0 !dbg !6 {
%buffer = alloca [4 x i32], align 16
tail call void #llvm.dbg.value(metadata i32 %argc, i64 0, metadata !14, metadata !21), !dbg !22
tail call void #llvm.dbg.value(metadata i8** %argv, i64 0, metadata !15, metadata !21), !dbg !23
%1 = bitcast [4 x i32]* %buffer to i8*, !dbg !24
call void #llvm.lifetime.start(i64 16, i8* %1) #3, !dbg !24
tail call void #llvm.dbg.declare(metadata [4 x i32]* %buffer, metadata !16, metadata !21), !dbg !25
call void #llvm.memset.p0i8.i64(i8* %1, i8 0, i64 16, i32 16, i1 false), !dbg !26
%2 = sext i32 %argc to i64, !dbg !28
%3 = getelementptr inbounds [4 x i32], [4 x i32]* %buffer, i64 0, i64 %2, !dbg !28
%4 = load i32, i32* %3, align 4, !dbg !28, !tbaa !29
call void #llvm.lifetime.end(i64 16, i8* %1) #3, !dbg !33
ret i32 %4, !dbg !34
}
I want to compare whether the accessing index at line 3 is a valid index. For this comparison I need to extract the value stored for the argc. Below is piece of code I have written to obain the vvalue of argc
auto gep = llvm::dyn_cast<llvm::GetElementPtrInst>(inst);
auto operand2 = gep->getOperand(2);
outs() << "operand 2 "<<*operand2<<"\n";
auto newOperand =operand2.getOperand(0);
outs()<<"New operand "<<*newOperand<<"\n";
Output :-
operand 2 %2 = sext i32 %argc to i64, !dbg !28
New operand i32 %argc
How can I get the value of %argc?
The return value of getOperand function is Value* object of the argc variable (variable newOperand in your code). You can pass that value to any new instruction that you might want to inject (for example CreateICmpEQ) in the IR to compare value of argc with some constant value.

LLVM IR: <N x i1> vector instructions ordering causing different results

I have been using LLVM as a backend for my compiler (I'm not using the LLVM libraries but my own to generate the necessary IR for numerous reasons). At the moment, I am implementing vector operations. Comparisons of vectors emit vectors of booleans <N x i1> and these are causing me problems.
To access vector elements, I have been using extractelement and insertelement however, I am getting some weird behaviour when I execute these instructions in a different orders. The code examples below have the same instructions and should be logically the same. Version 1 outputs BAA while Version 2 outputs BAB. Version 2 is the logically correct version but I cannot figure out why version 1 outputs the wrong version but has the exact same instructions, just in a different order.
EDIT: A workaround to solve this problem is to pass the IR to opt -mem2reg and then compile the bitcode. This is okay but not useful for debugging and thus does not solve my problem.
Version 1
; Version 1 - Generated by my naïve SSA generator
; Outputs: BAA (incorrect)
declare i32 #putchar(i32)
define void #main() {
entry:
%0 = alloca <8 x i1>, align 8 ; v
store <8 x i1> zeroinitializer, <8 x i1>* %0
%1 = alloca <8 x i1>, align 8
store <8 x i1> zeroinitializer, <8 x i1>* %1
%2 = load <8 x i1>, <8 x i1>* %1, align 8
%3 = insertelement <8 x i1> %2, i1 true, i64 0
%4 = insertelement <8 x i1> %3, i1 false, i64 1
%5 = insertelement <8 x i1> %4, i1 true, i64 2
%6 = insertelement <8 x i1> %5, i1 false, i64 3
%7 = insertelement <8 x i1> %6, i1 true, i64 4
%8 = insertelement <8 x i1> %7, i1 false, i64 5
%9 = insertelement <8 x i1> %8, i1 true, i64 6
%10 = insertelement <8 x i1> %9, i1 false, i64 7
store <8 x i1> %10, <8 x i1>* %0
%11 = load <8 x i1>, <8 x i1>* %0, align 8
%12 = extractelement <8 x i1> %11, i64 0
%13 = zext i1 %12 to i32
%14 = add i32 %13, 65 ; + 'A'
%15 = call i32 #putchar(i32 %14)
%16 = load <8 x i1>, <8 x i1>* %0, align 8
%17 = extractelement <8 x i1> %16, i64 1
%18 = zext i1 %17 to i32
%19 = add i32 %18, 65 ; + 'A'
%20 = call i32 #putchar(i32 %19)
%21 = load <8 x i1>, <8 x i1>* %0, align 8
%22 = extractelement <8 x i1> %21, i64 2
%23 = zext i1 %22 to i32
%24 = add i32 %23, 65 ; + 'A'
%25 = call i32 #putchar(i32 %24)
%26 = call i32 #putchar(i32 10) ; \n
ret void
}
Version 2
; Version 2 - Manually modified version of Version 1
; Outputs: BAB (correct)
declare i32 #putchar(i32)
define void #main() {
entry:
%0 = alloca <8 x i1>, align 8 ; v
store <8 x i1> zeroinitializer, <8 x i1>* %0
%1 = alloca <8 x i1>, align 8
store <8 x i1> zeroinitializer, <8 x i1>* %1
%2 = load <8 x i1>, <8 x i1>* %1, align 8
%3 = insertelement <8 x i1> %2, i1 true, i64 0
%4 = insertelement <8 x i1> %3, i1 false, i64 1
%5 = insertelement <8 x i1> %4, i1 true, i64 2
%6 = insertelement <8 x i1> %5, i1 false, i64 3
%7 = insertelement <8 x i1> %6, i1 true, i64 4
%8 = insertelement <8 x i1> %7, i1 false, i64 5
%9 = insertelement <8 x i1> %8, i1 true, i64 6
%10 = insertelement <8 x i1> %9, i1 false, i64 7
store <8 x i1> %10, <8 x i1>* %0
%11 = load <8 x i1>, <8 x i1>* %0, align 8
%12 = load <8 x i1>, <8 x i1>* %0, align 8
%13 = load <8 x i1>, <8 x i1>* %0, align 8
%14 = extractelement <8 x i1> %11, i64 0
%15 = extractelement <8 x i1> %12, i64 1
%16 = extractelement <8 x i1> %13, i64 2
%17 = zext i1 %14 to i32
%18 = zext i1 %15 to i32
%19 = zext i1 %16 to i32
%20 = add i32 %17, 65 ; + 'A'
%21 = add i32 %18, 65 ; + 'A'
%22 = add i32 %19, 65 ; + 'A'
%23 = call i32 #putchar(i32 %20)
%24 = call i32 #putchar(i32 %21)
%25 = call i32 #putchar(i32 %22)
%26 = call i32 #putchar(i32 10) ; \n
ret void
}

Conversion of vector of bool's to integer in llvm ir

I am writing a llvm-ir code which involves vector operations. I did a integer vector comparison with 'icmp' instruction which resulted in a vector of bools say <8 x i1>, my problem is I want to convert this 8 bits to its corresponding integer value with out traversing the vector(extracting elements from vector), I tried 'bitcast <8 x i1> to i8' which seems converting first bit of the vector to i8, correct me if am wrong. Can someone suggest me a way to do this.
define i8 #main() #0 {
entry:
%A = alloca [8 x i32], align 16
%B = alloca [8 x i32], align 16
%arrayidx = getelementptr inbounds [8 x i32], [8 x i32]* %A, i64 0, i64 0
store i32 90, i32* %arrayidx, align 4
%arrayidx1 = getelementptr inbounds [8 x i32], [8 x i32]* %A, i64 0, i64 1
store i32 91, i32* %arrayidx1, align 4
%arrayidx2 = getelementptr inbounds [8 x i32], [8 x i32]* %A, i64 0, i64 2
store i32 92, i32* %arrayidx2, align 8
%arrayidx3 = getelementptr inbounds [8 x i32], [8 x i32]* %A, i64 0, i64 3
store i32 93, i32* %arrayidx3, align 4
%arrayidx4 = getelementptr inbounds [8 x i32], [8 x i32]* %B, i64 0, i64 0
store i32 90, i32* %arrayidx4, align 4
%arrayidx5 = getelementptr inbounds [8 x i32], [8 x i32]* %B, i64 0, i64 1
store i32 1, i32* %arrayidx5, align 4
%arrayidx6 = getelementptr inbounds [8 x i32], [8 x i32]* %B, i64 0, i64 2
store i32 92, i32* %arrayidx6, align 8
%arrayidx7 = getelementptr inbounds [8 x i32], [8 x i32]* %B, i64 0, i64 3
store i32 93, i32* %arrayidx7, align 4
br label %vector.body
vector.body:
%0 = bitcast [8 x i32]* %A to <8 x i32>*
%1 = bitcast [8 x i32]* %B to <8 x i32>*
%2 = load <8 x i32>, <8 x i32>* %0
%3 = load <8 x i32>, <8 x i32>* %1
%4 = icmp eq <8 x i32> %2, %3
%5 = bitcast <8 x i1> %4 to i8
ret i8 %5;
}
am using 'lli' for running this code with out any flags. Output is expected to be 11 but am getting 1 or 0
Thank you so much in advance.
As far as I inderstand, you can't do that without calling a platform specific intrinsic. I noticed that by being unable to write target independant code in c++.
For example, the code below:
typedef int v8i __attribute__((vector_size(32)));
int main() {
v8i a = { 1, 2, 3, 4, 5, 6, 7, 8};
v8i b = { 0, 2, 3, 4, 5, 6, 7, 0};
v8i cmp = (a == b);
char res = *(char*)&cmp;
printf("%d\n", res);
return 0;
}
produces llvm-IR which is quite close from what you wrote (with the appropriate bitcast).
Unfortunately it didn't work as expected.
That's because <8 x i1> doesn't exist on the processor. For example, in x86 AVX2, _mm256_cmpeq_epi32 yields a __m256i.
Bitcasting that to a char will just take the first 8 bits of that register.
I wrote instead intel AVX2 specific code, and found the appropriate instruction : intel intrinsic guide
So this code does what you need:
#include <cstdio>
#include <cstdlib>
#include <immintrin.h>
int main() {
__m256i a = _mm256_set_epi32(8, 7, 6, 5, 4, 3, 2, 1);
__m256i b = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 0);
__m256i eq = _mm256_cmpeq_epi32(a, b);
int res = _mm256_movemask_ps(_mm256_castsi256_ps(eq));
printf("res = %d\n", res);
for(int i = 0; i < 8; ++i) {
printf("%d %d -> %d\n", _mm256_extract_epi32(a, i), _mm256_extract_epi32(b, i), !!((res << i) & 0x80));
}
return 0;
}
In terms of ll code, it turns out you need a few additional bitcast (to float), and a call to the intrinsic
#llvm.x86.avx.movmsk.ps.256
rewriting by hand the llvm-IR code leads to :
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-linux-gnu"
#formatString = private constant [4 x i8] c"%d\0A\00"
define i32 #main() #0 {
%a = alloca <8 x i32>, align 32
%b = alloca <8 x i32>, align 32
store <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>, <8 x i32>* %a, align 32
store <8 x i32> <i32 0, i32 2, i32 3, i32 0, i32 5, i32 0, i32 7, i32 0>, <8 x i32>* %b, align 32
%1 = load <8 x i32>, <8 x i32>* %a, align 32
%2 = load <8 x i32>, <8 x i32>* %b, align 32
%3 = icmp eq <8 x i32> %1, %2
%4 = sext <8 x i1> %3 to <8 x i32>
%5 = bitcast <8 x i32> %4 to <8 x float>
%res = call i32 #llvm.x86.avx.movmsk.ps.256(<8 x float> %5)
%6 = call i32 (i8*, ...) #printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* #formatString, i32 0, i32 0), i32 %res)
ret i32 0
}
declare i32 #llvm.x86.avx.movmsk.ps.256(<8 x float>) #1
declare i32 #printf(i8*, ...) #2
attributes #0 = { norecurse uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="haswell" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cmov,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+xsave,+xsaveopt,-adx,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512pf,-avx512vl,-fma4,-hle,-pku,-prfchw,-rdseed,-rtm,-sha,-sse4a,-tbm,-xop,-xsavec,-xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind readnone }
attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="haswell" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cmov,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+xsave,+xsaveopt,-adx,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512pf,-avx512vl,-fma4,-hle,-pku,-prfchw,-rdseed,-rtm,-sha,-sse4a,-tbm,-xop,-xsavec,-xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
The generated assembly (by llc) looks quite optimal:
vmovaps .LCPI0_0(%rip), %ymm0 # ymm0 = [1,2,3,4,5,6,7,8]
vmovaps %ymm0, 32(%rsp)
vmovdqa .LCPI0_1(%rip), %ymm0 # ymm0 = [0,2,3,0,5,0,7,0]
vmovdqa %ymm0, (%rsp)
vpcmpeqd 32(%rsp), %ymm0, %ymm0
vmovmskps %ymm0, %esi
I found this way working.
define i8 #main() #0 {
entry:
%0 = icmp eq <8 x i32> <i32 90,i32 91,i32 92,i32 93, i32 94,i32 95,i32 96,i32 97>, <i32 90,i32 91,i32 92,i32 93, i32 94,i32 95,i32 96,i32 97>
%1 = bitcast <8 x i1> %0 to <1 x i8>
%2 = extractelement <1 x i8> %1, i32 0
ret i8 %2
}
This is similar code as I posted in the question, I checked the result with "echo $?" am getting the result as expected.

LLVM Create VarArg Function and access var args

I have been trying to create a function using the module pass in LLVM. What I am trying to do is create a variable argument function and then add the logic to manipulate the variable arguments.
For example:
/\*can do this\*/
int foo(int a, ...)
{
double var1;
//can't figure out how to add any of this using llvm
va_list ap;
va_start(ap, a);
va_arg(var1,double);
va_end(ap);
}
Creating the function type is easy because I just set the vararg boolean to true. What do I do after that?
I always use clang to check what it needs to convert for c/c++ lang.
Use llvm instruction va_arg and intinsics llvm.va_start, llvm.va_end, llvm.va_copy to use llvm variable argument support.
you also need target-specific value type “va_list” for functions that operates on arguments that use this.
; This struct is different for every platform. For most platforms,
; it is merely an i8*.
%struct.va_list = type { i8* }
; For Unix x86_64 platforms, va_list is the following struct:
; %struct.va_list = type { i32, i32, i8*, i8* }
ref http://llvm.org/docs/LangRef.html#variable-argument-handling-intrinsics
for your listed code,
; ModuleID = 'test.c'
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-linux-gnu"
%struct.__va_list_tag = type { i32, i32, i8*, i8* }
; Function Attrs: nounwind uwtable
define i32 #foo(i32 %a, ...) #0 {
%1 = alloca i32, align 4
%2 = alloca i32, align 4
%var1 = alloca double, align 8
%ap = alloca [1 x %struct.__va_list_tag], align 16
store i32 %a, i32* %2, align 4
%3 = getelementptr inbounds [1 x %struct.__va_list_tag]* %ap, i32 0, i32 0
%4 = bitcast %struct.__va_list_tag* %3 to i8*
call void #llvm.va_start(i8* %4)
%5 = getelementptr inbounds [1 x %struct.__va_list_tag]* %ap, i32 0, i32 0
%6 = getelementptr inbounds %struct.__va_list_tag* %5, i32 0, i32 1
%7 = load i32* %6
%8 = icmp ule i32 %7, 160
br i1 %8, label %9, label %15
; <label>:9 ; preds = %0
%10 = getelementptr inbounds %struct.__va_list_tag* %5, i32 0, i32 3
%11 = load i8** %10
%12 = getelementptr i8* %11, i32 %7
%13 = bitcast i8* %12 to double*
%14 = add i32 %7, 16
store i32 %14, i32* %6
br label %20
; <label>:15 ; preds = %0
%16 = getelementptr inbounds %struct.__va_list_tag* %5, i32 0, i32 2
%17 = load i8** %16
%18 = bitcast i8* %17 to double*
%19 = getelementptr i8* %17, i32 8
store i8* %19, i8** %16
br label %20
; <label>:20 ; preds = %15, %9
%21 = phi double* [ %13, %9 ], [ %18, %15 ]
%22 = load double* %21
%23 = getelementptr inbounds [1 x %struct.__va_list_tag]* %ap, i32 0, i32 0
%24 = bitcast %struct.__va_list_tag* %23 to i8*
call void #llvm.va_end(i8* %24)
%25 = load i32* %1
ret i32 %25
}
; Function Attrs: nounwind
declare void #llvm.va_start(i8*) #1
; Function Attrs: nounwind
declare void #llvm.va_end(i8*) #1
; Function Attrs: nounwind uwtable
define i32 #main() #0 {
ret i32 0
}

LLVM intrinsic functions

When building a project with LLVM, some function calls will be replaced by intrinsic functions. Is the replacement completed by the front-end (e.g. clang) or the LLVM back-end?
Discussions through the Internet indicate that the intrinsic functions replacement is related to optimization options. So does it mean if there is no optimization option, then no intrinsic replacement will happen? Or in fact, there are some default intrinsic functions replacement that cannot be disabled?
If there is any method to disable all the intrinsic functions, how should I do that?
It depends. Intrinsics written in code are emitted through the front-end directly. Intrinsics like llvm.memset are introduced to the code during optimization at IR level (eigther the front-end nor the back-end perform this optimizations).
Here is a (quite stupid) example:
int main(int argc, char** argv)
{
int a[8];
for (int i = 0; i != 8; ++i)
a[i] = 0;
for (int i = 7; i >= 0; --i)
a[i] = a[i+1] + argc;
return a[0];
}
Compiled with clang 3.5 (clang -S -emit-llvm) you will get the following IR without any intrinsics:
; Function Attrs: nounwind uwtable
define i32 #main(i32 %argc, i8** %argv) #0 {
%1 = alloca i32, align 4
%2 = alloca i32, align 4
%3 = alloca i8**, align 8
%a = alloca [8 x i32], align 16
%i = alloca i32, align 4
%i1 = alloca i32, align 4
store i32 0, i32* %1
store i32 %argc, i32* %2, align 4
store i8** %argv, i8*** %3, align 8
store i32 0, i32* %i, align 4
br label %4
; <label>:4 ; preds = %11, %0
%5 = load i32* %i, align 4
%6 = icmp ne i32 %5, 8
br i1 %6, label %7, label %14
; <label>:7 ; preds = %4
%8 = load i32* %i, align 4
%9 = sext i32 %8 to i64
%10 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 %9
store i32 0, i32* %10, align 4
br label %11
; <label>:11 ; preds = %7
%12 = load i32* %i, align 4
%13 = add nsw i32 %12, 1
store i32 %13, i32* %i, align 4
br label %4
; <label>:14 ; preds = %4
store i32 7, i32* %i1, align 4
br label %15
; <label>:15 ; preds = %29, %14
%16 = load i32* %i1, align 4
%17 = icmp sge i32 %16, 0
br i1 %17, label %18, label %32
; <label>:18 ; preds = %15
%19 = load i32* %i1, align 4
%20 = add nsw i32 %19, 1
%21 = sext i32 %20 to i64
%22 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 %21
%23 = load i32* %22, align 4
%24 = load i32* %2, align 4
%25 = add nsw i32 %23, %24
%26 = load i32* %i1, align 4
%27 = sext i32 %26 to i64
%28 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 %27
store i32 %25, i32* %28, align 4
br label %29
; <label>:29 ; preds = %18
%30 = load i32* %i1, align 4
%31 = add nsw i32 %30, -1
store i32 %31, i32* %i1, align 4
br label %15
; <label>:32 ; preds = %15
%33 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 0
%34 = load i32* %33, align 4
ret i32 %34
}
Compiled again with clang -emit-llvm -O1 you will see this:
; Function Attrs: nounwind readnone uwtable
define i32 #main(i32 %argc, i8** nocapture readnone %argv) #0 {
.preheader:
%a = alloca [8 x i32], align 16
%a6 = bitcast [8 x i32]* %a to i8*
call void #llvm.memset.p0i8.i64(i8* %a6, i8 0, i64 32, i32 4, i1 false)
br label %0
; <label>:0 ; preds = %.preheader, %0
%indvars.iv = phi i64 [ 7, %.preheader ], [ %indvars.iv.next, %0 ]
%1 = add nsw i64 %indvars.iv, 1
%2 = getelementptr inbounds [8 x i32]* %a, i64 0, i64 %1
%3 = load i32* %2, align 4, !tbaa !1
%4 = add nsw i32 %3, %argc
%5 = getelementptr inbounds [8 x i32]* %a, i64 0, i64 %indvars.iv
store i32 %4, i32* %5, align 4, !tbaa !1
%indvars.iv.next = add nsw i64 %indvars.iv, -1
%6 = trunc i64 %indvars.iv to i32
%7 = icmp sgt i32 %6, 0
br i1 %7, label %0, label %8
; <label>:8 ; preds = %0
%9 = getelementptr inbounds [8 x i32]* %a, i64 0, i64 0
%10 = load i32* %9, align 16, !tbaa !1
ret i32 %10
}
The initialization loop was replaced by the llvm.memset intrinsic. The back-end is free to handle the intrinsic as it want's but commonly llvm.memset is lowered to a libc library call.
To answer your first question: Yes, if you don't optimize your code, then you will not get intrinsics in your IR.
To prevent intrinsics being introduced in your code all you have to do is find the optimization pass on your IR and don't run it. Here is a related question how to find out what passes are done on the IR: Where to find the optimization sequence for clang -OX?
for -O1 we get:
prune-eh -inline-cost -always-inline -functionattrs -sroa -domtree
-early-cse -lazy-value-info -jump-threading -correlated-propagation -simplifycfg -instcombine -tailcallelim -simplifycfg -reassociate -domtree -loops -loop-simplify -lcssa -loop-rotate -licm -loop-unswitch -instcombine -scalar-evolution -lcssa -indvars -loop-idiom -loop-deletion -loop-unroll -memdep -memcpyopt -sccp -instcombine -lazy-value-info -jump-threading -correlated-propagation -domtree -memdep -dse -adce -simplifycfg -instcombine -barrier -domtree -loops -loop-simplify -lcssa -branch-prob -block-freq -scalar-evolution -loop-vectorize -instcombine -simplifycfg -strip-dead-prototypes -verify
A wild guess: instcombine is introducing the llvm.memset. I run the passes without instcombine and opt on the unoptimized IR and get this:
; Function Attrs: nounwind readnone uwtable
define i32 #main(i32 %argc, i8** %argv) #0 {
%a = alloca [8 x i32], align 16
%1 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 8
%2 = load i32* %1, align 4
%3 = add nsw i32 %2, %argc
%4 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 7
store i32 %3, i32* %4, align 4
%5 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 7
%6 = load i32* %5, align 4
%7 = add nsw i32 %6, %argc
%8 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 6
store i32 %7, i32* %8, align 4
%9 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 6
%10 = load i32* %9, align 4
%11 = add nsw i32 %10, %argc
%12 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 5
store i32 %11, i32* %12, align 4
%13 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 5
%14 = load i32* %13, align 4
%15 = add nsw i32 %14, %argc
%16 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 4
store i32 %15, i32* %16, align 4
%17 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 4
%18 = load i32* %17, align 4
%19 = add nsw i32 %18, %argc
%20 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 3
store i32 %19, i32* %20, align 4
%21 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 3
%22 = load i32* %21, align 4
%23 = add nsw i32 %22, %argc
%24 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 2
store i32 %23, i32* %24, align 4
%25 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 2
%26 = load i32* %25, align 4
%27 = add nsw i32 %26, %argc
%28 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 1
store i32 %27, i32* %28, align 4
%29 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 1
%30 = load i32* %29, align 4
%31 = add nsw i32 %30, %argc
%32 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 0
store i32 %31, i32* %32, align 4
%33 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 0
%34 = load i32* %33, align 4
ret i32 %34
}
No instructions. So to prevent (at least the memset) intrinsics in your code don't run instcombine on your IR. However, instcombine is a mighty opt pass that realy shortens the code.
Now you have two options:
don't use opt passes that introduce intrinsics
write your own llvm
opt pass that transforms intrinsics back to whatever they could be
replaced with an run it after optimization and before the back-end
starts working
I hope this helps you somehow. Cheers!