LLVM IR: <N x i1> vector instructions ordering causing different results - llvm

I have been using LLVM as a backend for my compiler (I'm not using the LLVM libraries but my own to generate the necessary IR for numerous reasons). At the moment, I am implementing vector operations. Comparisons of vectors emit vectors of booleans <N x i1> and these are causing me problems.
To access vector elements, I have been using extractelement and insertelement however, I am getting some weird behaviour when I execute these instructions in a different orders. The code examples below have the same instructions and should be logically the same. Version 1 outputs BAA while Version 2 outputs BAB. Version 2 is the logically correct version but I cannot figure out why version 1 outputs the wrong version but has the exact same instructions, just in a different order.
EDIT: A workaround to solve this problem is to pass the IR to opt -mem2reg and then compile the bitcode. This is okay but not useful for debugging and thus does not solve my problem.
Version 1
; Version 1 - Generated by my naïve SSA generator
; Outputs: BAA (incorrect)
declare i32 #putchar(i32)
define void #main() {
entry:
%0 = alloca <8 x i1>, align 8 ; v
store <8 x i1> zeroinitializer, <8 x i1>* %0
%1 = alloca <8 x i1>, align 8
store <8 x i1> zeroinitializer, <8 x i1>* %1
%2 = load <8 x i1>, <8 x i1>* %1, align 8
%3 = insertelement <8 x i1> %2, i1 true, i64 0
%4 = insertelement <8 x i1> %3, i1 false, i64 1
%5 = insertelement <8 x i1> %4, i1 true, i64 2
%6 = insertelement <8 x i1> %5, i1 false, i64 3
%7 = insertelement <8 x i1> %6, i1 true, i64 4
%8 = insertelement <8 x i1> %7, i1 false, i64 5
%9 = insertelement <8 x i1> %8, i1 true, i64 6
%10 = insertelement <8 x i1> %9, i1 false, i64 7
store <8 x i1> %10, <8 x i1>* %0
%11 = load <8 x i1>, <8 x i1>* %0, align 8
%12 = extractelement <8 x i1> %11, i64 0
%13 = zext i1 %12 to i32
%14 = add i32 %13, 65 ; + 'A'
%15 = call i32 #putchar(i32 %14)
%16 = load <8 x i1>, <8 x i1>* %0, align 8
%17 = extractelement <8 x i1> %16, i64 1
%18 = zext i1 %17 to i32
%19 = add i32 %18, 65 ; + 'A'
%20 = call i32 #putchar(i32 %19)
%21 = load <8 x i1>, <8 x i1>* %0, align 8
%22 = extractelement <8 x i1> %21, i64 2
%23 = zext i1 %22 to i32
%24 = add i32 %23, 65 ; + 'A'
%25 = call i32 #putchar(i32 %24)
%26 = call i32 #putchar(i32 10) ; \n
ret void
}
Version 2
; Version 2 - Manually modified version of Version 1
; Outputs: BAB (correct)
declare i32 #putchar(i32)
define void #main() {
entry:
%0 = alloca <8 x i1>, align 8 ; v
store <8 x i1> zeroinitializer, <8 x i1>* %0
%1 = alloca <8 x i1>, align 8
store <8 x i1> zeroinitializer, <8 x i1>* %1
%2 = load <8 x i1>, <8 x i1>* %1, align 8
%3 = insertelement <8 x i1> %2, i1 true, i64 0
%4 = insertelement <8 x i1> %3, i1 false, i64 1
%5 = insertelement <8 x i1> %4, i1 true, i64 2
%6 = insertelement <8 x i1> %5, i1 false, i64 3
%7 = insertelement <8 x i1> %6, i1 true, i64 4
%8 = insertelement <8 x i1> %7, i1 false, i64 5
%9 = insertelement <8 x i1> %8, i1 true, i64 6
%10 = insertelement <8 x i1> %9, i1 false, i64 7
store <8 x i1> %10, <8 x i1>* %0
%11 = load <8 x i1>, <8 x i1>* %0, align 8
%12 = load <8 x i1>, <8 x i1>* %0, align 8
%13 = load <8 x i1>, <8 x i1>* %0, align 8
%14 = extractelement <8 x i1> %11, i64 0
%15 = extractelement <8 x i1> %12, i64 1
%16 = extractelement <8 x i1> %13, i64 2
%17 = zext i1 %14 to i32
%18 = zext i1 %15 to i32
%19 = zext i1 %16 to i32
%20 = add i32 %17, 65 ; + 'A'
%21 = add i32 %18, 65 ; + 'A'
%22 = add i32 %19, 65 ; + 'A'
%23 = call i32 #putchar(i32 %20)
%24 = call i32 #putchar(i32 %21)
%25 = call i32 #putchar(i32 %22)
%26 = call i32 #putchar(i32 10) ; \n
ret void
}

Related

What's the instruction for '&&' in LLVM IR?

I want to write an LLVM pass to reduce && in LLVM IR, but I can't find the specific instructions for it in IR. For example,
#include <iostream>
int main(){
bool a = true;
bool b = false;
bool c = a && b;
return 0;
}
and I get the IR,
define dso_local i32 #main() #4 {
%1 = alloca i32, align 4
%2 = alloca i8, align 1
%3 = alloca i8, align 1
%4 = alloca i8, align 1
store i32 0, i32* %1, align 4
store i8 1, i8* %2, align 1
store i8 0, i8* %3, align 1
%5 = load i8, i8* %2, align 1
%6 = trunc i8 %5 to i1
br i1 %6, label %7, label %10
7: ; preds = %0
%8 = load i8, i8* %3, align 1
%9 = trunc i8 %8 to i1
br label %10
10: ; preds = %7, %0
%11 = phi i1 [ false, %0 ], [ %9, %7 ]
%12 = zext i1 %11 to i8
store i8 %12, i8* %4, align 1
ret i32 0
}
but I tried this one,
#include <iostream>
int main(){
int a = 10;
int b = 10;
int c;
c = a && b;
return 0;
}
and I get this
define dso_local i32 #main() #4 {
%1 = alloca i32, align 4
%2 = alloca i32, align 4
%3 = alloca i32, align 4
%4 = alloca i32, align 4
store i32 0, i32* %1, align 4
store i32 10, i32* %2, align 4
store i32 10, i32* %3, align 4
%5 = load i32, i32* %2, align 4
%6 = icmp ne i32 %5, 0
br i1 %6, label %7, label %10
7: ; preds = %0
%8 = load i32, i32* %3, align 4
%9 = icmp ne i32 %8, 0
br label %10
10: ; preds = %7, %0
%11 = phi i1 [ false, %0 ], [ %9, %7 ]
%12 = zext i1 %11 to i32
store i32 %12, i32* %4, align 4
ret i32 0
}
I use LLVM 10 in ubuntu. I'll appreciate any answers or suggestions.
There is no LLVM instruction that specifically corresponds to the && operator. It can and will be translated in different ways depending on the expression and the optimization settings.
When you have optimizations enabled, the operands are side effect free (and not expensive to evaluate) and the whole expression can't be optimized away, clang will usually convert both operands to i1 and apply the logical and operator on them.
When optimizations are disabled or the operands have side effects, it'll usually be translated using branch instructions. That's the case in the two examples you posted.
Note that expr1 && expr2 is semantically equivalent to expr1 ? expr2 : false and you'll generally get the same LLVM code for both.
If you're okay with treating expr1 ? expr2 : false and other equivalent code (for example using if statements) the same as &&, you can try to detect the branching pattern created by them. If you need your pass to also be applicable after optimizations, you'll also have to detect at least the pattern of converting to i1 and anding.
If you only want your transformation to apply to && and nothing else, you simply can't do it at the LLVM level. You'd need an AST transformation at the Clang level.

Optimization generate super long function definition

In my code, I generate the following function:
define i32 #gl.qi([500 x i32] %x, i32 %i) {
entry:
%x. = alloca [500 x i32]
%i. = alloca i32
%0 = alloca [500 x i32]
store [500 x i32] %x, [500 x i32]* %x.
store i32 %i, i32* %i.
%x.1 = load [500 x i32], [500 x i32]* %x.
%i.2 = load i32, i32* %i.
store [500 x i32] %x.1, [500 x i32]* %0
%1 = icmp slt i32 %i.2, 500
br i1 %1, label %in-bound, label %out-of-bound
out-of-bound: ; preds = %entry
call void #gen.panic(i8* getelementptr inbounds ([22 x i8], [22 x i8]* #pool.str.2, i32 0, i32 0))
unreachable
in-bound: ; preds = %entry
%2 = getelementptr inbounds [500 x i32], [500 x i32]* %0, i32 0, i32 %i.2
%idx = load i32, i32* %2
ret i32 %idx
}
the high level functionality is to use %i to index %x, and if %i is out of bound, a panic function is called instead.
consider the store line:
store [500 x i32] %x, [500 x i32]* %x.
once I pass this function to opt -O1 -S --verify --verify-each, it generates code like this:
define i32 #gl.qi([500 x i32] %x, i32 %i) local_unnamed_addr {
entry:
%0 = alloca [500 x i32], align 4
%x.fca.0.extract = extractvalue [500 x i32] %x, 0
%x.fca.1.extract = extractvalue [500 x i32] %x, 1
%x.fca.2.extract = extractvalue [500 x i32] %x, 2
%x.fca.3.extract = extractvalue [500 x i32] %x, 3
%x.fca.4.extract = extractvalue [500 x i32] %x, 4
%x.fca.5.extract = extractvalue [500 x i32] %x, 5
until 500. I put the number to 50000 and it won't stop.
This is puzzling. I am not sure why must a store command be expanded to a sequence of etractvalues then stores? Is there a way to turn off this particular optimization without turning off the whole optimization?
Or am I looking at the wrong way to do this simple task?
To store %x in %x you need to first load the values from %x (function call arguments) before storing them back in %x (local variable). That's what the extractvalue and subsequent store is for. Same thing for %i. So it's not an optimization. It's just the way a variable a gets assigned to another variable b

How to put Load Instructions into InsertElement Instructions through LLVM pass?

I would like to auto-generate Load and InsertElement Instruction through llvm pass.
My problem is how to put Load Instructions such as %4, %6, %8, %10 into InsertElement Instructions, like my goal of LLVM-IR code:
%4 = load i32, i32* %1, align 4
%5 = insertelement <4 x i32> undef, i32 %4, i32 0
%6 = load i32, i32* %1, align 4
%7 = insertelement <4 x i32> %5, i32 %6, i32 1
%8 = load i32, i32* %1, align 4
%9 = insertelement <4 x i32> %7, i32 %8, i32 2
%10 = load i32, i32* %1, align 4
%11 = insertelement <4 x i32> %9, i32 %10, i32 3
and my pass:
if (auto *op = dyn_cast<LoadInst>(&I)) {
...
Value* loadinst_ptr=op->getPointerOperand();
Type* load_ty= loadinst_ptr->getType();
Value* val = UndefValue::get(VectorType::get(load_ty, 4));
for (unsigned i = 0; i < 4; i++)//for 4 copies
{
LoadInst* load_val=builder.CreateLoad(loadinst_ptr);
load_val->setAlignment(4);
//my problem is here, parameter load_val is invalid on CreateInsertElement.
val = builder.CreateInsertElement(val,load_val, builder.getInt32(i));
}
}
here is error message:
opt: /home/shun/llvm-proj/llvm/lib/IR/Instructions.cpp:1748:
llvm::InsertElementInst::InsertElementInst(llvm::Value*, llvm::Value*, llvm::Value*, const llvm::Twine&, llvm::Instruction*):
Assertion `isValidOperands(Vec, Elt, Index) && "Invalid insertelement instruction operands!"' failed.

Conversion of vector of bool's to integer in llvm ir

I am writing a llvm-ir code which involves vector operations. I did a integer vector comparison with 'icmp' instruction which resulted in a vector of bools say <8 x i1>, my problem is I want to convert this 8 bits to its corresponding integer value with out traversing the vector(extracting elements from vector), I tried 'bitcast <8 x i1> to i8' which seems converting first bit of the vector to i8, correct me if am wrong. Can someone suggest me a way to do this.
define i8 #main() #0 {
entry:
%A = alloca [8 x i32], align 16
%B = alloca [8 x i32], align 16
%arrayidx = getelementptr inbounds [8 x i32], [8 x i32]* %A, i64 0, i64 0
store i32 90, i32* %arrayidx, align 4
%arrayidx1 = getelementptr inbounds [8 x i32], [8 x i32]* %A, i64 0, i64 1
store i32 91, i32* %arrayidx1, align 4
%arrayidx2 = getelementptr inbounds [8 x i32], [8 x i32]* %A, i64 0, i64 2
store i32 92, i32* %arrayidx2, align 8
%arrayidx3 = getelementptr inbounds [8 x i32], [8 x i32]* %A, i64 0, i64 3
store i32 93, i32* %arrayidx3, align 4
%arrayidx4 = getelementptr inbounds [8 x i32], [8 x i32]* %B, i64 0, i64 0
store i32 90, i32* %arrayidx4, align 4
%arrayidx5 = getelementptr inbounds [8 x i32], [8 x i32]* %B, i64 0, i64 1
store i32 1, i32* %arrayidx5, align 4
%arrayidx6 = getelementptr inbounds [8 x i32], [8 x i32]* %B, i64 0, i64 2
store i32 92, i32* %arrayidx6, align 8
%arrayidx7 = getelementptr inbounds [8 x i32], [8 x i32]* %B, i64 0, i64 3
store i32 93, i32* %arrayidx7, align 4
br label %vector.body
vector.body:
%0 = bitcast [8 x i32]* %A to <8 x i32>*
%1 = bitcast [8 x i32]* %B to <8 x i32>*
%2 = load <8 x i32>, <8 x i32>* %0
%3 = load <8 x i32>, <8 x i32>* %1
%4 = icmp eq <8 x i32> %2, %3
%5 = bitcast <8 x i1> %4 to i8
ret i8 %5;
}
am using 'lli' for running this code with out any flags. Output is expected to be 11 but am getting 1 or 0
Thank you so much in advance.
As far as I inderstand, you can't do that without calling a platform specific intrinsic. I noticed that by being unable to write target independant code in c++.
For example, the code below:
typedef int v8i __attribute__((vector_size(32)));
int main() {
v8i a = { 1, 2, 3, 4, 5, 6, 7, 8};
v8i b = { 0, 2, 3, 4, 5, 6, 7, 0};
v8i cmp = (a == b);
char res = *(char*)&cmp;
printf("%d\n", res);
return 0;
}
produces llvm-IR which is quite close from what you wrote (with the appropriate bitcast).
Unfortunately it didn't work as expected.
That's because <8 x i1> doesn't exist on the processor. For example, in x86 AVX2, _mm256_cmpeq_epi32 yields a __m256i.
Bitcasting that to a char will just take the first 8 bits of that register.
I wrote instead intel AVX2 specific code, and found the appropriate instruction : intel intrinsic guide
So this code does what you need:
#include <cstdio>
#include <cstdlib>
#include <immintrin.h>
int main() {
__m256i a = _mm256_set_epi32(8, 7, 6, 5, 4, 3, 2, 1);
__m256i b = _mm256_set_epi32(0, 7, 6, 5, 4, 3, 2, 0);
__m256i eq = _mm256_cmpeq_epi32(a, b);
int res = _mm256_movemask_ps(_mm256_castsi256_ps(eq));
printf("res = %d\n", res);
for(int i = 0; i < 8; ++i) {
printf("%d %d -> %d\n", _mm256_extract_epi32(a, i), _mm256_extract_epi32(b, i), !!((res << i) & 0x80));
}
return 0;
}
In terms of ll code, it turns out you need a few additional bitcast (to float), and a call to the intrinsic
#llvm.x86.avx.movmsk.ps.256
rewriting by hand the llvm-IR code leads to :
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc-linux-gnu"
#formatString = private constant [4 x i8] c"%d\0A\00"
define i32 #main() #0 {
%a = alloca <8 x i32>, align 32
%b = alloca <8 x i32>, align 32
store <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>, <8 x i32>* %a, align 32
store <8 x i32> <i32 0, i32 2, i32 3, i32 0, i32 5, i32 0, i32 7, i32 0>, <8 x i32>* %b, align 32
%1 = load <8 x i32>, <8 x i32>* %a, align 32
%2 = load <8 x i32>, <8 x i32>* %b, align 32
%3 = icmp eq <8 x i32> %1, %2
%4 = sext <8 x i1> %3 to <8 x i32>
%5 = bitcast <8 x i32> %4 to <8 x float>
%res = call i32 #llvm.x86.avx.movmsk.ps.256(<8 x float> %5)
%6 = call i32 (i8*, ...) #printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* #formatString, i32 0, i32 0), i32 %res)
ret i32 0
}
declare i32 #llvm.x86.avx.movmsk.ps.256(<8 x float>) #1
declare i32 #printf(i8*, ...) #2
attributes #0 = { norecurse uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="haswell" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cmov,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+xsave,+xsaveopt,-adx,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512pf,-avx512vl,-fma4,-hle,-pku,-prfchw,-rdseed,-rtm,-sha,-sse4a,-tbm,-xop,-xsavec,-xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind readnone }
attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="haswell" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cmov,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+xsave,+xsaveopt,-adx,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512pf,-avx512vl,-fma4,-hle,-pku,-prfchw,-rdseed,-rtm,-sha,-sse4a,-tbm,-xop,-xsavec,-xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
The generated assembly (by llc) looks quite optimal:
vmovaps .LCPI0_0(%rip), %ymm0 # ymm0 = [1,2,3,4,5,6,7,8]
vmovaps %ymm0, 32(%rsp)
vmovdqa .LCPI0_1(%rip), %ymm0 # ymm0 = [0,2,3,0,5,0,7,0]
vmovdqa %ymm0, (%rsp)
vpcmpeqd 32(%rsp), %ymm0, %ymm0
vmovmskps %ymm0, %esi
I found this way working.
define i8 #main() #0 {
entry:
%0 = icmp eq <8 x i32> <i32 90,i32 91,i32 92,i32 93, i32 94,i32 95,i32 96,i32 97>, <i32 90,i32 91,i32 92,i32 93, i32 94,i32 95,i32 96,i32 97>
%1 = bitcast <8 x i1> %0 to <1 x i8>
%2 = extractelement <1 x i8> %1, i32 0
ret i8 %2
}
This is similar code as I posted in the question, I checked the result with "echo $?" am getting the result as expected.

LLVM intrinsic functions

When building a project with LLVM, some function calls will be replaced by intrinsic functions. Is the replacement completed by the front-end (e.g. clang) or the LLVM back-end?
Discussions through the Internet indicate that the intrinsic functions replacement is related to optimization options. So does it mean if there is no optimization option, then no intrinsic replacement will happen? Or in fact, there are some default intrinsic functions replacement that cannot be disabled?
If there is any method to disable all the intrinsic functions, how should I do that?
It depends. Intrinsics written in code are emitted through the front-end directly. Intrinsics like llvm.memset are introduced to the code during optimization at IR level (eigther the front-end nor the back-end perform this optimizations).
Here is a (quite stupid) example:
int main(int argc, char** argv)
{
int a[8];
for (int i = 0; i != 8; ++i)
a[i] = 0;
for (int i = 7; i >= 0; --i)
a[i] = a[i+1] + argc;
return a[0];
}
Compiled with clang 3.5 (clang -S -emit-llvm) you will get the following IR without any intrinsics:
; Function Attrs: nounwind uwtable
define i32 #main(i32 %argc, i8** %argv) #0 {
%1 = alloca i32, align 4
%2 = alloca i32, align 4
%3 = alloca i8**, align 8
%a = alloca [8 x i32], align 16
%i = alloca i32, align 4
%i1 = alloca i32, align 4
store i32 0, i32* %1
store i32 %argc, i32* %2, align 4
store i8** %argv, i8*** %3, align 8
store i32 0, i32* %i, align 4
br label %4
; <label>:4 ; preds = %11, %0
%5 = load i32* %i, align 4
%6 = icmp ne i32 %5, 8
br i1 %6, label %7, label %14
; <label>:7 ; preds = %4
%8 = load i32* %i, align 4
%9 = sext i32 %8 to i64
%10 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 %9
store i32 0, i32* %10, align 4
br label %11
; <label>:11 ; preds = %7
%12 = load i32* %i, align 4
%13 = add nsw i32 %12, 1
store i32 %13, i32* %i, align 4
br label %4
; <label>:14 ; preds = %4
store i32 7, i32* %i1, align 4
br label %15
; <label>:15 ; preds = %29, %14
%16 = load i32* %i1, align 4
%17 = icmp sge i32 %16, 0
br i1 %17, label %18, label %32
; <label>:18 ; preds = %15
%19 = load i32* %i1, align 4
%20 = add nsw i32 %19, 1
%21 = sext i32 %20 to i64
%22 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 %21
%23 = load i32* %22, align 4
%24 = load i32* %2, align 4
%25 = add nsw i32 %23, %24
%26 = load i32* %i1, align 4
%27 = sext i32 %26 to i64
%28 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 %27
store i32 %25, i32* %28, align 4
br label %29
; <label>:29 ; preds = %18
%30 = load i32* %i1, align 4
%31 = add nsw i32 %30, -1
store i32 %31, i32* %i1, align 4
br label %15
; <label>:32 ; preds = %15
%33 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 0
%34 = load i32* %33, align 4
ret i32 %34
}
Compiled again with clang -emit-llvm -O1 you will see this:
; Function Attrs: nounwind readnone uwtable
define i32 #main(i32 %argc, i8** nocapture readnone %argv) #0 {
.preheader:
%a = alloca [8 x i32], align 16
%a6 = bitcast [8 x i32]* %a to i8*
call void #llvm.memset.p0i8.i64(i8* %a6, i8 0, i64 32, i32 4, i1 false)
br label %0
; <label>:0 ; preds = %.preheader, %0
%indvars.iv = phi i64 [ 7, %.preheader ], [ %indvars.iv.next, %0 ]
%1 = add nsw i64 %indvars.iv, 1
%2 = getelementptr inbounds [8 x i32]* %a, i64 0, i64 %1
%3 = load i32* %2, align 4, !tbaa !1
%4 = add nsw i32 %3, %argc
%5 = getelementptr inbounds [8 x i32]* %a, i64 0, i64 %indvars.iv
store i32 %4, i32* %5, align 4, !tbaa !1
%indvars.iv.next = add nsw i64 %indvars.iv, -1
%6 = trunc i64 %indvars.iv to i32
%7 = icmp sgt i32 %6, 0
br i1 %7, label %0, label %8
; <label>:8 ; preds = %0
%9 = getelementptr inbounds [8 x i32]* %a, i64 0, i64 0
%10 = load i32* %9, align 16, !tbaa !1
ret i32 %10
}
The initialization loop was replaced by the llvm.memset intrinsic. The back-end is free to handle the intrinsic as it want's but commonly llvm.memset is lowered to a libc library call.
To answer your first question: Yes, if you don't optimize your code, then you will not get intrinsics in your IR.
To prevent intrinsics being introduced in your code all you have to do is find the optimization pass on your IR and don't run it. Here is a related question how to find out what passes are done on the IR: Where to find the optimization sequence for clang -OX?
for -O1 we get:
prune-eh -inline-cost -always-inline -functionattrs -sroa -domtree
-early-cse -lazy-value-info -jump-threading -correlated-propagation -simplifycfg -instcombine -tailcallelim -simplifycfg -reassociate -domtree -loops -loop-simplify -lcssa -loop-rotate -licm -loop-unswitch -instcombine -scalar-evolution -lcssa -indvars -loop-idiom -loop-deletion -loop-unroll -memdep -memcpyopt -sccp -instcombine -lazy-value-info -jump-threading -correlated-propagation -domtree -memdep -dse -adce -simplifycfg -instcombine -barrier -domtree -loops -loop-simplify -lcssa -branch-prob -block-freq -scalar-evolution -loop-vectorize -instcombine -simplifycfg -strip-dead-prototypes -verify
A wild guess: instcombine is introducing the llvm.memset. I run the passes without instcombine and opt on the unoptimized IR and get this:
; Function Attrs: nounwind readnone uwtable
define i32 #main(i32 %argc, i8** %argv) #0 {
%a = alloca [8 x i32], align 16
%1 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 8
%2 = load i32* %1, align 4
%3 = add nsw i32 %2, %argc
%4 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 7
store i32 %3, i32* %4, align 4
%5 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 7
%6 = load i32* %5, align 4
%7 = add nsw i32 %6, %argc
%8 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 6
store i32 %7, i32* %8, align 4
%9 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 6
%10 = load i32* %9, align 4
%11 = add nsw i32 %10, %argc
%12 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 5
store i32 %11, i32* %12, align 4
%13 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 5
%14 = load i32* %13, align 4
%15 = add nsw i32 %14, %argc
%16 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 4
store i32 %15, i32* %16, align 4
%17 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 4
%18 = load i32* %17, align 4
%19 = add nsw i32 %18, %argc
%20 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 3
store i32 %19, i32* %20, align 4
%21 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 3
%22 = load i32* %21, align 4
%23 = add nsw i32 %22, %argc
%24 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 2
store i32 %23, i32* %24, align 4
%25 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 2
%26 = load i32* %25, align 4
%27 = add nsw i32 %26, %argc
%28 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 1
store i32 %27, i32* %28, align 4
%29 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 1
%30 = load i32* %29, align 4
%31 = add nsw i32 %30, %argc
%32 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 0
store i32 %31, i32* %32, align 4
%33 = getelementptr inbounds [8 x i32]* %a, i32 0, i64 0
%34 = load i32* %33, align 4
ret i32 %34
}
No instructions. So to prevent (at least the memset) intrinsics in your code don't run instcombine on your IR. However, instcombine is a mighty opt pass that realy shortens the code.
Now you have two options:
don't use opt passes that introduce intrinsics
write your own llvm
opt pass that transforms intrinsics back to whatever they could be
replaced with an run it after optimization and before the back-end
starts working
I hope this helps you somehow. Cheers!