Related
There is a bool variable named "Enable", when "Enable" is false, I want to create following function:
void test_false()
{
float dst[4] = {1.0, 1.0, 1.0, 1.0};
float src[4] = {1.0, 2.0, 3.0, 4.0};
float * dst_addr = dst;
float * src_addr = src;
asm volatile (
"vld1.32 {q0}, [%[src]] \n"
"vld1.32 {q1}, [%[dst]] \n"
"vadd.f32 q0, q0, q1 \n"
"vadd.f32 q0, q0, q1 \n"
"vst1.32 {q0}, [%[dst]] \n"
:[src]"+r"(src_addr),
[dst]"+r"(dst_addr)
:
: "q0", "q1", "q2", "q3", "memory"
);
for (int i = 0; i < 4; i++)
{
printf("%f, ", dst[i]);//0.0 0.0 0.0 0.0
}
}
And when "Enable" is true, I want to create following function:
void test_true()
{
float dst[4] = {1.0, 1.0, 1.0, 1.0};
float src[4] = {1.0, 2.0, 3.0, 4.0};
float * dst_addr = dst;
float * src_addr = src;
asm volatile (
"vld1.32 {q0}, [%[src]] \n"
"vld1.32 {q1}, [%[dst]] \n"
"vadd.f32 q0, q0, q1 \n"
"vadd.f32 q0, q0, q1 \n"
"vadd.f32 q0, q0, q1 \n" //Only here is different from test_false()
"vst1.32 {q0}, [%[dst]] \n"
:[src]"+r"(src_addr),
[dst]"+r"(dst_addr)
:
: "q0", "q1", "q2", "q3", "memory"
);
for (int i = 0; i < 4; i++)
{
printf("%f, ", dst[i]);//0.0 0.0 0.0 0.0
}
}
But I don't want to save two copies of code, because most of them are the same. I want to use “c++ Template + Conditional Compile” to solve my problem. The code is as follows. But it didn't work. Whether the Enable is true or false, the compiler creates the code same as test_true().
template<bool Enable>
void test_tmp()
{
float dst[4] = {1.0, 1.0, 1.0, 1.0};
float src[4] = {1.0, 2.0, 3.0, 4.0};
float * dst_addr = dst;
float * src_addr = src;
if (Enable)
{
#define FUSE_
}
asm volatile (
"vld1.32 {q0}, [%[src]] \n"
"vld1.32 {q1}, [%[dst]] \n"
"vadd.f32 q0, q0, q1 \n"
"vadd.f32 q0, q0, q1 \n"
#ifdef FUSE_
"vadd.f32 q0, q0, q1 \n"
#endif
"vst1.32 {q0}, [%[dst]] \n"
:[src]"+r"(src_addr),
[dst]"+r"(dst_addr)
:
: "q0", "q1", "q2", "q3", "memory"
);
for (int i = 0; i < 4; i++)
{
printf("%f, ", dst[i]);//0.0 0.0 0.0 0.0
}
#undef FUSE_
}
template void test_tmp<true>();
template void test_tmp<false>();
It doesn't seem possible to write code like function test_tmp(). Does anyone know how to solve my problem? Thanks a lot.
If you use C temporaries and output operands for all live registers in the first half that line up with input constraints for the 2nd half, you should be able to split it up your inline asm without any performance loss, especially if you use specific memory input/output constraints instead of a catch-all "memory" clobber. But it will get a lot more complicated.
This obviously doesn't work, because the C preprocessor runs before the C++ compiler even looks at if() statements.
if (Enable) {
#define FUSE_ // always defined, regardless of Enable
}
But the GNU assembler has its own macro / conditional-assembly directives like .if which operate on the asm the compiler emits after making text substitutions into the asm() template, including actual numeric values for immediate input operands.
Use the bool as an input operand for an assembler .if directive
Use an "i" (Enable) input constraint. Normally the %0 or %[enable] expansion of that would be #0 or #1, because that's how to print an ARM immediate. But GCC has a %c0 / %c[enable] modifier that will print a constant without punctuation. (It's documented for x86, but works the same way for ARM and presumably all other architectures. Documentation for ARM / AArch64 operand modifiers is being worked on; I've been sitting on an email about that...)
".if %c[enable] \n\t" for [enable] "i" (c_var) will substitute as .if 0 or .if 1 into the inline-asm template, exactly what we need to make .if / .endif work at assemble time.
Full example:
template<bool Enable>
void test_tmp(float dst[4])
{
//float dst[4] = {1.0, 1.0, 1.0, 1.0};
// static const // non-static-const so we can see the memory clobber vs. dummy src stop this from optimizing away init of src[] on the stack
float src[4] = {1.0, 2.0, 3.0, 4.0};
float * dst_addr = dst;
const float * src_addr = src;
asm (
"vld1.32 {q1}, [%[dst]] # dummy dst = %[dummy_memdst]\n" // hopefully they pick the same regs?
"vld1.32 {q0}, [%[src]] # dummy src = %[dummy_memsrc]\n"
"vadd.f32 q0, q0, q1 \n" // TODO: optimize to q1+q1 first, without a dep on src
"vadd.f32 q0, q0, q1 \n" // allowing q0+=q1 and q1+=q1 in parallel if we need q0 += 3*q1
// #ifdef FUSE_
".if %c[enable]\n" // %c modifier: print constant without punctuation, same as documented for x86
"vadd.f32 q0, q0, q1 \n"
".endif \n"
// #endif
"vst1.32 {q0}, [%[dst]] \n"
: [dummy_memdst] "+m" (*(float(*)[4])dst_addr)
: [src]"r"(src_addr),
[dst]"r"(dst_addr),
[enable]"i"(Enable)
, [dummy_memsrc] "m" (*(const float(*)[4])src_addr)
: "q0", "q1", "q2", "q3" //, "memory"
);
/*
for (int i = 0; i < 4; i++)
{
printf("%f, ", dst[i]);//0.0 0.0 0.0 0.0
}
*/
}
float dst[4] = {1.0, 1.0, 1.0, 1.0};
template void test_tmp<true>(float *);
template void test_tmp<false>(float *);
compiles with GCC and Clang on the Godbolt compiler explorer
With gcc, you only get the compiler's .s output, so you have to turn off some of the usual compiler-explorer filters and look through the directives. All 3 vadd.f32 instructions are there in the false version, but one of them is surrounded by .if 0 / .endif.
But clang's built-in assembler processes assembler directives internally before turning things back into asm if that output is requested. (Normally clang/LLVM goes straight to machine code, unlike gcc which always runs a separate assembler).
Just to be clear, this works with gcc and clang, but it's just easier to see it on Godbolt with clang. (Because Godbolt doesn't have a "binary" mode that actually assembles and then disassembles, except for x86). Clang output for the false version
...
vld1.32 {d2, d3}, [r0] # dummy dst = [r0]
vld1.32 {d0, d1}, [r1] # dummy src = [r1]
vadd.f32 q0, q0, q1
vadd.f32 q0, q0, q1
vst1.32 {d0, d1}, [r0]
...
Notice that clang picked the same GP register for the raw pointers as it used for the memory operand. (gcc seems to choose [sp] for src_mem, but a different reg for the pointer input that you use manually inside an addressing mode). If you hadn't forced it to have the pointers in registers, it could have used an SP-relative addressing mode with an offset for the vector loads, potentially taking advantage of ARM addressing modes.
If you're really not going to modify the pointers inside the asm (e.g. with post-increment addressing modes), then "r" input-only operands makes the most sense. If we'd left in the printf loop, the compiler would have needed dst again after the asm, so it would benefit from having it still in a register. A "+r"(dst_addr) input forces the compiler to assume that that register is no longer usable as a copy of dst. Anyway, gcc always copies the registers, even when it doesn't need it later, whether I make it "r" or "+r", so that's weird.
Using (dummy) memory inputs / outputs means we can drop the volatile, so the compiler can optimize it normally as a pure function of its inputs. (And optimize it away if the result is unused.)
Hopefully this isn't worse code-gen that with the "memory" clobber. But it would probably be better if you just used the "=m" and "m" memory operands, and didn't ask for pointers in registers at all. (That doesn't help if you're going to loop over the array with inline asm, though.)
See also Looping over arrays with inline assembly
I haven't been doing ARM assembly for few years, and I never really bothered to learn GCC inline assembly properly, but I think your code can be rewritten like this, using intrinsics:
#include <cstdio>
#include <arm_neon.h>
template<bool Enable>
void test_tmp()
{
const float32x4_t src = {1.0, 2.0, 3.0, 4.0};
const float32x4_t src2 = {1.0, 1.0, 1.0, 1.0};
float32x4_t z;
z = vaddq_f32(src, src2);
z = vaddq_f32(z, src2);
if (Enable) z = vaddq_f32(z, src2);
float result[4];
vst1q_f32(result, z);
for (int i = 0; i < 4; i++)
{
printf("%f, ", result[i]);//0.0 0.0 0.0 0.0
}
}
template void test_tmp<true>();
template void test_tmp<false>();
You can see resulting machine code + toy around live at: https://godbolt.org/z/Fg7Tci
Compiled with ARM gcc8.2 and command line options "-O3 -mfloat-abi=softfp -mfpu=neon" the "true" variant is:
void test_tmp<true>():
vmov.f32 q9, #1.0e+0 # v4sf
vldr d16, .L6
vldr d17, .L6+8
# and the FALSE variant has one less vadd.f32 in this part
vadd.f32 q8, q8, q9
vadd.f32 q8, q8, q9
vadd.f32 q8, q8, q9
push {r4, r5, r6, lr}
sub sp, sp, #16
vst1.32 {d16-d17}, [sp:64]
mov r4, sp
ldr r5, .L6+16
add r6, sp, #16
.L2:
vldmia.32 r4!, {s15}
vcvt.f64.f32 d16, s15
mov r0, r5
vmov r2, r3, d16
bl printf
cmp r4, r6
bne .L2
add sp, sp, #16
pop {r4, r5, r6, pc}
.L6:
.word 1065353216
.word 1073741824
.word 1077936128
.word 1082130432
.word .LC0
.LC0:
.ascii "%f, \000"
This still leaves me profoundly confused by why the gcc doesn't simply calculate final string with values as string for output, as the inputs are constant. Maybe it's some math-rule about precision preventing it to do that in compile-time as the result could differ slightly from actual target HW platform FPU? I.e. with some fast-math switch it would probably drop that code completely and just produce one output string...
But I guess your code is not actually proper "MCVE" of what you are doing, and the test values would be fed into some real function you are testing, or something like that.
Anyway, if you are working on performance optimizations, you should probably rather avoid inline assembly completely and use intrinsics instead, as that allows the compiler to better allocate registers and optimize code around the calculations (I didn't track it precisely, but I think the last version of this experiment in godbolt was 2-4 instructions shorter/simpler than the original using inline assembly).
Plus you will avoid the incorrect asm constraints like your example code has, those are always tricky to get correctly and pure PITA to maintain if you keep modifying the inlined code often.
For example:
%3 = load float, float* addrspacecast (float addrspace(3)* #_ZZ16imGlobalFunctionvE1a to float*), align 4
Here, what I want to do is to change
float* addrspacecast (float addrspace(3)* #_ZZ16imGlobalFunctionvE1a to float*)
into
float addrspace(3)* #_ZZ16imGlobalFunctionvE1a
But, float* addrspacecast (float addrspace(3)* #_ZZ16imGlobalFunctionvE1a to float*) is ConstantExpr which cannot extracts its operand such as float addrspace(3)* #_ZZ16imGlobalFunctionvE1a.
How can I do this?
I have a piece of code that I want to unroll by various unroll factors and then look at the resulting bitcode. To do so I'm doing the following:
1) I first compile the code using clang:
clang -O0 -S -emit-llvm trainingCode.cpp -o trainingCode.ll
2) I then run a couple of passes on the resulting bitcode (as recommended by this):
opt -mem2reg -simplifycfg -loops -lcssa -loop-simplify -loop-rotate -inline -inline-threshold=1000000 trainingCode.ll -o trainingCode.bc > /dev/null
3) Finally, I run the unrolling pass:
opt -loop-unroll -unroll-count=2 -unroll-allow-partial trainingCode.bc -o unrolledTrainingCode.bc > /dev/null
I then repeat this with various unroll factors from 1 to 4.
For a simple piece of code like the following, this works exactly like I need it to:
#include <math.h>
int main() {
volatile float checksum = 0.0;
for (int i = 0; i < 10; i++) {
float fff = 0.112345;
fff *= fff;
fff += 1.13;
checksum += fff/10000;
}
}
But I get really weird behavior when I increase the complexity/size of the body to this for instance:
#include <math.h>
int main() {
volatile float checksum = 0.0;
for (int i = 0; i < 10; i++) {
float fff = 0.112345;
fff *= sqrt(fff) + fff;
fff += 1.13;
fff *= sqrt(fff) + fff;
fff += 17.16;
fff *= sqrt(fff) + fff;
fff += 15.13;
fff *= sqrt(fff) + fff;
fff += 21.13;
fff *= sqrt(fff) + fff;
fff += 81.13;
fff *= sqrt(fff) + fff;
fff += 11.13;
fff *= sqrt(fff) + fff;
fff += 81.13;
fff *= sqrt(fff) + fff;
fff += 11.13;
fff *= sqrt(fff) + fff;
fff += 91.13;
fff *= sqrt(fff) + fff;
fff += 11.13;
checksum += (fff + i)/10000;
}
}
For unroll factors of 1-2, everything works fine, but if I try to unroll using a factor greater than 2, LLVM completely unrolls the loop. This happens for any loop with sufficiently large of a body. For instance, this is (an excerpt) of the resulting bitcode of using any unroll factor greater than 2 for the code directly above:
; ModuleID = 'unrolledtrainingCode3.bc'
source_filename = "p1HighComplexity.cpp"
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; Function Attrs: norecurse uwtable
define i32 #main() #0 {
entry:
%checksum = alloca float, align 4
store volatile float 0.000000e+00, float* %checksum, align 4
br label %for.body
for.body: ; preds = %entry
%call.i = call float #sqrtf(float 0x3FBCC2A460000000) #2
%add = fadd float %call.i, 0x3FBCC2A460000000
%mul = fmul float 0x3FBCC2A460000000, %add
%conv = fpext float %mul to double
%add1 = fadd double %conv, 1.130000e+00
%conv2 = fptrunc double %add1 to float
%call.i2 = call float #sqrtf(float %conv2) #2
%add4 = fadd float %call.i2, %conv2
%mul5 = fmul float %conv2, %add4
%conv6 = fpext float %mul5 to double
%add7 = fadd double %conv6, 1.716000e+01
%conv8 = fptrunc double %add7 to float
%call.i3 = call float #sqrtf(float %conv8) #2
%add10 = fadd float %call.i3, %conv8
%mul11 = fmul float %conv8, %add10
%conv12 = fpext float %mul11 to double
%add13 = fadd double %conv12, 1.513000e+01
%conv14 = fptrunc double %add13 to float
%call.i4 = call float #sqrtf(float %conv14) #2
%add16 = fadd float %call.i4, %conv14
%mul17 = fmul float %conv14, %add16
%conv18 = fpext float %mul17 to double
%add19 = fadd double %conv18, 2.113000e+01
%conv20 = fptrunc double %add19 to float
%call.i5 = call float #sqrtf(float %conv20) #2
%add22 = fadd float %call.i5, %conv20
%mul23 = fmul float %conv20, %add22
%conv24 = fpext float %mul23 to double
%add25 = fadd double %conv24, 0x40544851EB851EB8
%conv26 = fptrunc double %add25 to float
%call.i6 = call float #sqrtf(float %conv26) #2
%add28 = fadd float %call.i6, %conv26
%mul29 = fmul float %conv26, %add28
%conv30 = fpext float %mul29 to double
%add31 = fadd double %conv30, 1.113000e+01
%conv32 = fptrunc double %add31 to float
%call.i7 = call float #sqrtf(float %conv32) #2
%add34 = fadd float %call.i7, %conv32
%mul35 = fmul float %conv32, %add34
%conv36 = fpext float %mul35 to double
%add37 = fadd double %conv36, 0x40544851EB851EB8
%conv38 = fptrunc double %add37 to float
%call.i8 = call float #sqrtf(float %conv38) #2
%add40 = fadd float %call.i8, %conv38
%mul41 = fmul float %conv38, %add40
%conv42 = fpext float %mul41 to double
%add43 = fadd double %conv42, 1.113000e+01
%conv44 = fptrunc double %add43 to float
%call.i9 = call float #sqrtf(float %conv44) #2
%add46 = fadd float %call.i9, %conv44
%mul47 = fmul float %conv44, %add46
%conv48 = fpext float %mul47 to double
%add49 = fadd double %conv48, 0x4056C851EB851EB8
%conv50 = fptrunc double %add49 to float
%call.i10 = call float #sqrtf(float %conv50) #2
%add52 = fadd float %call.i10, %conv50
%mul53 = fmul float %conv50, %add52
%conv54 = fpext float %mul53 to double
%add55 = fadd double %conv54, 1.113000e+01
%conv56 = fptrunc double %add55 to float
%div = fdiv float %conv56, 1.000000e+04
%0 = load volatile float, float* %checksum, align 4
%add57 = fadd float %0, %div
store volatile float %add57, float* %checksum, align 4
%call.i.1 = call float #sqrtf(float 0x3FBCC2A460000000) #2
%add.1 = fadd float %call.i.1, 0x3FBCC2A460000000
%mul.1 = fmul float 0x3FBCC2A460000000, %add.1
%conv.1 = fpext float %mul.1 to double
%add1.1 = fadd double %conv.1, 1.130000e+00
%conv2.1 = fptrunc double %add1.1 to float
%call.i2.1 = call float #sqrtf(float %conv2.1) #2
%add4.1 = fadd float %call.i2.1, %conv2.1
%mul5.1 = fmul float %conv2.1, %add4.1
%conv6.1 = fpext float %mul5.1 to double
%add7.1 = fadd double %conv6.1, 1.716000e+01
%conv8.1 = fptrunc double %add7.1 to float
%call.i3.1 = call float #sqrtf(float %conv8.1) #2
%add10.1 = fadd float %call.i3.1, %conv8.1
%mul11.1 = fmul float %conv8.1, %add10.1
%conv12.1 = fpext float %mul11.1 to double
%add13.1 = fadd double %conv12.1, 1.513000e+01
%conv14.1 = fptrunc double %add13.1 to float
%call.i4.1 = call float #sqrtf(float %conv14.1) #2
%add16.1 = fadd float %call.i4.1, %conv14.1
%mul17.1 = fmul float %conv14.1, %add16.1
%conv18.1 = fpext float %mul17.1 to double
%add19.1 = fadd double %conv18.1, 2.113000e+01
%conv20.1 = fptrunc double %add19.1 to float
%call.i5.1 = call float #sqrtf(float %conv20.1) #2
%add22.1 = fadd float %call.i5.1, %conv20.1
%mul23.1 = fmul float %conv20.1, %add22.1
%conv24.1 = fpext float %mul23.1 to double
%add25.1 = fadd double %conv24.1, 0x40544851EB851EB8
%conv26.1 = fptrunc double %add25.1 to float
%call.i6.1 = call float #sqrtf(float %conv26.1) #2
%add28.1 = fadd float %call.i6.1, %conv26.1
%mul29.1 = fmul float %conv26.1, %add28.1
%conv30.1 = fpext float %mul29.1 to double
%add31.1 = fadd double %conv30.1, 1.113000e+01
%conv32.1 = fptrunc double %add31.1 to float
%call.i7.1 = call float #sqrtf(float %conv32.1) #2
%add34.1 = fadd float %call.i7.1, %conv32.1
%mul35.1 = fmul float %conv32.1, %add34.1
%conv36.1 = fpext float %mul35.1 to double
%add37.1 = fadd double %conv36.1, 0x40544851EB851EB8
%conv38.1 = fptrunc double %add37.1 to float
%call.i8.1 = call float #sqrtf(float %conv38.1) #2
%add40.1 = fadd float %call.i8.1, %conv38.1
%mul41.1 = fmul float %conv38.1, %add40.1
%conv42.1 = fpext float %mul41.1 to double
%add43.1 = fadd double %conv42.1, 1.113000e+01
%conv44.1 = fptrunc double %add43.1 to float
%call.i9.1 = call float #sqrtf(float %conv44.1) #2
%add46.1 = fadd float %call.i9.1, %conv44.1
%mul47.1 = fmul float %conv44.1, %add46.1
%conv48.1 = fpext float %mul47.1 to double
%add49.1 = fadd double %conv48.1, 0x4056C851EB851EB8
%conv50.1 = fptrunc double %add49.1 to float
%call.i10.1 = call float #sqrtf(float %conv50.1) #2
%add52.1 = fadd float %call.i10.1, %conv50.1
%mul53.1 = fmul float %conv50.1, %add52.1
%conv54.1 = fpext float %mul53.1 to double
%add55.1 = fadd double %conv54.1, 1.113000e+01
%conv56.1 = fptrunc double %add55.1 to float
%div.1 = fdiv float %conv56.1, 1.000000e+04
%1 = load volatile float, float* %checksum, align 4
%add57.1 = fadd float %1, %div.1
store volatile float %add57.1, float* %checksum, align 4
%call.i.2 = call float #sqrtf(float 0x3FBCC2A460000000) #2
%add.2 = fadd float %call.i.2, 0x3FBCC2A460000000
%mul.2 = fmul float 0x3FBCC2A460000000, %add.2
%conv.2 = fpext float %mul.2 to double
%add1.2 = fadd double %conv.2, 1.130000e+00
%conv2.2 = fptrunc double %add1.2 to float
%call.i2.2 = call float #sqrtf(float %conv2.2) #2
%add4.2 = fadd float %call.i2.2, %conv2.2
%mul5.2 = fmul float %conv2.2, %add4.2
%conv6.2 = fpext float %mul5.2 to double
%add7.2 = fadd double %conv6.2, 1.716000e+01
%conv8.2 = fptrunc double %add7.2 to float
%call.i3.2 = call float #sqrtf(float %conv8.2) #2
%add10.2 = fadd float %call.i3.2, %conv8.2
%mul11.2 = fmul float %conv8.2, %add10.2
%conv12.2 = fpext float %mul11.2 to double
%add13.2 = fadd double %conv12.2, 1.513000e+01
%conv14.2 = fptrunc double %add13.2 to float
%call.i4.2 = call float #sqrtf(float %conv14.2) #2
%add16.2 = fadd float %call.i4.2, %conv14.2
%mul17.2 = fmul float %conv14.2, %add16.2
%conv18.2 = fpext float %mul17.2 to double
%add19.2 = fadd double %conv18.2, 2.113000e+01
%conv20.2 = fptrunc double %add19.2 to float
%call.i5.2 = call float #sqrtf(float %conv20.2) #2
%add22.2 = fadd float %call.i5.2, %conv20.2
%mul23.2 = fmul float %conv20.2, %add22.2
%conv24.2 = fpext float %mul23.2 to double
%add25.2 = fadd double %conv24.2, 0x40544851EB851EB8
%conv26.2 = fptrunc double %add25.2 to float
%call.i6.2 = call float #sqrtf(float %conv26.2) #2
%add28.2 = fadd float %call.i6.2, %conv26.2
%mul29.2 = fmul float %conv26.2, %add28.2
%conv30.2 = fpext float %mul29.2 to double
%add31.2 = fadd double %conv30.2, 1.113000e+01
%conv32.2 = fptrunc double %add31.2 to float
%call.i7.2 = call float #sqrtf(float %conv32.2) #2
%add34.2 = fadd float %call.i7.2, %conv32.2
%mul35.2 = fmul float %conv32.2, %add34.2
%conv36.2 = fpext float %mul35.2 to double
%add37.2 = fadd double %conv36.2, 0x40544851EB851EB8
%conv38.2 = fptrunc double %add37.2 to float
%call.i8.2 = call float #sqrtf(float %conv38.2) #2
%add40.2 = fadd float %call.i8.2, %conv38.2
%mul41.2 = fmul float %conv38.2, %add40.2
%conv42.2 = fpext float %mul41.2 to double
%add43.2 = fadd double %conv42.2, 1.113000e+01
%conv44.2 = fptrunc double %add43.2 to float
%call.i9.2 = call float #sqrtf(float %conv44.2) #2
%add46.2 = fadd float %call.i9.2, %conv44.2
%mul47.2 = fmul float %conv44.2, %add46.2
%conv48.2 = fpext float %mul47.2 to double
%add49.2 = fadd double %conv48.2, 0x4056C851EB851EB8
%conv50.2 = fptrunc double %add49.2 to float
%call.i10.2 = call float #sqrtf(float %conv50.2) #2
%add52.2 = fadd float %call.i10.2, %conv50.2
%mul53.2 = fmul float %conv50.2, %add52.2
%conv54.2 = fpext float %mul53.2 to double
%add55.2 = fadd double %conv54.2, 1.113000e+01
%conv56.2 = fptrunc double %add55.2 to float
%div.2 = fdiv float %conv56.2, 1.000000e+04
%2 = load volatile float, float* %checksum, align 4
%add57.2 = fadd float %2, %div.2
store volatile float %add57.2, float* %checksum, align 4
%call.i.3 = call float #sqrtf(float 0x3FBCC2A460000000) #2
%add.3 = fadd float %call.i.3, 0x3FBCC2A460000000
%mul.3 = fmul float 0x3FBCC2A460000000, %add.3
%conv.3 = fpext float %mul.3 to double
%add1.3 = fadd double %conv.3, 1.130000e+00
%conv2.3 = fptrunc double %add1.3 to float
%call.i2.3 = call float #sqrtf(float %conv2.3) #2
%add4.3 = fadd float %call.i2.3, %conv2.3
%mul5.3 = fmul float %conv2.3, %add4.3
%conv6.3 = fpext float %mul5.3 to double
%add7.3 = fadd double %conv6.3, 1.716000e+01
%conv8.3 = fptrunc double %add7.3 to float
%call.i3.3 = call float #sqrtf(float %conv8.3) #2
%add10.3 = fadd float %call.i3.3, %conv8.3
%mul11.3 = fmul float %conv8.3, %add10.3
%conv12.3 = fpext float %mul11.3 to double
%add13.3 = fadd double %conv12.3, 1.513000e+01
%conv14.3 = fptrunc double %add13.3 to float
%call.i4.3 = call float #sqrtf(float %conv14.3) #2
%add16.3 = fadd float %call.i4.3, %conv14.3
%mul17.3 = fmul float %conv14.3, %add16.3
%conv18.3 = fpext float %mul17.3 to double
%add19.3 = fadd double %conv18.3, 2.113000e+01
%conv20.3 = fptrunc double %add19.3 to float
%call.i5.3 = call float #sqrtf(float %conv20.3) #2
%add22.3 = fadd float %call.i5.3, %conv20.3
%mul23.3 = fmul float %conv20.3, %add22.3
%conv24.3 = fpext float %mul23.3 to double
%add25.3 = fadd double %conv24.3, 0x40544851EB851EB8
%conv26.3 = fptrunc double %add25.3 to float
%call.i6.3 = call float #sqrtf(float %conv26.3) #2
%add28.3 = fadd float %call.i6.3, %conv26.3
%mul29.3 = fmul float %conv26.3, %add28.3
%conv30.3 = fpext float %mul29.3 to double
%add31.3 = fadd double %conv30.3, 1.113000e+01
%conv32.3 = fptrunc double %add31.3 to float
%call.i7.3 = call float #sqrtf(float %conv32.3) #2
%add34.3 = fadd float %call.i7.3, %conv32.3
%mul35.3 = fmul float %conv32.3, %add34.3
%conv36.3 = fpext float %mul35.3 to double
%add37.3 = fadd double %conv36.3, 0x40544851EB851EB8
%conv38.3 = fptrunc double %add37.3 to float
%call.i8.3 = call float #sqrtf(float %conv38.3) #2
%add40.3 = fadd float %call.i8.3, %conv38.3
%mul41.3 = fmul float %conv38.3, %add40.3
%conv42.3 = fpext float %mul41.3 to double
%add43.3 = fadd double %conv42.3, 1.113000e+01
%conv44.3 = fptrunc double %add43.3 to float
%call.i9.3 = call float #sqrtf(float %conv44.3) #2
%add46.3 = fadd float %call.i9.3, %conv44.3
%mul47.3 = fmul float %conv44.3, %add46.3
%conv48.3 = fpext float %mul47.3 to double
%add49.3 = fadd double %conv48.3, 0x4056C851EB851EB8
%conv50.3 = fptrunc double %add49.3 to float
%call.i10.3 = call float #sqrtf(float %conv50.3) #2
%add52.3 = fadd float %call.i10.3, %conv50.3
%mul53.3 = fmul float %conv50.3, %add52.3
%conv54.3 = fpext float %mul53.3 to double
%add55.3 = fadd double %conv54.3, 1.113000e+01
%conv56.3 = fptrunc double %add55.3 to float
%div.3 = fdiv float %conv56.3, 1.000000e+04
%3 = load volatile float, float* %checksum, align 4
%add57.3 = fadd float %3, %div.3
store volatile float %add57.3, float* %checksum, align 4
%call.i.4 = call float #sqrtf(float 0x3FBCC2A460000000) #2
%add.4 = fadd float %call.i.4, 0x3FBCC2A460000000
%mul.4 = fmul float 0x3FBCC2A460000000, %add.4
%conv.4 = fpext float %mul.4 to double
%add1.4 = fadd double %conv.4, 1.130000e+00
%conv2.4 = fptrunc double %add1.4 to float
%call.i2.4 = call float #sqrtf(float %conv2.4) #2
%add4.4 = fadd float %call.i2.4, %conv2.4
%mul5.4 = fmul float %conv2.4, %add4.4
%conv6.4 = fpext float %mul5.4 to double
%add7.4 = fadd double %conv6.4, 1.716000e+01
%conv8.4 = fptrunc double %add7.4 to float
%call.i3.4 = call float #sqrtf(float %conv8.4) #2
%add10.4 = fadd float %call.i3.4, %conv8.4
%mul11.4 = fmul float %conv8.4, %add10.4
%conv12.4 = fpext float %mul11.4 to double
%add13.4 = fadd double %conv12.4, 1.513000e+01
%conv14.4 = fptrunc double %add13.4 to float
%call.i4.4 = call float #sqrtf(float %conv14.4) #2
%add16.4 = fadd float %call.i4.4, %conv14.4
%mul17.4 = fmul float %conv14.4, %add16.4
%conv18.4 = fpext float %mul17.4 to double
%add19.4 = fadd double %conv18.4, 2.113000e+01
%conv20.4 = fptrunc double %add19.4 to float
%call.i5.4 = call float #sqrtf(float %conv20.4) #2
%add22.4 = fadd float %call.i5.4, %conv20.4
%mul23.4 = fmul float %conv20.4, %add22.4
%conv24.4 = fpext float %mul23.4 to double
%add25.4 = fadd double %conv24.4, 0x40544851EB851EB8
%conv26.4 = fptrunc double %add25.4 to float
%call.i6.4 = call float #sqrtf(float %conv26.4) #2
%add28.4 = fadd float %call.i6.4, %conv26.4
%mul29.4 = fmul float %conv26.4, %add28.4
%conv30.4 = fpext float %mul29.4 to double
%add31.4 = fadd double %conv30.4, 1.113000e+01
%conv32.4 = fptrunc double %add31.4 to float
%call.i7.4 = call float #sqrtf(float %conv32.4) #2
%add34.4 = fadd float %call.i7.4, %conv32.4
%mul35.4 = fmul float %conv32.4, %add34.4
%conv36.4 = fpext float %mul35.4 to double
%add37.4 = fadd double %conv36.4, 0x40544851EB851EB8
%conv38.4 = fptrunc double %add37.4 to float
%call.i8.4 = call float #sqrtf(float %conv38.4) #2
%add40.4 = fadd float %call.i8.4, %conv38.4
%mul41.4 = fmul float %conv38.4, %add40.4
%conv42.4 = fpext float %mul41.4 to double
%add43.4 = fadd double %conv42.4, 1.113000e+01
%conv44.4 = fptrunc double %add43.4 to float
%call.i9.4 = call float #sqrtf(float %conv44.4) #2
%add46.4 = fadd float %call.i9.4, %conv44.4
%mul47.4 = fmul float %conv44.4, %add46.4
%conv48.4 = fpext float %mul47.4 to double
%add49.4 = fadd double %conv48.4, 0x4056C851EB851EB8
%conv50.4 = fptrunc double %add49.4 to float
%call.i10.4 = call float #sqrtf(float %conv50.4) #2
%add52.4 = fadd float %call.i10.4, %conv50.4
%mul53.4 = fmul float %conv50.4, %add52.4
%conv54.4 = fpext float %mul53.4 to double
%add55.4 = fadd double %conv54.4, 1.113000e+01
%conv56.4 = fptrunc double %add55.4 to float
%div.4 = fdiv float %conv56.4, 1.000000e+04
%4 = load volatile float, float* %checksum, align 4
%add57.4 = fadd float %4, %div.4
store volatile float %add57.4, float* %checksum, align 4
%call.i.5 = call float #sqrtf(float 0x3FBCC2A460000000) #2
%add.5 = fadd float %call.i.5, 0x3FBCC2A460000000
%mul.5 = fmul float 0x3FBCC2A460000000, %add.5
%conv.5 = fpext float %mul.5 to double
%add1.5 = fadd double %conv.5, 1.130000e+00
%conv2.5 = fptrunc double %add1.5 to float
%call.i2.5 = call float #sqrtf(float %conv2.5) #2
%add4.5 = fadd float %call.i2.5, %conv2.5
%mul5.5 = fmul float %conv2.5, %add4.5
%conv6.5 = fpext float %mul5.5 to double
%add7.5 = fadd double %conv6.5, 1.716000e+01
%conv8.5 = fptrunc double %add7.5 to float
%call.i3.5 = call float #sqrtf(float %conv8.5) #2
%add10.5 = fadd float %call.i3.5, %conv8.5
%mul11.5 = fmul float %conv8.5, %add10.5
%conv12.5 = fpext float %mul11.5 to double
%add13.5 = fadd double %conv12.5, 1.513000e+01
%conv14.5 = fptrunc double %add13.5 to float
%call.i4.5 = call float #sqrtf(float %conv14.5) #2
%add16.5 = fadd float %call.i4.5, %conv14.5
%mul17.5 = fmul float %conv14.5, %add16.5
%conv18.5 = fpext float %mul17.5 to double
%add19.5 = fadd double %conv18.5, 2.113000e+01
%conv20.5 = fptrunc double %add19.5 to float
%call.i5.5 = call float #sqrtf(float %conv20.5) #2
%add22.5 = fadd float %call.i5.5, %conv20.5
%mul23.5 = fmul float %conv20.5, %add22.5
%conv24.5 = fpext float %mul23.5 to double
%add25.5 = fadd double %conv24.5, 0x40544851EB851EB8
%conv26.5 = fptrunc double %add25.5 to float
%call.i6.5 = call float #sqrtf(float %conv26.5) #2
%add28.5 = fadd float %call.i6.5, %conv26.5
%mul29.5 = fmul float %conv26.5, %add28.5
%conv30.5 = fpext float %mul29.5 to double
%add31.5 = fadd double %conv30.5, 1.113000e+01
%conv32.5 = fptrunc double %add31.5 to float
%call.i7.5 = call float #sqrtf(float %conv32.5) #2
%add34.5 = fadd float %call.i7.5, %conv32.5
%mul35.5 = fmul float %conv32.5, %add34.5
%conv36.5 = fpext float %mul35.5 to double
%add37.5 = fadd double %conv36.5, 0x40544851EB851EB8
%conv38.5 = fptrunc double %add37.5 to float
%call.i8.5 = call float #sqrtf(float %conv38.5) #2
%add40.5 = fadd float %call.i8.5, %conv38.5
%mul41.5 = fmul float %conv38.5, %add40.5
%conv42.5 = fpext float %mul41.5 to double
%add43.5 = fadd double %conv42.5, 1.113000e+01
%conv44.5 = fptrunc double %add43.5 to float
%call.i9.5 = call float #sqrtf(float %conv44.5) #2
%add46.5 = fadd float %call.i9.5, %conv44.5
%mul47.5 = fmul float %conv44.5, %add46.5
%conv48.5 = fpext float %mul47.5 to double
%add49.5 = fadd double %conv48.5, 0x4056C851EB851EB8
%conv50.5 = fptrunc double %add49.5 to float
%call.i10.5 = call float #sqrtf(float %conv50.5) #2
%add52.5 = fadd float %call.i10.5, %conv50.5
%mul53.5 = fmul float %conv50.5, %add52.5
%conv54.5 = fpext float %mul53.5 to double
%add55.5 = fadd double %conv54.5, 1.113000e+01
%conv56.5 = fptrunc double %add55.5 to float
%div.5 = fdiv float %conv56.5, 1.000000e+04
%5 = load volatile float, float* %checksum, align 4
%add57.5 = fadd float %5, %div.5
store volatile float %add57.5, float* %checksum, align 4
%call.i.6 = call float #sqrtf(float 0x3FBCC2A460000000) #2
%add.6 = fadd float %call.i.6, 0x3FBCC2A460000000
%mul.6 = fmul float 0x3FBCC2A460000000, %add.6
%conv.6 = fpext float %mul.6 to double
%add1.6 = fadd double %conv.6, 1.130000e+00
%conv2.6 = fptrunc double %add1.6 to float
%call.i2.6 = call float #sqrtf(float %conv2.6) #2
%add4.6 = fadd float %call.i2.6, %conv2.6
%mul5.6 = fmul float %conv2.6, %add4.6
%conv6.6 = fpext float %mul5.6 to double
%add7.6 = fadd double %conv6.6, 1.716000e+01
%conv8.6 = fptrunc double %add7.6 to float
%call.i3.6 = call float #sqrtf(float %conv8.6) #2
%add10.6 = fadd float %call.i3.6, %conv8.6
%mul11.6 = fmul float %conv8.6, %add10.6
%conv12.6 = fpext float %mul11.6 to double
%add13.6 = fadd double %conv12.6, 1.513000e+01
%conv14.6 = fptrunc double %add13.6 to float
%call.i4.6 = call float #sqrtf(float %conv14.6) #2
%add16.6 = fadd float %call.i4.6, %conv14.6
%mul17.6 = fmul float %conv14.6, %add16.6
%conv18.6 = fpext float %mul17.6 to double
%add19.6 = fadd double %conv18.6, 2.113000e+01
%conv20.6 = fptrunc double %add19.6 to float
%call.i5.6 = call float #sqrtf(float %conv20.6) #2
%add22.6 = fadd float %call.i5.6, %conv20.6
%mul23.6 = fmul float %conv20.6, %add22.6
%conv24.6 = fpext float %mul23.6 to double
%add25.6 = fadd double %conv24.6, 0x40544851EB851EB8
%conv26.6 = fptrunc double %add25.6 to float
%call.i6.6 = call float #sqrtf(float %conv26.6) #2
%add28.6 = fadd float %call.i6.6, %conv26.6
%mul29.6 = fmul float %conv26.6, %add28.6
%conv30.6 = fpext float %mul29.6 to double
%add31.6 = fadd double %conv30.6, 1.113000e+01
%conv32.6 = fptrunc double %add31.6 to float
%call.i7.6 = call float #sqrtf(float %conv32.6) #2
%add34.6 = fadd float %call.i7.6, %conv32.6
%mul35.6 = fmul float %conv32.6, %add34.6
%conv36.6 = fpext float %mul35.6 to double
%add37.6 = fadd double %conv36.6, 0x40544851EB851EB8
%conv38.6 = fptrunc double %add37.6 to float
%call.i8.6 = call float #sqrtf(float %conv38.6) #2
%add40.6 = fadd float %call.i8.6, %conv38.6
%mul41.6 = fmul float %conv38.6, %add40.6
%conv42.6 = fpext float %mul41.6 to double
%add43.6 = fadd double %conv42.6, 1.113000e+01
%conv44.6 = fptrunc double %add43.6 to float
%call.i9.6 = call float #sqrtf(float %conv44.6) #2
%add46.6 = fadd float %call.i9.6, %conv44.6
%mul47.6 = fmul float %conv44.6, %add46.6
%conv48.6 = fpext float %mul47.6 to double
%add49.6 = fadd double %conv48.6, 0x4056C851EB851EB8
%conv50.6 = fptrunc double %add49.6 to float
%call.i10.6 = call float #sqrtf(float %conv50.6) #2
%add52.6 = fadd float %call.i10.6, %conv50.6
%mul53.6 = fmul float %conv50.6, %add52.6
%conv54.6 = fpext float %mul53.6 to double
%add55.6 = fadd double %conv54.6, 1.113000e+01
%conv56.6 = fptrunc double %add55.6 to float
%div.6 = fdiv float %conv56.6, 1.000000e+04
%6 = load volatile float, float* %checksum, align 4
%add57.6 = fadd float %6, %div.6
As you can see, the loop was fully unrolled, despite me specifying an unroll factor of 3. I was hoping someone would have some insight into why this was happening...
LLVM has different heuristics for determining whether to unroll and whether to unroll fully, because unrolling a loop fully is often more beneficial than unrolling it partially, particularly if the trip count is fixed and known at compile time, because then all the checks and branches can be eliminated.
My quick search only turned up the source code and not the documentation,
but I think full unrolling is limited by a different setting: -unroll-full-max-count.
I'm trying to craft some inline assembly to test performance of rotate on ARM. The code is part of a C++ code base, so the rotates are template specializations. The code is below, but its producing messages that don't make a lot of sense to me.
According to ARM Assembly Language, the instructions are roughly:
# rotate - rotate instruction
# dst - output operand
# lhs - value to be rotated
# rhs - rotate amount (immediate or register)
<rotate> <dst>, <lhs>, <rhs>
They don't make a lot of sense because (to me), for example, I use g to constrain the output register, and that's just a general purpose register per Simple Contraints. ARM is supposed to have a lot of them, and Machine Specific Constraints does not appear to change behavior of the constraint.
I'm not sure the best way to approach this, so I'm going to ask three questions:
How do I encode the rotate when using a constant or immediate value?
How do I encode the rotate when using a value passed through a register?
How would thumb mode change the inline assembly
arm-linux-androideabi-g++ -DNDEBUG -g2 -Os -pipe -fPIC -mfloat-abi=softfp
-mfpu=vfpv3-d16 -mthumb --sysroot=/opt/android-ndk-r10e/platforms/android-21/arch-arm
-I/opt/android-ndk-r10e/sources/cxx-stl/stlport/stlport/ -c camellia.cpp
In file included from seckey.h:9:0,
from camellia.h:9,
from camellia.cpp:14:
misc.h: In function 'T CryptoPP::rotlFixed(T, unsigned int) [with T = unsigned int]':
misc.h:1121:71: error: matching constraint not valid in output operand
__asm__ ("rol %2, %0, %1" : "=g2" (z) : "g0" (x), "M1" ((int)(y%32)));
^
misc.h:1121:71: error: matching constraint references invalid operand number
misc.h: In function 'T CryptoPP::rotrFixed(T, unsigned int) [with T = unsigned int]':
misc.h:1129:71: error: matching constraint not valid in output operand
__asm__ ("ror %2, %0, %1" : "=g2" (z) : "g0" (x), "M1" ((int)(y%32)));
^
misc.h:1129:71: error: matching constraint references invalid operand number
misc.h: In function 'T CryptoPP::rotlVariable(T, unsigned int) [with T = unsigned int]':
misc.h:1137:72: error: matching constraint not valid in output operand
__asm__ ("rol %2, %0, %1" : "=g2" (z) : "g0" (x), "g1" ((int)(y%32)));
^
misc.h:1137:72: error: matching constraint references invalid operand number
misc.h: In function 'T CryptoPP::rotrVariable(T, unsigned int) [with T = unsigned int]':
misc.h:1145:72: error: matching constraint not valid in output operand
__asm__ ("ror %2, %0, %1" : "=g2" (z) : "g0" (x), "g1" ((int)(y%32)));
^
misc.h:1145:72: error: matching constraint references invalid operand number
misc.h: In function 'T CryptoPP::rotrFixed(T, unsigned int) [with T = unsigned int]':
misc.h:1129:71: error: matching constraint not valid in output operand
__asm__ ("ror %2, %0, %1" : "=g2" (z) : "g0" (x), "M1" ((int)(y%32)));
^
misc.h:1129:71: error: invalid lvalue in asm output 0
misc.h:1129:71: error: matching constraint references invalid operand number
misc.h: In function 'T CryptoPP::rotlFixed(T, unsigned int) [with T = unsigned int]':
misc.h:1121:71: error: matching constraint not valid in output operand
__asm__ ("rol %2, %0, %1" : "=g2" (z) : "g0" (x), "M1" ((int)(y%32)));
^
misc.h:1121:71: error: invalid lvalue in asm output 0
misc.h:1121:71: error: matching constraint references invalid operand number
// ROL #n Rotate left immediate
template<> inline word32 rotlFixed<word32>(word32 x, unsigned int y)
{
int z;
__asm__ ("rol %2, %0, %1" : "=g2" (z) : "g0" (x), "M1" ((int)(y%32)));
return static_cast<word32>(z);
}
// ROR #n Rotate right immediate
template<> inline word32 rotrFixed<word32>(word32 x, unsigned int y)
{
int z;
__asm__ ("ror %2, %0, %1" : "=g2" (z) : "g0" (x), "M1" ((int)(y%32)));
return static_cast<word32>(z);
}
// ROR rn Rotate left by a register
template<> inline word32 rotlVariable<word32>(word32 x, unsigned int y)
{
int z;
__asm__ ("rol %2, %0, %1" : "=g2" (z) : "g0" (x), "g1" ((int)(y%32)));
return static_cast<word32>(z);
}
// ROR rn Rotate right by a register
template<> inline word32 rotrVariable<word32>(word32 x, unsigned int y)
{
int z;
__asm__ ("ror %2, %0, %1" : "=g2" (z) : "g0" (x), "g1" ((int)(y%32)));
return static_cast<word32>(z);
}
template<> inline word32 rotlMod<word32>(word32 x, unsigned int y)
{
return rotlVariable<word32>(x, y);
}
template<> inline word32 rotrMod<word32>(word32 x, unsigned int y)
{
return rotrVariable<word32>(x, y);
}
First, ARM does not have rotate left (ROL), you need to emulate that through ROR.
Second, the M constraint for some reason accepts 0 to 32, but ROL only accepts 0 to 31 when dealing with immediates.
Third, the g constraint is too generic because it also allows memory operands that ROR does not accept. Better use r instead.
This is what I came up with:
// Rotate right
inline word32 rotr(word32 x, unsigned int y)
{
int z;
if (__builtin_constant_p(y))
{
y &= 31;
if (y != 0) // this should be optimized away by the compiler
{
__asm__ ("ror %0, %1, %2" : "=r" (z) : "r" (x), "M" (y));
}
} else {
__asm__ ("ror %0, %1, %2" : "=r" (z) : "r" (x), "r" (y));
}
return static_cast<word32>(z);
}
// Rotate left
inline word32 rotl(word32 x, unsigned int y)
{
int z;
if (__builtin_constant_p(y))
{
y &= 31;
if (y != 0) // this should be optimized away by the compiler
{
__asm__ ("ror %0, %1, %2" : "=r" (z) : "r" (x), "M" (32 - y));
}
} else {
__asm__ ("ror %0, %1, %2" : "=r" (z) : "r" (x), "r" (32 - y));
}
return static_cast<word32>(z);
}
I can tell you that THUMB mode handles bit rotates very differently. ARM mode has what's called a "barrel shifter" where you can bit shift or bit rotate any parameter without actually changing it. So let's consider the following:
ADD r0,r0,r1 ror #1
This roughly translates to "Rotate r1 right once, add it to r0, then store the result in r0." You get to decide whether you shift/rotate one of the operands and by how much. There is no ROL but ROR #31 equals what ROL #1 would do if ARM had it, so use that to your advantage.
The actual value stored in r1 doesn't change, the shift/rotate only applies during this instruction. This only works in ARM mode, in THUMB mode you will have to use more traditional shift/rotate commands typical of other processors such as x86, 68000, etc.
I can't seem to find reference to intrinsics in the official LLVM OCaml binding, beyond the is_intrinsic function.
I am building a backend which needs to perform some target-specific code-generation (for SSE, AVX, and NEON), and intrinsics are the standard path in the C++ API.
The OCaml binding supports intrinsics in exactly the same way as the C language binding:
There is no special support for them in the interface (as there is in the full C++ interface), but they can be declared extern and called just like any other functions.
e.g.:
open Llvm
let () =
let c = create_context () in
let f32_t = float_type c in
let f32x4_t = vector_type f32_t 4 in
let m = create_module c "test" in
(* declare void #printv(<4 x float>)
* nonce extern which forces preservation of vector results *)
let printv =
declare_function "printv"
(function_type (void_type c) [|f32x4_t|]) m in
(* declare <4 x float> #llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone *)
let sqrtps =
declare_function "llvm.x86.sse.sqrt.ps"
(function_type f32x4_t [|f32x4_t|]) m in
(* define i32 #main() { entry: *)
let main = define_function "main" (function_type i32_t [| |]) m in
let at_entry = builder_at_end c (entry_block main) in
(*%sqrtps = call <4 x float> #llvm.x86.sse.sqrt.ps(<4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>)*)
let cv1234 = const_vector [| const_float f32_t 1.0; const_float f32_t 2.0;
const_float f32_t 3.0; const_float f32_t 4.0 |] in
let sqrt = build_call sqrtps [| cv1234 |] "sqrtps" at_entry in
(* call void printv(sqrtps) *)
ignore (build_call printv [| sqrt |] "" at_entry);
(* ret void *)
ignore (build_ret (const_null i32_t) at_entry);
(* Print .ll to stderr *)
dump_module m
produces:
; ModuleID = 'test'
declare void #printv(<4 x float>)
declare <4 x float> #llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
define i32 #main() {
entry:
%sqrtps = call <4 x float> #llvm.x86.sse.sqrt.ps(<4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>)
call void #printv(<4 x float> %sqrtps)
ret i32 0
}
which compiles into x86 correctly invoking sqrtps on the xmm registers.
Disclaimer: I never used LLVM. After a quick look at the binding documentation, it looks like the answer is "no". There is a support for inline assembly however, which may or may not suit your needs.
Finally, it seems it is accepted by the LLVM developers that the OCaml binding is not complete: if you want to, you can add some more functionality (if you're not familiar with OCaml, C bindings are really not the easiest part, but the LLVM bindings are full of example code that you could probably successfully adapt to other LLVM functions), then provide a patch for it on the LLVMdev list.