ARM inline assembly for 32-bit word rotate - c++

I'm trying to craft some inline assembly to test performance of rotate on ARM. The code is part of a C++ code base, so the rotates are template specializations. The code is below, but its producing messages that don't make a lot of sense to me.
According to ARM Assembly Language, the instructions are roughly:
# rotate - rotate instruction
# dst - output operand
# lhs - value to be rotated
# rhs - rotate amount (immediate or register)
<rotate> <dst>, <lhs>, <rhs>
They don't make a lot of sense because (to me), for example, I use g to constrain the output register, and that's just a general purpose register per Simple Contraints. ARM is supposed to have a lot of them, and Machine Specific Constraints does not appear to change behavior of the constraint.
I'm not sure the best way to approach this, so I'm going to ask three questions:
How do I encode the rotate when using a constant or immediate value?
How do I encode the rotate when using a value passed through a register?
How would thumb mode change the inline assembly
arm-linux-androideabi-g++ -DNDEBUG -g2 -Os -pipe -fPIC -mfloat-abi=softfp
-mfpu=vfpv3-d16 -mthumb --sysroot=/opt/android-ndk-r10e/platforms/android-21/arch-arm
-I/opt/android-ndk-r10e/sources/cxx-stl/stlport/stlport/ -c camellia.cpp
In file included from seckey.h:9:0,
from camellia.h:9,
from camellia.cpp:14:
misc.h: In function 'T CryptoPP::rotlFixed(T, unsigned int) [with T = unsigned int]':
misc.h:1121:71: error: matching constraint not valid in output operand
__asm__ ("rol %2, %0, %1" : "=g2" (z) : "g0" (x), "M1" ((int)(y%32)));
^
misc.h:1121:71: error: matching constraint references invalid operand number
misc.h: In function 'T CryptoPP::rotrFixed(T, unsigned int) [with T = unsigned int]':
misc.h:1129:71: error: matching constraint not valid in output operand
__asm__ ("ror %2, %0, %1" : "=g2" (z) : "g0" (x), "M1" ((int)(y%32)));
^
misc.h:1129:71: error: matching constraint references invalid operand number
misc.h: In function 'T CryptoPP::rotlVariable(T, unsigned int) [with T = unsigned int]':
misc.h:1137:72: error: matching constraint not valid in output operand
__asm__ ("rol %2, %0, %1" : "=g2" (z) : "g0" (x), "g1" ((int)(y%32)));
^
misc.h:1137:72: error: matching constraint references invalid operand number
misc.h: In function 'T CryptoPP::rotrVariable(T, unsigned int) [with T = unsigned int]':
misc.h:1145:72: error: matching constraint not valid in output operand
__asm__ ("ror %2, %0, %1" : "=g2" (z) : "g0" (x), "g1" ((int)(y%32)));
^
misc.h:1145:72: error: matching constraint references invalid operand number
misc.h: In function 'T CryptoPP::rotrFixed(T, unsigned int) [with T = unsigned int]':
misc.h:1129:71: error: matching constraint not valid in output operand
__asm__ ("ror %2, %0, %1" : "=g2" (z) : "g0" (x), "M1" ((int)(y%32)));
^
misc.h:1129:71: error: invalid lvalue in asm output 0
misc.h:1129:71: error: matching constraint references invalid operand number
misc.h: In function 'T CryptoPP::rotlFixed(T, unsigned int) [with T = unsigned int]':
misc.h:1121:71: error: matching constraint not valid in output operand
__asm__ ("rol %2, %0, %1" : "=g2" (z) : "g0" (x), "M1" ((int)(y%32)));
^
misc.h:1121:71: error: invalid lvalue in asm output 0
misc.h:1121:71: error: matching constraint references invalid operand number
// ROL #n Rotate left immediate
template<> inline word32 rotlFixed<word32>(word32 x, unsigned int y)
{
int z;
__asm__ ("rol %2, %0, %1" : "=g2" (z) : "g0" (x), "M1" ((int)(y%32)));
return static_cast<word32>(z);
}
// ROR #n Rotate right immediate
template<> inline word32 rotrFixed<word32>(word32 x, unsigned int y)
{
int z;
__asm__ ("ror %2, %0, %1" : "=g2" (z) : "g0" (x), "M1" ((int)(y%32)));
return static_cast<word32>(z);
}
// ROR rn Rotate left by a register
template<> inline word32 rotlVariable<word32>(word32 x, unsigned int y)
{
int z;
__asm__ ("rol %2, %0, %1" : "=g2" (z) : "g0" (x), "g1" ((int)(y%32)));
return static_cast<word32>(z);
}
// ROR rn Rotate right by a register
template<> inline word32 rotrVariable<word32>(word32 x, unsigned int y)
{
int z;
__asm__ ("ror %2, %0, %1" : "=g2" (z) : "g0" (x), "g1" ((int)(y%32)));
return static_cast<word32>(z);
}
template<> inline word32 rotlMod<word32>(word32 x, unsigned int y)
{
return rotlVariable<word32>(x, y);
}
template<> inline word32 rotrMod<word32>(word32 x, unsigned int y)
{
return rotrVariable<word32>(x, y);
}

First, ARM does not have rotate left (ROL), you need to emulate that through ROR.
Second, the M constraint for some reason accepts 0 to 32, but ROL only accepts 0 to 31 when dealing with immediates.
Third, the g constraint is too generic because it also allows memory operands that ROR does not accept. Better use r instead.
This is what I came up with:
// Rotate right
inline word32 rotr(word32 x, unsigned int y)
{
int z;
if (__builtin_constant_p(y))
{
y &= 31;
if (y != 0) // this should be optimized away by the compiler
{
__asm__ ("ror %0, %1, %2" : "=r" (z) : "r" (x), "M" (y));
}
} else {
__asm__ ("ror %0, %1, %2" : "=r" (z) : "r" (x), "r" (y));
}
return static_cast<word32>(z);
}
// Rotate left
inline word32 rotl(word32 x, unsigned int y)
{
int z;
if (__builtin_constant_p(y))
{
y &= 31;
if (y != 0) // this should be optimized away by the compiler
{
__asm__ ("ror %0, %1, %2" : "=r" (z) : "r" (x), "M" (32 - y));
}
} else {
__asm__ ("ror %0, %1, %2" : "=r" (z) : "r" (x), "r" (32 - y));
}
return static_cast<word32>(z);
}

I can tell you that THUMB mode handles bit rotates very differently. ARM mode has what's called a "barrel shifter" where you can bit shift or bit rotate any parameter without actually changing it. So let's consider the following:
ADD r0,r0,r1 ror #1
This roughly translates to "Rotate r1 right once, add it to r0, then store the result in r0." You get to decide whether you shift/rotate one of the operands and by how much. There is no ROL but ROR #31 equals what ROL #1 would do if ARM had it, so use that to your advantage.
The actual value stored in r1 doesn't change, the shift/rotate only applies during this instruction. This only works in ARM mode, in THUMB mode you will have to use more traditional shift/rotate commands typical of other processors such as x86, 68000, etc.

Related

Assembly Code only returns two of possible four values in comparison

I am currently writing a Program that requires C++ and Arm Assembly language to be implemented. The C++ code uses functions from the assembly file to perform particular tasks on a STM32F4discovery board touchscreen. One of the particular functions is a function that returns what quadrant the user is touching currently, and we had to write that function in assembly. No matter what I do, this function always returns quadrant 1 or 3. I think I am having an issue with branching properly, as this is one of my weak points, but I do not know for sure. Here is the function in context of the C++ code:
//all of this is above main
extern "C" uint32_t getQuad(uint16_t, uint16_t);
TS_DISCO_F429ZI ts; //touch screen sensing object
TS_StateTypeDef TS_State;
//other code here
ts.GetState(&TS_State);
uint16_t x = TS_State.X;
uint16_t y = TS_State.Y;
uint32_t quad = getQuad(x, y);
And here is the function in the assembly file
AREA Program3_F20_Gibbs, CODE, READONLY
GLOBAL getQuad
; uint32_t getQuad ( uint16_t x, uint16_t y)
getQuad LDR R0, [R0]
LDR R1, [R1]
CMP R0, #120
BLE TwoThree
CMP R1, #160
BLE Four
MOV R0, #1
BX LR
TwoThree CMP R1, #160
BLE Three
MOV R0, #2
BX LR
Four MOV R0, #4
BX LR
Three MOV R0, #3
BX LR
I should probably also note that this quadrant system follows an x/y coordinate system with a bit of difference. Quadrant one has x/y coordinates with x > 120 and y > 160, Quadrant two has x/y coordinates with x < 120 and y > 160, Quadrant three has x/y coordinates with x < 120 and y < 160, and Quadrant four has x/y coordinates with x > 120 and y < 160. If I need to give any more info, please let me know, and thank you for any help in advance!
You can get away without any (conditional) branch:
getQuad
cmp r0, #120
adr r12, %f1
addls r12, r12, #2
cmp r1, #160
addls r12, r12, #1
ldrb r0, [r12]
bx lr
1
dcb 1, 4, 2, 3
You really don't need to write it in assembly:
static inline uint32_t getQuad(uint16_t x, uint16_t y)
{
static const uint8_t array[4] = {1, 4, 2, 3};
uint8_t *pSrc = array;
if (x <= 120) pSrc += 2;
if (y <= 160) pSrc += 1;
return (uint32_t) *pSrc;
}
It's a very short function, hence you better inline it, and put it in the header file.
The pesky function call overhead is gone then.
PS: le (less of equal) is for signed value. You should us ls (lower or same) instead. le works in this case, but if you are dealing with 32bit values, it could cause disastrous problems that are hard to debug.
Your logic seems sound except for the fact there should be some <= type operations in the text:
Quadrant one has x/y coordinates with x > 120 and y > 160, Quadrant two has x/y coordinates with x <= 120 and y > 160, Quadrant three has x/y coordinates with x <= 120 and y <= 160, and Quadrant four has x/y coordinates with x > 120 and y <= 160.
However, unless the calling convention is very strange, you probably don't want to be doing this at the start:
LDR R0, [R0]
LDR R1, [R1]
This is what you do if you're passing the addresses of variables and you want to dereference those addresses to get the values. Therefore you will be using the wrong values to figure out which quadrant you're in.

How to use c++ template to conditionally compile asm code?

There is a bool variable named "Enable", when "Enable" is false, I want to create following function:
void test_false()
{
float dst[4] = {1.0, 1.0, 1.0, 1.0};
float src[4] = {1.0, 2.0, 3.0, 4.0};
float * dst_addr = dst;
float * src_addr = src;
asm volatile (
"vld1.32 {q0}, [%[src]] \n"
"vld1.32 {q1}, [%[dst]] \n"
"vadd.f32 q0, q0, q1 \n"
"vadd.f32 q0, q0, q1 \n"
"vst1.32 {q0}, [%[dst]] \n"
:[src]"+r"(src_addr),
[dst]"+r"(dst_addr)
:
: "q0", "q1", "q2", "q3", "memory"
);
for (int i = 0; i < 4; i++)
{
printf("%f, ", dst[i]);//0.0 0.0 0.0 0.0
}
}
And when "Enable" is true, I want to create following function:
void test_true()
{
float dst[4] = {1.0, 1.0, 1.0, 1.0};
float src[4] = {1.0, 2.0, 3.0, 4.0};
float * dst_addr = dst;
float * src_addr = src;
asm volatile (
"vld1.32 {q0}, [%[src]] \n"
"vld1.32 {q1}, [%[dst]] \n"
"vadd.f32 q0, q0, q1 \n"
"vadd.f32 q0, q0, q1 \n"
"vadd.f32 q0, q0, q1 \n" //Only here is different from test_false()
"vst1.32 {q0}, [%[dst]] \n"
:[src]"+r"(src_addr),
[dst]"+r"(dst_addr)
:
: "q0", "q1", "q2", "q3", "memory"
);
for (int i = 0; i < 4; i++)
{
printf("%f, ", dst[i]);//0.0 0.0 0.0 0.0
}
}
But I don't want to save two copies of code, because most of them are the same. I want to use “c++ Template + Conditional Compile” to solve my problem. The code is as follows. But it didn't work. Whether the Enable is true or false, the compiler creates the code same as test_true().
template<bool Enable>
void test_tmp()
{
float dst[4] = {1.0, 1.0, 1.0, 1.0};
float src[4] = {1.0, 2.0, 3.0, 4.0};
float * dst_addr = dst;
float * src_addr = src;
if (Enable)
{
#define FUSE_
}
asm volatile (
"vld1.32 {q0}, [%[src]] \n"
"vld1.32 {q1}, [%[dst]] \n"
"vadd.f32 q0, q0, q1 \n"
"vadd.f32 q0, q0, q1 \n"
#ifdef FUSE_
"vadd.f32 q0, q0, q1 \n"
#endif
"vst1.32 {q0}, [%[dst]] \n"
:[src]"+r"(src_addr),
[dst]"+r"(dst_addr)
:
: "q0", "q1", "q2", "q3", "memory"
);
for (int i = 0; i < 4; i++)
{
printf("%f, ", dst[i]);//0.0 0.0 0.0 0.0
}
#undef FUSE_
}
template void test_tmp<true>();
template void test_tmp<false>();
It doesn't seem possible to write code like function test_tmp(). Does anyone know how to solve my problem? Thanks a lot.
If you use C temporaries and output operands for all live registers in the first half that line up with input constraints for the 2nd half, you should be able to split it up your inline asm without any performance loss, especially if you use specific memory input/output constraints instead of a catch-all "memory" clobber. But it will get a lot more complicated.
This obviously doesn't work, because the C preprocessor runs before the C++ compiler even looks at if() statements.
if (Enable) {
#define FUSE_ // always defined, regardless of Enable
}
But the GNU assembler has its own macro / conditional-assembly directives like .if which operate on the asm the compiler emits after making text substitutions into the asm() template, including actual numeric values for immediate input operands.
Use the bool as an input operand for an assembler .if directive
Use an "i" (Enable) input constraint. Normally the %0 or %[enable] expansion of that would be #0 or #1, because that's how to print an ARM immediate. But GCC has a %c0 / %c[enable] modifier that will print a constant without punctuation. (It's documented for x86, but works the same way for ARM and presumably all other architectures. Documentation for ARM / AArch64 operand modifiers is being worked on; I've been sitting on an email about that...)
".if %c[enable] \n\t" for [enable] "i" (c_var) will substitute as .if 0 or .if 1 into the inline-asm template, exactly what we need to make .if / .endif work at assemble time.
Full example:
template<bool Enable>
void test_tmp(float dst[4])
{
//float dst[4] = {1.0, 1.0, 1.0, 1.0};
// static const // non-static-const so we can see the memory clobber vs. dummy src stop this from optimizing away init of src[] on the stack
float src[4] = {1.0, 2.0, 3.0, 4.0};
float * dst_addr = dst;
const float * src_addr = src;
asm (
"vld1.32 {q1}, [%[dst]] # dummy dst = %[dummy_memdst]\n" // hopefully they pick the same regs?
"vld1.32 {q0}, [%[src]] # dummy src = %[dummy_memsrc]\n"
"vadd.f32 q0, q0, q1 \n" // TODO: optimize to q1+q1 first, without a dep on src
"vadd.f32 q0, q0, q1 \n" // allowing q0+=q1 and q1+=q1 in parallel if we need q0 += 3*q1
// #ifdef FUSE_
".if %c[enable]\n" // %c modifier: print constant without punctuation, same as documented for x86
"vadd.f32 q0, q0, q1 \n"
".endif \n"
// #endif
"vst1.32 {q0}, [%[dst]] \n"
: [dummy_memdst] "+m" (*(float(*)[4])dst_addr)
: [src]"r"(src_addr),
[dst]"r"(dst_addr),
[enable]"i"(Enable)
, [dummy_memsrc] "m" (*(const float(*)[4])src_addr)
: "q0", "q1", "q2", "q3" //, "memory"
);
/*
for (int i = 0; i < 4; i++)
{
printf("%f, ", dst[i]);//0.0 0.0 0.0 0.0
}
*/
}
float dst[4] = {1.0, 1.0, 1.0, 1.0};
template void test_tmp<true>(float *);
template void test_tmp<false>(float *);
compiles with GCC and Clang on the Godbolt compiler explorer
With gcc, you only get the compiler's .s output, so you have to turn off some of the usual compiler-explorer filters and look through the directives. All 3 vadd.f32 instructions are there in the false version, but one of them is surrounded by .if 0 / .endif.
But clang's built-in assembler processes assembler directives internally before turning things back into asm if that output is requested. (Normally clang/LLVM goes straight to machine code, unlike gcc which always runs a separate assembler).
Just to be clear, this works with gcc and clang, but it's just easier to see it on Godbolt with clang. (Because Godbolt doesn't have a "binary" mode that actually assembles and then disassembles, except for x86). Clang output for the false version
...
vld1.32 {d2, d3}, [r0] # dummy dst = [r0]
vld1.32 {d0, d1}, [r1] # dummy src = [r1]
vadd.f32 q0, q0, q1
vadd.f32 q0, q0, q1
vst1.32 {d0, d1}, [r0]
...
Notice that clang picked the same GP register for the raw pointers as it used for the memory operand. (gcc seems to choose [sp] for src_mem, but a different reg for the pointer input that you use manually inside an addressing mode). If you hadn't forced it to have the pointers in registers, it could have used an SP-relative addressing mode with an offset for the vector loads, potentially taking advantage of ARM addressing modes.
If you're really not going to modify the pointers inside the asm (e.g. with post-increment addressing modes), then "r" input-only operands makes the most sense. If we'd left in the printf loop, the compiler would have needed dst again after the asm, so it would benefit from having it still in a register. A "+r"(dst_addr) input forces the compiler to assume that that register is no longer usable as a copy of dst. Anyway, gcc always copies the registers, even when it doesn't need it later, whether I make it "r" or "+r", so that's weird.
Using (dummy) memory inputs / outputs means we can drop the volatile, so the compiler can optimize it normally as a pure function of its inputs. (And optimize it away if the result is unused.)
Hopefully this isn't worse code-gen that with the "memory" clobber. But it would probably be better if you just used the "=m" and "m" memory operands, and didn't ask for pointers in registers at all. (That doesn't help if you're going to loop over the array with inline asm, though.)
See also Looping over arrays with inline assembly
I haven't been doing ARM assembly for few years, and I never really bothered to learn GCC inline assembly properly, but I think your code can be rewritten like this, using intrinsics:
#include <cstdio>
#include <arm_neon.h>
template<bool Enable>
void test_tmp()
{
const float32x4_t src = {1.0, 2.0, 3.0, 4.0};
const float32x4_t src2 = {1.0, 1.0, 1.0, 1.0};
float32x4_t z;
z = vaddq_f32(src, src2);
z = vaddq_f32(z, src2);
if (Enable) z = vaddq_f32(z, src2);
float result[4];
vst1q_f32(result, z);
for (int i = 0; i < 4; i++)
{
printf("%f, ", result[i]);//0.0 0.0 0.0 0.0
}
}
template void test_tmp<true>();
template void test_tmp<false>();
You can see resulting machine code + toy around live at: https://godbolt.org/z/Fg7Tci
Compiled with ARM gcc8.2 and command line options "-O3 -mfloat-abi=softfp -mfpu=neon" the "true" variant is:
void test_tmp<true>():
vmov.f32 q9, #1.0e+0 # v4sf
vldr d16, .L6
vldr d17, .L6+8
# and the FALSE variant has one less vadd.f32 in this part
vadd.f32 q8, q8, q9
vadd.f32 q8, q8, q9
vadd.f32 q8, q8, q9
push {r4, r5, r6, lr}
sub sp, sp, #16
vst1.32 {d16-d17}, [sp:64]
mov r4, sp
ldr r5, .L6+16
add r6, sp, #16
.L2:
vldmia.32 r4!, {s15}
vcvt.f64.f32 d16, s15
mov r0, r5
vmov r2, r3, d16
bl printf
cmp r4, r6
bne .L2
add sp, sp, #16
pop {r4, r5, r6, pc}
.L6:
.word 1065353216
.word 1073741824
.word 1077936128
.word 1082130432
.word .LC0
.LC0:
.ascii "%f, \000"
This still leaves me profoundly confused by why the gcc doesn't simply calculate final string with values as string for output, as the inputs are constant. Maybe it's some math-rule about precision preventing it to do that in compile-time as the result could differ slightly from actual target HW platform FPU? I.e. with some fast-math switch it would probably drop that code completely and just produce one output string...
But I guess your code is not actually proper "MCVE" of what you are doing, and the test values would be fed into some real function you are testing, or something like that.
Anyway, if you are working on performance optimizations, you should probably rather avoid inline assembly completely and use intrinsics instead, as that allows the compiler to better allocate registers and optimize code around the calculations (I didn't track it precisely, but I think the last version of this experiment in godbolt was 2-4 instructions shorter/simpler than the original using inline assembly).
Plus you will avoid the incorrect asm constraints like your example code has, those are always tricky to get correctly and pure PITA to maintain if you keep modifying the inlined code often.

LLVM ERROR: Cannot select: 0x5586a71270c0: f32 = Constant<1036831949>

I get the error LLVM ERROR: Cannot select: 0x5644a6291a10: f32 = Constant<1036831949> somewhere inside:
%a2 = load float, float* %a
%a3 = load float, float* %a
%cmp = fcmp one float %a3, 0.000000e+00
%not = xor i1 %cmp, true
%convert = zext i1 %not to i32
%conv = sitofp i32 %convert to float
%cmp2 = or float %conv, %a2
store float %cmp2, float* %a
Is there a possible type mismatch occurring here? I have encountered this error before, but in the context of a mismatch of types. Not sure what is wrong here, though.
Found the issue. %cmp2 = or float %conv, %a2 is invalid because or only takes int types.
Tip to other newbies, try running llc myfile.llvm to find issues in your LLVM IR.

Maximum optimization of element wise multiplication via ARM NEON assembly

I'm optimizing an element wise multiplication of two single dimensional arrays for a dual Cortex-A9 processor. Linux is running on the board and I'm using the GCC 4.5.2 compiler.
So the following is my C++ inline assembler function. src1, src2 and dst are 16 byte aligned.
Update: Testable Code:
void Multiply(
const float* __restrict__ src1,
const float* __restrict__ src2,
float* __restrict__ dst,
const unsigned int width,
const unsigned int height)
{
int loopBound = (width * height) / 4;
asm volatile(
".loop: \n\t"
"vld1.32 {q1}, [%[src1]:128]! \n\t"
"vld1.32 {q2}, [%[src2]:128]! \n\t"
"vmul.f32 q0, q1, q2 \n\t"
"vst1.32 {q0}, [%[dst]:128]! \n\t"
"subs %[lBound], %[lBound], $1 \n\t"
"bge .loop \n\t"
:
:[dst] "r" (dst), [src1] "r" (src1), [src2] "r" (src2),
[lBound] "r" (loopBound)
:"memory", "d0", "d1", "d2", "d3", "d4", "d5
);
}
//The following function describes how to test the element wise multiplication
void Test()
{
const unsigned int width = 1024, height = 1024;
float* src1 __attribute__((aligned(16))) = new float[width * height];
float* src2 __attribute__((aligned(16))) = new float[width * height];
float* dst __attribute__((aligned(16))) = new float[width * height];
for(unsigned int i = 0; i < (width * height); i++)
{
src1[i] = (float)rand();
src2[i] = (float)rand();
}
Multiply(src1, src2, dst, width, height);
std::cout << dst[0] << std::endl;
}
The calculation of 1024*1024 values takes ~0.016 s. (Two threads - each thread calculates a half of the array). Naively interpreted, the calculation of one iteration takes 122 cycles. This seems to be a bit slow. But where is the bottleneck?
I even tried the pld command for preloading elements in the L2 cache, "unrolling" the loop by calculating up to 20 values per iteration and reordering the instructions to make shure the processor is not waiting for memory. I didn't get that much speedup (max 0.001 s faster).
Do you have any suggestions for speeding up the calculation?
I don't really know that much about the NEON. However, I think that you have data dependencies that cause performance issues. I would suggest you prime the loop with some loads and then place them between the multiply and store. I think the store is probably blocking until the multiply is done.
asm volatile(
"vld1.32 {q1}, [%[src1]:128]! \n\t"
"vld1.32 {q2}, [%[src2]:128]! \n\t"
".loop: \n\t"
"vmul.f32 q0, q1, q2 \n\t"
"vld1.32 {q1}, [%[src1]:128]! \n\t"
"vld1.32 {q2}, [%[src2]:128]! \n\t"
"vst1.32 {q0}, [%[dst]:128]! \n\t"
"subs %[lBound], %[lBound], $1 \n\t"
"bge .loop \n\t"
:
:[dst] "r" (dst), [src1] "r" (src1), [src2] "r" (src2),
[lBound] "r" (loopBound)
:"memory", "d0", "d1", "d2", "d3", "d4", "d5
);
This way you should be able to parallel the loads with the multiply. You will need to over-allocate the source arrays or change the loop index and do a final multiply and store. If the NEON ops are not affecting the condition codes, you can re-order the subs as well and place it earlier.
Edit: In fact, the Cortex A-9 Media processing Engine document recommends interleaving ARM and NEON instructions as they can execute in parallel. Also, the NEON instructions seem to set FPSCR and not the ARM CPSR so re-ordering the subs would decrease the execution time. You may also cache align the loop.

Does the LLVM OCaml binding include intrinsic support?

I can't seem to find reference to intrinsics in the official LLVM OCaml binding, beyond the is_intrinsic function.
I am building a backend which needs to perform some target-specific code-generation (for SSE, AVX, and NEON), and intrinsics are the standard path in the C++ API.
The OCaml binding supports intrinsics in exactly the same way as the C language binding:
There is no special support for them in the interface (as there is in the full C++ interface), but they can be declared extern and called just like any other functions.
e.g.:
open Llvm
let () =
let c = create_context () in
let f32_t = float_type c in
let f32x4_t = vector_type f32_t 4 in
let m = create_module c "test" in
(* declare void #printv(<4 x float>)
* nonce extern which forces preservation of vector results *)
let printv =
declare_function "printv"
(function_type (void_type c) [|f32x4_t|]) m in
(* declare <4 x float> #llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone *)
let sqrtps =
declare_function "llvm.x86.sse.sqrt.ps"
(function_type f32x4_t [|f32x4_t|]) m in
(* define i32 #main() { entry: *)
let main = define_function "main" (function_type i32_t [| |]) m in
let at_entry = builder_at_end c (entry_block main) in
(*%sqrtps = call <4 x float> #llvm.x86.sse.sqrt.ps(<4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>)*)
let cv1234 = const_vector [| const_float f32_t 1.0; const_float f32_t 2.0;
const_float f32_t 3.0; const_float f32_t 4.0 |] in
let sqrt = build_call sqrtps [| cv1234 |] "sqrtps" at_entry in
(* call void printv(sqrtps) *)
ignore (build_call printv [| sqrt |] "" at_entry);
(* ret void *)
ignore (build_ret (const_null i32_t) at_entry);
(* Print .ll to stderr *)
dump_module m
produces:
; ModuleID = 'test'
declare void #printv(<4 x float>)
declare <4 x float> #llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
define i32 #main() {
entry:
%sqrtps = call <4 x float> #llvm.x86.sse.sqrt.ps(<4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>)
call void #printv(<4 x float> %sqrtps)
ret i32 0
}
which compiles into x86 correctly invoking sqrtps on the xmm registers.
Disclaimer: I never used LLVM. After a quick look at the binding documentation, it looks like the answer is "no". There is a support for inline assembly however, which may or may not suit your needs.
Finally, it seems it is accepted by the LLVM developers that the OCaml binding is not complete: if you want to, you can add some more functionality (if you're not familiar with OCaml, C bindings are really not the easiest part, but the LLVM bindings are full of example code that you could probably successfully adapt to other LLVM functions), then provide a patch for it on the LLVMdev list.