Proper way to load structs in LLVM - llvm

I have been experimenting with MCJIT in LLVM and have come across a problem when trying to load struct types.
The relevant struct I am trying to load can be seen here:
typedef struct {
float x;
float y;
} cfloat;
The generated LLVM code can be seen below.
--LLVM IR--
; ModuleID = 'Test_Module'
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
%cfloat = type { float, float }
declare float #cabs(%cfloat)
declare float #cabsptr(%cfloat*)
define i32 #test(i32 %argc, i8** %args) {
entry:
%argloc0 = getelementptr i8*, i8** %args, i64 0
%argloc1 = getelementptr i8*, i8** %args, i64 1
%arg0 = load i8*, i8** %argloc0
%iptr = bitcast i8* %arg0 to %cfloat*
%arg1 = load i8*, i8** %argloc1
%optr = bitcast i8* %arg1 to float*
%ival = load %cfloat, %cfloat* %iptr
%oval = call float #cabs(%cfloat %ival)
%oval2 = call float #cabsptr(%cfloat* %iptr)
store float %oval, float* %optr
ret i32 %argc
}
The relevant structs and functions:
extern "C" float cabs(cfloat val)
{
printf("%f, %f\n", val.x, val.y);
return sqrtf(val.x * val.x + val.y * val.y);
}
extern "C" float cabsptr(cfloat *val)
{
printf("%f, %f\n", val->x, val->y);
return sqrtf(val->x * val->x + val->y * val->y);
}
When I pass in cfloat val = {300, -400}; to the function
cabs prints 300.0000, 0.0000.
cabsptr prints 300.0000, -400.0000.
I'd expect both the functions to print the input values.
Is this a bug I am encountering or did I miss something when generating the LLVM function?
The full source code to reproduce this issue can be seen here:
https://gist.github.com/pavanky/4439f1a6c2ca182058c19187aa8215a7
EDIT:
Defining my custom type to be a VectorType of <2 x float> seems to be a work around for me. But I'd like to know a proper solution if available.

Related

SSE vector Operation on type double

I want to use SIMD Operation of vectors containing double type values on AMD64 architecture. below is the simple example of my Problem. this works fine if I print float values , but not for double. I Need precision of upto 9 decimal Digits.
#include<stdio.h>
#include<emmintrin.h>
typedef union f4vector
{
__m128d v;
}float4;
int main()
{
float4 x,y,z;
double f0[2]={2334, 5};
double f1[2]={2334.32345324 , 5};
double f3[2];
x.v=_mm_set_pd(f0[0], f0[1]);
y.v = _mm_set_pd(f1[0], f1[1]);
z.v = _mm_mul_pd(x.v , y.v);
f3[0]=z.v[0];
f3[1]=z.v[1];
printf("%d, %d\n", f3[0], f3[1]); // doesnt print correct values.
}
You have some mistakes:
Using %d format specifier instead of %f in function printf.
To use SIMD instructions effective you have to load and store data with using of vector instructions such as _mm_loadu_pd/_mm_storeu_pd. Intrinsic _mm_set_pd is very ineffecient.
Below I write correct example:
#include<stdio.h>
#include<emmintrin.h>
int main()
{
double d0[2] = { 2334, 5 };
double d1[2] = { 2334.32345324 , 5 };
double d2[2] = { 0, 0 };
__m128d v0 = _mm_loadu_pd(d0);
__m128d v1 = _mm_loadu_pd(d1);
__m128d v2 = _mm_mul_pd(v0, v1);
_mm_storeu_pd(d2, v2);
printf("%f, %f\n", d2[0], d2[1]);
}
Output:
5448310.939862, 25.000000
In printf you have used format specifier as %d you need to use %f as you want to print double value

Fast conversion of 16-bit big-endian to little-endian in ARM

I need to convert big arrays of 16-bit integer values from big-endian to little-endian format.
Now I use for conversion the following function:
inline void Reorder16bit(const uint8_t * src, uint8_t * dst)
{
uint16_t value = *(uint16_t*)src;
*(uint16_t*)dst = value >> 8 | value << 8;
}
void Reorder16bit(const uint8_t * src, size_t size, uint8_t * dst)
{
assert(size%2 == 0);
for(size_t i = 0; i < size; i += 2)
Reorder16bit(src + i, dst + i);
}
I use GCC. Target platform is ARMv7 (Raspberry Phi 2B).
Is there any way to optimize it?
This conversion is needed for loading audio samples which can be as in little- endian as in big-endian format. Of course it is not a bottleneck now, but it takes about 10% of total processing time. And I think that is too much for such a simple operation.
If you want to improve performance of your code you can make following:
1) Processing of 4-bytes for one step:
inline void Reorder16bit(const uint8_t * src, uint8_t * dst)
{
uint16_t value = *(uint16_t*)src;
*(uint16_t*)dst = value >> 8 | value << 8;
}
inline void Reorder16bit2(const uint8_t * src, uint8_t * dst)
{
uint32_t value = *(uint32_t*)src;
*(size_t*)dst = (value & 0xFF00FF00) >> 8 | (value & 0x00FF00FF) << 8;
}
void Reorder16bit(const uint8_t * src, size_t size, uint8_t * dst)
{
assert(size%2 == 0);
size_t alignedSize = size/4*4;
for(size_t i = 0; i < alignedSize; i += 4)
Reorder16bit2(src + i, dst + i);
for(size_t i = alignedSize; i < size; i += 2)
Reorder16bit(src + i, dst + i);
}
If you use a 64-bit platform, it is possible to process 8 bytes for one step the same way.
2) ARMv7 platform supports SIMD instructions called NEON.
With using of them you can make you code even faster then in 1):
inline void Reorder16bit(const uint8_t * src, uint8_t * dst)
{
uint16_t value = *(uint16_t*)src;
*(uint16_t*)dst = value >> 8 | value << 8;
}
inline void Reorder16bit8(const uint8_t * src, uint8_t * dst)
{
uint8x16_t _src = vld1q_u8(src);
vst1q_u8(dst, vrev16q_u8(_src));
}
void Reorder16bit(const uint8_t * src, size_t size, uint8_t * dst)
{
assert(size%2 == 0);
size_t alignedSize = size/16*16;
for(size_t i = 0; i < alignedSize; i += 16)
Reorder16bit8(src + i, dst + i);
for(size_t i = alignedSize; i < size; i += 2)
Reorder16bit(src + i, dst + i);
}
https://goo.gl/4bRGNh
int swap(int b) {
return __builtin_bswap16(b);
}
becomes
swap(int):
rev16 r0, r0
uxth r0, r0
bx lr
So yours could be written as (gcc-explorer: https://goo.gl/HFLdMb)
void fast_Reorder16bit(const uint16_t * src, size_t size, uint16_t * dst)
{
assert(size%2 == 0);
for(size_t i = 0; i < size; i++)
dst[i] = __builtin_bswap16(src[i]);
}
which should make for loop
.L13:
ldrh r4, [r0, r3]
rev16 r4, r4
strh r4, [r2, r3] # movhi
adds r3, r3, #2
cmp r3, r1
bne .L13
read more about __builtin_bswap16 at GCC builtin docs.
Neon suggestion (kinda tested, gcc-explorer: https://goo.gl/fLNYuc):
void neon_Reorder16bit(const uint8_t * src, size_t size, uint8_t * dst)
{
assert(size%16 == 0);
//uint16x8_t vld1q_u16 (const uint16_t *)
//vrev64q_u16(uint16x8_t vec);
//void vst1q_u16 (uint16_t *, uint16x8_t)
for (size_t i = 0; i < size; i += 16)
vst1q_u8(dst + i, vrev16q_u8(vld1q_u8(src + i)));
}
which becomes
.L23:
adds r5, r0, r3
adds r4, r2, r3
adds r3, r3, #16
vld1.8 {d16-d17}, [r5]
cmp r1, r3
vrev16.8 q8, q8
vst1.8 {d16-d17}, [r4]
bhi .L23
See more about neon intrinsics here: https://gcc.gnu.org/onlinedocs/gcc-4.4.1/gcc/ARM-NEON-Intrinsics.html
Bonus from ARM ARM A8.8.386:
VREV16 (Vector Reverse in halfwords) reverses the order of 8-bit elements in each halfword of the vector, and places the result in the corresponding destination vector.
VREV32 (Vector Reverse in words) reverses the order of 8-bit or 16-bit elements in each word of the vector, and places the result in the corresponding destination vector.
VREV64 (Vector Reverse in doublewords) reverses the order of 8-bit, 16-bit, or 32-bit elements in each doubleword of the vector, and places the result in the corresponding destination vector.
There is no distinction between data types, other than size.
If it's specifically for ARM there's a REV instruction, specifically REV16 which would do two 16 bit ints at a time.
I don't know much about ARM instruction sets, but I guess there are some special instruction to endianess conversion. Apparently, ARMv7 has things like rev etc.
Have you tried the compiler intrinsic __builtin_bswap16? It should compile to CPU-specific code, e.g. rev on ARM. In addition, it helps the compiler to recognize that you are actually doing a byte-swap, and it performs other optimizations with that knowledge, e.g. eliminate redundant byte swaps entirely in cases like y=swap(x); y &= some_value; x = swap(y);.
I googled a little bit, and this thread discusses an issue with the optimization potential. According to this discussion, the compiler can also vectorize the conversion if the CPU supports the vrev NEON instruction.
You'd want to measure to see which is faster, but an alternative body for Reorder16bit would be
*(uint16_t*)dst = 256 * src[0] + src[1];
assuming that your native ints are little-endian. Another possibility:
dst[0] = src[1];
dst[1] = src[0];

How to prevent FTZ for a single line in CUDA

I am working on a particle code where flushing-to-zero is extensively used to extract performance. However there is a single floating point comparison statement that I do not wish to be flushed. One solution is to use inline PTX, but it introduces unnecessary instructions since there is no boolean type, but just predicate registers, in PTX:
C++ code:
float a, b;
if ( a < b ) do_something;
// compiles into SASS:
// FSETP.LT.FTZ.AND P0, PT, A, B, PT;
// #P0 DO_SOMETHING
PTX:
float a, b;
uint p;
asm("{.reg .pred p; setp.lt.f32 p, %1, %2; selp %0, 1, 0, p;}" : "=r"(p) : "f"(a), "f"(b) );
if (p) do_something;
// compiled into SASS:
// FSETP.LT.AND P0, PT, A, B, PT;
// SEL R2, RZ, 0x1, !P0;
// ISETP.NE.AND P0, PT, R2, RZ, PT;
// #P0 DO_SOMETHING
Is there a way that I can do the non-FTZ comparison with a single instruction without coding the entire thing in PTX/SASS?

get xyz from vector

Below is a function which helps control the car, it creates a matrix then creates a vector using that matrix and now I want to be able to get the x,y and z from that vector.
I've tried i32 CarX = vecVel.GetX; but im getting these three errors
error C3867: 'Vec2::GetX': function call missing argument list; use '&Vec2::GetX' to create a pointer to member
error C2440: '=' : cannot convert from 'const float (__thiscall Vec2::* )(void) const' to 'i32'
IntelliSense: a pointer to a bound function may only be used to call the function
I thought i32 carX = vecVel[0]; might work but no. i32 carX = vecVel.FX; doesnt work because it says FX is protected
void APIENTRY Car_Update(Object *pObject)
{
Car *pCar=(Car *)pObject;
Matrix mat;
Matrix *pmat;
Object_GetMatrix(pObject,&mat);
Object_GetMatrixPtr(pObject,&pmat);
Vec3 vecVel(pCar->vecBounce + mat.GetColumn(2) * pCar->fSpeed);
pCar->vecBounce = pCar->vecBounce *0.5f;
mat.RotY(pCar->fRot);
mat.SetColumn(3, vecVel + mat.GetColumn(3));
//mat.GetColumn;
pCar->fSpeed *= 0.8f;// friction
pCar->fRotWheelLast = pCar->fRotWheel;
pCar->fRotWheel += (pCar->fSpeed*30.f);
Level_GenerateDraw( &mat.GetColumn(3) );
Level_GenerateAlphas( &mat.GetColumn(3) );
Car_Light(pCar);
Collision_UpdateMat(pCar->pBox, &mat);
float fCol=1.f;
while(Level_TestBoxCollide( pCar->pBox ))
{
ColData Data;
float fDot;
Collision_GetColData(&Data);
fDot = -1.8f * Data.normal.Dot(vecVel);
pCar->vecBounce = (vecVel + Data.normal * fDot)*fCol;
pCar->fSpeed = 0.f;
mat.SetColumn(3, pmat->GetColumn(3) + pCar->vecBounce);
Collision_UpdateMat(pCar->pBox, &mat);
vecVel = pCar->vecBounce;
fCol-=0.1f;
if(fCol<0.f)
{
pCar->vecBounce.Set(0.f,0.f,0.f);
mat.SetColumn(3, pmat->GetColumn(3));
Collision_UpdateMat(pCar->pBox, &mat);
ASSERT(!Level_TestBoxCollide( pCar->pBox ), "still colliding");
}
}
Object_SetMatrix(pObject, &mat);
Vec3 vecWidth(mat.GetColumn(0)*0.2f);
Vec3 vecWheel1(mat.GetColumn(3) - mat.GetColumn(0)*0.6f);
Vec3 vecWheel2(mat.GetColumn(3) + mat.GetColumn(0)*0.6f);
vecWheel1.SetY( vecWheel1.GetY() - 0.7f );
vecWheel2.SetY( vecWheel2.GetY() - 0.7f );
Trail_AddPoint((Object*)pCar->pTrail[0], vecWheel1, vecWidth);
Trail_AddPoint((Object*)pCar->pTrail[1], vecWheel2, vecWidth);
}
The compiler is telling you that in order to call a function on a class you must follow the call with parentheses: i32 carX = vecVel.FX(). That is assuming that FX is a function that returns an i32 value on Vec2.

I have 3 errors:expected a ")" , expected an expression, argument of type long is incompatible with parameter of type U32

/*******************************************************************/
#define cdisp(a, src, col);
#define FL_wpset_U8 256;
/*******************************************************************/
void main(void)
{
int posx= 100, posy=100, dx=300, dy=300;
long length=5000;
int threshold=125;
int lx, x0=0, y0=0;
int res1=0,res2=0, *rlc, *input, i;
long dest1,dest2,desttemp, addr;
char c;
image Area, Ovl;
ScrSetLogPage((int)ScrGetPhysPage);
OvlSetLogPage((int)OvlGetPhysPage);
OvlClearAll;
set_ovlmask(255);
ImageAssign(&Area,ScrByteAddr(posx,posy), dx, dy, ScrGetPitch);
ImageAssign(&Ovl,OvlBitAddr(posx,posy), dx, dy, OvlGetPitch);
frameo(&Ovl);
vmode(vmOvlLive);
/* follow contour */
dest1=DRAMWordMalloc((long)length);
dest2=DRAMWordMalloc((long)length);
The 2 errors are in the line frameo(&Ovl) (expected a ")" , expected an expression).
desttemp = dest1;
res1 = contour8(&Area,x0,y0,~2,threshold,length,&desttemp);
The last error is in the last line (argument of type long is incompatible with parameter of type U32). The function signature of contour8 is I32 contour8(image *a, I32 x0, I32 y0, I32 dir, I32 thr, U32 lng, U32 **dst).
I don't know how to solve it ,thanks in advance.
You almost certainly don't want those semicolons at the end of your #define lines.
Using them will most likely inject the statement separator into the middle of expressions.
You should remove the ; from defines.