CreateTruncOrBitCast or CreateAnd - llvm

I am using LLVM for code instrumentation, where each byte of a variable needs to be extracted. I have two alternatives:
for (int i = 0; i < 8; i++)
{
Value* cur_byte = CreateTruncOrBitCast(var, Int8Ty);
cur_byte = IRB.CreateZExt(cur_byte, Int64Ty);
/* do something */
var = IRB.CreateLShr(var, 8);
}
Or
for (int i = 0; i < 8; i++)
{
Value* cur_byte = CreateAnd(var, 0xFF);
/* do something */
var = IRB.CreateLShr(var, 8);
I would like to consult which method is faster, or is there any other better method?

Related

Bullet Physics - How can I convert btHeightfieldTerrainShape to btTriangleMeshShape?

I found that btStridingMeshInterface is required to create btBvhTriangleMeshShape.
But all I have is as much data as (Width * Height) for btTerrainHeightfieldShape. There is no index information on the data.
Then, I found this post on Forum : https://pybullet.org/Bullet/phpBB3/viewtopic.php?p=38852&hilit=btTriangleIndexVertexArray#p38852
so I modify that codes like this :
auto TerrainShape = new btHeightfieldTerrainShape(mWidth * mTerrainScale.x, mDepth * mTerrainScale.z, mHeightmapData, minHeight, maxHeight, 1, false);
TerrainShape->setLocalScaling(btVector3(1, mTerrainScale.y, 1));
btVector3 aabbMin, aabbMax;
for (int k = 0; k < 3; k++)
{
aabbMin[k] = -BT_LARGE_FLOAT;
aabbMax[k] = BT_LARGE_FLOAT;
}
std::vector<XMFLOAT3> vertices;
std::vector<unsigned short> Targetindices;
btTriangleCollector collector;
collector.m_pVerticesOut = &vertices;
collector.m_pIndicesOut = &Targetindices;
TerrainShape->processAllTriangles(&collector, aabbMin, aabbMax);
mVertexArray = std::make_shared<btTriangleIndexVertexArray>();
btIndexedMesh tempMesh;
mVertexArray->addIndexedMesh(tempMesh, PHY_FLOAT);
btIndexedMesh& mesh = mVertexArray->getIndexedMeshArray()[0];
const int32_t VERTICES_PER_TRIANGLE = 3;
size_t numIndices = Targetindices.size();
mesh.m_numTriangles = numIndices / VERTICES_PER_TRIANGLE;
if (numIndices < std::numeric_limits<int16_t>::max())
{
mesh.m_triangleIndexBase = new unsigned char[sizeof(int16_t) * (size_t)numIndices];
mesh.m_indexType = PHY_SHORT;
mesh.m_triangleIndexStride = VERTICES_PER_TRIANGLE * sizeof(int16_t);
}
else
{
mesh.m_triangleIndexBase = new unsigned char[sizeof(int32_t) * (size_t)numIndices];
mesh.m_indexType = PHY_INTEGER;
mesh.m_triangleIndexStride = VERTICES_PER_TRIANGLE * sizeof(int32_t);
}
mesh.m_numVertices = vertices.size();
mesh.m_vertexBase = new unsigned char[VERTICES_PER_TRIANGLE * sizeof(btScalar) * (size_t)mesh.m_numVertices];
mesh.m_vertexStride = VERTICES_PER_TRIANGLE * sizeof(btScalar);
btScalar* vertexData = static_cast<btScalar*>((void*)(mesh.m_vertexBase));
for (int32_t i = 0; i < mesh.m_numVertices; ++i)
{
int32_t j = i * VERTICES_PER_TRIANGLE;
const XMFLOAT3& point = vertices[i];
vertexData[j] = point.x;
vertexData[j + 1] = point.y;
vertexData[j + 2] = point.z;
}
if (numIndices < std::numeric_limits<int16_t>::max())
{
int16_t* indices = static_cast<int16_t*>((void*)(mesh.m_triangleIndexBase));
for (int32_t i = 0; i < numIndices; ++i) {
indices[i] = (int16_t)Targetindices[i];
}
}
else
{
int32_t* indices = static_cast<int32_t*>((void*)(mesh.m_triangleIndexBase));
for (int32_t i = 0; i < numIndices; ++i) {
indices[i] = Targetindices[i];
}
}
btBvhTriangleMeshShape* shape = new btBvhTriangleMeshShape(mVertexArray.get(), true);
btTransform btTerrainTransform;
btTerrainTransform.setIdentity();
btTerrainTransform.setOrigin(btVector3(0, 0, 0));
mBtRigidBody = physics->CreateRigidBody(0.0f, btTerrainTransform, shape);
What I try to do is simple. make btHeightfieldTerrainShape, and collect all triangle data by TriangleCallback(collector) and make btTriangleIndexVertexArray by that data sets.
problem is data that I collected seems wrong.
if this codes works well, mBtRigdBody's AABB must be min(-511.5, 4, -511.5), max(511.5, 242, 511.5),
but, actually what i got is min(-511.5, -105.389..., -511.5), max(511.5, 25.8... ,-500.5)
definitely wrong values.
I think btTriangleIndexVertexArray doesn't have right data of terrain, but I can't find where's wrong.

SSE addition and conversion

Here's the thing, how can I add two unsigned char arrays and store the result in an unsigned short array by using SSE. Can anyone give me some help or hint. This is what I have done so far. I just don't know where the error is..need some help
#include<iostream>
#include<intrin.h>
#include<windows.h>
#include<emmintrin.h>
#include<iterator>
using namespace std;
void sse_add(unsigned char * input1, unsigned char *input2, unsigned short *output, const int N)
{
unsigned char *op3 = new unsigned char[N];
unsigned char *op4 = new unsigned char[N];
__m128i *sse_op3 = (__m128i*)op3;
__m128i *sse_op4 = (__m128i*)op4;
__m128i *sse_result = (__m128i*)output;
for (int i = 0; i < N; i = i + 16)
{
__m128i src = _mm_loadu_si128((__m128i*)input1);
__m128i zero = _mm_setzero_si128();
__m128i higher = _mm_unpackhi_epi8(src, zero);
__m128i lower = _mm_unpacklo_epi8(src, zero);
_mm_storeu_si128(sse_op3, lower);
sse_op3 = sse_op3 + 1;
_mm_storeu_si128(sse_op3, higher);
sse_op3 = sse_op3 + 1;
input1 = input1 + 16;
}
for (int j = 0; j < N; j = j + 16)
{
__m128i src1 = _mm_loadu_si128((__m128i*)input2);
__m128i zero1 = _mm_setzero_si128();
__m128i higher1 = _mm_unpackhi_epi8(src1, zero1);
__m128i lower1 = _mm_unpacklo_epi8(src1, zero1);
_mm_storeu_si128(sse_op4, lower1);
sse_op4 = sse_op4 + 1;
_mm_storeu_si128(sse_op4, higher1);
sse_op4 = sse_op4 + 1;
input2 = input2 + 16;
}
__m128i *sse_op3_new = (__m128i*)op3;
__m128i *sse_op4_new = (__m128i*)op4;
for (int y = 0; y < N; y = y + 8)
{
*sse_result = _mm_adds_epi16(*sse_op3_new, *sse_op4_new);
sse_result = sse_result + 1;
sse_op3_new = sse_op3_new + 1;
sse_op4_new = sse_op4_new + 1;
}
}
void C_add(unsigned char * input1, unsigned char *input2, unsigned short *output, int N)
{
for (int i = 0; i < N; i++)
output[i] = (unsigned short)input1[i] + (unsigned short)input2[i];
}
int main()
{
int n = 1023;
unsigned char *p0 = new unsigned char[n];
unsigned char *p1 = new unsigned char[n];
unsigned short *p21 = new unsigned short[n];
unsigned short *p22 = new unsigned short[n];
for (int j = 0; j < n; j++)
{
p21[j] = rand() % 256;
p22[j] = rand() % 256;
}
C_add(p0, p1, p22, n);
cout << "C_add finished!" << endl;
sse_add(p0, p1, p21, n);
cout << "sse_add finished!" << endl;
for (int j = 0; j < n; j++)
{
if (p21[j] != p22[j])
{
cout << "diff!!!!!#######" << endl;
}
}
//system("pause");
delete[] p0;
delete[] p1;
delete[] p21;
delete[] p22;
return 0;
}
Assuming everything is aligned to _Alignof(__m128i) and the size of the array is a multiple of sizeof(__m128i), something like this should work:
void addw(size_t size, uint16_t res[size], uint8_t a[size], uint8_t b[size]) {
__m128i* r = (__m128i*) res;
__m128i* ap = (__m128i*) a;
__m128i* bp = (__m128i*) b;
for (size_t i = 0 ; i < (size / sizeof(__m128i)) ; i++) {
r[(i * 2)] = _mm_add_epi16(_mm_cvtepu8_epi16(ap[i]), _mm_cvtepu8_epi16(bp[i]));
r[(i * 2) + 1] = _mm_add_epi16(_mm_cvtepu8_epi16(_mm_srli_si128(ap[i], 8)), _mm_cvtepu8_epi16(_mm_srli_si128(bp[i], 8)));
}
}
FWIW, NEON would be a bit simpler (using vaddl_u8 and vaddl_high_u8).
If you're dealing with unaligned data you can use _mm_loadu_si128/_mm_storeu_si128. If size isn't a multiple of 16 you'll just have to do the remainder without SSE.
Note that this may be something your compiler can do automatically (I haven't checked). You may want to try something like this:
#pragma omp simd
for (size_t i = 0 ; i < size ; i++) {
res[i] = ((uint16_t) a[i]) + ((uint16_t) b[i]);
}
That uses OpenMP 4, but there is also Cilk++ (#pragma simd), clang (#pragma clang loop vectorize(enable)), gcc (#pragma GCC ivdep), or you could just hope the compiler is smart enough without the pragma hint.

Multiple sequence alignment with gaps in input (Seqan)

I have a sequence of strings like so: ["123-5", "1-45", "--345"]. In result is perfect to get 12345. So sometimes I know that in certain position I have symbol, but do not know which. From examples I get this code.
typedef String<char> TSequence; // sequence type
typedef Align<TSequence, ArrayGaps> TAlign; // align type
unsigned int plate_count = plates.size();
TAlign align;
resize(rows(align), plate_count);
for (unsigned int i = 0; i < plate_count; ++i)
assignSource(row(align, i), plates[i]);
globalMsaAlignment(align, SimpleScore(5, -3, -1, -3));
// create the profile string
String<ProfileChar<char> > profile;
resize(profile, length(row(align, 0)));
for (unsigned rowNo = 0; rowNo < plate_count; ++rowNo)
for (unsigned i = 0; i < length(row(align, rowNo)); ++i)
profile[i].count[ordValue(row(align, rowNo)[i])] += 1;
// call consensus from this string
String<char> consensus;
for (unsigned i = 0; i < length(profile); ++i)
{
char idx = (char)getMaxIndex(profile[i]);
if (idx == '-') {
int bck = profile[i].count[ordValue('-')];
profile[i].count[ordValue('-')] = 0;
idx = (char)getMaxIndex(profile[i]);
if (profile[i].count[ordValue(idx)] == 1) { // ignore single recognitions
idx = '#';
}
profile[i].count[ordValue('-')] = bck;
}
appendValue(consensus, idx);
}
return string(toCString(consensus));
How can I tell Seqan that there is a symbol in specific position?

Free memory after malloc in a loop

I got some memory allocated in a loop - how to free it when I am done with tr_data variable ?
(I am fairly new to C++)
#define Malloc(type,n) (type *)malloc((n)*sizeof(type))
struct svm_problem tr_data;
tr_data.l = (int) prm_num_samples_anchored.array[bar];
tr_data.y = Malloc(double, tr_data.l);
tr_data.x = Malloc(struct svm_node*, tr_data.l);
for (int row = 0; row < tr_data.l; row++)
{
tr_data.y[row] = ta0.array[bar-row-1];
//leak
svm_node* tr_data_x_onerow = Malloc(svm_node, num_features+1);
tr_data_x_onerow[0].index = 1; tr_data_x_onerow[0].value = in0.array[bar-row-1]; tr_data_x_onerow[1].index = 2; tr_data_x_onerow[1].value = in1.array[bar-row-1]; tr_data_x_onerow[2].index = 3; tr_data_x_onerow[2].value = in2.array[bar-row-1]; tr_data_x_onerow[3].index = 4; tr_data_x_onerow[3].value = in3.array[bar-row-1]; tr_data_x_onerow[4].index = 5; tr_data_x_onerow[4].value = in4.array[bar-row-1]; tr_data_x_onerow[5].index = 6; tr_data_x_onerow[5].value = in5.array[bar-row-1]; tr_data_x_onerow[6].index = 7; tr_data_x_onerow[6].value = in6.array[bar-row-1]; tr_data_x_onerow[7].index = 8; tr_data_x_onerow[7].value = in7.array[bar-row-1]; tr_data_x_onerow[8].index = 9; tr_data_x_onerow[8].value = in8.array[bar-row-1]; tr_data_x_onerow[9].index = 10;
tr_data_x_onerow[num_features].index = -1; //Each row of properties should be terminated with a -1 according to the readme
tr_data.x[row] = tr_data_x_onerow;
}
... few operation on tr_data
... and this does not work
for (int row = 0; row <tr_data.l; row++)
{
free(tr_data_x_onerow);
}
for (int row = 0; row <tr_data.l; row++)
{
free(tr_data.x[row]);
}
But please, just don't do this. This is C++. Use a vector or some other sane collection.

PIC C18: Reading bits from a byte

I have a very elementary question. However, what ever I tried, I couldn't successfully implement this.
I have a shift register (74LS164) connected to PIC18F2550 with the following hardware configuration:
// Data pin
#define SCLCD_DATA LATBbits.LATB7
#define SCLCD_DATA_TRIS TRISBbits.TRISB7
// Clock pin
#define SCLCD_CLOCK LATBbits.LATB6
#define SCLCD_CLOCK_TRIS TRISBbits.TRISB6
LEDs are connected to the output pins of 74LS164 to view its status. I have an 8-bit variable declared as unsigned char. I want to send the bits of this variable to the shift register. The shift register has internal flip-flops whose outputs are named as Q0-Q7. The first sent bit loads into Q0, when you send a second bit, previous Q0 shifts to Q1 and the newly sent bit comes to Q0, and this goes so on as you send succeeding bits. When the sending is completed, LSB of the variable is supposed to be on the Q0 of the shift register, and MSB will be on Q7.
My code is like this (Language is C18):
void SCLCD_SendSerialBits(unsigned char unRegister)
{
// ucRegister is always passed as 0b10101010 for test
for (i=0; i<8; i++)
{
SCLCD_CLOCK = 0;
SCLCD_DATA = ((ucRegister & 0b10000000) == 0b10000000) ? 1 : 0;
ucRegister = ucRegister << 1;
SCLCD_CLOCK = 1;
}
}
The code above doesn't run as I want to. When I run it, all the LEDs light on, as if I had loaded 0b11111111 into the ucRegister variable.
However, the following one works very well:
void SCLCD_SendSerialBits(void)
{
SCLCD_CLOCK = 0; SCLCD_DATA = 1; SCLCD_CLOCK = 1;
SCLCD_CLOCK = 0; SCLCD_DATA = 0; SCLCD_CLOCK = 1;
SCLCD_CLOCK = 0; SCLCD_DATA = 1; SCLCD_CLOCK = 1;
SCLCD_CLOCK = 0; SCLCD_DATA = 0; SCLCD_CLOCK = 1;
SCLCD_CLOCK = 0; SCLCD_DATA = 1; SCLCD_CLOCK = 1;
SCLCD_CLOCK = 0; SCLCD_DATA = 0; SCLCD_CLOCK = 1;
SCLCD_CLOCK = 0; SCLCD_DATA = 1; SCLCD_CLOCK = 1;
SCLCD_CLOCK = 0; SCLCD_DATA = 0; SCLCD_CLOCK = 1;
}
What is wrong with my code? I thing the error is most likely to be on the line SCLCD_DATA = ((ucRegister & 0b10000000) == 0b10000000) ? 1 : 0;, but no matter how much I look at it, it looks perfectly OK to me. What is wrong with my code?
Any help will be appreciated.
Your code looks like it should work. I would write it like this to be more readable and efficient (assuming your system has a barrel shifter):
for (i=7; i>=0; i--)
{
SCLCD_CLOCK = 0;
SCLCD_DATA = ((ucRegister >> i) & 1);
SCLCD_CLOCK = 1;
}
For systems without a barrel shifter, a variation of your code
unsigned char ucMask = 0x80;
for (i=0; i<8; i++)
{
SCLCD_CLOCK = 0;
SCLCD_DATA = (ucRegister & ucMask) ? 1:0;
ucMask >>= 1;
SCLCD_CLOCK = 1;
}
If my first or second examples work, then it sounds like the compiler is not handling the constant values or compares correctly in your original code.
Might just be a typo, but your parameter is unRegister not ucRegister. Is it possible that ucRegister is a global which is 0b11111111?