AltiVec vec_msum equivalent for float values

AltiVec vec_msum equivalent for float values - c++

Is anybody aware of a method to achieve vec_msum functionality against a vector of float values?
I'm quite new to SIMD, and although I think I'm starting to make sense of it - there are still a few puzzles.
My end goal is to rewrite the function "convolve_altivec" (as found in the accepted answer for this question) such that it accepts input parameters as float values, instead of short's.
That is, the prototype should be
float convolve_altivec(const float *a, const float *b, int n)
I'm trying to match the functionality provided by the original non-optimised function below:
float convolve(const float *a, const float *b, int n)
{
float out = 0.f;
while (n --)
out += (*(a ++)) * (*(b ++));
return out;
}
My initial efforts have seen me trying to port an existing SSE version of this same function to altivec instructions.

For a float version you need vec_madd.
Here's a float version of the previous 16 bit int version and test harness I posted in response to your earlier question:
#include <assert.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <altivec.h>
static float convolve_ref(const float *a, const float *b, int n)
{
float sum = 0.0f;
int i;
for (i = 0; i < n; ++i)
{
sum += a[i] * b[i];
}
return sum;
}
static inline float convolve_altivec(const float *a, const float *b, int n)
{
float sum = 0.0f;
vector float vsum = { 0.0f, 0.0f, 0.0f, 0.0f };
union {
vector float v;
float a[4];
} usum;
vector float *pa = (vector float *)a;
vector float *pb = (vector float *)b;
assert(((unsigned long)a & 15) == 0);
assert(((unsigned long)b & 15) == 0);
while (n >= 4)
{
vsum = vec_madd(*pa, *pb, vsum);
pa++;
pb++;
n -= 4;
}
usum.v = vsum;
sum = usum.a[0] + usum.a[1] + usum.a[2] + usum.a[3];
a = (float *)pa;
b = (float *)pb;
while (n --)
{
sum += (*a++ * *b++);
}
return sum;
}
int main(void)
{
const int n = 1002;
vector float _a[n / 4 + 1];
vector float _b[n / 4 + 1];
float *a = (float *)_a;
float *b = (float *)_b;
float sum_ref, sum_test;
int i;
for (i = 0; i < n; ++i)
{
a[i] = (float)rand();
b[i] = (float)rand();
}
sum_ref = convolve_ref(a, b, n);
sum_test = convolve_altivec(a, b, n);
printf("sum_ref = %g\n", sum_ref);
printf("sum_test = %g\n", sum_test);
printf("%s\n", fabsf((sum_ref - sum_test) / sum_ref) < 1.0e-6 ? "PASS" : "FAIL");
return 0;
}

Related

Recursive function outputs a wrong value

In this code the output is 'r' instead of 'r0'
Instead of doing the operations it outputs me the first 'r' (equals 100) and does not do the process.
I´m trying to program an operation like (x_0 = x + (nt²/(2(x+(n(t-1)²/2(x+(n(t-3)²/2(x + (n(t-4)²...)²)²)²)²)²)²)²)²) in where the process is repeated until the variable 't' is '0'(because each time the operation is done 't' get a '-1').
#include <iostream>
#include "math.h"
using namespace std;
int operation(float r,
float r0,
float recursiva,
float operacion,
float recursivaPrincipal2,
float recursivaPrincipal,
float p,
float n,
long long t,
float q,
float potenciaQ,
float c,
float potenciaC,
float t2,
float division);
float r = 100;
float t = 10000;
float r0;
float recursiva;
float operacion;
float recursivaPrincipal2;
float recursivaPrincipal;
float p;
float n;
float q;
float potenciaQ;
float c;
float potenciaC;
float t2;
float division;
int main() {
r0 = r + operacion;
potenciaQ = pow(10,10);
q = 6 * potenciaQ;
potenciaC = pow(10,2);
c = 5 * potenciaC;
while (t = 10000, t = t - 1, t > 0) {
t2 = t * t;
n = q * t2;
operacion = n / recursivaPrincipal;
recursivaPrincipal2 = recursiva * recursiva;
recursivaPrincipal = 2 * recursivaPrincipal2;
recursiva = r + operacion;
if (t == 0) {
system("pause");
return 0;
}
cout << "Solucion: " << r0 << endl;
}
}
i want to do something like this
I'm so sorry if this code offended you (comments look like it) but I'm not very good, this is my first c++ code (and last I think)

The answer is based on what i get from your question
Please do expand your mathematical expression for t=3 and append an image of it
by far what i got from your expression you need this
float func(int t,int n,int x)
{
if (t==1)
{
return (x + (n/2)*(n/2)) * (x + (n/2)*(n/2));
}
return x + (n*t*t)/(2*func(t-1,n,x)) ;
}
According to the picture you have uploaded this is my code
Don't use 0 for n
#include<iostream>
using namespace std;
double partSolver(int x,int p, int n)
{
if(n==0) return 2*x*x;
double val = x - ( (p*n*n) / partSolver(x,p,n-1) );
return 2*val*val ;
}
double solver(int x,int p,int n)
{
return (n*n * 2) / partSolver(x,p,n-1);
}
int main()
{
cout<<"The Solution is: "<<solver(3,2,1)<<endl;
return 0;
}

Is it possible to use CUDA parallelizing this nested for loop?

I want to speed up this nested for loop, just start learn CUDA, how could I use CUDA to parallel this c++ code ?
#define PI 3.14159265
using namespace std;
int main()
{
int nbint = 2;
int hits = 20;
int nbinp = 2;
float _theta, _phi, _l, _m, _n, _k = 0, delta = 5;
float x[20],y[20],z[20],a[20],t[20];
for (int i = 0; i < hits; ++i)
{
x[i] = rand() / (float)(RAND_MAX / 100);
}
for (int i = 0; i < hits; ++i)
{
y[i] = rand() / (float)(RAND_MAX / 100);
}
for (int i = 0; i < hits; ++i)
{
z[i] = rand() / (float)(RAND_MAX / 100);
}
for (int i = 0; i < hits; ++i)
{
a[i] = rand() / (float)(RAND_MAX / 100);
}
float maxforall = 1e-6;
float theta0;
float phi0;
for (int i = 0; i < nbint; i++)
{
_theta = (0.5 + i)*delta;
for (int j = 0; j < nbinp; j++)
{
_phi = (0.5 + j)*delta / _theta;
_l = sin(_theta* PI / 180.0)*cos(_phi* PI / 180.0);
_m = sin(_theta* PI / 180.0)*sin(_phi* PI / 180.0);
_n = cos(_theta* PI / 180.0);
for (int k = 0; k < hits; k++)
{
_k = -(_l*x[k] + _m*y[k] + _n*z[k]);
t[k] = a[k] - _k;
}
qsort(t, 0, hits - 1);
float max = t[0];
for (int k = 0; k < hits; k++)
{
if (max < t[k])
max = t[k];
}
if (max > maxforall)
{
maxforall = max;
}
}
}
return 0;
}
I want to put innermost for loop and the sort part(maybe the whole nested loop) into parallel. After sort those array I found the maximum of all arrays. I use maximum to simplify the code. The reason I need sort is that maximum represent
here is a continuous time information(all arrays contain time information). The sort part make those time from lowest to highest. Then I compare the a specific time interval(not a single value). The compare process almost like I choose maximum but with a continuous interval not a single value.

Your 3 nested loops calculate nbint*nbinp*hits values. Since each of those values is independent from each other, all values can be calculated in parallel.
You stated in your comments that you have a commutative and associative "filter condition" which reduces the output to a single scalar value. This can be exploited to avoid sorting and storing the temporary values. Instead, we can calculate the values on-the-fly and then apply a parallel reduction to determine the end result.
This can be done in "raw" CUDA, below I implemented this idea using thrust. The main idea is to run grid_op nbint*nbinp*hits times in parallel. In order to find out the three original "loop indices" from the single scalar index which is passed to grid_op the algorithm from this SO question is used.
thrust::transform_reduce performs the on-the-fly transformation and the subsequent parallel reduction (here thrust::maximum is used as a substitute).
#include <cmath>
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/transform_reduce.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/tuple.h>
// ### BEGIN utility for demo ####
#include <iostream>
#include <thrust/random.h>
thrust::host_vector<float> random_vector(const size_t N)
{
thrust::default_random_engine rng;
thrust::uniform_real_distribution<float> u01(0.0f, 1.0f);
thrust::host_vector<float> temp(N);
for(size_t i = 0; i < N; i++) {
temp[i] = u01(rng);
}
return temp;
}
// ### END utility for demo ####
template <typename... Iterators>
thrust::zip_iterator<thrust::tuple<Iterators...>> zip(Iterators... its)
{
return thrust::make_zip_iterator(thrust::make_tuple(its...));
}
template <typename ZipIterator>
class grid_op
{
public:
grid_op(ZipIterator zipIt, std::size_t dim1, std::size_t dim2) : zipIt(zipIt), dim1(dim1), dim2(dim2){}
__host__ __device__
float operator()(std::size_t index) const
{
const auto coords = unflatten_3d_index(index, dim1, dim2);
const auto values = zipIt[thrust::get<2>(coords)];
const float delta = 5;
const float _theta = (0.5f + thrust::get<0>(coords))*delta;
const float _phi = (0.5f + thrust::get<1>(coords))*delta / _theta;
const float _l = sin(_theta* M_PI / 180.0)*cos(_phi* M_PI / 180.0);
const float _m = sin(_theta* M_PI / 180.0)*sin(_phi* M_PI / 180.0);
const float _n = cos(_theta* M_PI / 180.0);
const float _k = -(_l*thrust::get<0>(values) + _m*thrust::get<1>(values) + _n*thrust::get<2>(values));
return (thrust::get<3>(values) - _k);
}
private:
__host__ __device__
thrust::tuple<std::size_t, std::size_t, std::size_t>
unflatten_3d_index(std::size_t index, std::size_t dim1, std::size_t dim2) const
{
// taken from https://stackoverflow.com/questions/29142417/4d-position-from-1d-index
std::size_t x = index % dim1;
std::size_t y = ( ( index - x ) / dim1 ) % dim2;
std::size_t z = ( ( index - y * dim1 - x ) / (dim1 * dim2) );
return thrust::make_tuple(x,y,z);
}
ZipIterator zipIt;
std::size_t dim1;
std::size_t dim2;
};
template <typename ZipIterator>
grid_op<ZipIterator> make_grid_op(ZipIterator zipIt, std::size_t dim1, std::size_t dim2)
{
return grid_op<ZipIterator>(zipIt, dim1, dim2);
}
int main()
{
const int nbint = 3;
const int nbinp = 4;
const int hits = 20;
const std::size_t N = nbint * nbinp * hits;
thrust::device_vector<float> d_x = random_vector(hits);
thrust::device_vector<float> d_y = random_vector(hits);
thrust::device_vector<float> d_z = random_vector(hits);
thrust::device_vector<float> d_a = random_vector(hits);
auto zipIt = zip(d_x.begin(), d_y.begin(), d_z.begin(), d_a.begin());
auto countingIt = thrust::counting_iterator<std::size_t>(0);
auto unary_op = make_grid_op(zipIt, nbint, nbinp);
auto binary_op = thrust::maximum<float>();
const float init = 0;
float max = thrust::transform_reduce(
countingIt, countingIt+N,
unary_op,
init,
binary_op
);
std::cout << "max = " << max << std::endl;
}

Fast implementation of covariance of two 8-bit arrays

I need to compare a big amount of similar images of small size (up to 200x200).
So I try to implement SSIM (Structural similarity see https://en.wikipedia.org/wiki/Structural_similarity ) algorithm.
SSIM requires calculation of covariance of two 8-bit gray images.
A trivial implementation look like:
float SigmaXY(const uint8_t * x, const uint8_t * y, size_t size, float averageX, float averageY)
{
float sum = 0;
for(size_t i = 0; i < size; ++i)
sum += (x[i] - averageX) * (y[i] - averageY);
return sum / size;
}
But it has poor performance.
So I hope to improve it with using SIMD or CUDA (I heard that it can be done).
Unfortunately I have no experience to do this.
How it will look? And where I have to go?

I have another nice solution!
At first I want to mention some mathematical formulas:
averageX = Sum(x[i])/size;
averageY = Sum(y[i])/size;
And therefore:
Sum((x[i] - averageX)*(y[i] - averageY))/size =
Sum(x[i]*y[i])/size - Sum(x[i]*averageY)/size -
Sum(averageX*y[i])/size + Sum(averageX*averageY)/size =
Sum(x[i]*y[i])/size - averageY*Sum(x[i])/size -
averageX*Sum(y[i])/size + averageX*averageY*Sum(1)/size =
Sum(x[i]*y[i])/size - averageY*averageX -
averageX*averageY + averageX*averageY =
Sum(x[i]*y[i])/size - averageY*averageX;
It allows to modify our algorithm:
float SigmaXY(const uint8_t * x, const uint8_t * y, size_t size, float averageX, float averageY)
{
uint32_t sum = 0; // If images will have size greater then 256x256 than you have to use uint64_t.
for(size_t i = 0; i < size; ++i)
sum += x[i]*y[i];
return sum / size - averageY*averageX;
}
And only after that we can use SIMD (I used SSE2):
#include <emmintrin.h>
inline __m128i SigmaXY(__m128i x, __m128i y)
{
__m128i lo = _mm_madd_epi16(_mm_unpacklo_epi8(x, _mm_setzero_si128()), _mm_unpacklo_epi8(y, _mm_setzero_si128()));
__m128i hi = _mm_madd_epi16(_mm_unpackhi_epi8(y, _mm_setzero_si128()), _mm_unpackhi_epi8(y, _mm_setzero_si128()));
return _mm_add_epi32(lo, hi);
}
float SigmaXY(const uint8_t * x, const uint8_t * y, size_t size, float averageX, float averageY)
{
uint32_t sum = 0;
size_t i = 0, alignedSize = size/16*16;
if(size >= 16)
{
__m128i sums = _mm_setzero_si128();
for(; i < alignedSize; i += 16)
{
__m128i _x = _mm_loadu_si128((__m128i*)(x + i));
__m128i _y = _mm_loadu_si128((__m128i*)(y + i));
sums = _mm_add_epi32(sums, SigmaXY(_x, _y));
}
uint32_t _sums[4];
_mm_storeu_si128(_sums, sums);
sum = _sums[0] + _sums[1] + _sums[2] + _sums[3];
}
for(; i < size; ++i)
sum += x[i]*y[i];
return sum / size - averageY*averageX;
}

There is a SIMD implementation of the algorithm (I used SSE4.1):
#include <smmintrin.h>
template <int shift> inline __m128 SigmaXY(const __m128i & x, const __m128i & y, __m128 & averageX, __m128 & averageY)
{
__m128 _x = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_srli_si128(x, shift)));
__m128 _y = _mm_cvtepi32_ps(_mm_cvtepu8_epi32(_mm_srli_si128(y, shift)));
return _mm_mul_ps(_mm_sub_ps(_x, averageX), _mm_sub_ps(_y, averageY))
}
float SigmaXY(const uint8_t * x, const uint8_t * y, size_t size, float averageX, float averageY)
{
float sum = 0;
size_t i = 0, alignedSize = size/16*16;
if(size >= 16)
{
__m128 sums = _mm_setzero_ps();
__m128 avgX = _mm_set1_ps(averageX);
__m128 avgY = _mm_set1_ps(averageY);
for(; i < alignedSize; i += 16)
{
__m128i _x = _mm_loadu_si128((__m128i*)(x + i));
__m128i _y = _mm_loadu_si128((__m128i*)(y + i));
sums = _mm_add_ps(sums, SigmaXY<0>(_x, _y, avgX, avgY);
sums = _mm_add_ps(sums, SigmaXY<4>(_x, _y, avgX, avgY);
sums = _mm_add_ps(sums, SigmaXY<8>(_x, _y, avgX, avgY);
sums = _mm_add_ps(sums, SigmaXY<12>(_x, _y, avgX, avgY);
}
float _sums[4];
_mm_storeu_ps(_sums, sums);
sum = _sums[0] + _sums[1] + _sums[2] + _sums[3];
}
for(; i < size; ++i)
sum += (x[i] - averageX) * (y[i] - averageY);
return sum / size;
}
I hope that it will useful for you.

opencv: Rigid Transformation between two 3D point clouds

I have two 3D point clouds, and I'd like to use opencv to find the rigid transformation matrix (translation, rotation, constant scaling among all 3 axes).
I've found an estimateRigidTransformation function, but it's only for 2D points apparently
In addition, I've found estimateAffine3D, but it doesn't seem to support rigid transformation mode.
Do I need to just write my own rigid transformation function?

I did not find the required functionality in OpenCV so I have written my own implementation. Based on ideas from OpenSFM.
cv::Vec3d
CalculateMean(const cv::Mat_<cv::Vec3d> &points)
{
cv::Mat_<cv::Vec3d> result;
cv::reduce(points, result, 0, CV_REDUCE_AVG);
return result(0, 0);
}
cv::Mat_<double>
FindRigidTransform(const cv::Mat_<cv::Vec3d> &points1, const cv::Mat_<cv::Vec3d> points2)
{
/* Calculate centroids. */
cv::Vec3d t1 = -CalculateMean(points1);
cv::Vec3d t2 = -CalculateMean(points2);
cv::Mat_<double> T1 = cv::Mat_<double>::eye(4, 4);
T1(0, 3) = t1[0];
T1(1, 3) = t1[1];
T1(2, 3) = t1[2];
cv::Mat_<double> T2 = cv::Mat_<double>::eye(4, 4);
T2(0, 3) = -t2[0];
T2(1, 3) = -t2[1];
T2(2, 3) = -t2[2];
/* Calculate covariance matrix for input points. Also calculate RMS deviation from centroid
* which is used for scale calculation.
*/
cv::Mat_<double> C(3, 3, 0.0);
double p1Rms = 0, p2Rms = 0;
for (int ptIdx = 0; ptIdx < points1.rows; ptIdx++) {
cv::Vec3d p1 = points1(ptIdx, 0) + t1;
cv::Vec3d p2 = points2(ptIdx, 0) + t2;
p1Rms += p1.dot(p1);
p2Rms += p2.dot(p2);
for (int i = 0; i < 3; i++) {
for (int j = 0; j < 3; j++) {
C(i, j) += p2[i] * p1[j];
}
}
}
cv::Mat_<double> u, s, vh;
cv::SVD::compute(C, s, u, vh);
cv::Mat_<double> R = u * vh;
if (cv::determinant(R) < 0) {
R -= u.col(2) * (vh.row(2) * 2.0);
}
double scale = sqrt(p2Rms / p1Rms);
R *= scale;
cv::Mat_<double> M = cv::Mat_<double>::eye(4, 4);
R.copyTo(M.colRange(0, 3).rowRange(0, 3));
cv::Mat_<double> result = T2 * M * T1;
result /= result(3, 3);
return result.rowRange(0, 3);
}

I've found PCL to be a nice adjunct to OpenCV. Take a look at their Iterative Closest Point (ICP) example. The provided example registers the two point clouds and then displays the rigid transformation.

Here's my rmsd code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <assert.h>
typedef struct
{
float m[4][4];
} MATRIX;
#define vdiff2(a,b) ( ((a)[0]-(b)[0]) * ((a)[0]-(b)[0]) + \
((a)[1]-(b)[1]) * ((a)[1]-(b)[1]) + \
((a)[2]-(b)[2]) * ((a)[2]-(b)[2]) )
static double alignedrmsd(float *v1, float *v2, int N);
static void centroid(float *ret, float *v, int N);
static int getalignmtx(float *v1, float *v2, int N, MATRIX *mtx);
static void crossproduct(float *ans, float *pt1, float *pt2);
static void mtx_root(MATRIX *mtx);
static int almostequal(MATRIX *a, MATRIX *b);
static void mulpt(MATRIX *mtx, float *pt);
static void mtx_mul(MATRIX *ans, MATRIX *x, MATRIX *y);
static void mtx_identity(MATRIX *mtx);
static void mtx_trans(MATRIX *mtx, float x, float y, float z);
static int mtx_invert(float *mtx, int N);
static float absmaxv(float *v, int N);
/*
calculate rmsd between two structures
Params: v1 - first set of points
v2 - second set of points
N - number of points
mtx - return for transfrom matrix used to align structures
Returns: rmsd score
Notes: mtx can be null. Transform will be rigid. Inputs must
be previously aligned for sequence alignment
*/
double rmsd(float *v1, float *v2, int N, float *mtx)
{
float cent1[3];
float cent2[3];
MATRIX tmtx;
MATRIX tempmtx;
MATRIX move1;
MATRIX move2;
int i;
double answer;
float *temp1 = 0;
float *temp2 = 0;
int err;
assert(N > 3);
temp1 = malloc(N * 3 * sizeof(float));
temp2 = malloc(N * 3 * sizeof(float));
if(!temp1 || !temp2)
goto error_exit;
centroid(cent1, v1, N);
centroid(cent2, v2, N);
for(i=0;i<N;i++)
{
temp1[i*3+0] = v1[i*3+0] - cent1[0];
temp1[i*3+1] = v1[i*3+1] - cent1[1];
temp1[i*3+2] = v1[i*3+2] - cent1[2];
temp2[i*3+0] = v2[i*3+0] - cent2[0];
temp2[i*3+1] = v2[i*3+1] - cent2[1];
temp2[i*3+2] = v2[i*3+2] - cent2[2];
}
err = getalignmtx(temp1, temp2, N, &tmtx);
if(err == -1)
goto error_exit;
mtx_trans(&move1, -cent2[0], -cent2[1], -cent2[2]);
mtx_mul(&tempmtx, &move1, &tmtx);
mtx_trans(&move2, cent1[0], cent1[1], cent1[2]);
mtx_mul(&tmtx, &tempmtx, &move2);
memcpy(temp2, v2, N * sizeof(float) * 3);
for(i=0;i<N;i++)
mulpt(&tmtx, temp2 + i * 3);
answer = alignedrmsd(v1, temp2, N);
free(temp1);
free(temp2);
if(mtx)
memcpy(mtx, &tmtx.m, 16 * sizeof(float));
return answer;
error_exit:
free(temp1);
free(temp2);
if(mtx)
{
for(i=0;i<16;i++)
mtx[i] = 0;
}
return sqrt(-1.0);
}
/*
calculate rmsd between two aligned structures (trivial)
Params: v1 - first structure
v2 - second structure
N - number of points
Returns: rmsd
*/
static double alignedrmsd(float *v1, float *v2, int N)
{
double answer =0;
int i;
for(i=0;i<N;i++)
answer += vdiff2(v1 + i *3, v2 + i * 3);
return sqrt(answer/N);
}
/*
compute the centroid
*/
static void centroid(float *ret, float *v, int N)
{
int i;
ret[0] = 0;
ret[1] = 0;
ret[2] = 0;
for(i=0;i<N;i++)
{
ret[0] += v[i*3+0];
ret[1] += v[i*3+1];
ret[2] += v[i*3+2];
}
ret[0] /= N;
ret[1] /= N;
ret[2] /= N;
}
/*
get the matrix needed to align two structures
Params: v1 - reference structure
v2 - structure to align
N - number of points
mtx - return for rigid body alignment matrix
Notes: only calculates rotation part of matrix.
assumes input has been aligned to centroids
*/
static int getalignmtx(float *v1, float *v2, int N, MATRIX *mtx)
{
MATRIX A = { {{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,1}} };
MATRIX At;
MATRIX Ainv;
MATRIX temp;
float tv[3];
float tw[3];
float tv2[3];
float tw2[3];
int k, i, j;
int flag = 0;
float correction;
correction = absmaxv(v1, N * 3) * absmaxv(v2, N * 3);
for(k=0;k<N;k++)
for(i=0;i<3;i++)
for(j=0;j<3;j++)
A.m[i][j] += (v1[k*3+i] * v2[k*3+j])/correction;
while(flag < 3)
{
for(i=0;i<4;i++)
for(j=0;j<4;j++)
At.m[i][j] = A.m[j][i];
memcpy(&Ainv, &A, sizeof(MATRIX));
/* this will happen if all points are in a plane */
if( mtx_invert((float *) &Ainv, 4) == -1)
{
if(flag == 0)
{
crossproduct(tv, v1, v1+3);
crossproduct(tw, v2, v2+3);
}
else
{
crossproduct(tv2, tv, v1);
crossproduct(tw2, tw, v2);
memcpy(tv, tv2, 3 * sizeof(float));
memcpy(tw, tw2, 3 * sizeof(float));
}
for(i=0;i<3;i++)
for(j=0;j<3;j++)
A.m[i][j] += tv[i] * tw[j];
flag++;
}
else
flag = 5;
}
if(flag != 5)
return -1;
mtx_mul(&temp, &At, &A);
mtx_root(&temp);
mtx_mul(mtx, &temp, &Ainv);
return 0;
}
/*
get the crossproduct of two vectors.
Params: ans - return pinter for answer.
pt1 - first vector
pt2 - second vector.
Notes: crossproduct is at right angles to the two vectors.
*/
static void crossproduct(float *ans, float *pt1, float *pt2)
{
ans[0] = pt1[1] * pt2[2] - pt1[2] * pt2[1];
ans[1] = pt1[0] * pt2[2] - pt1[2] * pt2[0];
ans[2] = pt1[0] * pt2[1] - pt1[1] * pt2[0];
}
/*
Denman-Beavers square root iteration
*/
static void mtx_root(MATRIX *mtx)
{
MATRIX Y = *mtx;
MATRIX Z;
MATRIX Y1;
MATRIX Z1;
MATRIX invY;
MATRIX invZ;
MATRIX Y2;
int iter = 0;
int i, ii;
mtx_identity(&Z);
do
{
invY = Y;
invZ = Z;
if( mtx_invert((float *) &invY, 4) == -1)
return;
if( mtx_invert((float *) &invZ, 4) == -1)
return;
for(i=0;i<4;i++)
for(ii=0;ii<4;ii++)
{
Y1.m[i][ii] = 0.5 * (Y.m[i][ii] + invZ.m[i][ii]);
Z1.m[i][ii] = 0.5 * (Z.m[i][ii] + invY.m[i][ii]);
}
Y = Y1;
Z = Z1;
mtx_mul(&Y2, &Y, &Y);
}
while(!almostequal(&Y2, mtx) && iter++ < 20 );
*mtx = Y;
}
/*
Check two matrices for near-enough equality
Params: a - first matrix
b - second matrix
Returns: 1 if almost equal, else 0, epsilon 0.0001f.
*/
static int almostequal(MATRIX *a, MATRIX *b)
{
int i, ii;
float epsilon = 0.001f;
for(i=0;i<4;i++)
for(ii=0;ii<4;ii++)
if(fabs(a->m[i][ii] - b->m[i][ii]) > epsilon)
return 0;
return 1;
}
/*
multiply a point by a matrix.
Params: mtx - matrix
pt - the point (transformed)
*/
static void mulpt(MATRIX *mtx, float *pt)
{
float ans[4] = {0};
int i;
int ii;
for(i=0;i<4;i++)
{
for(ii=0;ii<3;ii++)
{
ans[i] += pt[ii] * mtx->m[ii][i];
}
ans[i] += mtx->m[3][i];
}
pt[0] = ans[0];
pt[1] = ans[1];
pt[2] = ans[2];
}
/*
multiply two matrices.
Params: ans - return pointer for answer.
x - first matrix
y - second matrix.
Notes: ans may not be equal to x or y.
*/
static void mtx_mul(MATRIX *ans, MATRIX *x, MATRIX *y)
{
int i;
int ii;
int iii;
for(i=0;i<4;i++)
for(ii=0;ii<4;ii++)
{
ans->m[i][ii] = 0;
for(iii=0;iii<4;iii++)
ans->m[i][ii] += x->m[i][iii] * y->m[iii][ii];
}
}
/*
create an identity matrix.
Params: mtx - return pointer.
*/
static void mtx_identity(MATRIX *mtx)
{
int i;
int ii;
for(i=0;i<4;i++)
for(ii=0;ii<4;ii++)
{
if(i==ii)
mtx->m[i][ii] = 1.0f;
else
mtx->m[i][ii] = 0;
}
}
/*
create a translation matrix.
Params: mtx - return pointer for matrix.
x - x translation.
y - y translation.
z - z translation
*/
static void mtx_trans(MATRIX *mtx, float x, float y, float z)
{
mtx->m[0][0] = 1;
mtx->m[0][1] = 0;
mtx->m[0][2] = 0;
mtx->m[0][3] = 0;
mtx->m[1][0] = 0;
mtx->m[1][1] = 1;
mtx->m[1][2] = 0;
mtx->m[1][3] = 0;
mtx->m[2][0] = 0;
mtx->m[2][1] = 0;
mtx->m[2][2] = 1;
mtx->m[2][3] = 0;
mtx->m[3][0] = x;
mtx->m[3][1] = y;
mtx->m[3][2] = z;
mtx->m[3][3] = 1;
}
/*
matrix invert routine
Params: mtx - the matrix in raw format, in/out
N - width and height
Returns: 0 on success, -1 on fail
*/
static int mtx_invert(float *mtx, int N)
{
int indxc[100]; /* these 100s are the only restriction on matrix size */
int indxr[100];
int ipiv[100];
int i, j, k;
int irow, icol;
double big;
double pinv;
int l, ll;
double dum;
double temp;
assert(N <= 100);
for(i=0;i<N;i++)
ipiv[i] = 0;
for(i=0;i<N;i++)
{
big = 0.0;
/* find biggest element */
for(j=0;j<N;j++)
if(ipiv[j] != 1)
for(k=0;k<N;k++)
if(ipiv[k] == 0)
if(fabs(mtx[j*N+k]) >= big)
{
big = fabs(mtx[j*N+k]);
irow = j;
icol = k;
}
ipiv[icol]=1;
if(irow != icol)
for(l=0;l<N;l++)
{
temp = mtx[irow * N + l];
mtx[irow * N + l] = mtx[icol * N + l];
mtx[icol * N + l] = temp;
}
indxr[i] = irow;
indxc[i] = icol;
/* if biggest element is zero matrix is singular, bail */
if(mtx[icol* N + icol] == 0)
goto error_exit;
pinv = 1.0/mtx[icol * N + icol];
mtx[icol * N + icol] = 1.0;
for(l=0;l<N;l++)
mtx[icol * N + l] *= pinv;
for(ll=0;ll<N;ll++)
if(ll != icol)
{
dum = mtx[ll * N + icol];
mtx[ll * N + icol] = 0.0;
for(l=0;l<N;l++)
mtx[ll * N + l] -= mtx[icol * N + l]*dum;
}
}
/* unscramble matrix */
for (l=N-1;l>=0;l--)
{
if (indxr[l] != indxc[l])
for (k=0;k<N;k++)
{
temp = mtx[k * N + indxr[l]];
mtx[k * N + indxr[l]] = mtx[k * N + indxc[l]];
mtx[k * N + indxc[l]] = temp;
}
}
return 0;
error_exit:
return -1;
}
/*
get the asolute maximum of an array
*/
static float absmaxv(float *v, int N)
{
float answer;
int i;
for(i=0;i<N;i++)
if(answer < fabs(v[i]))
answer = fabs(v[i]);
return answer;
}
#include <stdio.h>
/*
debug utlitiy
*/
static void printmtx(FILE *fp, MATRIX *mtx)
{
int i, ii;
for(i=0;i<4;i++)
{
for(ii=0;ii<4;ii++)
fprintf(fp, "%f, ", mtx->m[i][ii]);
fprintf(fp, "\n");
}
}
int rmsdmain(void)
{
float one[4*3] = {0,0,0, 1,0,0, 2,1,0, 0,3,1};
float two[4*3] = {0,0,0, 0,1,0, 1,2,0, 3,0,1};
MATRIX mtx;
double diff;
int i;
diff = rmsd(one, two, 4, (float *) &mtx.m);
printf("%f\n", diff);
printmtx(stdout, &mtx);
for(i=0;i<4;i++)
{
mulpt(&mtx, two + i * 3);
printf("%f %f %f\n", two[i*3], two[i*3+1], two[i*3+2]);
}
return 0;
}

I took #vagran's implementation and added RANSAC on top of it, since estimateRigidTransform2d does it and it was helpful for me since my data is noisy. (Note: This code doesn't have constant scaling along all 3 axes; you can add it back in easily by comparing to vargran's).
cv::Vec3f CalculateMean(const cv::Mat_<cv::Vec3f> &points)
{
if(points.size().height == 0){
return 0;
}
assert(points.size().width == 1);
double mx = 0.0;
double my = 0.0;
double mz = 0.0;
int n_points = points.size().height;
for(int i = 0; i < n_points; i++){
double x = double(points(i)[0]);
double y = double(points(i)[1]);
double z = double(points(i)[2]);
mx += x;
my += y;
mz += z;
}
return cv::Vec3f(mx/n_points, my/n_points, mz/n_points);
}
cv::Mat_<double>
FindRigidTransform(const cv::Mat_<cv::Vec3f> &points1, const cv::Mat_<cv::Vec3f> points2)
{
/* Calculate centroids. */
cv::Vec3f t1 = CalculateMean(points1);
cv::Vec3f t2 = CalculateMean(points2);
cv::Mat_<double> T1 = cv::Mat_<double>::eye(4, 4);
T1(0, 3) = double(-t1[0]);
T1(1, 3) = double(-t1[1]);
T1(2, 3) = double(-t1[2]);
cv::Mat_<double> T2 = cv::Mat_<double>::eye(4, 4);
T2(0, 3) = double(t2[0]);
T2(1, 3) = double(t2[1]);
T2(2, 3) = double(t2[2]);
/* Calculate covariance matrix for input points. Also calculate RMS deviation from centroid
* which is used for scale calculation.
*/
cv::Mat_<double> C(3, 3, 0.0);
for (int ptIdx = 0; ptIdx < points1.rows; ptIdx++) {
cv::Vec3f p1 = points1(ptIdx) - t1;
cv::Vec3f p2 = points2(ptIdx) - t2;
for (int i = 0; i < 3; i++) {
for (int j = 0; j < 3; j++) {
C(i, j) += double(p2[i] * p1[j]);
}
}
}
cv::Mat_<double> u, s, vt;
cv::SVD::compute(C, s, u, vt);
cv::Mat_<double> R = u * vt;
if (cv::determinant(R) < 0) {
R -= u.col(2) * (vt.row(2) * 2.0);
}
cv::Mat_<double> M = cv::Mat_<double>::eye(4, 4);
R.copyTo(M.colRange(0, 3).rowRange(0, 3));
cv::Mat_<double> result = T2 * M * T1;
result /= result(3, 3);
return result;
}
cv::Mat_<double> RANSACFindRigidTransform(const cv::Mat_<cv::Vec3f> &points1, const cv::Mat_<cv::Vec3f> &points2)
{
cv::Mat points1Homo;
cv::convertPointsToHomogeneous(points1, points1Homo);
int iterations = 100;
int min_n_points = 3;
int n_points = points1.size().height;
std::vector<int> range(n_points);
cv::Mat_<double> best;
int best_inliers = -1;
// inlier points should be projected within this many units
float threshold = .02;
std::iota(range.begin(), range.end(), 0);
auto gen = std::mt19937{std::random_device{}()};
for(int i = 0; i < iterations; i++) {
std::shuffle(range.begin(), range.end(), gen);
cv::Mat_<cv::Vec3f> points1subset(min_n_points, 1, cv::Vec3f(0,0,0));
cv::Mat_<cv::Vec3f> points2subset(min_n_points, 1, cv::Vec3f(0,0,0));
for(int j = 0; j < min_n_points; j++) {
points1subset(j) = points1(range[j]);
points2subset(j) = points2(range[j]);
}
cv::Mat_<float> rigidT = FindRigidTransform(points1subset, points2subset);
cv::Mat_<float> rigidT_float = cv::Mat::eye(4, 4, CV_32F);
rigidT.convertTo(rigidT_float, CV_32F);
std::vector<int> inliers;
for(int j = 0; j < n_points; j++) {
cv::Mat_<float> t1_3d = rigidT_float * cv::Mat_<float>(points1Homo.at<cv::Vec4f>(j));
if(t1_3d(3) == 0) {
continue; // Avoid 0 division
}
float dx = (t1_3d(0)/t1_3d(3) - points2(j)[0]);
float dy = (t1_3d(1)/t1_3d(3) - points2(j)[1]);
float dz = (t1_3d(2)/t1_3d(3) - points2(j)[2]);
float square_dist = dx * dx + dy * dy + dz * dz;
if(square_dist < threshold * threshold){
inliers.push_back(j);
}
}
int n_inliers = inliers.size();
if(n_inliers > best_inliers) {
best_inliers = n_inliers;
best = rigidT;
}
}
return best;
}

#vagran Thanks for the code! Seems to work very well.
I do have a little terminology suggestion though. Since you are estimating and applying a scale during the transformation, it is a 7-parameter transformation, or Helmert / similarity transformation. And in a rigid transformation, no scaling is applied because all Euclidiean distances need to be reserved.
I would've added this as comment, but don't have enough points.. D: sorry for that.
rigid transformation: https://en.wikipedia.org/wiki/Rigid_transformation
Helmert transformation: https://www.researchgate.net/publication/322841143_Parameter_estimation_in_3D_affine_and_similarity_transformation_implementation_of_variance_component_estimation

how can I replicate accelerate (apple DSP library) functions in windows?

I'm going to make this as succinct as I can:
I have a project that I am needing to port to windows due to some very specific hardware constraints. There's a little utility class which performs vector distance calculations using Accelerate, the Apple DSP library. I need to rewrite this so that it functions without said library, but have been unable to find a suitable replacement. What is my best course of action?
#include <Accelerate/Accelerate.h>
inline float distBetween(float *x, float *y, unsigned int count) {
float *tmp = (float*)malloc(count * sizeof(float));
// float tmp[count];
//t = y - x
vDSP_vsub(x, 1, y, 1, tmp, 1, count);
//t.squared
vDSP_vsq(tmp, 1, tmp, 1, count);
//t.sum
float sum;
vDSP_sve(tmp, 1, &sum, count);
delete tmp;
return sqrt(sum);
}
inline float cosineDistance(float *x, float *y, unsigned int count) {
float dotProd, magX, magY;
float *tmp = (float*)malloc(count * sizeof(float));
vDSP_dotpr(x, 1, y, 1, &dotProd, count);
vDSP_vsq(x, 1, tmp, 1, count);
vDSP_sve(tmp, 1, &magX, count);
magX = sqrt(magX);
vDSP_vsq(y, 1, tmp, 1, count);
vDSP_sve(tmp, 1, &magY, count);
magY = sqrt(magY);
delete tmp;
return 1.0 - (dotProd / (magX * magY));
}

Vector functions are usually implemented through a specific assembly language instructions. This implementation is very slow. Perhaps you need a library that uses the SSE instructions.
In your code, all the arguments stride_x, stride_y, stride_res equal to 1, so I recommend you remove them from the functions arguments. Сode should be faster.
//t = y - x
float
vDSP_vsub(float *x, int stride_x, float *y, int stride_y, float *res, int stride_res, int count)
{
while(count > 0)
{
// may be *x - *y ?
*res = *y - *x;
res += stride_res;
x += stride_x;
y += stride_y;
count--;
}
}
//t.squared
float
vDSP_vsq(float *x, int stride_x, float *res, int stride_res, int count)
{
while(count > 0)
{
*res += (*x) * (*x);
x += stride_x;
res += stride_res;
count--;
}
}
//t.sum
float
vDSP_sve(float *x, int stride_x, float *res, int count)
{
*res = 0.0;
while(count > 0)
{
*res += *x;
x += stride_x;
count--;
}
}
float
vDSP_dotpr(float *x, int stride_x, float *y, int stride_y, float *res, int count)
{
*res = 0.0;
while(count > 0)
{
*res += (*x) * (*y);
x += stride_x;
y += stride_y;
count--;
}
}

Have a look at Intel's IPP libraries.

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

AltiVec vec_msum equivalent for float values - c++

Related

Recursive function outputs a wrong value

Is it possible to use CUDA parallelizing this nested for loop?

Fast implementation of covariance of two 8-bit arrays

opencv: Rigid Transformation between two 3D point clouds

how can I replicate accelerate (apple DSP library) functions in windows?

Categories

Resources