how can I replicate accelerate (apple DSP library) functions in windows? - c++

I'm going to make this as succinct as I can:
I have a project that I am needing to port to windows due to some very specific hardware constraints. There's a little utility class which performs vector distance calculations using Accelerate, the Apple DSP library. I need to rewrite this so that it functions without said library, but have been unable to find a suitable replacement. What is my best course of action?
#include <Accelerate/Accelerate.h>
inline float distBetween(float *x, float *y, unsigned int count) {
float *tmp = (float*)malloc(count * sizeof(float));
// float tmp[count];
//t = y - x
vDSP_vsub(x, 1, y, 1, tmp, 1, count);
vDSP_vsq(tmp, 1, tmp, 1, count);
float sum;
vDSP_sve(tmp, 1, &sum, count);
delete tmp;
return sqrt(sum);
inline float cosineDistance(float *x, float *y, unsigned int count) {
float dotProd, magX, magY;
float *tmp = (float*)malloc(count * sizeof(float));
vDSP_dotpr(x, 1, y, 1, &dotProd, count);
vDSP_vsq(x, 1, tmp, 1, count);
vDSP_sve(tmp, 1, &magX, count);
magX = sqrt(magX);
vDSP_vsq(y, 1, tmp, 1, count);
vDSP_sve(tmp, 1, &magY, count);
magY = sqrt(magY);
delete tmp;
return 1.0 - (dotProd / (magX * magY));

Vector functions are usually implemented through a specific assembly language instructions. This implementation is very slow. Perhaps you need a library that uses the SSE instructions.
In your code, all the arguments stride_x, stride_y, stride_res equal to 1, so I recommend you remove them from the functions arguments. Сode should be faster.
//t = y - x
vDSP_vsub(float *x, int stride_x, float *y, int stride_y, float *res, int stride_res, int count)
while(count > 0)
// may be *x - *y ?
*res = *y - *x;
res += stride_res;
x += stride_x;
y += stride_y;
vDSP_vsq(float *x, int stride_x, float *res, int stride_res, int count)
while(count > 0)
*res += (*x) * (*x);
x += stride_x;
res += stride_res;
vDSP_sve(float *x, int stride_x, float *res, int count)
*res = 0.0;
while(count > 0)
*res += *x;
x += stride_x;
vDSP_dotpr(float *x, int stride_x, float *y, int stride_y, float *res, int count)
*res = 0.0;
while(count > 0)
*res += (*x) * (*y);
x += stride_x;
y += stride_y;

Have a look at Intel's IPP libraries.


How to call ** var in C/C++ 10.5 chapter numerical recipes

I have a problem with calling this function:
void powell(float p[], float **xi, int n,
float ftol, int *iter, float *fret,
float (*func)(float []))
I don't know which argument must be under **xi to run my code.
Whole function below:
void powell(float p[], float** xi, int n, float ftol, int* iter, float* fret, float (*func)(float[]))
void linmin(float p[], float xi[], int n, float* fret, float (*func)(float[]));
int i, ibig, j;
float del, fp, fptt, t, *pt, *ptt, *xit;
pt = vector(1, n);
ptt = vector(1, n);
xit = vector(1, n);
*fret = (*func)(p);
for (j = 1; j <= n; j++)
pt[j] = p[j];
for (*iter = 1;; ++(*iter)) {
fp = (*fret);
ibig = 0;
del = 0.0;
for (i = 1; i <= n; i++) {
for (j = 1; j <= n; j++)
xit[j] = xi[j][i];
fptt = (*fret);
linmin(p, xit, n, fret, func);
if (fptt - (*fret) > del) {
del = fptt - (*fret);
ibig = i;
if (2.0 * (fp - (*fret)) <= ftol * (fabs(fp) + fabs(*fret)) + TINY) {
free_vector(xit, 1, n);
free_vector(ptt, 1, n);
free_vector(pt, 1, n);
if (*iter == ITMAX)
nrerror("powell exceeding maximum iterations.");
for (j = 1; j <= n; j++) {
ptt[j] = 2.0 * p[j] - pt[j];
xit[j] = p[j] - pt[j];
pt[j] = p[j];
fptt = (*func)(ptt);
if (fptt < fp) {
t = 2.0 * (fp - 2.0 * (*fret) + fptt) * SQR(fp - (*fret) - del) - del * SQR(fp - fptt);
if (t < 0.0) {
linmin(p, xit, n, fret, func);
for (j = 1; j <= n; j++) {
xi[j][ibig] = xi[j][n];
xi[j][n] = xit[j];
Thanks in advance.
A double pointer means that the function wants the address of a pointer.
void my_function(int **p_pointer)
*p_pointer = new int[42];
int main(void)
int * pointer = nullptr;
return 0;
In C++, the double pointer can be avoided by using reference:
void another_function(int *& pointer)
pointer = new int [256];
int main(void)
int p = nullptr;
return 0;
One of the primary concerns with pointers is that they can point to anywhere, a defined location or not. Testing a pointer for validity is complex because it depends on the range (or ranges) that are valid for the current platform. With references, the reference is valid, by definition, so no validity checks need to be performed.

Recursive function outputs a wrong value

In this code the output is 'r' instead of 'r0'
Instead of doing the operations it outputs me the first 'r' (equals 100) and does not do the process.
I´m trying to program an operation like (x_0 = x + (nt²/(2(x+(n(t-1)²/2(x+(n(t-3)²/2(x + (n(t-4)²...)²)²)²)²)²)²)²)²) in where the process is repeated until the variable 't' is '0'(because each time the operation is done 't' get a '-1').
#include <iostream>
#include "math.h"
using namespace std;
int operation(float r,
float r0,
float recursiva,
float operacion,
float recursivaPrincipal2,
float recursivaPrincipal,
float p,
float n,
long long t,
float q,
float potenciaQ,
float c,
float potenciaC,
float t2,
float division);
float r = 100;
float t = 10000;
float r0;
float recursiva;
float operacion;
float recursivaPrincipal2;
float recursivaPrincipal;
float p;
float n;
float q;
float potenciaQ;
float c;
float potenciaC;
float t2;
float division;
int main() {
r0 = r + operacion;
potenciaQ = pow(10,10);
q = 6 * potenciaQ;
potenciaC = pow(10,2);
c = 5 * potenciaC;
while (t = 10000, t = t - 1, t > 0) {
t2 = t * t;
n = q * t2;
operacion = n / recursivaPrincipal;
recursivaPrincipal2 = recursiva * recursiva;
recursivaPrincipal = 2 * recursivaPrincipal2;
recursiva = r + operacion;
if (t == 0) {
return 0;
cout << "Solucion: " << r0 << endl;
i want to do something like this
I'm so sorry if this code offended you (comments look like it) but I'm not very good, this is my first c++ code (and last I think)
The answer is based on what i get from your question
Please do expand your mathematical expression for t=3 and append an image of it
by far what i got from your expression you need this
float func(int t,int n,int x)
if (t==1)
return (x + (n/2)*(n/2)) * (x + (n/2)*(n/2));
return x + (n*t*t)/(2*func(t-1,n,x)) ;
According to the picture you have uploaded this is my code
Don't use 0 for n
using namespace std;
double partSolver(int x,int p, int n)
if(n==0) return 2*x*x;
double val = x - ( (p*n*n) / partSolver(x,p,n-1) );
return 2*val*val ;
double solver(int x,int p,int n)
return (n*n * 2) / partSolver(x,p,n-1);
int main()
cout<<"The Solution is: "<<solver(3,2,1)<<endl;
return 0;

Code for Julia set always generates a grey image in CUDA C

I've written the following code for generating a Julia set fractal in CUDA C/C++ with some help from online sources. I've been trying for hours now, but I'm unable to figure out as to why this always generates a grey image rather than the one I get when I run the CPU code. I'm new to CUDA C and parallel programming, and I'm currently referring to CUDA by Example by Sanders and Kandrot.
Here is the CPU variant of code, which runs fine with all the necessary imports in VS2013:
#include <stdio.h>
#include <stdlib.h>
#include <complex>
#include <string.h>
#include <IL/il.h>
#include <IL/ilu.h>
#include <time.h>
using namespace std;
#define N 1024
#define SQRT_2 1.4142
#define MAX_ITER 512
void HSVtoRGB( float *r, float *g, float *b, float h, float s, float v );
void saveImage(int width, int height, unsigned char * bitmap, complex<float> seed);
void compute_julia(complex<float> c, unsigned char * image);
int main(int argc, char **argv)
complex<float> c(0.285f, 0.01f);
if(argc > 2)
} else
fprintf(stderr, "Usage: %s <real> <imag>\nWhere <real> and <imag> form the complex seed for the Julia set.\n", argv[0]);
unsigned char *image = new unsigned char[N*N*3]; //RGB image
compute_julia(c, image);
saveImage(N, N, image, c);
delete[] image;
void compute_julia(complex<float> c, unsigned char * image)
complex<float> z_old(0.0f, 0.0f);
complex<float> z_new(0.0f, 0.0f);
for(int y=0; y<N; y++)
for(int x=0; x<N; x++)
z_new.real(4.0f * x / (N) - 2.0f);
z_new.imag(4.0f * y / (N) - 2.0f);
int i;
for(i=0; i<MAX_ITER; i++)
z_new = pow(z_new, 2);
z_new += c;
if(norm(z_new) > 4.0f) break;
float brightness = (i<MAX_ITER) ? 1.0f : 0.0f;
float hue = (i % MAX_ITER)/float(MAX_ITER - 1);
hue = (120*sqrtf(hue) + 150);
float r, g, b;
HSVtoRGB(&r, &g, &b, hue, 1.0f, brightness);
image[(x + y*N)*3 + 0] = (unsigned char)(b*255);
image[(x + y*N)*3 + 1] = (unsigned char)(g*255);
image[(x + y*N)*3 + 2] = (unsigned char)(r*255);
void saveImage(int width, int height, unsigned char * bitmap, complex<float> seed)
ILuint imageID = ilGenImage();
ilTexImage(width, height, 1, 3, IL_RGB, IL_UNSIGNED_BYTE, bitmap);
char imageName[256];
sprintf(imageName, "Julia %.3f + i%.3f.png", seed.real(), seed.imag());
ilSave(IL_PNG, imageName);
fprintf(stderr, "Image saved as: %s\n", imageName);
// r,g,b values are from 0 to 1
// h = [0,360], s = [0,1], v = [0,1]
// if s == 0, then h = -1 (undefined)
void HSVtoRGB( float *r, float *g, float *b, float h, float s, float v )
int i;
float f, p, q, t;
if( s == 0 ) {
// achromatic (grey)
*r = *g = *b = v;
h /= 60; // sector 0 to 5
i = floor( h );
f = h - i; // factorial part of h
p = v * ( 1 - s );
q = v * ( 1 - s * f );
t = v * ( 1 - s * ( 1 - f ) );
switch( i ) {
case 0:
*r = v;
*g = t;
*b = p;
case 1:
*r = q;
*g = v;
*b = p;
case 2:
*r = p;
*g = v;
*b = t;
case 3:
*r = p;
*g = q;
*b = v;
case 4:
*r = t;
*g = p;
*b = v;
default: // case 5:
*r = v;
*g = p;
*b = q;
And here is the corresponding GPU version (note that it is pretty unrefined at the moment, I'll do that once I'm able to get basic functionality out of it):
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <complex>
#include <string.h>
#include <IL/il.h>
#include <IL/ilu.h>
#include <time.h>
using namespace std;
#define N 1024
#define SQRT_2 1.4142
#define MAX_ITER 512
struct cuComplex {
float r;
float i;
__host__ __device__ cuComplex(float a, float b) : r(a), i(b) {}
__host__ __device__ float magnitude2(void) {
return r * r + i * i;
__host__ __device__ cuComplex operator*(const cuComplex& a) {
return cuComplex(r*a.r - i*a.i, i*a.r + r*a.i);
__host__ __device__ cuComplex operator+(const cuComplex& a) {
return cuComplex(r + a.r, i + a.i);
void HSVtoRGB(float *r, float *g, float *b, float h, float s, float v);
void saveImage(int width, int height, unsigned char * bitmap, cuComplex seed);
void compute_julia(complex<float> c, unsigned char * image);
__global__ void compute_julia_gpu(unsigned char* image);
__device__ void HSVtoRGB_GPU(float *r, float *g, float *b, float h, float s, float v);
int main(int argc, char **argv)
cuComplex c(-0.8f, 0.156f);
if (argc > 2)
fprintf(stderr, "Usage: %s <real> <imag>\nWhere <real> and <imag> form the complex seed for the Julia set.\n", argv[0]);
dim3 grid(N, N);
unsigned char *image = new unsigned char[N*N * 3]; //RGB image
size_t size = sizeof(image);
unsigned char *d_image; //RGB image
cudaMalloc((void **)&d_image, size);
compute_julia_gpu<<<grid, 1>>>(d_image);
cudaMemcpy(image, d_image, size, cudaMemcpyDeviceToHost);
saveImage(N, N, image, c);
delete[] image;
__global__ void compute_julia_gpu(unsigned char* image) {
complex<float> z_old(0.0f, 0.0f);
complex<float> z_new(0.0f, 0.0f);
complex<float> c(-0.8f, 0.156f);
cuComplex z_old(0.0, 0.0);
cuComplex z_new(0.0, 0.0);
cuComplex c(-0.8f, 0.156f);
int x = blockIdx.x;
int y = blockIdx.y;
z_new.r = (4.0f * x / (N)-2.0f);
z_new.i = (4.0f * y / (N)-2.0f);
int i = 0;
for (i = 0; i<MAX_ITER; i++)
z_old.r = z_new.r;
z_old.i = z_new.i;
z_new = (z_new * z_new) + c;
if (z_new.magnitude2() > 4.0f) break;
float brightness = (i<MAX_ITER) ? 1.0f : 0.0f;
float hue = (i % MAX_ITER) / float(MAX_ITER - 1);
hue = (120 * sqrtf(hue) + 150);
float r, g, b;
HSVtoRGB_GPU(&r, &g, &b, hue, 1.0f, brightness);
image[(x + y*N) * 3 + 0] = (unsigned char)(b * 255);
image[(x + y*N) * 3 + 1] = (unsigned char)(g * 255);
image[(x + y*N) * 3 + 2] = (unsigned char)(r * 255);
void saveImage(int width, int height, unsigned char * bitmap, cuComplex seed)
ILuint imageID = ilGenImage();
ilTexImage(width, height, 1, 3, IL_RGB, IL_UNSIGNED_BYTE, bitmap);
char imageName[256];
sprintf(imageName, "Julia %.3f + i%.3f.png", seed.r, seed.i);
ilSave(IL_PNG, imageName);
fprintf(stderr, "Image saved as: %s\n", imageName);
__device__ void HSVtoRGB_GPU(float *r, float *g, float *b, float h, float s, float v)
int i;
float f, p, q, t;
if (s == 0) {
// achromatic (grey)
*r = *g = *b = v;
h /= 60; // sector 0 to 5
i = floor(h);
f = h - i; // factorial part of h
p = v * (1 - s);
q = v * (1 - s * f);
t = v * (1 - s * (1 - f));
switch (i) {
case 0:
*r = v;
*g = t;
*b = p;
case 1:
*r = q;
*g = v;
*b = p;
case 2:
*r = p;
*g = v;
*b = t;
case 3:
*r = p;
*g = q;
*b = v;
case 4:
*r = t;
*g = p;
*b = v;
default: // case 5:
*r = v;
*g = p;
*b = q;
Any help is appreciated, thanks.
The problem is your size variable:
#include <iostream>
#include <string.h>
using namespace std;
#define N 1024
int main() {
unsigned char *image = new unsigned char[N*N * 3]; //RGB image
size_t size = sizeof(image);
cout << size;
return 0;
in this case the ouput is 4 (on a 32 bit architecture) because sizeof returns the size of the variable's type. In this case it is unsigned char * and is 4 byte long.
You can run cuda-memcheck ./yourExecuteable and you will see errors when your code performs out of bounds access to the GPU's global memory. You will see a lot of errors because you are allocating only 4 bytes of global memory for your d_image array.

opencv: Rigid Transformation between two 3D point clouds

I have two 3D point clouds, and I'd like to use opencv to find the rigid transformation matrix (translation, rotation, constant scaling among all 3 axes).
I've found an estimateRigidTransformation function, but it's only for 2D points apparently
In addition, I've found estimateAffine3D, but it doesn't seem to support rigid transformation mode.
Do I need to just write my own rigid transformation function?
I did not find the required functionality in OpenCV so I have written my own implementation. Based on ideas from OpenSFM.
CalculateMean(const cv::Mat_<cv::Vec3d> &points)
cv::Mat_<cv::Vec3d> result;
cv::reduce(points, result, 0, CV_REDUCE_AVG);
return result(0, 0);
FindRigidTransform(const cv::Mat_<cv::Vec3d> &points1, const cv::Mat_<cv::Vec3d> points2)
/* Calculate centroids. */
cv::Vec3d t1 = -CalculateMean(points1);
cv::Vec3d t2 = -CalculateMean(points2);
cv::Mat_<double> T1 = cv::Mat_<double>::eye(4, 4);
T1(0, 3) = t1[0];
T1(1, 3) = t1[1];
T1(2, 3) = t1[2];
cv::Mat_<double> T2 = cv::Mat_<double>::eye(4, 4);
T2(0, 3) = -t2[0];
T2(1, 3) = -t2[1];
T2(2, 3) = -t2[2];
/* Calculate covariance matrix for input points. Also calculate RMS deviation from centroid
* which is used for scale calculation.
cv::Mat_<double> C(3, 3, 0.0);
double p1Rms = 0, p2Rms = 0;
for (int ptIdx = 0; ptIdx < points1.rows; ptIdx++) {
cv::Vec3d p1 = points1(ptIdx, 0) + t1;
cv::Vec3d p2 = points2(ptIdx, 0) + t2;
p1Rms +=;
p2Rms +=;
for (int i = 0; i < 3; i++) {
for (int j = 0; j < 3; j++) {
C(i, j) += p2[i] * p1[j];
cv::Mat_<double> u, s, vh;
cv::SVD::compute(C, s, u, vh);
cv::Mat_<double> R = u * vh;
if (cv::determinant(R) < 0) {
R -= u.col(2) * (vh.row(2) * 2.0);
double scale = sqrt(p2Rms / p1Rms);
R *= scale;
cv::Mat_<double> M = cv::Mat_<double>::eye(4, 4);
R.copyTo(M.colRange(0, 3).rowRange(0, 3));
cv::Mat_<double> result = T2 * M * T1;
result /= result(3, 3);
return result.rowRange(0, 3);
I've found PCL to be a nice adjunct to OpenCV. Take a look at their Iterative Closest Point (ICP) example. The provided example registers the two point clouds and then displays the rigid transformation.
Here's my rmsd code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <assert.h>
typedef struct
float m[4][4];
#define vdiff2(a,b) ( ((a)[0]-(b)[0]) * ((a)[0]-(b)[0]) + \
((a)[1]-(b)[1]) * ((a)[1]-(b)[1]) + \
((a)[2]-(b)[2]) * ((a)[2]-(b)[2]) )
static double alignedrmsd(float *v1, float *v2, int N);
static void centroid(float *ret, float *v, int N);
static int getalignmtx(float *v1, float *v2, int N, MATRIX *mtx);
static void crossproduct(float *ans, float *pt1, float *pt2);
static void mtx_root(MATRIX *mtx);
static int almostequal(MATRIX *a, MATRIX *b);
static void mulpt(MATRIX *mtx, float *pt);
static void mtx_mul(MATRIX *ans, MATRIX *x, MATRIX *y);
static void mtx_identity(MATRIX *mtx);
static void mtx_trans(MATRIX *mtx, float x, float y, float z);
static int mtx_invert(float *mtx, int N);
static float absmaxv(float *v, int N);
calculate rmsd between two structures
Params: v1 - first set of points
v2 - second set of points
N - number of points
mtx - return for transfrom matrix used to align structures
Returns: rmsd score
Notes: mtx can be null. Transform will be rigid. Inputs must
be previously aligned for sequence alignment
double rmsd(float *v1, float *v2, int N, float *mtx)
float cent1[3];
float cent2[3];
MATRIX tmtx;
MATRIX tempmtx;
MATRIX move1;
MATRIX move2;
int i;
double answer;
float *temp1 = 0;
float *temp2 = 0;
int err;
assert(N > 3);
temp1 = malloc(N * 3 * sizeof(float));
temp2 = malloc(N * 3 * sizeof(float));
if(!temp1 || !temp2)
goto error_exit;
centroid(cent1, v1, N);
centroid(cent2, v2, N);
temp1[i*3+0] = v1[i*3+0] - cent1[0];
temp1[i*3+1] = v1[i*3+1] - cent1[1];
temp1[i*3+2] = v1[i*3+2] - cent1[2];
temp2[i*3+0] = v2[i*3+0] - cent2[0];
temp2[i*3+1] = v2[i*3+1] - cent2[1];
temp2[i*3+2] = v2[i*3+2] - cent2[2];
err = getalignmtx(temp1, temp2, N, &tmtx);
if(err == -1)
goto error_exit;
mtx_trans(&move1, -cent2[0], -cent2[1], -cent2[2]);
mtx_mul(&tempmtx, &move1, &tmtx);
mtx_trans(&move2, cent1[0], cent1[1], cent1[2]);
mtx_mul(&tmtx, &tempmtx, &move2);
memcpy(temp2, v2, N * sizeof(float) * 3);
mulpt(&tmtx, temp2 + i * 3);
answer = alignedrmsd(v1, temp2, N);
memcpy(mtx, &tmtx.m, 16 * sizeof(float));
return answer;
mtx[i] = 0;
return sqrt(-1.0);
calculate rmsd between two aligned structures (trivial)
Params: v1 - first structure
v2 - second structure
N - number of points
Returns: rmsd
static double alignedrmsd(float *v1, float *v2, int N)
double answer =0;
int i;
answer += vdiff2(v1 + i *3, v2 + i * 3);
return sqrt(answer/N);
compute the centroid
static void centroid(float *ret, float *v, int N)
int i;
ret[0] = 0;
ret[1] = 0;
ret[2] = 0;
ret[0] += v[i*3+0];
ret[1] += v[i*3+1];
ret[2] += v[i*3+2];
ret[0] /= N;
ret[1] /= N;
ret[2] /= N;
get the matrix needed to align two structures
Params: v1 - reference structure
v2 - structure to align
N - number of points
mtx - return for rigid body alignment matrix
Notes: only calculates rotation part of matrix.
assumes input has been aligned to centroids
static int getalignmtx(float *v1, float *v2, int N, MATRIX *mtx)
MATRIX A = { {{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,1}} };
MATRIX temp;
float tv[3];
float tw[3];
float tv2[3];
float tw2[3];
int k, i, j;
int flag = 0;
float correction;
correction = absmaxv(v1, N * 3) * absmaxv(v2, N * 3);
A.m[i][j] += (v1[k*3+i] * v2[k*3+j])/correction;
while(flag < 3)
At.m[i][j] = A.m[j][i];
memcpy(&Ainv, &A, sizeof(MATRIX));
/* this will happen if all points are in a plane */
if( mtx_invert((float *) &Ainv, 4) == -1)
if(flag == 0)
crossproduct(tv, v1, v1+3);
crossproduct(tw, v2, v2+3);
crossproduct(tv2, tv, v1);
crossproduct(tw2, tw, v2);
memcpy(tv, tv2, 3 * sizeof(float));
memcpy(tw, tw2, 3 * sizeof(float));
A.m[i][j] += tv[i] * tw[j];
flag = 5;
if(flag != 5)
return -1;
mtx_mul(&temp, &At, &A);
mtx_mul(mtx, &temp, &Ainv);
return 0;
get the crossproduct of two vectors.
Params: ans - return pinter for answer.
pt1 - first vector
pt2 - second vector.
Notes: crossproduct is at right angles to the two vectors.
static void crossproduct(float *ans, float *pt1, float *pt2)
ans[0] = pt1[1] * pt2[2] - pt1[2] * pt2[1];
ans[1] = pt1[0] * pt2[2] - pt1[2] * pt2[0];
ans[2] = pt1[0] * pt2[1] - pt1[1] * pt2[0];
Denman-Beavers square root iteration
static void mtx_root(MATRIX *mtx)
MATRIX Y = *mtx;
int iter = 0;
int i, ii;
invY = Y;
invZ = Z;
if( mtx_invert((float *) &invY, 4) == -1)
if( mtx_invert((float *) &invZ, 4) == -1)
Y1.m[i][ii] = 0.5 * (Y.m[i][ii] + invZ.m[i][ii]);
Z1.m[i][ii] = 0.5 * (Z.m[i][ii] + invY.m[i][ii]);
Y = Y1;
Z = Z1;
mtx_mul(&Y2, &Y, &Y);
while(!almostequal(&Y2, mtx) && iter++ < 20 );
*mtx = Y;
Check two matrices for near-enough equality
Params: a - first matrix
b - second matrix
Returns: 1 if almost equal, else 0, epsilon 0.0001f.
static int almostequal(MATRIX *a, MATRIX *b)
int i, ii;
float epsilon = 0.001f;
if(fabs(a->m[i][ii] - b->m[i][ii]) > epsilon)
return 0;
return 1;
multiply a point by a matrix.
Params: mtx - matrix
pt - the point (transformed)
static void mulpt(MATRIX *mtx, float *pt)
float ans[4] = {0};
int i;
int ii;
ans[i] += pt[ii] * mtx->m[ii][i];
ans[i] += mtx->m[3][i];
pt[0] = ans[0];
pt[1] = ans[1];
pt[2] = ans[2];
multiply two matrices.
Params: ans - return pointer for answer.
x - first matrix
y - second matrix.
Notes: ans may not be equal to x or y.
static void mtx_mul(MATRIX *ans, MATRIX *x, MATRIX *y)
int i;
int ii;
int iii;
ans->m[i][ii] = 0;
ans->m[i][ii] += x->m[i][iii] * y->m[iii][ii];
create an identity matrix.
Params: mtx - return pointer.
static void mtx_identity(MATRIX *mtx)
int i;
int ii;
mtx->m[i][ii] = 1.0f;
mtx->m[i][ii] = 0;
create a translation matrix.
Params: mtx - return pointer for matrix.
x - x translation.
y - y translation.
z - z translation
static void mtx_trans(MATRIX *mtx, float x, float y, float z)
mtx->m[0][0] = 1;
mtx->m[0][1] = 0;
mtx->m[0][2] = 0;
mtx->m[0][3] = 0;
mtx->m[1][0] = 0;
mtx->m[1][1] = 1;
mtx->m[1][2] = 0;
mtx->m[1][3] = 0;
mtx->m[2][0] = 0;
mtx->m[2][1] = 0;
mtx->m[2][2] = 1;
mtx->m[2][3] = 0;
mtx->m[3][0] = x;
mtx->m[3][1] = y;
mtx->m[3][2] = z;
mtx->m[3][3] = 1;
matrix invert routine
Params: mtx - the matrix in raw format, in/out
N - width and height
Returns: 0 on success, -1 on fail
static int mtx_invert(float *mtx, int N)
int indxc[100]; /* these 100s are the only restriction on matrix size */
int indxr[100];
int ipiv[100];
int i, j, k;
int irow, icol;
double big;
double pinv;
int l, ll;
double dum;
double temp;
assert(N <= 100);
ipiv[i] = 0;
big = 0.0;
/* find biggest element */
if(ipiv[j] != 1)
if(ipiv[k] == 0)
if(fabs(mtx[j*N+k]) >= big)
big = fabs(mtx[j*N+k]);
irow = j;
icol = k;
if(irow != icol)
temp = mtx[irow * N + l];
mtx[irow * N + l] = mtx[icol * N + l];
mtx[icol * N + l] = temp;
indxr[i] = irow;
indxc[i] = icol;
/* if biggest element is zero matrix is singular, bail */
if(mtx[icol* N + icol] == 0)
goto error_exit;
pinv = 1.0/mtx[icol * N + icol];
mtx[icol * N + icol] = 1.0;
mtx[icol * N + l] *= pinv;
if(ll != icol)
dum = mtx[ll * N + icol];
mtx[ll * N + icol] = 0.0;
mtx[ll * N + l] -= mtx[icol * N + l]*dum;
/* unscramble matrix */
for (l=N-1;l>=0;l--)
if (indxr[l] != indxc[l])
for (k=0;k<N;k++)
temp = mtx[k * N + indxr[l]];
mtx[k * N + indxr[l]] = mtx[k * N + indxc[l]];
mtx[k * N + indxc[l]] = temp;
return 0;
return -1;
get the asolute maximum of an array
static float absmaxv(float *v, int N)
float answer;
int i;
if(answer < fabs(v[i]))
answer = fabs(v[i]);
return answer;
#include <stdio.h>
debug utlitiy
static void printmtx(FILE *fp, MATRIX *mtx)
int i, ii;
fprintf(fp, "%f, ", mtx->m[i][ii]);
fprintf(fp, "\n");
int rmsdmain(void)
float one[4*3] = {0,0,0, 1,0,0, 2,1,0, 0,3,1};
float two[4*3] = {0,0,0, 0,1,0, 1,2,0, 3,0,1};
double diff;
int i;
diff = rmsd(one, two, 4, (float *) &mtx.m);
printf("%f\n", diff);
printmtx(stdout, &mtx);
mulpt(&mtx, two + i * 3);
printf("%f %f %f\n", two[i*3], two[i*3+1], two[i*3+2]);
return 0;
I took #vagran's implementation and added RANSAC on top of it, since estimateRigidTransform2d does it and it was helpful for me since my data is noisy. (Note: This code doesn't have constant scaling along all 3 axes; you can add it back in easily by comparing to vargran's).
cv::Vec3f CalculateMean(const cv::Mat_<cv::Vec3f> &points)
if(points.size().height == 0){
return 0;
assert(points.size().width == 1);
double mx = 0.0;
double my = 0.0;
double mz = 0.0;
int n_points = points.size().height;
for(int i = 0; i < n_points; i++){
double x = double(points(i)[0]);
double y = double(points(i)[1]);
double z = double(points(i)[2]);
mx += x;
my += y;
mz += z;
return cv::Vec3f(mx/n_points, my/n_points, mz/n_points);
FindRigidTransform(const cv::Mat_<cv::Vec3f> &points1, const cv::Mat_<cv::Vec3f> points2)
/* Calculate centroids. */
cv::Vec3f t1 = CalculateMean(points1);
cv::Vec3f t2 = CalculateMean(points2);
cv::Mat_<double> T1 = cv::Mat_<double>::eye(4, 4);
T1(0, 3) = double(-t1[0]);
T1(1, 3) = double(-t1[1]);
T1(2, 3) = double(-t1[2]);
cv::Mat_<double> T2 = cv::Mat_<double>::eye(4, 4);
T2(0, 3) = double(t2[0]);
T2(1, 3) = double(t2[1]);
T2(2, 3) = double(t2[2]);
/* Calculate covariance matrix for input points. Also calculate RMS deviation from centroid
* which is used for scale calculation.
cv::Mat_<double> C(3, 3, 0.0);
for (int ptIdx = 0; ptIdx < points1.rows; ptIdx++) {
cv::Vec3f p1 = points1(ptIdx) - t1;
cv::Vec3f p2 = points2(ptIdx) - t2;
for (int i = 0; i < 3; i++) {
for (int j = 0; j < 3; j++) {
C(i, j) += double(p2[i] * p1[j]);
cv::Mat_<double> u, s, vt;
cv::SVD::compute(C, s, u, vt);
cv::Mat_<double> R = u * vt;
if (cv::determinant(R) < 0) {
R -= u.col(2) * (vt.row(2) * 2.0);
cv::Mat_<double> M = cv::Mat_<double>::eye(4, 4);
R.copyTo(M.colRange(0, 3).rowRange(0, 3));
cv::Mat_<double> result = T2 * M * T1;
result /= result(3, 3);
return result;
cv::Mat_<double> RANSACFindRigidTransform(const cv::Mat_<cv::Vec3f> &points1, const cv::Mat_<cv::Vec3f> &points2)
cv::Mat points1Homo;
cv::convertPointsToHomogeneous(points1, points1Homo);
int iterations = 100;
int min_n_points = 3;
int n_points = points1.size().height;
std::vector<int> range(n_points);
cv::Mat_<double> best;
int best_inliers = -1;
// inlier points should be projected within this many units
float threshold = .02;
std::iota(range.begin(), range.end(), 0);
auto gen = std::mt19937{std::random_device{}()};
for(int i = 0; i < iterations; i++) {
std::shuffle(range.begin(), range.end(), gen);
cv::Mat_<cv::Vec3f> points1subset(min_n_points, 1, cv::Vec3f(0,0,0));
cv::Mat_<cv::Vec3f> points2subset(min_n_points, 1, cv::Vec3f(0,0,0));
for(int j = 0; j < min_n_points; j++) {
points1subset(j) = points1(range[j]);
points2subset(j) = points2(range[j]);
cv::Mat_<float> rigidT = FindRigidTransform(points1subset, points2subset);
cv::Mat_<float> rigidT_float = cv::Mat::eye(4, 4, CV_32F);
rigidT.convertTo(rigidT_float, CV_32F);
std::vector<int> inliers;
for(int j = 0; j < n_points; j++) {
cv::Mat_<float> t1_3d = rigidT_float * cv::Mat_<float>(<cv::Vec4f>(j));
if(t1_3d(3) == 0) {
continue; // Avoid 0 division
float dx = (t1_3d(0)/t1_3d(3) - points2(j)[0]);
float dy = (t1_3d(1)/t1_3d(3) - points2(j)[1]);
float dz = (t1_3d(2)/t1_3d(3) - points2(j)[2]);
float square_dist = dx * dx + dy * dy + dz * dz;
if(square_dist < threshold * threshold){
int n_inliers = inliers.size();
if(n_inliers > best_inliers) {
best_inliers = n_inliers;
best = rigidT;
return best;
#vagran Thanks for the code! Seems to work very well.
I do have a little terminology suggestion though. Since you are estimating and applying a scale during the transformation, it is a 7-parameter transformation, or Helmert / similarity transformation. And in a rigid transformation, no scaling is applied because all Euclidiean distances need to be reserved.
I would've added this as comment, but don't have enough points.. D: sorry for that.
rigid transformation:
Helmert transformation:

AltiVec vec_msum equivalent for float values

Is anybody aware of a method to achieve vec_msum functionality against a vector of float values?
I'm quite new to SIMD, and although I think I'm starting to make sense of it - there are still a few puzzles.
My end goal is to rewrite the function "convolve_altivec" (as found in the accepted answer for this question) such that it accepts input parameters as float values, instead of short's.
That is, the prototype should be
float convolve_altivec(const float *a, const float *b, int n)
I'm trying to match the functionality provided by the original non-optimised function below:
float convolve(const float *a, const float *b, int n)
float out = 0.f;
while (n --)
out += (*(a ++)) * (*(b ++));
return out;
My initial efforts have seen me trying to port an existing SSE version of this same function to altivec instructions.
For a float version you need vec_madd.
Here's a float version of the previous 16 bit int version and test harness I posted in response to your earlier question:
#include <assert.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <altivec.h>
static float convolve_ref(const float *a, const float *b, int n)
float sum = 0.0f;
int i;
for (i = 0; i < n; ++i)
sum += a[i] * b[i];
return sum;
static inline float convolve_altivec(const float *a, const float *b, int n)
float sum = 0.0f;
vector float vsum = { 0.0f, 0.0f, 0.0f, 0.0f };
union {
vector float v;
float a[4];
} usum;
vector float *pa = (vector float *)a;
vector float *pb = (vector float *)b;
assert(((unsigned long)a & 15) == 0);
assert(((unsigned long)b & 15) == 0);
while (n >= 4)
vsum = vec_madd(*pa, *pb, vsum);
n -= 4;
usum.v = vsum;
sum = usum.a[0] + usum.a[1] + usum.a[2] + usum.a[3];
a = (float *)pa;
b = (float *)pb;
while (n --)
sum += (*a++ * *b++);
return sum;
int main(void)
const int n = 1002;
vector float _a[n / 4 + 1];
vector float _b[n / 4 + 1];
float *a = (float *)_a;
float *b = (float *)_b;
float sum_ref, sum_test;
int i;
for (i = 0; i < n; ++i)
a[i] = (float)rand();
b[i] = (float)rand();
sum_ref = convolve_ref(a, b, n);
sum_test = convolve_altivec(a, b, n);
printf("sum_ref = %g\n", sum_ref);
printf("sum_test = %g\n", sum_test);
printf("%s\n", fabsf((sum_ref - sum_test) / sum_ref) < 1.0e-6 ? "PASS" : "FAIL");
return 0;