Reducing the error of my derivative function - c++

I would like to solve a formula
which describes a complex function E extending in r-direction (as a cylindrical coordinate system) and propagating in z-direction. Due to the impossibility to rewrite it to a time derivative I wanted to solve it as
by using the following code:
// MWE_derivation.cpp : Defines the entry point for the console application.
#include "stdafx.h"
#include <cmath>
#include <iostream>
#include <vector>
#include <complex>
#include <fstream>
//Some typedefs
typedef std::complex<double> COMPLEX;
typedef std::vector<COMPLEX> C_VECTOR;
typedef std::vector<C_VECTOR> C_MATRIX;
//Some constants
#define L0 1.2e-6
#define C90 299792458
#define K0 (2 * M_PI / L0)
#define W0 (2 * M_PI*C90 / L0)
#define FWHM0 250e-15
#define W_PULSE 3.25e-6
#define E_VAL 1.60217662e-19
#define ME 9.10938356e-31
#define M0 0.15*ME
//const double rho0 = 0;
#define BETA0 2e-11
#define HBAR 1.0545718e-34
#define N0 3.52
#define N2 4.5e-18
#define ENERGY 2.5e-6
#define POWER 1e7
#define RANGE_TIME 3
#define RANGE_SPACE 3
//Printing function
void print_vector(const C_VECTOR &data, std::string filename)
std::ofstream file_out;;
for (unsigned int j = 0; j < data.size(); j++)
file_out << data[j] << ' ';
file_out << '\n';
//Derivative functions
COMPLEX forward_diff_first(const C_VECTOR &x, const int p0, const double dx)
return (x[p0 + 1] - x[p0]) / dx;
COMPLEX forward_diff_second(const C_VECTOR &x, const int p0, const double dx)
return (x[p0 + 2] - x[p0 + 1] - x[p0 + 1] + x[p0]) / (dx*dx);
COMPLEX backward_diff_first(const C_VECTOR &x, const int p0, const double dx)
return (x[p0] - x[p0 - 1]) / dx;
COMPLEX backward_diff_second(const C_VECTOR &x, const int p0, const double dx)
return (x[p0 - 2] - x[p0 - 1] - x[p0 - 1] + x[p0]) / (dx*dx);
COMPLEX central_diff_first(const C_VECTOR &x, const int p0, const double dx)
return (x[p0 + 1] - x[p0 - 1]) / (2 * dx);
COMPLEX central_diff_second(const C_VECTOR &x, const int p0, const double dx)
return (x[p0 + 2] - x[p0] - x[p0] + x[p0 - 2]) / (4 * dx*dx);
C_VECTOR central_diff_f(const C_VECTOR &x, const double dx)
C_VECTOR cdf = C_VECTOR(x.size());
cdf[0] = forward_diff_first(x, 0, dx);
cdf[x.size() - 1] = backward_diff_first(x, x.size() - 1, dx);
for (unsigned int i = 1; i < x.size() - 1; i++)
cdf[i] = central_diff_first(x, i, dx);
return cdf;
C_VECTOR forward_diff_f(const C_VECTOR &x, const double dx)
C_VECTOR cdf = C_VECTOR(x.size());
cdf[x.size() - 1] = backward_diff_first(x, x.size() - 1, dx);
for (unsigned int i = 0; i < x.size() - 1; i++)
cdf[i] = forward_diff_first(x, i, dx);
return cdf;
C_VECTOR backward_diff_f(const C_VECTOR &x, const double dx)
C_VECTOR cdf = C_VECTOR(x.size());
cdf[0] = forward_diff_first(x, 0, dx);
for (unsigned int i = 1; i < x.size(); i++)
cdf[i] = backward_diff_first(x, i, dx);
return cdf;
C_VECTOR central_diff_s(const C_VECTOR &x, const double dx)
C_VECTOR cdf = C_VECTOR(x.size());
cdf[x.size() - 1] = backward_diff_second(x, x.size() - 1, dx);
cdf[x.size() - 2] = backward_diff_second(x, x.size() - 2, dx);
cdf[0] = forward_diff_second(x, 0, dx);
cdf[1] = forward_diff_second(x, 1, dx);
for (unsigned int i = 2; i < x.size() - 2; i++)
cdf[i] = central_diff_second(x, i, dx);
return cdf;
//Auxiliary functions
bool isnan(const COMPLEX &in)
return isnan(in.real()) || isnan(in.imag());
template<typename T>
std::vector<T> linspace(T start_in, T end_in, int num_in)
double start = static_cast<double>(start_in);
double end = static_cast<double>(end_in);
double num = static_cast<double>(num_in);
double delta = (end - start) / (num - 1);
std::cout << "Temp_data created\n";
std::vector<T> linspaced(num - 1);
for (int i = 0; i < num - 1; ++i)
linspaced[i] = start + delta * i;
std::cout << "Filled linspaced\n";
std::cout << "linspaced filled finally\n";
return linspaced;
C_VECTOR gaussian(const double w, const std::vector<double> r, const double P0)
C_VECTOR ret_val(r.size());
for (unsigned int i = 0; i < r.size(); i++)
ret_val[i] = P0 * exp(-COMPLEX(2, 0) * r[i] * r[i] / (w*w));
return ret_val;
double get_abs(COMPLEX number)
return number.real()*number.real() + number.imag()*number.imag();
//Main function
C_VECTOR deriv_func(const C_VECTOR &in, const double dx)
return central_diff_f(in, dx);
C_VECTOR calc_steps(const C_VECTOR &in, const std::vector<double> r, const double dz, const int steps, const COMPLEX pref)
C_VECTOR calc_val = in;
C_VECTOR add_vec(in.size());
C_VECTOR rho_vals(in.size());
for (unsigned int i = 0; i < in.size(); i++)
rho_vals[i] = 1e20;
for (int i = 0; i < steps; i++)
print_vector(calc_val, "new_vec_" + std::to_string(i) + ".txt");
C_VECTOR deriv = deriv_func(calc_val, abs(r[0] - r[1]));
C_VECTOR deriv2 = central_diff_s(calc_val, abs(r[0] - r[1]));
for (unsigned int j = 1; j < in.size(); j++)
if (isnan(deriv[j]))
std::cout << "isnan value in round " << i << " detected.\n ";
return calc_val;
for (unsigned int j = 0; j < in.size(); j++)
double r_val = r[j] + 1e-9;
add_vec[j] = pref * dz*(deriv[j] / r_val + deriv2[j])* calc_val[j] + COMPLEX(0, 1)*(K0)*N2 / N0*get_abs(calc_val[j])*calc_val[j] - BETA0 / 2 * calc_val[j] * get_abs(calc_val[j]) - 2*M_PI*COMPLEX(0, 1)*K0*E_VAL*E_VAL/(N0*N0*M0*W0*W0)*rho_vals[j]*calc_val[j];
calc_val[j] += add_vec[j];
//calc_val[0] = pref * dz * (deriv[1] / r[1] + deriv2[0])*calc_val[0];
return calc_val;
int main()
double w = 3.25e-6;
int num_vals = 1000;
std::vector<double> r_val = linspace<double>(0.0, 3 * w, num_vals);
C_VECTOR gauss = gaussian(w, r_val, 1e-2);
print_vector(gauss, "gauss.txt");
std::cout << "pref is " << COMPLEX(0, 1 / (2 * K0)) << '\n';
C_VECTOR new_vec = calc_steps(gauss, r_val, 1e-7, 1000, COMPLEX(0, 1)/(2*K0));
print_vector(new_vec, "new_vec.txt");
return 0;
Unfortunately I have the problem that my derivative functions add artifacts to the function, resulting in overshooting and extremely large numbers at the borders. This can be seen in the code above with the current values, and already in iteration 2 (in the file new_vec_2.txt, at the left border). The resulting function is not smooth anymore. Adding more points only delays that problem, but does not fix it. Is there anything I can do to improve the code, and to prevent the overshooting? Or is the algorithm in fault, and I have to develop another one?


How to to find smallest (optimized) distance between two vectors in C++

I'm translating Python's version of 'page_dewarper' ( into C++. I'm going to use dlib, which is a fantastic tool, that helped me in a few optimization problems before. In line 748 of Github repo ( Matt uses optimize function from Scipy, to find the minimal distance between two vectors. I think, my C++ equivalent should be solve_least_squares_lm() or solve_least_squares(). I'll give a concrete example to analyze.
My data:
a) dstpoints is a vector with OpenCV points - std::vector<cv::Point2f> (I have 162 points in this example, they are not changing),
b) ppts is also std::vector<cv::Point2f> and the same size as dstpoints.
std::vector<cv::Point2f> ppts = project_keypoints(params, input);
It is dependent on:
- dlib::column_vector 'input' is 2*162=324 long and is not changing,
- dlib::column_vector 'params' is 189 long and its values should be changed to get the minimal value of variable 'suma', something like this:
double suma = 0.0;
for (int i=0; i<dstpoints_size; i++)
suma += pow(dstpoints[i].x - ppts[i].x, 2);
suma += pow(dstpoints[i].y - ppts[i].y, 2);
I'm looking for 'params' vector that will give me the smallest value of 'suma' variable. Least squares algorithm seems to be a good option to solve it:, but I don't know if it is good for my case.
I think, my problem is that for every different 'params' vector I get different 'ppts' vector, not only single value, and I don't know if solve_least_squares function can match my example.
I must calculate residual for every point. I think, my 'list' from aforementioned link should be something like this:
(ppts[i].x - dstpoints[i].x, ppts[i].y - dstpoints[i].y, ppts[i+1].x - dstpoints[i+1].x, ppts[i+1].y - dstpoints[i+1].y, etc.)
, where 'ppts' vector depends on 'params' vector and then this problem can be solved with least squares algorithm. I don't know how to create data_samples with these assumptions, because it requires dlib::input_vector for every sample, as it is shown in example:
Am I thinking right?
I'm doing the same thing this days. My solution is writing a Powell Class by myself. It works, but really slowly. The program takes 2 minutes in dewarping linguistics_thesis.jpg.
I don't know what cause the program running so slowly. Maybe because of the algorithm or the code has some extra loop. I'm a Chinese student and my school only have java lessons. So it is normal if you find some extra codes in my codes.
Here is my Powell class.
using namespace std;
using namespace cv;
class MyPowell
vector<vector<double>> xi;
vector<double> pcom;
vector<double> xicom;
vector<Point2d> dstpoints;
vector<double> myparams;
vector<double> params;
vector<Point> keypoint_index;
Point2d dst_br;
Point2d dims;
int N;
int itmax;
int ncom;
int iter;
double fret, ftol;
int usingAorB;
MyPowell(vector<Point2d> &dstpoints, vector<double> &params, vector<Point> &keypoint_index);
MyPowell(Point2d &dst_br, vector<double> &params, Point2d & dims);
double obj(vector<double> &params);
void powell(vector<double> &p, vector<vector<double>> &xi, double ftol, double &fret);
double sign(double a);// , double b);
double sqr(double a);
void linmin(vector<double> &p, vector<double> &xit, int n, double &fret);
void mnbrak(double & ax, double & bx, double & cx,
double & fa, double & fb, double & fc);
double f1dim(double x);
double brent(double ax, double bx, double cx, double & xmin, double tol);
vector<double> usePowell();
void erase(vector<double>& pbar, vector<double> &prr, vector<double> &pr);
MyPowell::MyPowell(vector<Point2d> &dstpoints, vector<double>& params, vector<Point> &keypoint_index)
this->dstpoints = dstpoints;
this->myparams = params;
this->keypoint_index = keypoint_index;
N = params.size();
itmax = N * N;
usingAorB = 1;
MyPowell::MyPowell(Point2d & dst_br, vector<double>& params, Point2d & dims)
this->dst_br = dst_br;
this->params = params;
this->dims = dims;
N = 2;
itmax = N * 1000;
usingAorB = 2;
usingAorB = 3;
double MyPowell::obj(vector<double> &myparams)
if (1 == usingAorB)
vector<Point2d> ppts = Dewarp::projectKeypoints(keypoint_index, myparams);
double total = 0;
for (int i = 0; i < ppts.size(); i++)
double x = dstpoints[i].x - ppts[i].x;
double y = dstpoints[i].y - ppts[i].y;
total += (x * x + y * y);
return total;
else if(2 == usingAorB)
dims.x = myparams[0];
dims.y = myparams[1];
//cout << "dims.x " << dims.x << " dims.y " << dims.y << endl;
vector<Point2d> vdims = { dims };
vector<Point2d> proj_br = Dewarp::projectXY(vdims, params);
double total = 0;
double x = dst_br.x - proj_br[0].x;
double y = dst_br.y - proj_br[0].y;
total += (x * x + y * y);
return total;
return 0;
void MyPowell::powell(vector<double> &x, vector<vector<double>> &direc, double ftol, double &fval)
vector<double> x1;
vector<double> x2;
vector<double> direc1;
int myitmax = 20;
myitmax = 10;
else if (N > 300)
myitmax = 15;
double fx2, t, fx, dum, delta;
fval = obj(x);
int bigind;
for (int j = 0; j < N; j++)
int iter = 0;
while (true)
iter += 1;
fx = fval;
bigind = 0;
delta = 0.0;
for (int i = 0; i < N; i++)
direc1 = direc[i];
fx2 = fval;
linmin(x, direc1, N, fval);
if (fabs(fx2 - fval) > delta)
delta = fabs(fx2 - fval);
bigind = i;
if (2.0 * fabs(fx - fval) <= ftol * (fabs(fx) + fabs(fval)) + 1e-7)
erase(direc1, x2, x1);
if (iter >= itmax)
cout << "powell exceeding maximum iterations" << endl;
if (!x2.empty())
for (int j = 0; j < N; j++)
x2.push_back(2.0*x[j] - x1[j]);
direc1[j] = x[j] - x1[j];
x1[j] = x[j];
cout << fx2 << endl;
fx2 = obj(x2);
if (myitmax < 0)
} while (fx2 >= fx);
dum = fx - 2 * fval + fx2;
t = 2.0*dum*pow((fx - fval - delta), 2) - delta * pow((fx - fx2), 2);
} while (t >= 0.0);
linmin(x, direc1, N, fval);
direc[bigind] = direc1;
double MyPowell::sign(double a)//, double b)
if (a > 0.0)
return 1;
if (a < 0.0)
return -1;
return 0;
double MyPowell::sqr(double a)
return a * a;
void MyPowell::linmin(vector<double>& p, vector<double>& xit, int n, double &fret)
double tol = 1e-2;
ncom = n;
pcom = p;
xicom = xit;
double ax = 0.0;
double xx = 1.0;
double bx = 0.0;
double fa, fb, fx, xmin;
mnbrak(ax, xx, bx, fa, fx, fb);
fret = brent(ax, xx, bx, xmin, tol);
for (int i = 0; i < n; i++)
xit[i] = (xmin * xit[i]);
p[i] += xit[i];
void MyPowell::mnbrak(double & ax, double & bx, double & cx,
double & fa, double & fb, double & fc)
const double GOLD = 1.618034, GLIMIT = 110.0, TINY = 1e-20;
double val, fw, tmp2, tmp1, w, wlim;
double denom;
fa = f1dim(ax);
fb = f1dim(bx);
if (fb > fa)
val = ax;
ax = bx;
bx = val;
val = fb;
fb = fa;
fa = val;
cx = bx + GOLD * (bx - ax);
fc = f1dim(cx);
int iter = 0;
while (fb >= fc)
tmp1 = (bx - ax) * (fb - fc);
tmp2 = (bx - cx) * (fb - fa);
val = tmp2 - tmp1;
if (fabs(val) < TINY)
denom = 2.0*TINY;
denom = 2.0*val;
w = bx - ((bx - cx)*tmp2 - (bx - ax)*tmp1) / (denom);
wlim = bx + GLIMIT * (cx - bx);
if ((bx - w) * (w - cx) > 0.0)
fw = f1dim(w);
if (fw < fc)
ax = bx;
fa = fb;
bx = w;
fb = fw;
else if (fw > fb)
cx = w;
fc = fw;
w = cx + GOLD * (cx - bx);
fw = f1dim(w);
if ((cx - w)*(w - wlim) >= 0.0)
fw = f1dim(w);
if (fw < fc)
bx = cx;
cx = w;
w = cx + GOLD * (cx - bx);
fb = fc;
fc = fw;
fw = f1dim(w);
else if ((w - wlim)*(wlim - cx) >= 0.0)
w = wlim;
fw = f1dim(w);
w = cx + GOLD * (cx - bx);
fw = f1dim(w);
ax = bx;
bx = cx;
cx = w;
fa = fb;
fb = fc;
fc = fw;
double MyPowell::f1dim(double x)
vector<double> xt;
for (int j = 0; j < ncom; j++)
xt.push_back(pcom[j] + x * xicom[j]);
return obj(xt);
double MyPowell::brent(double ax, double bx, double cx, double & xmin, double tol = 1.48e-8)
const double CGOLD = 0.3819660, ZEPS = 1.0e-4;
int itmax = 500;
double a = MIN(ax, cx);
double b = MAX(ax, cx);
double v = bx;
double w = v, x = v;
double deltax = 0.0;
double fx = f1dim(x);
double fv = fx;
double fw = fx;
double rat = 0, u = 0, fu;
int iter;
int done;
double dx_temp, xmid, tol1, tol2, tmp1, tmp2, p;
for (iter = 0; iter < 500; iter++)
xmid = 0.5 * (a + b);
tol1 = tol * fabs(x) + ZEPS;
tol2 = 2.0*tol1;
if (fabs(x - xmid) <= (tol2 - 0.5*(b - a)))
done = -1;
if (fabs(deltax) > tol1)
tmp1 = (x - w) * (fx - fv);
tmp2 = (x - v) * (fx - fw);
p = (x - v) * tmp2 - (x - w) * tmp1;
tmp2 = 2.0 * (tmp2 - tmp1);
if (tmp2 > 0.0)
p = -p;
tmp2 = fabs(tmp2);
dx_temp = deltax;
deltax = rat;
if ((p > tmp2 * (a - x)) && (p < tmp2 * (b - x)) &&
fabs(p) < fabs(0.5 * tmp2 * dx_temp))
rat = p / tmp2;
u = x + rat;
if ((u - a) < tol2 || (b - u) < tol2)
rat = fabs(tol1) * sign(xmid - x);
done = 0;
if (x >= xmid)
deltax = a - x;
deltax = b - x;
rat = CGOLD * deltax;
if (fabs(rat) >= tol1)
u = x + rat;
u = x + fabs(tol1) * sign(rat);
fu = f1dim(u);
if (fu > fx)
if (u < x)
a = u;
b = u;
if (fu <= fw || w == x)
v = w;
w = u;
fv = fw;
fw = fu;
else if (fu <= fv || v == x || v == w)
v = u;
fv = fu;
if (u >= x)
a = x;
b = x;
v = w;
w = x;
x = u;
fv = fw;
fw = fx;
fx = fu;
if(iter > itmax)
cout << "\n Brent exceed maximum iterations.\n\n";
xmin = x;
return fx;
vector<double> MyPowell::usePowell()
ftol = 1e-4;
vector<vector<double>> xi;
for (int i = 0; i < N; i++)
vector<double> xii;
for (int j = 0; j < N; j++)
double fret = 0;
powell(myparams, xi, ftol, fret);
//for (int i = 0; i < xi.size(); i++)
// double a = obj(xi[i]);
// if (fret > a)
// {
// fret = a;
// myparams = xi[i];
// }
cout << "final result" << fret << endl;
return myparams;
void MyPowell::erase(vector<double>& pbar, vector<double>& prr, vector<double>& pr)
for (int i = 0; i < pbar.size(); i++)
pbar[i] = 0;
for (int i = 0; i < prr.size(); i++)
prr[i] = 0;
for (int i = 0; i < pr.size(); i++)
pr[i] = 0;
I used PRAXIS library, because it doesn't need derivative information and is fast.
I modified the code a little to my needs and now it is faster than original version written in Python.

Cannot convert complex<double> to double

I'm trying to fix a program that I found so it takes diferent values than the ones it has as a test to itself. The program should be able to take an array of values that respresent a mathematical function as a signal and the output should be the Fast Fourier Transform to that signal. Here is what I already have fixed in the code:
#include <complex>
#include <iostream>
#include <valarray>
#define fnc(x) (x)
const double PI = 3.141592653589793238460;
typedef std::valarray<double> CArray;
double d;
int i;
void fft(CArray& x)
const size_t N = x.size();
if (N <= 1) return;
// divide
CArray even = x[std::slice(0, N/2, 2)];
CArray odd = x[std::slice(1, N/2, 2)];
// conquer
// combine
for (size_t k = 0; k < N/2; ++k)
double t = std::polar(1.0, -2 * PI * k / N) * odd[k];
x[k ] = even[k] + t;
x[k+N/2] = even[k] - t;
//Complex f = 1.0 / sqrt(N);
//for (unsigned int i = 0; i < N; i++)
// x[i] *= f;
int main()
double test[num.i];
for(i.i=1; i.i < num.i;++i.i)
test[i.i] = (double)fnc(i.i);
CArray data(test, num.d);
// forward fft
std::cout << "fft" << std::endl;
for (i.i = 0; i.i < num.i; ++i.i)
std::cout << data[i.i] << std::endl;
return 0;
When I try to compile it ti shows me the nect
error: cannot convert 'std::complex' to 'double' in initialization|
on the 34th line, on the line marked in this part:
for (size_t k = 0; k < N/2; ++k)
double t = std::polar(1.0, -2 * PI * k / N) * odd[k];
x[k ] = even[k] + t;
x[k+N/2] = even[k] - t;
pesizaly this one:
double t = std::polar(1.0, -2 * PI * k / N) * odd[k];
If anyone could tell me how to fix it I would be very thakfully.
For better references this is the original code, in case anyone could tell me a better way to fix it so it mekes what I want.
#include <complex>
#include <iostream>
#include <valarray>
const double PI = 3.141592653589793238460;
typedef std::complex<double> Complex;
typedef std::valarray<Complex> CArray;
// Cooley–Tukey FFT (in-place, divide-and-conquer)
// Higher memory requirements and redundancy although more intuitive
void fft(CArray& x)
const size_t N = x.size();
if (N <= 1) return;
// divide
CArray even = x[std::slice(0, N/2, 2)];
CArray odd = x[std::slice(1, N/2, 2)];
// conquer
// combine
for (size_t k = 0; k < N/2; ++k)
Complex t = std::polar(1.0, -2 * PI * k / N) * odd[k];
x[k ] = even[k] + t;
x[k+N/2] = even[k] - t;
// Cooley-Tukey FFT (in-place, breadth-first, decimation-in-frequency)
// Better optimized but less intuitive
// !!! Warning : in some cases this code make result different from not optimased version above (need to fix bug)
// The bug is now fixed #2017/05/30
void fft(CArray &x)
// DFT
unsigned int N = x.size(), k = N, n;
double thetaT = 3.14159265358979323846264338328L / N;
Complex phiT = Complex(cos(thetaT), -sin(thetaT)), T;
while (k > 1)
n = k;
k >>= 1;
phiT = phiT * phiT;
T = 1.0L;
for (unsigned int l = 0; l < k; l++)
for (unsigned int a = l; a < N; a += n)
unsigned int b = a + k;
Complex t = x[a] - x[b];
x[a] += x[b];
x[b] = t * T;
T *= phiT;
// Decimate
unsigned int m = (unsigned int)log2(N);
for (unsigned int a = 0; a < N; a++)
unsigned int b = a;
// Reverse bits
b = (((b & 0xaaaaaaaa) >> 1) | ((b & 0x55555555) << 1));
b = (((b & 0xcccccccc) >> 2) | ((b & 0x33333333) << 2));
b = (((b & 0xf0f0f0f0) >> 4) | ((b & 0x0f0f0f0f) << 4));
b = (((b & 0xff00ff00) >> 8) | ((b & 0x00ff00ff) << 8));
b = ((b >> 16) | (b << 16)) >> (32 - m);
if (b > a)
Complex t = x[a];
x[a] = x[b];
x[b] = t;
//// Normalize (This section make it not working correctly)
//Complex f = 1.0 / sqrt(N);
//for (unsigned int i = 0; i < N; i++)
// x[i] *= f;
// inverse fft (in-place)
void ifft(CArray& x)
// conjugate the complex numbers
x = x.apply(std::conj);
// forward fft
fft( x );
// conjugate the complex numbers again
x = x.apply(std::conj);
// scale the numbers
x /= x.size();
int main()
const Complex test[] = { 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0 };
CArray data(test, 8);
// forward fft
std::cout << "fft" << std::endl;
for (int i = 0; i < 8; ++i)
std::cout << data[i] << std::endl;
// inverse fft
std::cout << std::endl << "ifft" << std::endl;
for (int i = 0; i < 8; ++i)
std::cout << data[i] << std::endl;
return 0;
Ps. If anyone knows a better code for what I need I could also use it.
std::complex<double> and double are incompatible types.
Change this:
double t = std::polar(1.0, -2 * PI * k / N) * odd[k];
to this:
std::complex<double> t = std::polar(1.0, -2 * PI * k / N) * odd[k];
since std::polar returns:
The complex cartesian equivalent to the polar format formed by rho and theta.
The error message is pretty explicit: std::polar returns a std::complex, not a double. Looking at the rest of the code, maybe just change the type of t?

Is it possible to use CUDA parallelizing this nested for loop?

I want to speed up this nested for loop, just start learn CUDA, how could I use CUDA to parallel this c++ code ?
#define PI 3.14159265
using namespace std;
int main()
int nbint = 2;
int hits = 20;
int nbinp = 2;
float _theta, _phi, _l, _m, _n, _k = 0, delta = 5;
float x[20],y[20],z[20],a[20],t[20];
for (int i = 0; i < hits; ++i)
x[i] = rand() / (float)(RAND_MAX / 100);
for (int i = 0; i < hits; ++i)
y[i] = rand() / (float)(RAND_MAX / 100);
for (int i = 0; i < hits; ++i)
z[i] = rand() / (float)(RAND_MAX / 100);
for (int i = 0; i < hits; ++i)
a[i] = rand() / (float)(RAND_MAX / 100);
float maxforall = 1e-6;
float theta0;
float phi0;
for (int i = 0; i < nbint; i++)
_theta = (0.5 + i)*delta;
for (int j = 0; j < nbinp; j++)
_phi = (0.5 + j)*delta / _theta;
_l = sin(_theta* PI / 180.0)*cos(_phi* PI / 180.0);
_m = sin(_theta* PI / 180.0)*sin(_phi* PI / 180.0);
_n = cos(_theta* PI / 180.0);
for (int k = 0; k < hits; k++)
_k = -(_l*x[k] + _m*y[k] + _n*z[k]);
t[k] = a[k] - _k;
qsort(t, 0, hits - 1);
float max = t[0];
for (int k = 0; k < hits; k++)
if (max < t[k])
max = t[k];
if (max > maxforall)
maxforall = max;
return 0;
I want to put innermost for loop and the sort part(maybe the whole nested loop) into parallel. After sort those array I found the maximum of all arrays. I use maximum to simplify the code. The reason I need sort is that maximum represent
here is a continuous time information(all arrays contain time information). The sort part make those time from lowest to highest. Then I compare the a specific time interval(not a single value). The compare process almost like I choose maximum but with a continuous interval not a single value.
Your 3 nested loops calculate nbint*nbinp*hits values. Since each of those values is independent from each other, all values can be calculated in parallel.
You stated in your comments that you have a commutative and associative "filter condition" which reduces the output to a single scalar value. This can be exploited to avoid sorting and storing the temporary values. Instead, we can calculate the values on-the-fly and then apply a parallel reduction to determine the end result.
This can be done in "raw" CUDA, below I implemented this idea using thrust. The main idea is to run grid_op nbint*nbinp*hits times in parallel. In order to find out the three original "loop indices" from the single scalar index which is passed to grid_op the algorithm from this SO question is used.
thrust::transform_reduce performs the on-the-fly transformation and the subsequent parallel reduction (here thrust::maximum is used as a substitute).
#include <cmath>
#include <thrust/device_vector.h>
#include <thrust/functional.h>
#include <thrust/transform_reduce.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/tuple.h>
// ### BEGIN utility for demo ####
#include <iostream>
#include <thrust/random.h>
thrust::host_vector<float> random_vector(const size_t N)
thrust::default_random_engine rng;
thrust::uniform_real_distribution<float> u01(0.0f, 1.0f);
thrust::host_vector<float> temp(N);
for(size_t i = 0; i < N; i++) {
temp[i] = u01(rng);
return temp;
// ### END utility for demo ####
template <typename... Iterators>
thrust::zip_iterator<thrust::tuple<Iterators...>> zip(Iterators... its)
return thrust::make_zip_iterator(thrust::make_tuple(its...));
template <typename ZipIterator>
class grid_op
grid_op(ZipIterator zipIt, std::size_t dim1, std::size_t dim2) : zipIt(zipIt), dim1(dim1), dim2(dim2){}
__host__ __device__
float operator()(std::size_t index) const
const auto coords = unflatten_3d_index(index, dim1, dim2);
const auto values = zipIt[thrust::get<2>(coords)];
const float delta = 5;
const float _theta = (0.5f + thrust::get<0>(coords))*delta;
const float _phi = (0.5f + thrust::get<1>(coords))*delta / _theta;
const float _l = sin(_theta* M_PI / 180.0)*cos(_phi* M_PI / 180.0);
const float _m = sin(_theta* M_PI / 180.0)*sin(_phi* M_PI / 180.0);
const float _n = cos(_theta* M_PI / 180.0);
const float _k = -(_l*thrust::get<0>(values) + _m*thrust::get<1>(values) + _n*thrust::get<2>(values));
return (thrust::get<3>(values) - _k);
__host__ __device__
thrust::tuple<std::size_t, std::size_t, std::size_t>
unflatten_3d_index(std::size_t index, std::size_t dim1, std::size_t dim2) const
// taken from
std::size_t x = index % dim1;
std::size_t y = ( ( index - x ) / dim1 ) % dim2;
std::size_t z = ( ( index - y * dim1 - x ) / (dim1 * dim2) );
return thrust::make_tuple(x,y,z);
ZipIterator zipIt;
std::size_t dim1;
std::size_t dim2;
template <typename ZipIterator>
grid_op<ZipIterator> make_grid_op(ZipIterator zipIt, std::size_t dim1, std::size_t dim2)
return grid_op<ZipIterator>(zipIt, dim1, dim2);
int main()
const int nbint = 3;
const int nbinp = 4;
const int hits = 20;
const std::size_t N = nbint * nbinp * hits;
thrust::device_vector<float> d_x = random_vector(hits);
thrust::device_vector<float> d_y = random_vector(hits);
thrust::device_vector<float> d_z = random_vector(hits);
thrust::device_vector<float> d_a = random_vector(hits);
auto zipIt = zip(d_x.begin(), d_y.begin(), d_z.begin(), d_a.begin());
auto countingIt = thrust::counting_iterator<std::size_t>(0);
auto unary_op = make_grid_op(zipIt, nbint, nbinp);
auto binary_op = thrust::maximum<float>();
const float init = 0;
float max = thrust::transform_reduce(
countingIt, countingIt+N,
std::cout << "max = " << max << std::endl;

C++ while loop using bisection method. Help on break

I need some help here. Please excuse the complexity of the code. Basically, I am looking to use the bisection method to find a value "Theta" and each i increment.
I know that all the calculations work fine when I know the Theta, and I have the code run to just simply calculate all the values, but when I introduce a while loop and the bisection method to have the code approximate Theta, I can't seem to get it to run correctly. I am assuming I have my while loop set up incorrectly....
#include <math.h>
#include <iostream>
#include <vector>
#include <iomanip>
#include <algorithm> // std::max
using namespace std;
double FuncM(double theta, double r, double F, double G, double Gprime, double d_t, double sig);
double FuncM(double theta, double r, double F, double G, double Gprime, double d_t, double sig)
double eps = 0.0001;
return ((log(max((r + (theta + F - 0.5 * G * Gprime ) * d_t), eps))) / sig);
double FuncJSTAR(double m, double x_0, double d_x);
double FuncJSTAR(double m, double x_0, double d_x)
return (int(((m - x_0) / d_x)+ 0.5));
double FuncCN(double m, double x_0, double j, double d_x);
double FuncCN(double m, double x_0, double j, double d_x)
return (m - x_0 - j * d_x);
double FuncPup(double d_t, double cn, double d_x);
double FuncPup(double d_t, double cn, double d_x)
return (((d_t + pow(cn, 2.0)) / (2.0 * pow(d_x, 2.0))) + (cn / (2.0 * d_x)));
double FuncPdn(double d_t, double cn, double d_x);
double FuncPdn(double d_t, double cn, double d_x)
return (((d_t + pow(cn, 2.0)) / (2.0 * pow(d_x, 2.0))) - (cn / (2.0 * d_x)));
double FuncPmd(double pd, double pu);
double FuncPmd(double pd, double pu)
return (1 - pu - pd);
int main()
const int Maturities = 5;
const double EPS = 0.00001;
double TermStructure[Maturities][2] = {
{0.5 , 0.05},
{1.0 , 0.06},
{1.5 , 0.07},
{2.0 , 0.075},
{3.0 , 0.085} };
vector<double> Price(Maturities);
double Initial_Price = 1.00;
for (int i = 0; i < Maturities; i++)
Price[i] = Initial_Price * exp(-TermStructure[i][1] * TermStructure[i][0]);
int j_max = 8;
int j_range = ((j_max * 2) + 1);
// Set up vector of possible j values
vector<int> j_value(j_range);
for (int j = 0; j < j_range; j++)
j_value[j] = j_max - j;
double dt = 0.5;
double dx = sqrt(3 * dt);
double sigma = 0.15;
double mean_reversion = 0.2; // "a" value
double r0 = TermStructure[0][1]; // Initialise r(0) in case no corresponding dt rate in term structure
double x0 = log(r0) / sigma;
vector<double> r_j(j_range); // rate at each j
vector<double> F_r(j_range);
vector<double> G_r(j_range);
vector<double> G_prime_r(j_range);
for(int j = 0; j < j_range; j++)
if (j == j_max)
r_j[j] = r0;
r_j[j] = exp((x0 + j_value[j]*dx) * sigma);
F_r[j] = -mean_reversion * r_j[j];
G_r[j] = sigma * r_j[j];
G_prime_r[j] = sigma;
vector<vector<double>> m((j_range), vector<double>(Maturities));
vector<vector<int>> j_star((j_range), vector<int>(Maturities));
vector<vector<double>> Central_Node((j_range), vector<double>(Maturities));
vector<double> Theta(Maturities - 1);
vector<vector<double>> Pu((j_range), vector<double>(Maturities));
vector<vector<double>> Pd((j_range), vector<double>(Maturities));
vector<vector<double>> Pm((j_range), vector<double>(Maturities));
vector<vector<double>> Q((j_range), vector<double>(Maturities));// = {}; // Arrow Debreu Price. Initialised all array values to 0
vector<double> Q_dt_sum(Maturities);// = {}; // Sum of Arrow Debreu Price at each time step. Initialised all array values to 0
double Theta_A, Theta_B, Theta_C;
int JEND;
int TempStart;
int TempEnd;
int max;
int min;
vector<vector<int>> Up((j_range), vector<int>(Maturities));
vector<vector<int>> Down((j_range), vector<int>(Maturities));
// Theta[0] = 0.0498039349327417;
// Theta[1] = 0.0538710670441647;
// Theta[2] = 0.0181648634139392;
// Theta[3] = 0.0381183886467521;
for(int i = 0; i < (Maturities-1); i++)
Theta_A = 0.00;
Theta_B = TermStructure[i][1];
Q_dt_sum[0] = Initial_Price;
Q_dt_sum[i+1] = 0.0;
while (fabs(Theta_A - Theta_B) >= 0.0000001)
max = 1;
min = 10;
if (i == 0)
JSTART = j_max;
JEND = j_max;
JSTART = TempStart;
JEND = TempEnd;
for(int j = JSTART; j >= JEND; j--)
Theta_C = (Theta_A + Theta_B) / 2.0; // If Theta C is too low, the associated Price will be higher than Price from initial term structure. (ie P(Theta C) > P(i+2) for Theta C < Theta)
// If P_C > P(i+2), set Theta_B = Theta_C, else if P_C < P(i+2), set Theta_A = Theta_C, Else if P_C = P(i+2), Theta_C = Theta[i]
//cout << Theta_A << " " << Theta_B << " " << Theta_C << endl;
m[j][i] = FuncM(Theta[i], r_j[j], F_r[j], G_r[j], G_prime_r[j], dt, sigma);
j_star[j][i] = FuncJSTAR(m[j][i], x0, dx);
Central_Node[j][i] = FuncCN(m[j][i], x0, j_star[j][i], dx);
Pu[j][i] = FuncPup(dt, Central_Node[j][i], dx);
Pd[j][i] = FuncPdn(dt, Central_Node[j][i], dx);
Pm[j][i] = FuncPmd(Pd[j][i], Pu[j][i]);
for (int p = 0; p < j_range; p++)
Q[p][i] = 0; // Clear Q array
Q[j_max][0] = Initial_Price;
Q[j_max -(j_star[j][i]+1)][i+1] = Q[j_max - (j_star[j][i]+1)][i+1] + Q[j][i] * Pu[j][i] * exp(-r_j[j] * dt);
Q[j_max -(j_star[j][i] )][i+1] = Q[j_max - (j_star[j][i] )][i+1] + Q[j][i] * Pm[j][i] * exp(-r_j[j] * dt);
Q[j_max -(j_star[j][i]-1)][i+1] = Q[j_max - (j_star[j][i]-1)][i+1] + Q[j][i] * Pd[j][i] * exp(-r_j[j] * dt);
for (int j = 0; j < j_range; j++)
Up[j][i] = j_star[j][i] + 1;
Down[j][i] = j_star[j][i] - 1;
if (Up[j][i] > max)
max = Up[j][i];
if ((Down[j][i] < min) && (Down[j][i] > 0))
min = Down[j][i];
TempEnd = j_max - (max);
TempStart = j_max - (min);
for (int j = 0; j < j_range; j++)
Q_dt_sum[i+1] = Q_dt_sum[i+1] + Q[j][i] * exp(-r_j[j] * dt);
cout << Q_dt_sum[i+1] << endl;
if (Q_dt_sum[i+1] == Price[i+2])
Theta[i] = Theta_C;
if (Q_dt_sum[i+1] > Price[i+2])
Theta_B = Theta_C;
else if (Q_dt_sum[i+1] < Price[i+2])
Theta_A = Theta_C;
cout << Theta[i] << endl;
return 0;
Ok, my bad. I had a value being called incorrectly.
All good.

opencv: Rigid Transformation between two 3D point clouds

I have two 3D point clouds, and I'd like to use opencv to find the rigid transformation matrix (translation, rotation, constant scaling among all 3 axes).
I've found an estimateRigidTransformation function, but it's only for 2D points apparently
In addition, I've found estimateAffine3D, but it doesn't seem to support rigid transformation mode.
Do I need to just write my own rigid transformation function?
I did not find the required functionality in OpenCV so I have written my own implementation. Based on ideas from OpenSFM.
CalculateMean(const cv::Mat_<cv::Vec3d> &points)
cv::Mat_<cv::Vec3d> result;
cv::reduce(points, result, 0, CV_REDUCE_AVG);
return result(0, 0);
FindRigidTransform(const cv::Mat_<cv::Vec3d> &points1, const cv::Mat_<cv::Vec3d> points2)
/* Calculate centroids. */
cv::Vec3d t1 = -CalculateMean(points1);
cv::Vec3d t2 = -CalculateMean(points2);
cv::Mat_<double> T1 = cv::Mat_<double>::eye(4, 4);
T1(0, 3) = t1[0];
T1(1, 3) = t1[1];
T1(2, 3) = t1[2];
cv::Mat_<double> T2 = cv::Mat_<double>::eye(4, 4);
T2(0, 3) = -t2[0];
T2(1, 3) = -t2[1];
T2(2, 3) = -t2[2];
/* Calculate covariance matrix for input points. Also calculate RMS deviation from centroid
* which is used for scale calculation.
cv::Mat_<double> C(3, 3, 0.0);
double p1Rms = 0, p2Rms = 0;
for (int ptIdx = 0; ptIdx < points1.rows; ptIdx++) {
cv::Vec3d p1 = points1(ptIdx, 0) + t1;
cv::Vec3d p2 = points2(ptIdx, 0) + t2;
p1Rms +=;
p2Rms +=;
for (int i = 0; i < 3; i++) {
for (int j = 0; j < 3; j++) {
C(i, j) += p2[i] * p1[j];
cv::Mat_<double> u, s, vh;
cv::SVD::compute(C, s, u, vh);
cv::Mat_<double> R = u * vh;
if (cv::determinant(R) < 0) {
R -= u.col(2) * (vh.row(2) * 2.0);
double scale = sqrt(p2Rms / p1Rms);
R *= scale;
cv::Mat_<double> M = cv::Mat_<double>::eye(4, 4);
R.copyTo(M.colRange(0, 3).rowRange(0, 3));
cv::Mat_<double> result = T2 * M * T1;
result /= result(3, 3);
return result.rowRange(0, 3);
I've found PCL to be a nice adjunct to OpenCV. Take a look at their Iterative Closest Point (ICP) example. The provided example registers the two point clouds and then displays the rigid transformation.
Here's my rmsd code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <assert.h>
typedef struct
float m[4][4];
#define vdiff2(a,b) ( ((a)[0]-(b)[0]) * ((a)[0]-(b)[0]) + \
((a)[1]-(b)[1]) * ((a)[1]-(b)[1]) + \
((a)[2]-(b)[2]) * ((a)[2]-(b)[2]) )
static double alignedrmsd(float *v1, float *v2, int N);
static void centroid(float *ret, float *v, int N);
static int getalignmtx(float *v1, float *v2, int N, MATRIX *mtx);
static void crossproduct(float *ans, float *pt1, float *pt2);
static void mtx_root(MATRIX *mtx);
static int almostequal(MATRIX *a, MATRIX *b);
static void mulpt(MATRIX *mtx, float *pt);
static void mtx_mul(MATRIX *ans, MATRIX *x, MATRIX *y);
static void mtx_identity(MATRIX *mtx);
static void mtx_trans(MATRIX *mtx, float x, float y, float z);
static int mtx_invert(float *mtx, int N);
static float absmaxv(float *v, int N);
calculate rmsd between two structures
Params: v1 - first set of points
v2 - second set of points
N - number of points
mtx - return for transfrom matrix used to align structures
Returns: rmsd score
Notes: mtx can be null. Transform will be rigid. Inputs must
be previously aligned for sequence alignment
double rmsd(float *v1, float *v2, int N, float *mtx)
float cent1[3];
float cent2[3];
MATRIX tmtx;
MATRIX tempmtx;
MATRIX move1;
MATRIX move2;
int i;
double answer;
float *temp1 = 0;
float *temp2 = 0;
int err;
assert(N > 3);
temp1 = malloc(N * 3 * sizeof(float));
temp2 = malloc(N * 3 * sizeof(float));
if(!temp1 || !temp2)
goto error_exit;
centroid(cent1, v1, N);
centroid(cent2, v2, N);
temp1[i*3+0] = v1[i*3+0] - cent1[0];
temp1[i*3+1] = v1[i*3+1] - cent1[1];
temp1[i*3+2] = v1[i*3+2] - cent1[2];
temp2[i*3+0] = v2[i*3+0] - cent2[0];
temp2[i*3+1] = v2[i*3+1] - cent2[1];
temp2[i*3+2] = v2[i*3+2] - cent2[2];
err = getalignmtx(temp1, temp2, N, &tmtx);
if(err == -1)
goto error_exit;
mtx_trans(&move1, -cent2[0], -cent2[1], -cent2[2]);
mtx_mul(&tempmtx, &move1, &tmtx);
mtx_trans(&move2, cent1[0], cent1[1], cent1[2]);
mtx_mul(&tmtx, &tempmtx, &move2);
memcpy(temp2, v2, N * sizeof(float) * 3);
mulpt(&tmtx, temp2 + i * 3);
answer = alignedrmsd(v1, temp2, N);
memcpy(mtx, &tmtx.m, 16 * sizeof(float));
return answer;
mtx[i] = 0;
return sqrt(-1.0);
calculate rmsd between two aligned structures (trivial)
Params: v1 - first structure
v2 - second structure
N - number of points
Returns: rmsd
static double alignedrmsd(float *v1, float *v2, int N)
double answer =0;
int i;
answer += vdiff2(v1 + i *3, v2 + i * 3);
return sqrt(answer/N);
compute the centroid
static void centroid(float *ret, float *v, int N)
int i;
ret[0] = 0;
ret[1] = 0;
ret[2] = 0;
ret[0] += v[i*3+0];
ret[1] += v[i*3+1];
ret[2] += v[i*3+2];
ret[0] /= N;
ret[1] /= N;
ret[2] /= N;
get the matrix needed to align two structures
Params: v1 - reference structure
v2 - structure to align
N - number of points
mtx - return for rigid body alignment matrix
Notes: only calculates rotation part of matrix.
assumes input has been aligned to centroids
static int getalignmtx(float *v1, float *v2, int N, MATRIX *mtx)
MATRIX A = { {{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,1}} };
MATRIX temp;
float tv[3];
float tw[3];
float tv2[3];
float tw2[3];
int k, i, j;
int flag = 0;
float correction;
correction = absmaxv(v1, N * 3) * absmaxv(v2, N * 3);
A.m[i][j] += (v1[k*3+i] * v2[k*3+j])/correction;
while(flag < 3)
At.m[i][j] = A.m[j][i];
memcpy(&Ainv, &A, sizeof(MATRIX));
/* this will happen if all points are in a plane */
if( mtx_invert((float *) &Ainv, 4) == -1)
if(flag == 0)
crossproduct(tv, v1, v1+3);
crossproduct(tw, v2, v2+3);
crossproduct(tv2, tv, v1);
crossproduct(tw2, tw, v2);
memcpy(tv, tv2, 3 * sizeof(float));
memcpy(tw, tw2, 3 * sizeof(float));
A.m[i][j] += tv[i] * tw[j];
flag = 5;
if(flag != 5)
return -1;
mtx_mul(&temp, &At, &A);
mtx_mul(mtx, &temp, &Ainv);
return 0;
get the crossproduct of two vectors.
Params: ans - return pinter for answer.
pt1 - first vector
pt2 - second vector.
Notes: crossproduct is at right angles to the two vectors.
static void crossproduct(float *ans, float *pt1, float *pt2)
ans[0] = pt1[1] * pt2[2] - pt1[2] * pt2[1];
ans[1] = pt1[0] * pt2[2] - pt1[2] * pt2[0];
ans[2] = pt1[0] * pt2[1] - pt1[1] * pt2[0];
Denman-Beavers square root iteration
static void mtx_root(MATRIX *mtx)
MATRIX Y = *mtx;
int iter = 0;
int i, ii;
invY = Y;
invZ = Z;
if( mtx_invert((float *) &invY, 4) == -1)
if( mtx_invert((float *) &invZ, 4) == -1)
Y1.m[i][ii] = 0.5 * (Y.m[i][ii] + invZ.m[i][ii]);
Z1.m[i][ii] = 0.5 * (Z.m[i][ii] + invY.m[i][ii]);
Y = Y1;
Z = Z1;
mtx_mul(&Y2, &Y, &Y);
while(!almostequal(&Y2, mtx) && iter++ < 20 );
*mtx = Y;
Check two matrices for near-enough equality
Params: a - first matrix
b - second matrix
Returns: 1 if almost equal, else 0, epsilon 0.0001f.
static int almostequal(MATRIX *a, MATRIX *b)
int i, ii;
float epsilon = 0.001f;
if(fabs(a->m[i][ii] - b->m[i][ii]) > epsilon)
return 0;
return 1;
multiply a point by a matrix.
Params: mtx - matrix
pt - the point (transformed)
static void mulpt(MATRIX *mtx, float *pt)
float ans[4] = {0};
int i;
int ii;
ans[i] += pt[ii] * mtx->m[ii][i];
ans[i] += mtx->m[3][i];
pt[0] = ans[0];
pt[1] = ans[1];
pt[2] = ans[2];
multiply two matrices.
Params: ans - return pointer for answer.
x - first matrix
y - second matrix.
Notes: ans may not be equal to x or y.
static void mtx_mul(MATRIX *ans, MATRIX *x, MATRIX *y)
int i;
int ii;
int iii;
ans->m[i][ii] = 0;
ans->m[i][ii] += x->m[i][iii] * y->m[iii][ii];
create an identity matrix.
Params: mtx - return pointer.
static void mtx_identity(MATRIX *mtx)
int i;
int ii;
mtx->m[i][ii] = 1.0f;
mtx->m[i][ii] = 0;
create a translation matrix.
Params: mtx - return pointer for matrix.
x - x translation.
y - y translation.
z - z translation
static void mtx_trans(MATRIX *mtx, float x, float y, float z)
mtx->m[0][0] = 1;
mtx->m[0][1] = 0;
mtx->m[0][2] = 0;
mtx->m[0][3] = 0;
mtx->m[1][0] = 0;
mtx->m[1][1] = 1;
mtx->m[1][2] = 0;
mtx->m[1][3] = 0;
mtx->m[2][0] = 0;
mtx->m[2][1] = 0;
mtx->m[2][2] = 1;
mtx->m[2][3] = 0;
mtx->m[3][0] = x;
mtx->m[3][1] = y;
mtx->m[3][2] = z;
mtx->m[3][3] = 1;
matrix invert routine
Params: mtx - the matrix in raw format, in/out
N - width and height
Returns: 0 on success, -1 on fail
static int mtx_invert(float *mtx, int N)
int indxc[100]; /* these 100s are the only restriction on matrix size */
int indxr[100];
int ipiv[100];
int i, j, k;
int irow, icol;
double big;
double pinv;
int l, ll;
double dum;
double temp;
assert(N <= 100);
ipiv[i] = 0;
big = 0.0;
/* find biggest element */
if(ipiv[j] != 1)
if(ipiv[k] == 0)
if(fabs(mtx[j*N+k]) >= big)
big = fabs(mtx[j*N+k]);
irow = j;
icol = k;
if(irow != icol)
temp = mtx[irow * N + l];
mtx[irow * N + l] = mtx[icol * N + l];
mtx[icol * N + l] = temp;
indxr[i] = irow;
indxc[i] = icol;
/* if biggest element is zero matrix is singular, bail */
if(mtx[icol* N + icol] == 0)
goto error_exit;
pinv = 1.0/mtx[icol * N + icol];
mtx[icol * N + icol] = 1.0;
mtx[icol * N + l] *= pinv;
if(ll != icol)
dum = mtx[ll * N + icol];
mtx[ll * N + icol] = 0.0;
mtx[ll * N + l] -= mtx[icol * N + l]*dum;
/* unscramble matrix */
for (l=N-1;l>=0;l--)
if (indxr[l] != indxc[l])
for (k=0;k<N;k++)
temp = mtx[k * N + indxr[l]];
mtx[k * N + indxr[l]] = mtx[k * N + indxc[l]];
mtx[k * N + indxc[l]] = temp;
return 0;
return -1;
get the asolute maximum of an array
static float absmaxv(float *v, int N)
float answer;
int i;
if(answer < fabs(v[i]))
answer = fabs(v[i]);
return answer;
#include <stdio.h>
debug utlitiy
static void printmtx(FILE *fp, MATRIX *mtx)
int i, ii;
fprintf(fp, "%f, ", mtx->m[i][ii]);
fprintf(fp, "\n");
int rmsdmain(void)
float one[4*3] = {0,0,0, 1,0,0, 2,1,0, 0,3,1};
float two[4*3] = {0,0,0, 0,1,0, 1,2,0, 3,0,1};
double diff;
int i;
diff = rmsd(one, two, 4, (float *) &mtx.m);
printf("%f\n", diff);
printmtx(stdout, &mtx);
mulpt(&mtx, two + i * 3);
printf("%f %f %f\n", two[i*3], two[i*3+1], two[i*3+2]);
return 0;
I took #vagran's implementation and added RANSAC on top of it, since estimateRigidTransform2d does it and it was helpful for me since my data is noisy. (Note: This code doesn't have constant scaling along all 3 axes; you can add it back in easily by comparing to vargran's).
cv::Vec3f CalculateMean(const cv::Mat_<cv::Vec3f> &points)
if(points.size().height == 0){
return 0;
assert(points.size().width == 1);
double mx = 0.0;
double my = 0.0;
double mz = 0.0;
int n_points = points.size().height;
for(int i = 0; i < n_points; i++){
double x = double(points(i)[0]);
double y = double(points(i)[1]);
double z = double(points(i)[2]);
mx += x;
my += y;
mz += z;
return cv::Vec3f(mx/n_points, my/n_points, mz/n_points);
FindRigidTransform(const cv::Mat_<cv::Vec3f> &points1, const cv::Mat_<cv::Vec3f> points2)
/* Calculate centroids. */
cv::Vec3f t1 = CalculateMean(points1);
cv::Vec3f t2 = CalculateMean(points2);
cv::Mat_<double> T1 = cv::Mat_<double>::eye(4, 4);
T1(0, 3) = double(-t1[0]);
T1(1, 3) = double(-t1[1]);
T1(2, 3) = double(-t1[2]);
cv::Mat_<double> T2 = cv::Mat_<double>::eye(4, 4);
T2(0, 3) = double(t2[0]);
T2(1, 3) = double(t2[1]);
T2(2, 3) = double(t2[2]);
/* Calculate covariance matrix for input points. Also calculate RMS deviation from centroid
* which is used for scale calculation.
cv::Mat_<double> C(3, 3, 0.0);
for (int ptIdx = 0; ptIdx < points1.rows; ptIdx++) {
cv::Vec3f p1 = points1(ptIdx) - t1;
cv::Vec3f p2 = points2(ptIdx) - t2;
for (int i = 0; i < 3; i++) {
for (int j = 0; j < 3; j++) {
C(i, j) += double(p2[i] * p1[j]);
cv::Mat_<double> u, s, vt;
cv::SVD::compute(C, s, u, vt);
cv::Mat_<double> R = u * vt;
if (cv::determinant(R) < 0) {
R -= u.col(2) * (vt.row(2) * 2.0);
cv::Mat_<double> M = cv::Mat_<double>::eye(4, 4);
R.copyTo(M.colRange(0, 3).rowRange(0, 3));
cv::Mat_<double> result = T2 * M * T1;
result /= result(3, 3);
return result;
cv::Mat_<double> RANSACFindRigidTransform(const cv::Mat_<cv::Vec3f> &points1, const cv::Mat_<cv::Vec3f> &points2)
cv::Mat points1Homo;
cv::convertPointsToHomogeneous(points1, points1Homo);
int iterations = 100;
int min_n_points = 3;
int n_points = points1.size().height;
std::vector<int> range(n_points);
cv::Mat_<double> best;
int best_inliers = -1;
// inlier points should be projected within this many units
float threshold = .02;
std::iota(range.begin(), range.end(), 0);
auto gen = std::mt19937{std::random_device{}()};
for(int i = 0; i < iterations; i++) {
std::shuffle(range.begin(), range.end(), gen);
cv::Mat_<cv::Vec3f> points1subset(min_n_points, 1, cv::Vec3f(0,0,0));
cv::Mat_<cv::Vec3f> points2subset(min_n_points, 1, cv::Vec3f(0,0,0));
for(int j = 0; j < min_n_points; j++) {
points1subset(j) = points1(range[j]);
points2subset(j) = points2(range[j]);
cv::Mat_<float> rigidT = FindRigidTransform(points1subset, points2subset);
cv::Mat_<float> rigidT_float = cv::Mat::eye(4, 4, CV_32F);
rigidT.convertTo(rigidT_float, CV_32F);
std::vector<int> inliers;
for(int j = 0; j < n_points; j++) {
cv::Mat_<float> t1_3d = rigidT_float * cv::Mat_<float>(<cv::Vec4f>(j));
if(t1_3d(3) == 0) {
continue; // Avoid 0 division
float dx = (t1_3d(0)/t1_3d(3) - points2(j)[0]);
float dy = (t1_3d(1)/t1_3d(3) - points2(j)[1]);
float dz = (t1_3d(2)/t1_3d(3) - points2(j)[2]);
float square_dist = dx * dx + dy * dy + dz * dz;
if(square_dist < threshold * threshold){
int n_inliers = inliers.size();
if(n_inliers > best_inliers) {
best_inliers = n_inliers;
best = rigidT;
return best;
#vagran Thanks for the code! Seems to work very well.
I do have a little terminology suggestion though. Since you are estimating and applying a scale during the transformation, it is a 7-parameter transformation, or Helmert / similarity transformation. And in a rigid transformation, no scaling is applied because all Euclidiean distances need to be reserved.
I would've added this as comment, but don't have enough points.. D: sorry for that.
rigid transformation:
Helmert transformation: