I have implemented an in-place cache-oblivious matrix transposition algorithm in C++ as below:
void CacheObliviousTransposition(int x, int delx, int y, int dely, int N, int* matrix) {
if ((delx == 1) && (dely == 1)) {
int tmp = matrix[(N*y) + x];
matrix[(N*y) + x] = matrix[(N*x) + y];
matrix[(N*x) + y] = tmp;
if (delx >= dely) {
int xmid = delx / 2;
CacheObliviousTransposition(x, xmid, y, dely, N, matrix);
CacheObliviousTransposition(x + xmid, delx - xmid, y, dely, N, matrix);
int ymid = dely / 2;
CacheObliviousTransposition(x, delx, y, ymid, N, matrix);
CacheObliviousTransposition(x, delx, y + ymid, dely - ymid, N, matrix);
However, when I called the below method after transposition to ensure that it worked correctly, the if loop is being entered so I'm assuming something must be wrong with the implementation.
void CheckTransposition(int N, int* matrix)
for (int i = 0; i < N; i++)
for (int j = 0; j < N; j++)
if (matrix[(i*N) + j] != (j*N) + i + 42)
cout << "Transposition failed at i=" << i << ", j=" << j << "\n";
Can anyone help me identify what is wrong?
Note: variable matrix is a dynamically assigned integer array as below, as matrix is stored row by row in N*N consecutive memory locations
int* MatrixInit(int N)
int* matrix = new int[N*N];
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
matrix[(i*N) + j] = (i*N) + j + 42;
return matrix;
The above code will transpose your elements twice. For example, once CacheObliviousTransposition reaches the single element [0,1], it will transpose it with [1,0]. However, a separate recursion will later on reach [1,0], and transpose that with [0,1] again. Ultimately, all elements will be back in their original places.
To ensure that elements are only transposed once, you could check that x is less than y before switching:
void CacheObliviousTransposition(int x, int delx, int y, int dely, int N, int* matrix) {
if ((delx == 1) && (dely == 1)) {
int tmp = matrix[(N*y) + x];
matrix[(N*y) + x] = matrix[(N*x) + y];
matrix[(N*x) + y] = tmp;
if (delx >= dely) {
int xmid = delx / 2;
CacheObliviousTransposition(x, xmid, y, dely, N, matrix);
CacheObliviousTransposition(x + xmid, delx - xmid, y, dely, N, matrix);
int ymid = dely / 2;
CacheObliviousTransposition(x, delx, y, ymid, N, matrix);
CacheObliviousTransposition(x, delx, y + ymid, dely - ymid, N, matrix);
I'm creating a Matrix math library with CUDA to improve my CNNs performance (and to understand C++ better).
I would like to be able to add error handling and tell the user (me) what has gone wrong when using the matrix class.
This can be seen in my main file as, in this case, I'm trying to add a 10 * 10 matrix to a 15 * 15 matrix. This is an impossible action and would like some output to tell the user. for example
Error in file "Main.cu" on line: 9 (Dimensions inconsistent)
If you check inside the function the line number is line number of the check and I've looked at using macros to check but I'm wondering if there is another way without having to call the macro every time I add two matrices together.
#include "Matrix.cuh"
int main() {
double* init;
cudaMallocManaged(&init, sizeof(double));
Matrix A(10, 10, 2);
Matrix B(15, 15, 3);
Matrix C = A + B;
return 0;
#include "Matrix.cuh"
void sumMatrix(Matrix* A, Matrix* B, Matrix* C)
int x = blockIdx.x * BLOCK_SIZE + threadIdx.x;
int y = blockIdx.y * BLOCK_SIZE + threadIdx.y;
if (x < A->ColumnCount && y < A->RowCount)
C->VALUES[y * A->ColumnCount + x] = A->VALUES[y * A->ColumnCount + x] + B->VALUES[y * A->ColumnCount + x];
void matrixInit(Row* rows, int R, int C, double* VALUES, double val) {
int x = blockIdx.x * BLOCK_SIZE + threadIdx.x;
int y = blockIdx.y * BLOCK_SIZE + threadIdx.y;
if (x < C && y < R)
if (x == 0)
rows[y].Count = C;
rows[y].values = VALUES + C * y;
VALUES[y * C + x] = val;
Matrix::Matrix(int R, int C, double val)
cudaMallocManaged(&VALUES, R * C * sizeof(double));
cudaMallocManaged(&rows, R * sizeof(Row));
RowCount = R;
ColumnCount = C;
dim3 gridDim(ceil(C / (double)BLOCK_SIZE), ceil(R / (double)BLOCK_SIZE), 1);
dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE, 1);
matrixInit << <gridDim, blockDim >> > (rows, R, C, VALUES, val);
cudaCheckErrors("MATRIX INIT VAL");
Matrix::Matrix(int R, int C)
cudaMallocManaged(&VALUES, R * C * sizeof(double));
cudaMallocManaged(&rows, R * sizeof(Row));
RowCount = R;
ColumnCount = C;
dim3 gridDim(ceil(C / (double)BLOCK_SIZE), ceil(R / (double)BLOCK_SIZE), 1);
dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE, 1);
matrixInit << <gridDim, blockDim >> > (rows, R, C, VALUES, 0);
cudaCheckErrors("MATRIX INIT VAL");
void Matrix::updatePointers()
for (size_t i = 0; i < RowCount; i++)
rows[i].values = VALUES + (i * ColumnCount);
void Matrix::removePointers()
VALUES = nullptr;
rows = nullptr;
void Matrix::printM(const char* msg)
std::cout << "Matrix " << msg << ": " << RowCount << "*" << ColumnCount << std::endl;
for (size_t i = 0; i < RowCount; i++)
for (size_t j = 0; j < ColumnCount; j++)
std::cout << rows[i][j] << " ";
std::cout << std::endl;
Matrix Matrix::sum(Matrix B)
Matrix* A_p, * B_p, * C_p;
Matrix C(RowCount, ColumnCount);
cudaMallocManaged(&A_p, sizeof(Matrix));
cudaMallocManaged(&B_p, sizeof(Matrix));
cudaMallocManaged(&C_p, sizeof(Matrix));
memcpy(A_p, this, sizeof(Matrix));
memcpy(B_p, &B, sizeof(Matrix));
memcpy(C_p, &C, sizeof(Matrix));
dim3 gridDim(ceil(ColumnCount / (double)BLOCK_SIZE), ceil(RowCount / (double)BLOCK_SIZE), 1);
dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE, 1);
sumMatrix << < gridDim, blockDim >> > (A_p, B_p, C_p);
return *C_p;
Row& Matrix::operator[](size_t i)
if (i >= RowCount)
std::cout << "OUT OF BOUNDS";
return rows[i];
Matrix& Matrix::operator+(Matrix B)
Matrix C = sum(B);
Matrix* C_p;
cudaMallocManaged(&C_p, sizeof(Matrix));
memcpy(C_p, &C, sizeof(Matrix));
return *C_p;
if (VALUES != nullptr && rows != nullptr)
I have two overloaded functions: "ChooseElements", which chooses elements from passed array, and "SortElements", which sorts elements of passed array. One pair works with INT data, and another one with FLOAT.
int * ChooseElements(int * X, int n, int & m)
int * Y = NULL;
for (int i = 0; i < n; i++)
if (X[i] > 0)
if (Y == NULL)
m = 1;
Y = new int[1];
Y[0] = X[i];
Y = (int *)realloc(Y, sizeof(int) * m);
Y[m - 1] = X[i];
return Y;
float * ChooseElements(float * X, int n, int & m)
float * Y = NULL;
for (int i = 0; i < n; i++)
if (X[i] > 0)
if (Y == NULL)
m = 1;
Y = new float[1];
Y[0] = X[i];
Y = (float *)realloc(Y, sizeof(float) * m);
Y[m - 1] = X[i];
return Y;
int * SortElements(int m, int *& Y)
for (int i = 1; i < m; i++)
for (int j = 0; j < m - i; j++)
if (Y[j] > Y[j + 1])
int Temp = Y[j];
Y[j] = Y[j + 1];
Y[j + 1] = Temp;
return Y;
float * SortElements(int m, float *& Y)
for (int i = 1; i < m; i++)
for (int j = 0; j < m - i; j++)
if (Y[j] > Y[j + 1])
float Temp = Y[j];
Y[j] = Y[j + 1];
Y[j + 1] = Temp;
return Y;
What I want to do is pass first function as argument to second one. Like that:
int n, m;
int * X = NULL, * Y = NULL;
/* ...
Some code in which n and X are initialized
... */
Y = SortElements(m, ChooseElements(X, n, m));
However, when I try to do that, Visual Studio 2017 tells me:
no instance of overloaded function "SortElements" matches the argument list
argument types are: (int, int *)
If I do this instead:
Y = ChooseElements(X, n, m);
Y = SortElements(m, Y);
everything works fine.
If I remove overloads and leave only INT pair and once again try
int n, m;
int * X = NULL, * Y = NULL;
/* ...
Some code in which n and X are initialized
... */
Y = SortElements(m, ChooseElements(X, n, m));
I get another problem:
int *ChooseElements(int *X, int n, int &m)
initial value of reference to non-const value must be an lvalue
What am I doing wrong? My teacher asks for a function which uses another function as an argument. What I have written does not work, and I have no idea what could be done here.
In your int * SortElements(int m, int *& Y)
function you are using : int *& Y. So you have a reference to a int pointer. My guess is that you don't need that.
You can just use int * Y as a parameter as a solution.
Int *& Y - needs an lvalue(like your variable Y) but your ChooseElements function returns only a temporary object(rvalue) because you are returning by value.
I have this sample of code that I try to understand it:
__global__ void
d_boxfilter_rgba_x(unsigned int *od, int w, int h, int r)
float scale = 1.0f / (float)((r << 1) + 1);
unsigned int y = blockIdx.x*blockDim.x + threadIdx.x;
if (y < h)
float4 t = make_float4(0.0f);
for (int x = -r; x <= r; x++)
t += tex2D(rgbaTex, x, y);
od[y * w] = rgbaFloatToInt(t * scale);
for (int x = 1; x < w; x++)
t += tex2D(rgbaTex, x + r, y);
t -= tex2D(rgbaTex, x - r - 1, y);
od[y * w + x] = rgbaFloatToInt(t * scale);
__global__ void
d_boxfilter_rgba_y(unsigned int *id, unsigned int *od, int w, int h, int r)
unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
id = &id[x];
od = &od[x];
float scale = 1.0f / (float)((r << 1) + 1);
float4 t;
// partea din stanga
t = rgbaIntToFloat(id[0]) * r;
for (int y = 0; y < (r + 1); y++)
t += rgbaIntToFloat(id[y*w]);
od[0] = rgbaFloatToInt(t * scale);
for (int y = 1; y < (r + 1); y++)
t += rgbaIntToFloat(id[(y + r) * w]);
t -= rgbaIntToFloat(id[0]);
od[y * w] = rgbaFloatToInt(t * scale);
// main loop
for (int y = (r + 1); y < (h - r); y++)
t += rgbaIntToFloat(id[(y + r) * w]);
t -= rgbaIntToFloat(id[((y - r) * w) - w]);
od[y * w] = rgbaFloatToInt(t * scale);
// right side
for (int y = h - r; y < h; y++)
t += rgbaIntToFloat(id[(h - 1) * w]);
t -= rgbaIntToFloat(id[((y - r) * w) - w]);
od[y * w] = rgbaFloatToInt(t * scale);
This should be a box filter with CUDA.
From what I have read this should make an average with a given radius.
But in d_boxfilter_rgba_y make something like this:
od[0] = rgbaFloatToInt(t * scale);
I don't understand why is used this scale and why are made all that loops when there should be just one. To calculate the value from -r to +r and divide this by a number of pixels.
Can somebody help me?
To calculate the average of a box with radius 1 (3 values), you do:
(box[0] + box[1] + box[2]) / 3 // which is equal to
(box[0] + box[1] + box[2] * 1/3 // which is equal to your scale factor
The calculation of scale is:
1.0f / (float)((r << 1) + 1); // equal to
1 / ((r * 2) + 1) // equal to
1 / (2r + 1) // 2r because you go to the left and right and +1 for the middle
The two for loops are used, because the "sliding window" optimisation is used. First the first box is calculated:
for (int x = -r; x <= r; x++)
t += tex2D(rgbaTex, x, y);
And then for each step to the right, the value right of the box is added and the most left value of the box is removed. That way you can calculate the sum of the box with just 2 operations instead of 2*r + 1 operations.
for (int x = 1; x < w; x++)
t += tex2D(rgbaTex, x + r, y);
t -= tex2D(rgbaTex, x - r - 1, y);
od[y * w + x] = rgbaFloatToInt(t * scale);
I want to implement the harris corner detector. I found this page to be very helpful, since it shows how the detector is implemented using the basic opencv functions (like gaussianBlur and Sobel):
Now I even want to implement Gaussian Blur and Sobel. If I run my Gaussian or Sobel over some Images it works but in combination with my Corner Detector it does not work. Can anybody help me please. The full Code is below, thx.
#include "opencv2/highgui/highgui.hpp"
#include "opencv2/imgproc/imgproc.hpp"
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
using namespace cv;
using namespace std;
/// Global variables
Mat src, src_gray, dst;
int thresh = 200;
int max_thresh = 255;
char* source_window = "Source Image";
char* corners_window = "Corner Image";
/// Function header
void cornerHarris_demo(int, void*);
void cornerHarrisMe(int, int, double);
int xGradient(Mat, int, int);
int yGradient(Mat, int, int);
void SobelMe(Mat&,Mat&,int,int);
int borderCheck(int M, int x);
void SepGaussian(Mat&, Mat&, int, int);
/** #function main */
int main(int argc, char** argv)
/// Load source image and convert it to gray
src = imread("data/a-real-big-church.jpg", 1);
//Mat src_gray(src.size(), CV_8UC1);
cvtColor(src, src_gray, CV_BGR2GRAY);
/// Create a window and a trackbar
namedWindow(source_window, CV_WINDOW_AUTOSIZE);
createTrackbar("Threshold: ", source_window, &thresh, max_thresh, cornerHarris_demo);
imshow(source_window, src);
cornerHarris_demo(0, 0);
/** #function cornerHarris_demo */
void cornerHarris_demo(int, void*)
Mat dst_norm, dst_norm_scaled;
/// Detector parameters
int blockSize = 2;
int apertureSize = 3;
double k = 0.04;
/// Detecting corners
cornerHarrisMe(blockSize, apertureSize, k);
/// Normalizing
normalize(dst, dst_norm, 0, 255, NORM_MINMAX, CV_32FC1, Mat());
convertScaleAbs(dst_norm, dst_norm_scaled);
/// Drawing a circle around corners
for (int j = 0; j < dst_norm.rows; j++)
for (int i = 0; i < dst_norm.cols; i++)
if ((int)dst_norm.at<float>(j, i) > thresh)
circle(dst_norm_scaled, Point(i, j), 5, Scalar(255), 2, 8, 0);
/// Showing the result
namedWindow(corners_window, CV_WINDOW_AUTOSIZE);
imshow(corners_window, dst_norm_scaled);
void cornerHarrisMe(int blockSize, int apertureSize, double k)
Mat x2y2, xy, mtrace, x_der, y_der, x2_der, y2_der, xy_der, x2g_der, y2g_der, xyg_der;
//1: calculate x and y derivative of image via Sobel
SobelMe(src_gray, x_der, 1, 0);
SobelMe(src_gray, y_der, 0, 1);
//2: calculate other three images in M
pow(x_der, blockSize, x2_der);
pow(y_der, blockSize, y2_der);
multiply(x_der, y_der, xy_der);
//3: gaussain
SepGaussian(x2_der, x2g_der, 1, 0);
SepGaussian(y2_der, y2g_der, 0, 1);
SepGaussian(xy_der, xyg_der, 1, 1);
//4. calculating R with k
multiply(x2g_der, y2g_der, x2y2);
multiply(xyg_der, xyg_der, xy);
pow((x2g_der + y2g_der), blockSize, mtrace);
dst = (x2y2 - xy) - k * mtrace;
// gradient in the x direction
int xGradient(Mat image, int x, int y)
return image.at<uchar>(y - 1, x - 1) +
2 * image.at<uchar>(y, x - 1) +
image.at<uchar>(y + 1, x - 1) -
image.at<uchar>(y - 1, x + 1) -
2 * image.at<uchar>(y, x + 1) -
image.at<uchar>(y + 1, x + 1);
// gradient in the y direction
int yGradient(Mat image, int x, int y)
return image.at<uchar>(y - 1, x - 1) +
2 * image.at<uchar>(y - 1, x) +
image.at<uchar>(y - 1, x + 1) -
image.at<uchar>(y + 1, x - 1) -
2 * image.at<uchar>(y + 1, x) -
image.at<uchar>(y + 1, x + 1);
void SobelMe(Mat& source, Mat& destination, int xOrder, int yOrder){
int gradX, gradY, sum;
destination = source.clone();
if (xOrder == 1 && yOrder == 0){
for (int y = 1; y < source.rows - 1; y++){
for (int x = 1; x < source.cols - 1; x++){
gradX = xGradient(source, x, y);
sum = abs(gradX);
sum = sum > 255 ? 255 : sum;
sum = sum < 0 ? 0 : sum;
destination.at<uchar>(y, x) = sum;
else if (xOrder == 0 && yOrder == 1){
for (int y = 1; y < source.rows - 1; y++){
for (int x = 1; x < source.cols - 1; x++){
gradY = yGradient(source, x, y);
sum = abs(gradY);
sum = sum > 255 ? 255 : sum;
sum = sum < 0 ? 0 : sum;
destination.at<uchar>(y, x) = sum;
else if (xOrder == 1 && yOrder == 1)
for (int y = 1; y < source.rows - 1; y++){
for (int x = 1; x < source.cols - 1; x++){
gradX = xGradient(source, x, y);
gradY = yGradient(source, x, y);
sum = abs(gradX) + abs(gradY);
sum = sum > 255 ? 255 : sum;
sum = sum < 0 ? 0 : sum;
destination.at<uchar>(y, x) = sum;
int borderCheck(int M, int x){
if (x < 0)
return -x - 1;
if (x >= M)
return 2 * M - x - 1;
return x;
void SepGaussian(Mat& source, Mat& desination, int sigmaX, int sigmaY){
// coefficients of 1D gaussian kernel with sigma = 1
double coeffs[] = { 0.0545, 0.2442, 0.4026, 0.2442, 0.0545 };
Mat tempX, tempY;
float sum, x1, y1;
desination = source.clone();
tempY = source.clone();
tempX = source.clone();
// along y - direction
if (sigmaX == 0 && sigmaY == 1){
for (int y = 0; y < source.rows; y++){
for (int x = 0; x < source.cols; x++){
sum = 0.0;
for (int i = -2; i <= 2; i++){
y1 = borderCheck(source.rows, y - i);
sum = sum + coeffs[i + 2] * source.at<uchar>(y1, x);
desination.at<uchar>(y, x) = sum;
// along x - direction
else if (sigmaX == 1 && sigmaY == 0){
for (int y = 0; y < source.rows; y++){
for (int x = 0; x < source.cols; x++){
sum = 0.0;
for (int i = -2; i <= 2; i++){
x1 = borderCheck(source.cols, x - i);
sum = sum + coeffs[i + 2] * source.at<uchar>(y, x1);
desination.at<uchar>(y, x) = sum;
// along xy - direction
else if (sigmaX == 1 && sigmaY == 1){
for (int y = 0; y < source.rows; y++){
for (int x = 0; x < source.cols; x++){
sum = 0.0;
for (int i = -2; i <= 2; i++){
y1 = borderCheck(source.rows, y - i);
sum = sum + coeffs[i + 2] * source.at<uchar>(y1, x);
tempY.at<uchar>(y, x) = sum;
for (int y = 0; y < source.rows; y++){
for (int x = 0; x < source.cols; x++){
sum = 0.0;
for (int i = -2; i <= 2; i++){
x1 = borderCheck(source.cols, x - i);
sum = sum + coeffs[i + 2] * tempY.at<uchar>(y, x1);
desination.at<uchar>(y, x) = sum;
The Result:
Here is the a picture of the Result.
The Result is now the other way around, it detects areas where are no Corners.
In case there are some questions, feel free to ask me.
I have two 3D point clouds, and I'd like to use opencv to find the rigid transformation matrix (translation, rotation, constant scaling among all 3 axes).
I've found an estimateRigidTransformation function, but it's only for 2D points apparently
In addition, I've found estimateAffine3D, but it doesn't seem to support rigid transformation mode.
Do I need to just write my own rigid transformation function?
I did not find the required functionality in OpenCV so I have written my own implementation. Based on ideas from OpenSFM.
CalculateMean(const cv::Mat_<cv::Vec3d> &points)
cv::Mat_<cv::Vec3d> result;
cv::reduce(points, result, 0, CV_REDUCE_AVG);
return result(0, 0);
FindRigidTransform(const cv::Mat_<cv::Vec3d> &points1, const cv::Mat_<cv::Vec3d> points2)
/* Calculate centroids. */
cv::Vec3d t1 = -CalculateMean(points1);
cv::Vec3d t2 = -CalculateMean(points2);
cv::Mat_<double> T1 = cv::Mat_<double>::eye(4, 4);
T1(0, 3) = t1[0];
T1(1, 3) = t1[1];
T1(2, 3) = t1[2];
cv::Mat_<double> T2 = cv::Mat_<double>::eye(4, 4);
T2(0, 3) = -t2[0];
T2(1, 3) = -t2[1];
T2(2, 3) = -t2[2];
/* Calculate covariance matrix for input points. Also calculate RMS deviation from centroid
* which is used for scale calculation.
cv::Mat_<double> C(3, 3, 0.0);
double p1Rms = 0, p2Rms = 0;
for (int ptIdx = 0; ptIdx < points1.rows; ptIdx++) {
cv::Vec3d p1 = points1(ptIdx, 0) + t1;
cv::Vec3d p2 = points2(ptIdx, 0) + t2;
p1Rms += p1.dot(p1);
p2Rms += p2.dot(p2);
for (int i = 0; i < 3; i++) {
for (int j = 0; j < 3; j++) {
C(i, j) += p2[i] * p1[j];
cv::Mat_<double> u, s, vh;
cv::SVD::compute(C, s, u, vh);
cv::Mat_<double> R = u * vh;
if (cv::determinant(R) < 0) {
R -= u.col(2) * (vh.row(2) * 2.0);
double scale = sqrt(p2Rms / p1Rms);
R *= scale;
cv::Mat_<double> M = cv::Mat_<double>::eye(4, 4);
R.copyTo(M.colRange(0, 3).rowRange(0, 3));
cv::Mat_<double> result = T2 * M * T1;
result /= result(3, 3);
return result.rowRange(0, 3);
I've found PCL to be a nice adjunct to OpenCV. Take a look at their Iterative Closest Point (ICP) example. The provided example registers the two point clouds and then displays the rigid transformation.
Here's my rmsd code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <assert.h>
typedef struct
float m[4][4];
#define vdiff2(a,b) ( ((a)[0]-(b)[0]) * ((a)[0]-(b)[0]) + \
((a)[1]-(b)[1]) * ((a)[1]-(b)[1]) + \
((a)[2]-(b)[2]) * ((a)[2]-(b)[2]) )
static double alignedrmsd(float *v1, float *v2, int N);
static void centroid(float *ret, float *v, int N);
static int getalignmtx(float *v1, float *v2, int N, MATRIX *mtx);
static void crossproduct(float *ans, float *pt1, float *pt2);
static void mtx_root(MATRIX *mtx);
static int almostequal(MATRIX *a, MATRIX *b);
static void mulpt(MATRIX *mtx, float *pt);
static void mtx_mul(MATRIX *ans, MATRIX *x, MATRIX *y);
static void mtx_identity(MATRIX *mtx);
static void mtx_trans(MATRIX *mtx, float x, float y, float z);
static int mtx_invert(float *mtx, int N);
static float absmaxv(float *v, int N);
calculate rmsd between two structures
Params: v1 - first set of points
v2 - second set of points
N - number of points
mtx - return for transfrom matrix used to align structures
Returns: rmsd score
Notes: mtx can be null. Transform will be rigid. Inputs must
be previously aligned for sequence alignment
double rmsd(float *v1, float *v2, int N, float *mtx)
float cent1[3];
float cent2[3];
MATRIX tmtx;
MATRIX tempmtx;
MATRIX move1;
MATRIX move2;
int i;
double answer;
float *temp1 = 0;
float *temp2 = 0;
int err;
assert(N > 3);
temp1 = malloc(N * 3 * sizeof(float));
temp2 = malloc(N * 3 * sizeof(float));
if(!temp1 || !temp2)
goto error_exit;
centroid(cent1, v1, N);
centroid(cent2, v2, N);
temp1[i*3+0] = v1[i*3+0] - cent1[0];
temp1[i*3+1] = v1[i*3+1] - cent1[1];
temp1[i*3+2] = v1[i*3+2] - cent1[2];
temp2[i*3+0] = v2[i*3+0] - cent2[0];
temp2[i*3+1] = v2[i*3+1] - cent2[1];
temp2[i*3+2] = v2[i*3+2] - cent2[2];
err = getalignmtx(temp1, temp2, N, &tmtx);
if(err == -1)
goto error_exit;
mtx_trans(&move1, -cent2[0], -cent2[1], -cent2[2]);
mtx_mul(&tempmtx, &move1, &tmtx);
mtx_trans(&move2, cent1[0], cent1[1], cent1[2]);
mtx_mul(&tmtx, &tempmtx, &move2);
memcpy(temp2, v2, N * sizeof(float) * 3);
mulpt(&tmtx, temp2 + i * 3);
answer = alignedrmsd(v1, temp2, N);
memcpy(mtx, &tmtx.m, 16 * sizeof(float));
return answer;
mtx[i] = 0;
return sqrt(-1.0);
calculate rmsd between two aligned structures (trivial)
Params: v1 - first structure
v2 - second structure
N - number of points
Returns: rmsd
static double alignedrmsd(float *v1, float *v2, int N)
double answer =0;
int i;
answer += vdiff2(v1 + i *3, v2 + i * 3);
return sqrt(answer/N);
compute the centroid
static void centroid(float *ret, float *v, int N)
int i;
ret[0] = 0;
ret[1] = 0;
ret[2] = 0;
ret[0] += v[i*3+0];
ret[1] += v[i*3+1];
ret[2] += v[i*3+2];
ret[0] /= N;
ret[1] /= N;
ret[2] /= N;
get the matrix needed to align two structures
Params: v1 - reference structure
v2 - structure to align
N - number of points
mtx - return for rigid body alignment matrix
Notes: only calculates rotation part of matrix.
assumes input has been aligned to centroids
static int getalignmtx(float *v1, float *v2, int N, MATRIX *mtx)
MATRIX A = { {{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,1}} };
MATRIX temp;
float tv[3];
float tw[3];
float tv2[3];
float tw2[3];
int k, i, j;
int flag = 0;
float correction;
correction = absmaxv(v1, N * 3) * absmaxv(v2, N * 3);
A.m[i][j] += (v1[k*3+i] * v2[k*3+j])/correction;
while(flag < 3)
At.m[i][j] = A.m[j][i];
memcpy(&Ainv, &A, sizeof(MATRIX));
/* this will happen if all points are in a plane */
if( mtx_invert((float *) &Ainv, 4) == -1)
if(flag == 0)
crossproduct(tv, v1, v1+3);
crossproduct(tw, v2, v2+3);
crossproduct(tv2, tv, v1);
crossproduct(tw2, tw, v2);
memcpy(tv, tv2, 3 * sizeof(float));
memcpy(tw, tw2, 3 * sizeof(float));
A.m[i][j] += tv[i] * tw[j];
flag = 5;
if(flag != 5)
return -1;
mtx_mul(&temp, &At, &A);
mtx_mul(mtx, &temp, &Ainv);
return 0;
get the crossproduct of two vectors.
Params: ans - return pinter for answer.
pt1 - first vector
pt2 - second vector.
Notes: crossproduct is at right angles to the two vectors.
static void crossproduct(float *ans, float *pt1, float *pt2)
ans[0] = pt1[1] * pt2[2] - pt1[2] * pt2[1];
ans[1] = pt1[0] * pt2[2] - pt1[2] * pt2[0];
ans[2] = pt1[0] * pt2[1] - pt1[1] * pt2[0];
Denman-Beavers square root iteration
static void mtx_root(MATRIX *mtx)
MATRIX Y = *mtx;
int iter = 0;
int i, ii;
invY = Y;
invZ = Z;
if( mtx_invert((float *) &invY, 4) == -1)
if( mtx_invert((float *) &invZ, 4) == -1)
Y1.m[i][ii] = 0.5 * (Y.m[i][ii] + invZ.m[i][ii]);
Z1.m[i][ii] = 0.5 * (Z.m[i][ii] + invY.m[i][ii]);
Y = Y1;
Z = Z1;
mtx_mul(&Y2, &Y, &Y);
while(!almostequal(&Y2, mtx) && iter++ < 20 );
*mtx = Y;
Check two matrices for near-enough equality
Params: a - first matrix
b - second matrix
Returns: 1 if almost equal, else 0, epsilon 0.0001f.
static int almostequal(MATRIX *a, MATRIX *b)
int i, ii;
float epsilon = 0.001f;
if(fabs(a->m[i][ii] - b->m[i][ii]) > epsilon)
return 0;
return 1;
multiply a point by a matrix.
Params: mtx - matrix
pt - the point (transformed)
static void mulpt(MATRIX *mtx, float *pt)
float ans[4] = {0};
int i;
int ii;
ans[i] += pt[ii] * mtx->m[ii][i];
ans[i] += mtx->m[3][i];
pt[0] = ans[0];
pt[1] = ans[1];
pt[2] = ans[2];
multiply two matrices.
Params: ans - return pointer for answer.
x - first matrix
y - second matrix.
Notes: ans may not be equal to x or y.
static void mtx_mul(MATRIX *ans, MATRIX *x, MATRIX *y)
int i;
int ii;
int iii;
ans->m[i][ii] = 0;
ans->m[i][ii] += x->m[i][iii] * y->m[iii][ii];
create an identity matrix.
Params: mtx - return pointer.
static void mtx_identity(MATRIX *mtx)
int i;
int ii;
mtx->m[i][ii] = 1.0f;
mtx->m[i][ii] = 0;
create a translation matrix.
Params: mtx - return pointer for matrix.
x - x translation.
y - y translation.
z - z translation
static void mtx_trans(MATRIX *mtx, float x, float y, float z)
mtx->m[0][0] = 1;
mtx->m[0][1] = 0;
mtx->m[0][2] = 0;
mtx->m[0][3] = 0;
mtx->m[1][0] = 0;
mtx->m[1][1] = 1;
mtx->m[1][2] = 0;
mtx->m[1][3] = 0;
mtx->m[2][0] = 0;
mtx->m[2][1] = 0;
mtx->m[2][2] = 1;
mtx->m[2][3] = 0;
mtx->m[3][0] = x;
mtx->m[3][1] = y;
mtx->m[3][2] = z;
mtx->m[3][3] = 1;
matrix invert routine
Params: mtx - the matrix in raw format, in/out
N - width and height
Returns: 0 on success, -1 on fail
static int mtx_invert(float *mtx, int N)
int indxc[100]; /* these 100s are the only restriction on matrix size */
int indxr[100];
int ipiv[100];
int i, j, k;
int irow, icol;
double big;
double pinv;
int l, ll;
double dum;
double temp;
assert(N <= 100);
ipiv[i] = 0;
big = 0.0;
/* find biggest element */
if(ipiv[j] != 1)
if(ipiv[k] == 0)
if(fabs(mtx[j*N+k]) >= big)
big = fabs(mtx[j*N+k]);
irow = j;
icol = k;
if(irow != icol)
temp = mtx[irow * N + l];
mtx[irow * N + l] = mtx[icol * N + l];
mtx[icol * N + l] = temp;
indxr[i] = irow;
indxc[i] = icol;
/* if biggest element is zero matrix is singular, bail */
if(mtx[icol* N + icol] == 0)
goto error_exit;
pinv = 1.0/mtx[icol * N + icol];
mtx[icol * N + icol] = 1.0;
mtx[icol * N + l] *= pinv;
if(ll != icol)
dum = mtx[ll * N + icol];
mtx[ll * N + icol] = 0.0;
mtx[ll * N + l] -= mtx[icol * N + l]*dum;
/* unscramble matrix */
for (l=N-1;l>=0;l--)
if (indxr[l] != indxc[l])
for (k=0;k<N;k++)
temp = mtx[k * N + indxr[l]];
mtx[k * N + indxr[l]] = mtx[k * N + indxc[l]];
mtx[k * N + indxc[l]] = temp;
return 0;
return -1;
get the asolute maximum of an array
static float absmaxv(float *v, int N)
float answer;
int i;
if(answer < fabs(v[i]))
answer = fabs(v[i]);
return answer;
#include <stdio.h>
debug utlitiy
static void printmtx(FILE *fp, MATRIX *mtx)
int i, ii;
fprintf(fp, "%f, ", mtx->m[i][ii]);
fprintf(fp, "\n");
int rmsdmain(void)
float one[4*3] = {0,0,0, 1,0,0, 2,1,0, 0,3,1};
float two[4*3] = {0,0,0, 0,1,0, 1,2,0, 3,0,1};
double diff;
int i;
diff = rmsd(one, two, 4, (float *) &mtx.m);
printf("%f\n", diff);
printmtx(stdout, &mtx);
mulpt(&mtx, two + i * 3);
printf("%f %f %f\n", two[i*3], two[i*3+1], two[i*3+2]);
return 0;
I took #vagran's implementation and added RANSAC on top of it, since estimateRigidTransform2d does it and it was helpful for me since my data is noisy. (Note: This code doesn't have constant scaling along all 3 axes; you can add it back in easily by comparing to vargran's).
cv::Vec3f CalculateMean(const cv::Mat_<cv::Vec3f> &points)
if(points.size().height == 0){
return 0;
assert(points.size().width == 1);
double mx = 0.0;
double my = 0.0;
double mz = 0.0;
int n_points = points.size().height;
for(int i = 0; i < n_points; i++){
double x = double(points(i)[0]);
double y = double(points(i)[1]);
double z = double(points(i)[2]);
mx += x;
my += y;
mz += z;
return cv::Vec3f(mx/n_points, my/n_points, mz/n_points);
FindRigidTransform(const cv::Mat_<cv::Vec3f> &points1, const cv::Mat_<cv::Vec3f> points2)
/* Calculate centroids. */
cv::Vec3f t1 = CalculateMean(points1);
cv::Vec3f t2 = CalculateMean(points2);
cv::Mat_<double> T1 = cv::Mat_<double>::eye(4, 4);
T1(0, 3) = double(-t1[0]);
T1(1, 3) = double(-t1[1]);
T1(2, 3) = double(-t1[2]);
cv::Mat_<double> T2 = cv::Mat_<double>::eye(4, 4);
T2(0, 3) = double(t2[0]);
T2(1, 3) = double(t2[1]);
T2(2, 3) = double(t2[2]);
/* Calculate covariance matrix for input points. Also calculate RMS deviation from centroid
* which is used for scale calculation.
cv::Mat_<double> C(3, 3, 0.0);
for (int ptIdx = 0; ptIdx < points1.rows; ptIdx++) {
cv::Vec3f p1 = points1(ptIdx) - t1;
cv::Vec3f p2 = points2(ptIdx) - t2;
for (int i = 0; i < 3; i++) {
for (int j = 0; j < 3; j++) {
C(i, j) += double(p2[i] * p1[j]);
cv::Mat_<double> u, s, vt;
cv::SVD::compute(C, s, u, vt);
cv::Mat_<double> R = u * vt;
if (cv::determinant(R) < 0) {
R -= u.col(2) * (vt.row(2) * 2.0);
cv::Mat_<double> M = cv::Mat_<double>::eye(4, 4);
R.copyTo(M.colRange(0, 3).rowRange(0, 3));
cv::Mat_<double> result = T2 * M * T1;
result /= result(3, 3);
return result;
cv::Mat_<double> RANSACFindRigidTransform(const cv::Mat_<cv::Vec3f> &points1, const cv::Mat_<cv::Vec3f> &points2)
cv::Mat points1Homo;
cv::convertPointsToHomogeneous(points1, points1Homo);
int iterations = 100;
int min_n_points = 3;
int n_points = points1.size().height;
std::vector<int> range(n_points);
cv::Mat_<double> best;
int best_inliers = -1;
// inlier points should be projected within this many units
float threshold = .02;
std::iota(range.begin(), range.end(), 0);
auto gen = std::mt19937{std::random_device{}()};
for(int i = 0; i < iterations; i++) {
std::shuffle(range.begin(), range.end(), gen);
cv::Mat_<cv::Vec3f> points1subset(min_n_points, 1, cv::Vec3f(0,0,0));
cv::Mat_<cv::Vec3f> points2subset(min_n_points, 1, cv::Vec3f(0,0,0));
for(int j = 0; j < min_n_points; j++) {
points1subset(j) = points1(range[j]);
points2subset(j) = points2(range[j]);
cv::Mat_<float> rigidT = FindRigidTransform(points1subset, points2subset);
cv::Mat_<float> rigidT_float = cv::Mat::eye(4, 4, CV_32F);
rigidT.convertTo(rigidT_float, CV_32F);
std::vector<int> inliers;
for(int j = 0; j < n_points; j++) {
cv::Mat_<float> t1_3d = rigidT_float * cv::Mat_<float>(points1Homo.at<cv::Vec4f>(j));
if(t1_3d(3) == 0) {
continue; // Avoid 0 division
float dx = (t1_3d(0)/t1_3d(3) - points2(j)[0]);
float dy = (t1_3d(1)/t1_3d(3) - points2(j)[1]);
float dz = (t1_3d(2)/t1_3d(3) - points2(j)[2]);
float square_dist = dx * dx + dy * dy + dz * dz;
if(square_dist < threshold * threshold){
int n_inliers = inliers.size();
if(n_inliers > best_inliers) {
best_inliers = n_inliers;
best = rigidT;
return best;
#vagran Thanks for the code! Seems to work very well.
I do have a little terminology suggestion though. Since you are estimating and applying a scale during the transformation, it is a 7-parameter transformation, or Helmert / similarity transformation. And in a rigid transformation, no scaling is applied because all Euclidiean distances need to be reserved.
I would've added this as comment, but don't have enough points.. D: sorry for that.
rigid transformation: https://en.wikipedia.org/wiki/Rigid_transformation
Helmert transformation: https://www.researchgate.net/publication/322841143_Parameter_estimation_in_3D_affine_and_similarity_transformation_implementation_of_variance_component_estimation