I'm currently struggling to properly work with 2D arrays within my CUDA kernel. 1D was fine but so far had no luck with it moving on to 2D. Here is my host function and kernel:
__global__ void add_d2D(double *x, double *y,double *z, int n, int m){
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x){
for(int j = blockIdx.y * blockDim.y + threadIdx.y; j < m; j += blockDim.y * gridDim.y){
z[i*m + j] = x[i*m + j] + y[i*m + j];
}
}
}
__host__ void add2D(double *a, double *b, double *result, int N, int M){
double *a_d, *b_d, *c_d;
size_t pitcha;
size_t pitchb;
size_t pitchc;
cudaErrchk(cudaMallocPitch(&a_d,&pitcha, M*sizeof(double),N));
cudaErrchk(cudaMallocPitch(&b_d,&pitchb, M*sizeof(double),N));
cudaErrchk(cudaMallocPitch(&c_d,&pitchc, M*sizeof(double),N));
cudaErrchk(cudaMemcpy2D(a_d,M*sizeof(double), a,pitcha, M*sizeof(double),N, cudaMemcpyHostToDevice));
cudaErrchk(cudaMemcpy2D(b_d,M*sizeof(double), b,pitchb, M*sizeof(double),N, cudaMemcpyHostToDevice));
dim3 threadsPerBlock(2, 2);
dim3 numBlocks(N/threadsPerBlock.x, M/threadsPerBlock.y);
add_d2D<<<numBlocks, threadsPerBlock>>>(a_d, b_d, c_d , N, M);
cudaDeviceSynchronize();
cudaErrchk(cudaMemcpy2D(result,M*sizeof(double), c_d,pitchc, M*sizeof(double),N, cudaMemcpyDeviceToHost));
cudaFree(a_d);
cudaFree(b_d);
cudaFree(c_d);
}
And below my example to test it. It prints out the first 10 values of C correctly but all others remain 0. I believe the problem is within the kernel. Where it can't find the correct values due to the pitch, but not sure how to solve it correctly though.
double a[4][10];
double b[4][10];
double c[4][10];
for (int i = 0; i < 4; i ++){
for (int j = 0; j < 10; j ++){
a[i][j] = 0 + rand() % 10;
b[i][j] = 0 + rand() % 10;
}
}
ertiscuda::add2D((double *)a, (double *)b, (double *)c, 4, 10);
for (int i = 0; i < 4; i ++){
for (int j = 0; j < 10; j ++){
std::cout << a[i][j] << " " << b[i][j] << " " << c[i][j] << std::endl;
}
}
You have two mistakes
Each thread in the kernel should perform one operation rather than all the operations. (For memory reasons you might want to do more, be we will keep this example simple).
You had the destination and source pitches switched when loading the data onto the device.
Here is a working version
#include <cuda_runtime.h>
#include <stdlib.h>
#include <iostream>
#include <sstream>
#define CUDASAFECALL( err ) cuda_safe_call(err, __FILE__, __LINE__ )
void cuda_safe_call(const cudaError err, const char *file, const int line)
{
if (cudaSuccess != err)
{
std::stringstream error_msg;
error_msg << "cuda_safe_call() failed at " << file << ":" << line << ":" << cudaGetErrorString(err);
const auto error_msg_str = error_msg.str();
std::cout << error_msg_str << std::endl;
throw std::runtime_error(error_msg_str);
}
}
__global__ void add_d2D(const double *x, const double *y, double *z, int n, int m, int m_pitch_elements)
{
int row = blockIdx.x * blockDim.x + threadIdx.x;
int col = blockIdx.y * blockDim.y + threadIdx.y;
if (row< n && col <m )
{
auto idx = row*m_pitch_elements + col;
z[idx] = x[idx] + y[idx];
//z[idx] = idx;
}
}
__host__ void add2D(const double *a,const double *b, double *result, int N, int M) {
double *a_d, *b_d, *c_d;
size_t pitcha,pitchb,pitchc;
CUDASAFECALL(cudaMallocPitch(&a_d, &pitcha, M * sizeof(double), N));
CUDASAFECALL(cudaMallocPitch(&b_d, &pitchb, M * sizeof(double), N));
CUDASAFECALL(cudaMallocPitch(&c_d, &pitchc, M * sizeof(double), N));
CUDASAFECALL(cudaMemcpy2D(a_d, pitcha, a, M * sizeof(double), M * sizeof(double), N, cudaMemcpyHostToDevice));
CUDASAFECALL(cudaMemcpy2D(b_d, pitchb, b, M * sizeof(double), M * sizeof(double), N, cudaMemcpyHostToDevice));
dim3 threadsPerBlock(2, 2);
auto safediv = [](auto a, auto b) {return static_cast<unsigned int>(ceil(a / (b*1.0))); };
dim3 numBlocks(safediv(N, threadsPerBlock.x), safediv( M, threadsPerBlock.y));
//all the pitches should be the same
auto pitch_elements = pitcha / sizeof(double);
add_d2D << <numBlocks, threadsPerBlock >> >(a_d, b_d, c_d, N, M, pitch_elements);
CUDASAFECALL(cudaDeviceSynchronize());
CUDASAFECALL(cudaMemcpy2D(result, M * sizeof(double), c_d, pitchc, M * sizeof(double), N, cudaMemcpyDeviceToHost));
CUDASAFECALL(cudaFree(a_d));
CUDASAFECALL(cudaFree(b_d));
CUDASAFECALL(cudaFree(c_d));
}
int main()
{
double a[4][10];
double b[4][10];
double c[4][10];
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 10; j++) {
a[i][j] = 0 + rand() % 10;
b[i][j] = 0 + rand() % 10;
}
}
add2D((double *)a, (double *)b, (double *)c, 4, 10);
for (int i = 0; i < 4; i++) {
for (int j = 0; j < 10; j++) {
std::cout << a[i][j] << " " << b[i][j] << " " << c[i][j]<< "|"<< a[i][j]+ b[i][j] << std::endl;
}
}
return 0;
}
I have 200 matrices A[i] (whose dimension is 4096*48), and 48 vectors v[j](whose dimension is 48*1). I want to calculate A[i]*v[j], (i=0:199,j=1:47).
I think about how to arrange my grid size and block size from yesterday. But I don't figure out an answer now. Could anyone give me some advice?
Max num of per block is 512. This is my working environment.
The following is my code. It works right. I have checked. But it is slower than Matlab :(
#include<iostream>
#include <mat.h>
#include <time.h>
#include <cuda_runtime.h>
#include "cuda.h"
using std::cout;
using std::endl;
using namespace cv;
using namespace std;
#include <limits>
#include <iostream>
#include <cstdlib>
using namespace std;
#define kernel_size 48
////////////////////////////////////////////
typedef struct {
int width;
int height;
int stride;
float* elements;
} Matrix;
// Forward declaration of the matrix multiplication kernel
__global__ void MatMulKernel(const Matrix, const Matrix, Matrix);
// Matrix multiplication - Host code
// Matrix dimensions are assumed to be multiples of BLOCK_SIZE
void MatMul(const Matrix A, const Matrix B, Matrix C)
{
// Load A and B to device memory
Matrix d_A;
d_A.width = d_A.stride = A.width; d_A.height = A.height;
size_t size = A.width * A.height * sizeof(float);
cudaMalloc(&d_A.elements, size);
cudaMemcpy(d_A.elements, A.elements, size,
cudaMemcpyHostToDevice);
Matrix d_B;
d_B.width = d_B.stride = B.width; d_B.height = B.height;
size = B.width * B.height * sizeof(float);
cudaMalloc(&d_B.elements, size);
cudaMemcpy(d_B.elements, B.elements, size,
cudaMemcpyHostToDevice);
// Allocate C in device memory
Matrix d_C;
d_C.width = d_C.stride = C.width; d_C.height = C.height;
size = C.width * C.height * sizeof(float);
cudaMalloc(&d_C.elements, size);
// Invoke kernel
dim3 dimBlock(1,B.height);
dim3 dimGrid(A.height, C.width);
MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);
// Read C from device memory
cudaMemcpy(C.elements, d_C.elements, size,
cudaMemcpyDeviceToHost);
// Free device memory
cudaFree(d_A.elements);
cudaFree(d_B.elements);
cudaFree(d_C.elements);
}
// Matrix multiplication kernel called by MatMul()
__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)
{
// Block row and column
int blockCol = blockIdx.y;
int blockRow = blockIdx.x;
float Cvalue = 0;
// Thread row and column within Csub
int row = threadIdx.y;
int col = threadIdx.x;
// Loop over all the sub-matrices of A and B that are
// required to compute Csub
// Multiply each pair of sub-matrices together
// and accumulate the results
// Shared memory used to store Asub and Bsub respectively
__shared__ float As[1][kernel_size];
__shared__ float Bs[kernel_size][1];
// Load Asub and Bsub from device memory to shared memory
// Each thread loads one element of each sub-matrix
As[0][row] = A.elements[blockRow * A.stride + row+B.height*blockCol];
Bs[row][0] = B.elements[row];
// Synchronize to make sure the sub-matrices are loaded
// before starting the computation
__syncthreads();
// Multiply Asub and Bsub together
for (int e = 0; e < B.height; ++e)
{
Cvalue += As[0][e] * Bs[e][0];
}
// Synchronize to make sure that the preceding
// computation is done before loading two new
// sub-matrices of A and B in the next iteration
__syncthreads();
// Write Csub to device memory
// Each thread writes one element
C.elements[blockRow * C.stride +blockCol]= Cvalue;
}
//////////////////
float * gen_matrix(int n /*row*/, int m /*col*/){
float *A;
//srand(1023);
A = (float *) malloc(n*m*sizeof(float));
for(int row = 0;row < n;row++)
for(int col = 0;col < m;col++) {
A[row*m+col] = rand()%10;
}
/*
// print matrix elements.
for (int i = 0; i < n; ++i) {
for (int j = 0; j < m; ++j)
cout << " [" << i << "," << j << "] " << A[i*m+j] ;
cout << endl;
}
*/
return A;
}
int main()
{
int k=kernel_size;
int s=2000;
int m =4096;
//int m=2;
//int s=1;
int n = k*s;
float *Ae = gen_matrix(m,n);
float *Be= gen_matrix(k,1);00
float *Ce=(float *) malloc(m*s*sizeof(float));
Matrix A ={n,m,n,Ae};
Matrix B ={1,k,1,Be};
Matrix C ={s,m,s,Ce};
const clock_t begin_time = clock();
MatMul(A, B, C);
std::cout << float( clock () - begin_time ) / CLOCKS_PER_SEC;
for (int i = 0; i < 3; ++i) {
for (int j = 0; j <7; ++j)
cout << " [" << i << "," << j << "] " << Ce[i*m+j] ;
cout << endl;
}
//check
float *Ce2=(float *) malloc(s*m*sizeof(float));
for (int i = 0; i < m; i++)
{
for (int j = 0; j < s; j++)
{
Ce2[i*s+j]=0;
}
}
for (int i = 0; i < m; i++)
{
for (int j = 0; j < s; j++)
{
for (int ind = 0; ind < k; ind++)
{
Ce2[i*s+j]=Ce2[i*s+j]+Ae[j*k+ind+i*k*s]*Be[ind];
// printf("%f---****%f\n",Ae[j*k+ind+i*k*s],Be[ind]);
}
if (Ce2[i*s+j]!= Ce[i*s+j])
{
printf("%f----%f\n",Ce2[i*s+j],Ce[i*s+j]);
}
}
}
free(Ae);
free(Be);
free(Ce);
}
This is just a matrix-matrix multiplication problem. If you want things to run fast, you should not be writing your own matrix-matrix multiply code. Use CUBLAS Sgemm.
Conceptually, if you arrange your A matrices like this:
[A0]
[A1]
[A2]
...
[A199]
then you will have a new matrix AA that is (4096*200) rows x 48 columns.
Arrange your 48 V vectors (48x1) in a 48x48 matrix (VV):
[V0][V1][V2]...[V47]
(each V vector is a column of the new matrix VV)
You now have a single matrix multiplication problem (AA*VV) that is (4096*200)x48 multiplied by 48x48 which yields a (4096*200) x 48 result. This result has one column vector of length 4096*200 that contains 200 results of the individual matrix-vector multiplications you were trying to do. The 200 results per column * 48 columns combine to give you all of the results that your original problem would create. The first column would contain the results of [V0] multiplied by each of the 200 A matrices, the second column would contain the results of [V1] multiplied by each of the 200 A matrices, etc.
Once you have arranged your data like this, using CUBLAS Sgemm should be the quickest possible approach on the GPU. Note that CUBLAS expects the underlying storage to be column-major, so if you are rearranging your data, you will probably want to keep this in mind. There is a CUDA sample code for CUBLAS matrix multiplication.
In your code it appears you actually have 2000 A matrices, but your question refers to 200. I have used 200 for example in my answer, but the concept would be the same with 2000 A matrices.
I have two 3D point clouds, and I'd like to use opencv to find the rigid transformation matrix (translation, rotation, constant scaling among all 3 axes).
I've found an estimateRigidTransformation function, but it's only for 2D points apparently
In addition, I've found estimateAffine3D, but it doesn't seem to support rigid transformation mode.
Do I need to just write my own rigid transformation function?
I did not find the required functionality in OpenCV so I have written my own implementation. Based on ideas from OpenSFM.
cv::Vec3d
CalculateMean(const cv::Mat_<cv::Vec3d> &points)
{
cv::Mat_<cv::Vec3d> result;
cv::reduce(points, result, 0, CV_REDUCE_AVG);
return result(0, 0);
}
cv::Mat_<double>
FindRigidTransform(const cv::Mat_<cv::Vec3d> &points1, const cv::Mat_<cv::Vec3d> points2)
{
/* Calculate centroids. */
cv::Vec3d t1 = -CalculateMean(points1);
cv::Vec3d t2 = -CalculateMean(points2);
cv::Mat_<double> T1 = cv::Mat_<double>::eye(4, 4);
T1(0, 3) = t1[0];
T1(1, 3) = t1[1];
T1(2, 3) = t1[2];
cv::Mat_<double> T2 = cv::Mat_<double>::eye(4, 4);
T2(0, 3) = -t2[0];
T2(1, 3) = -t2[1];
T2(2, 3) = -t2[2];
/* Calculate covariance matrix for input points. Also calculate RMS deviation from centroid
* which is used for scale calculation.
*/
cv::Mat_<double> C(3, 3, 0.0);
double p1Rms = 0, p2Rms = 0;
for (int ptIdx = 0; ptIdx < points1.rows; ptIdx++) {
cv::Vec3d p1 = points1(ptIdx, 0) + t1;
cv::Vec3d p2 = points2(ptIdx, 0) + t2;
p1Rms += p1.dot(p1);
p2Rms += p2.dot(p2);
for (int i = 0; i < 3; i++) {
for (int j = 0; j < 3; j++) {
C(i, j) += p2[i] * p1[j];
}
}
}
cv::Mat_<double> u, s, vh;
cv::SVD::compute(C, s, u, vh);
cv::Mat_<double> R = u * vh;
if (cv::determinant(R) < 0) {
R -= u.col(2) * (vh.row(2) * 2.0);
}
double scale = sqrt(p2Rms / p1Rms);
R *= scale;
cv::Mat_<double> M = cv::Mat_<double>::eye(4, 4);
R.copyTo(M.colRange(0, 3).rowRange(0, 3));
cv::Mat_<double> result = T2 * M * T1;
result /= result(3, 3);
return result.rowRange(0, 3);
}
I've found PCL to be a nice adjunct to OpenCV. Take a look at their Iterative Closest Point (ICP) example. The provided example registers the two point clouds and then displays the rigid transformation.
Here's my rmsd code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <assert.h>
typedef struct
{
float m[4][4];
} MATRIX;
#define vdiff2(a,b) ( ((a)[0]-(b)[0]) * ((a)[0]-(b)[0]) + \
((a)[1]-(b)[1]) * ((a)[1]-(b)[1]) + \
((a)[2]-(b)[2]) * ((a)[2]-(b)[2]) )
static double alignedrmsd(float *v1, float *v2, int N);
static void centroid(float *ret, float *v, int N);
static int getalignmtx(float *v1, float *v2, int N, MATRIX *mtx);
static void crossproduct(float *ans, float *pt1, float *pt2);
static void mtx_root(MATRIX *mtx);
static int almostequal(MATRIX *a, MATRIX *b);
static void mulpt(MATRIX *mtx, float *pt);
static void mtx_mul(MATRIX *ans, MATRIX *x, MATRIX *y);
static void mtx_identity(MATRIX *mtx);
static void mtx_trans(MATRIX *mtx, float x, float y, float z);
static int mtx_invert(float *mtx, int N);
static float absmaxv(float *v, int N);
/*
calculate rmsd between two structures
Params: v1 - first set of points
v2 - second set of points
N - number of points
mtx - return for transfrom matrix used to align structures
Returns: rmsd score
Notes: mtx can be null. Transform will be rigid. Inputs must
be previously aligned for sequence alignment
*/
double rmsd(float *v1, float *v2, int N, float *mtx)
{
float cent1[3];
float cent2[3];
MATRIX tmtx;
MATRIX tempmtx;
MATRIX move1;
MATRIX move2;
int i;
double answer;
float *temp1 = 0;
float *temp2 = 0;
int err;
assert(N > 3);
temp1 = malloc(N * 3 * sizeof(float));
temp2 = malloc(N * 3 * sizeof(float));
if(!temp1 || !temp2)
goto error_exit;
centroid(cent1, v1, N);
centroid(cent2, v2, N);
for(i=0;i<N;i++)
{
temp1[i*3+0] = v1[i*3+0] - cent1[0];
temp1[i*3+1] = v1[i*3+1] - cent1[1];
temp1[i*3+2] = v1[i*3+2] - cent1[2];
temp2[i*3+0] = v2[i*3+0] - cent2[0];
temp2[i*3+1] = v2[i*3+1] - cent2[1];
temp2[i*3+2] = v2[i*3+2] - cent2[2];
}
err = getalignmtx(temp1, temp2, N, &tmtx);
if(err == -1)
goto error_exit;
mtx_trans(&move1, -cent2[0], -cent2[1], -cent2[2]);
mtx_mul(&tempmtx, &move1, &tmtx);
mtx_trans(&move2, cent1[0], cent1[1], cent1[2]);
mtx_mul(&tmtx, &tempmtx, &move2);
memcpy(temp2, v2, N * sizeof(float) * 3);
for(i=0;i<N;i++)
mulpt(&tmtx, temp2 + i * 3);
answer = alignedrmsd(v1, temp2, N);
free(temp1);
free(temp2);
if(mtx)
memcpy(mtx, &tmtx.m, 16 * sizeof(float));
return answer;
error_exit:
free(temp1);
free(temp2);
if(mtx)
{
for(i=0;i<16;i++)
mtx[i] = 0;
}
return sqrt(-1.0);
}
/*
calculate rmsd between two aligned structures (trivial)
Params: v1 - first structure
v2 - second structure
N - number of points
Returns: rmsd
*/
static double alignedrmsd(float *v1, float *v2, int N)
{
double answer =0;
int i;
for(i=0;i<N;i++)
answer += vdiff2(v1 + i *3, v2 + i * 3);
return sqrt(answer/N);
}
/*
compute the centroid
*/
static void centroid(float *ret, float *v, int N)
{
int i;
ret[0] = 0;
ret[1] = 0;
ret[2] = 0;
for(i=0;i<N;i++)
{
ret[0] += v[i*3+0];
ret[1] += v[i*3+1];
ret[2] += v[i*3+2];
}
ret[0] /= N;
ret[1] /= N;
ret[2] /= N;
}
/*
get the matrix needed to align two structures
Params: v1 - reference structure
v2 - structure to align
N - number of points
mtx - return for rigid body alignment matrix
Notes: only calculates rotation part of matrix.
assumes input has been aligned to centroids
*/
static int getalignmtx(float *v1, float *v2, int N, MATRIX *mtx)
{
MATRIX A = { {{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,1}} };
MATRIX At;
MATRIX Ainv;
MATRIX temp;
float tv[3];
float tw[3];
float tv2[3];
float tw2[3];
int k, i, j;
int flag = 0;
float correction;
correction = absmaxv(v1, N * 3) * absmaxv(v2, N * 3);
for(k=0;k<N;k++)
for(i=0;i<3;i++)
for(j=0;j<3;j++)
A.m[i][j] += (v1[k*3+i] * v2[k*3+j])/correction;
while(flag < 3)
{
for(i=0;i<4;i++)
for(j=0;j<4;j++)
At.m[i][j] = A.m[j][i];
memcpy(&Ainv, &A, sizeof(MATRIX));
/* this will happen if all points are in a plane */
if( mtx_invert((float *) &Ainv, 4) == -1)
{
if(flag == 0)
{
crossproduct(tv, v1, v1+3);
crossproduct(tw, v2, v2+3);
}
else
{
crossproduct(tv2, tv, v1);
crossproduct(tw2, tw, v2);
memcpy(tv, tv2, 3 * sizeof(float));
memcpy(tw, tw2, 3 * sizeof(float));
}
for(i=0;i<3;i++)
for(j=0;j<3;j++)
A.m[i][j] += tv[i] * tw[j];
flag++;
}
else
flag = 5;
}
if(flag != 5)
return -1;
mtx_mul(&temp, &At, &A);
mtx_root(&temp);
mtx_mul(mtx, &temp, &Ainv);
return 0;
}
/*
get the crossproduct of two vectors.
Params: ans - return pinter for answer.
pt1 - first vector
pt2 - second vector.
Notes: crossproduct is at right angles to the two vectors.
*/
static void crossproduct(float *ans, float *pt1, float *pt2)
{
ans[0] = pt1[1] * pt2[2] - pt1[2] * pt2[1];
ans[1] = pt1[0] * pt2[2] - pt1[2] * pt2[0];
ans[2] = pt1[0] * pt2[1] - pt1[1] * pt2[0];
}
/*
Denman-Beavers square root iteration
*/
static void mtx_root(MATRIX *mtx)
{
MATRIX Y = *mtx;
MATRIX Z;
MATRIX Y1;
MATRIX Z1;
MATRIX invY;
MATRIX invZ;
MATRIX Y2;
int iter = 0;
int i, ii;
mtx_identity(&Z);
do
{
invY = Y;
invZ = Z;
if( mtx_invert((float *) &invY, 4) == -1)
return;
if( mtx_invert((float *) &invZ, 4) == -1)
return;
for(i=0;i<4;i++)
for(ii=0;ii<4;ii++)
{
Y1.m[i][ii] = 0.5 * (Y.m[i][ii] + invZ.m[i][ii]);
Z1.m[i][ii] = 0.5 * (Z.m[i][ii] + invY.m[i][ii]);
}
Y = Y1;
Z = Z1;
mtx_mul(&Y2, &Y, &Y);
}
while(!almostequal(&Y2, mtx) && iter++ < 20 );
*mtx = Y;
}
/*
Check two matrices for near-enough equality
Params: a - first matrix
b - second matrix
Returns: 1 if almost equal, else 0, epsilon 0.0001f.
*/
static int almostequal(MATRIX *a, MATRIX *b)
{
int i, ii;
float epsilon = 0.001f;
for(i=0;i<4;i++)
for(ii=0;ii<4;ii++)
if(fabs(a->m[i][ii] - b->m[i][ii]) > epsilon)
return 0;
return 1;
}
/*
multiply a point by a matrix.
Params: mtx - matrix
pt - the point (transformed)
*/
static void mulpt(MATRIX *mtx, float *pt)
{
float ans[4] = {0};
int i;
int ii;
for(i=0;i<4;i++)
{
for(ii=0;ii<3;ii++)
{
ans[i] += pt[ii] * mtx->m[ii][i];
}
ans[i] += mtx->m[3][i];
}
pt[0] = ans[0];
pt[1] = ans[1];
pt[2] = ans[2];
}
/*
multiply two matrices.
Params: ans - return pointer for answer.
x - first matrix
y - second matrix.
Notes: ans may not be equal to x or y.
*/
static void mtx_mul(MATRIX *ans, MATRIX *x, MATRIX *y)
{
int i;
int ii;
int iii;
for(i=0;i<4;i++)
for(ii=0;ii<4;ii++)
{
ans->m[i][ii] = 0;
for(iii=0;iii<4;iii++)
ans->m[i][ii] += x->m[i][iii] * y->m[iii][ii];
}
}
/*
create an identity matrix.
Params: mtx - return pointer.
*/
static void mtx_identity(MATRIX *mtx)
{
int i;
int ii;
for(i=0;i<4;i++)
for(ii=0;ii<4;ii++)
{
if(i==ii)
mtx->m[i][ii] = 1.0f;
else
mtx->m[i][ii] = 0;
}
}
/*
create a translation matrix.
Params: mtx - return pointer for matrix.
x - x translation.
y - y translation.
z - z translation
*/
static void mtx_trans(MATRIX *mtx, float x, float y, float z)
{
mtx->m[0][0] = 1;
mtx->m[0][1] = 0;
mtx->m[0][2] = 0;
mtx->m[0][3] = 0;
mtx->m[1][0] = 0;
mtx->m[1][1] = 1;
mtx->m[1][2] = 0;
mtx->m[1][3] = 0;
mtx->m[2][0] = 0;
mtx->m[2][1] = 0;
mtx->m[2][2] = 1;
mtx->m[2][3] = 0;
mtx->m[3][0] = x;
mtx->m[3][1] = y;
mtx->m[3][2] = z;
mtx->m[3][3] = 1;
}
/*
matrix invert routine
Params: mtx - the matrix in raw format, in/out
N - width and height
Returns: 0 on success, -1 on fail
*/
static int mtx_invert(float *mtx, int N)
{
int indxc[100]; /* these 100s are the only restriction on matrix size */
int indxr[100];
int ipiv[100];
int i, j, k;
int irow, icol;
double big;
double pinv;
int l, ll;
double dum;
double temp;
assert(N <= 100);
for(i=0;i<N;i++)
ipiv[i] = 0;
for(i=0;i<N;i++)
{
big = 0.0;
/* find biggest element */
for(j=0;j<N;j++)
if(ipiv[j] != 1)
for(k=0;k<N;k++)
if(ipiv[k] == 0)
if(fabs(mtx[j*N+k]) >= big)
{
big = fabs(mtx[j*N+k]);
irow = j;
icol = k;
}
ipiv[icol]=1;
if(irow != icol)
for(l=0;l<N;l++)
{
temp = mtx[irow * N + l];
mtx[irow * N + l] = mtx[icol * N + l];
mtx[icol * N + l] = temp;
}
indxr[i] = irow;
indxc[i] = icol;
/* if biggest element is zero matrix is singular, bail */
if(mtx[icol* N + icol] == 0)
goto error_exit;
pinv = 1.0/mtx[icol * N + icol];
mtx[icol * N + icol] = 1.0;
for(l=0;l<N;l++)
mtx[icol * N + l] *= pinv;
for(ll=0;ll<N;ll++)
if(ll != icol)
{
dum = mtx[ll * N + icol];
mtx[ll * N + icol] = 0.0;
for(l=0;l<N;l++)
mtx[ll * N + l] -= mtx[icol * N + l]*dum;
}
}
/* unscramble matrix */
for (l=N-1;l>=0;l--)
{
if (indxr[l] != indxc[l])
for (k=0;k<N;k++)
{
temp = mtx[k * N + indxr[l]];
mtx[k * N + indxr[l]] = mtx[k * N + indxc[l]];
mtx[k * N + indxc[l]] = temp;
}
}
return 0;
error_exit:
return -1;
}
/*
get the asolute maximum of an array
*/
static float absmaxv(float *v, int N)
{
float answer;
int i;
for(i=0;i<N;i++)
if(answer < fabs(v[i]))
answer = fabs(v[i]);
return answer;
}
#include <stdio.h>
/*
debug utlitiy
*/
static void printmtx(FILE *fp, MATRIX *mtx)
{
int i, ii;
for(i=0;i<4;i++)
{
for(ii=0;ii<4;ii++)
fprintf(fp, "%f, ", mtx->m[i][ii]);
fprintf(fp, "\n");
}
}
int rmsdmain(void)
{
float one[4*3] = {0,0,0, 1,0,0, 2,1,0, 0,3,1};
float two[4*3] = {0,0,0, 0,1,0, 1,2,0, 3,0,1};
MATRIX mtx;
double diff;
int i;
diff = rmsd(one, two, 4, (float *) &mtx.m);
printf("%f\n", diff);
printmtx(stdout, &mtx);
for(i=0;i<4;i++)
{
mulpt(&mtx, two + i * 3);
printf("%f %f %f\n", two[i*3], two[i*3+1], two[i*3+2]);
}
return 0;
}
I took #vagran's implementation and added RANSAC on top of it, since estimateRigidTransform2d does it and it was helpful for me since my data is noisy. (Note: This code doesn't have constant scaling along all 3 axes; you can add it back in easily by comparing to vargran's).
cv::Vec3f CalculateMean(const cv::Mat_<cv::Vec3f> &points)
{
if(points.size().height == 0){
return 0;
}
assert(points.size().width == 1);
double mx = 0.0;
double my = 0.0;
double mz = 0.0;
int n_points = points.size().height;
for(int i = 0; i < n_points; i++){
double x = double(points(i)[0]);
double y = double(points(i)[1]);
double z = double(points(i)[2]);
mx += x;
my += y;
mz += z;
}
return cv::Vec3f(mx/n_points, my/n_points, mz/n_points);
}
cv::Mat_<double>
FindRigidTransform(const cv::Mat_<cv::Vec3f> &points1, const cv::Mat_<cv::Vec3f> points2)
{
/* Calculate centroids. */
cv::Vec3f t1 = CalculateMean(points1);
cv::Vec3f t2 = CalculateMean(points2);
cv::Mat_<double> T1 = cv::Mat_<double>::eye(4, 4);
T1(0, 3) = double(-t1[0]);
T1(1, 3) = double(-t1[1]);
T1(2, 3) = double(-t1[2]);
cv::Mat_<double> T2 = cv::Mat_<double>::eye(4, 4);
T2(0, 3) = double(t2[0]);
T2(1, 3) = double(t2[1]);
T2(2, 3) = double(t2[2]);
/* Calculate covariance matrix for input points. Also calculate RMS deviation from centroid
* which is used for scale calculation.
*/
cv::Mat_<double> C(3, 3, 0.0);
for (int ptIdx = 0; ptIdx < points1.rows; ptIdx++) {
cv::Vec3f p1 = points1(ptIdx) - t1;
cv::Vec3f p2 = points2(ptIdx) - t2;
for (int i = 0; i < 3; i++) {
for (int j = 0; j < 3; j++) {
C(i, j) += double(p2[i] * p1[j]);
}
}
}
cv::Mat_<double> u, s, vt;
cv::SVD::compute(C, s, u, vt);
cv::Mat_<double> R = u * vt;
if (cv::determinant(R) < 0) {
R -= u.col(2) * (vt.row(2) * 2.0);
}
cv::Mat_<double> M = cv::Mat_<double>::eye(4, 4);
R.copyTo(M.colRange(0, 3).rowRange(0, 3));
cv::Mat_<double> result = T2 * M * T1;
result /= result(3, 3);
return result;
}
cv::Mat_<double> RANSACFindRigidTransform(const cv::Mat_<cv::Vec3f> &points1, const cv::Mat_<cv::Vec3f> &points2)
{
cv::Mat points1Homo;
cv::convertPointsToHomogeneous(points1, points1Homo);
int iterations = 100;
int min_n_points = 3;
int n_points = points1.size().height;
std::vector<int> range(n_points);
cv::Mat_<double> best;
int best_inliers = -1;
// inlier points should be projected within this many units
float threshold = .02;
std::iota(range.begin(), range.end(), 0);
auto gen = std::mt19937{std::random_device{}()};
for(int i = 0; i < iterations; i++) {
std::shuffle(range.begin(), range.end(), gen);
cv::Mat_<cv::Vec3f> points1subset(min_n_points, 1, cv::Vec3f(0,0,0));
cv::Mat_<cv::Vec3f> points2subset(min_n_points, 1, cv::Vec3f(0,0,0));
for(int j = 0; j < min_n_points; j++) {
points1subset(j) = points1(range[j]);
points2subset(j) = points2(range[j]);
}
cv::Mat_<float> rigidT = FindRigidTransform(points1subset, points2subset);
cv::Mat_<float> rigidT_float = cv::Mat::eye(4, 4, CV_32F);
rigidT.convertTo(rigidT_float, CV_32F);
std::vector<int> inliers;
for(int j = 0; j < n_points; j++) {
cv::Mat_<float> t1_3d = rigidT_float * cv::Mat_<float>(points1Homo.at<cv::Vec4f>(j));
if(t1_3d(3) == 0) {
continue; // Avoid 0 division
}
float dx = (t1_3d(0)/t1_3d(3) - points2(j)[0]);
float dy = (t1_3d(1)/t1_3d(3) - points2(j)[1]);
float dz = (t1_3d(2)/t1_3d(3) - points2(j)[2]);
float square_dist = dx * dx + dy * dy + dz * dz;
if(square_dist < threshold * threshold){
inliers.push_back(j);
}
}
int n_inliers = inliers.size();
if(n_inliers > best_inliers) {
best_inliers = n_inliers;
best = rigidT;
}
}
return best;
}
#vagran Thanks for the code! Seems to work very well.
I do have a little terminology suggestion though. Since you are estimating and applying a scale during the transformation, it is a 7-parameter transformation, or Helmert / similarity transformation. And in a rigid transformation, no scaling is applied because all Euclidiean distances need to be reserved.
I would've added this as comment, but don't have enough points.. D: sorry for that.
rigid transformation: https://en.wikipedia.org/wiki/Rigid_transformation
Helmert transformation: https://www.researchgate.net/publication/322841143_Parameter_estimation_in_3D_affine_and_similarity_transformation_implementation_of_variance_component_estimation