I need to convert this code into C code.
Questions:
Will MATLAB Coder generate C code that are memory safe, e.g they not using calloc or malloc. Misra C standard does not allow coder to use dynamical memory allocation. It's dangerous for embedded system due to memory leaks.
Will MATLAB Coder generate C code with dynamical matrix as argument e.g. functions with arguments foo(float* A, int m, int n) or foo(int m, int n, float A[m][n]) or is fix size example foo(float A[3][5]), only available as option?
Will MATLAB Coder generate C code that can be fitted into an embedded system. How about the internal C++ commands in the .m files such as horzcat, size and vertcat? Will they become 100% portable C-code?
Will MATLAB Coder generate functions that have call by reference? Example foo(float* input, float* output) instead of float* output = foo(float* input)
function [U] = mpc (A, B, C, x, N, r, lb)
## Find matrix
PHI = phiMat(A, C, N);
GAMMA = gammaMat(A, B, C, N);
## Solve first with no constraints
U = solve(PHI, GAMMA, x, N, r, 0, 0, false);
## Then use the last U as upper bound
U = solve(PHI, GAMMA, x, N, r, lb, U(end), true);
end
function U = solve(PHI, GAMMA, x, N, r, lb, ub, constraints)
## Set U
U = zeros(N, 1);
## Iterate Gaussian Elimination
for i = 1:N
## Solve u
if(i == 1)
u = (r - PHI(i,:)*x)/GAMMA(i,i)
else
u = (r - PHI(i,:)*x - GAMMA(i,1:i-1)*U(1:i-1) )/GAMMA(i,i)
end
## Constraints
if(constraints == true)
if(u > ub)
u = ub;
elseif(u < lb)
u = lb;
end
end
## Save u
U(i) = u
end
end
function PHI = phiMat(A, C, N)
## Create the special Observabillity matrix
PHI = [];
for i = 1:N
PHI = vertcat(PHI, C*A^i);
end
end
function GAMMA = gammaMat(A, B, C, N)
## Create the lower triangular toeplitz matrix
GAMMA = [];
for i = 1:N
GAMMA = horzcat(GAMMA, vertcat(zeros((i-1)*size(C*A*B, 1), size(C*A*B, 2)),cabMat(A, B, C, N-i+1)));
end
end
function CAB = cabMat(A, B, C, N)
## Create the column for the GAMMA matrix
CAB = [];
for i = 0:N-1
CAB = vertcat(CAB, C*A^i*B);
end
end
My C-code. Yes its working!
/*
* Generalized_Predictive_Control.c
*
* Created on:
* Author:
*/
#include "Generalized_Predictive_Control.h"
/*
* Parameters
*/
int adim;
int ydim;
int rdim;
int horizon;
/*
* Deceleration
*/
static void obsv(float* PHI, const float* A, const float* C);
static void kalman(float* x, const float* A, const float* B, float* u, const float* K, float* y, const float* C);
static void mul(float* A, float* B, float* C, int row_a, int column_a, int column_b);
static void tran(float* A, int row, int column);
static void CAB(float* GAMMA, float* PHI, const float* A, const float* B, const float* C);
static void solve(float* GAMMA, float* PHI, float* x, float* u, float* r, float lb, float ub, int constraintsON);
static void print(float* A, int row, int column);
void GPC(int adim_, int ydim_, int rdim_, int horizon_, const float* A, const float* B, const float* C, const float* D, const float* K, float* u, float* r, float* y, float* x){
/*
* Set the dimensions
*/
adim = adim_;
ydim = ydim_;
rdim = rdim_;
horizon = horizon_;
/*
* Identify the model - Extended Least Square
*/
int n = 5;
float* phi;
float* theta;
//els(phi, theta, n, y, u, P);
/*
* Create a state space model with Observable canonical form
*/
/*
* Create the extended observability matrix
*/
float PHI[horizon*ydim*adim];
memset(PHI, 0, horizon*ydim*adim*sizeof(float));
obsv(PHI, A, C);
/*
* Create the lower triangular toeplitz matrix
*/
float GAMMA[horizon*rdim*horizon*ydim];
memset(GAMMA, 0, horizon*rdim*horizon*ydim*sizeof(float));
CAB(GAMMA, PHI, A, B, C);
/*
* Solve the best input value
*/
solve(GAMMA, PHI, x, u, r, 0, 0, 0);
solve(GAMMA, PHI, x, u, r, 0, *(u), 1);
/*
* Estimate the state vector
*/
kalman(x, A, B, u, K, y, C);
}
/*
* Identify the model
*/
static void els(float* P, float* phi, float* theta, int polyLength, int totalPolyLength, float* y, float* u, float* e){
/*
* move phi with the inputs, outputs, errors one step to right
*/
for(int i = 0; i < polyLength; i++){
*(phi + i+1 + totalPolyLength*0) = *(phi + i + totalPolyLength*0); // Move one to right for the y's
*(phi + i+1 + totalPolyLength*1) = *(phi + i + totalPolyLength*1); // Move one to right for the u's
*(phi + i+1 + totalPolyLength*2) = *(phi + i + totalPolyLength*2); // Move one to right for the e's
}
/*
* Add the current y, u and e
(*phi + totalPolyLength*0) = -*(y + 0); // Need to be negative!
(*phi + totalPolyLength*1) = *(u + 0);
(*phi + totalPolyLength*2) = *(e + 0);
*/
/*
* phi'*theta
*/
float y_est = 0;
for(int i = 0; i < totalPolyLength; i++){
y_est += *(phi + i) * *(theta + i);
}
float epsilon = *(y + 0) - y_est; // In this case, y is only one element array
/*
* phi*epsilon
*/
float phi_epsilon[totalPolyLength];
memset(phi_epsilon, 0, totalPolyLength*sizeof(float));
for(int i = 0; i < totalPolyLength; i++){
*(phi_epsilon + i) = *(phi + i) * epsilon;
}
/*
* P_vec = P*phi_epsilon
*/
float P_vec[totalPolyLength];
memset(P_vec, 0, totalPolyLength*sizeof(float));
mul(P, phi_epsilon, P_vec, totalPolyLength, totalPolyLength, 1);
/*
* Update our estimated vector theta = theta + P_vec
*/
for(int i = 0; i < totalPolyLength; i++){
*(theta + i) = *(theta + i) + *(P_vec + i);
}
/*
* Update P = P - (P*phi*phi'*P)/(1 + phi'*P*phi)
*/
// Create phi'
float phiT[totalPolyLength];
memset(phiT, 0, totalPolyLength*sizeof(float));
memcpy(phiT, phi, totalPolyLength*sizeof(float));
tran(phiT, totalPolyLength, 1);
// phi'*P
float phiT_P[totalPolyLength];
memset(phiT_P, 0, totalPolyLength*sizeof(float));
mul(phiT, P, phiT_P, 1, totalPolyLength, totalPolyLength);
// phi*phi'*P
float phi_phiT_P[totalPolyLength*totalPolyLength];
memset(phi_phiT_P, 0, totalPolyLength*totalPolyLength*sizeof(float));
mul(phi, phiT_P, phi_phiT_P, totalPolyLength, 1, totalPolyLength);
// P*phi*phi'*P
float P_phi_phiT_P[totalPolyLength*totalPolyLength];
memset(P_phi_phiT_P, 0, totalPolyLength*totalPolyLength*sizeof(float));
mul(P, phi_phiT_P, P_phi_phiT_P, totalPolyLength, totalPolyLength, totalPolyLength);
// P*phi
float P_phi[totalPolyLength];
memset(P_phi, 0, totalPolyLength*sizeof(float));
mul(P, phi, P_phi, totalPolyLength, totalPolyLength, 1);
// phi'*P*phi
float phiT_P_phi[1];
memset(phiT_P_phi, 0, 1*sizeof(float));
mul(phiT, P_phi, phiT_P_phi, 1, totalPolyLength, 1);
// P = P - (P_phi_phiT_P) / (1+phi'*P*phi)
for(int i = 0; i < totalPolyLength*totalPolyLength; i++){
*(P + i) = *(P + i) - *(P_phi_phiT_P + i) / (1 + *(phiT_P_phi));
}
}
/*
* This will solve if GAMMA is square!
*/
static void solve(float* GAMMA, float* PHI, float* x, float* u, float* r, float lb, float ub, int constraintsON){
/*
* Now we are going to solve on the form
* Ax=b, where b = (R*r-PHI*x) and A = GAMMA and x = U
*/
/*
* R_vec = R*r
*/
float R_vec[horizon*ydim];
memset(R_vec, 0, horizon*ydim*sizeof(float));
for(int i = 0; i < horizon*ydim; i++){
for (int j = 0; j < rdim; j++) {
*(R_vec + i + j) = *(r + j);
}
i += rdim-1;
}
/*
* PHI_vec = PHI*x
*/
float PHI_vec[horizon*ydim];
memset(PHI_vec, 0, horizon * ydim * sizeof(float));
mul(PHI, x, PHI_vec, horizon*ydim, adim, 1);
/*
* Solve now (R_vec - PHI_vec) = GAMMA*U
* Notice that this is ONLY for Square GAMMA with lower triangular toeplitz matrix e.g SISO case
* This using Gaussian Elimination backward substitution
*/
float U[horizon];
float sum = 0.0;
memset(U, 0, horizon*sizeof(float));
for(int i = 0; i < horizon; i++){
for(int j = 0; j < i; j++){
sum += *(GAMMA + i*horizon + j) * *(U + j);
}
float newU = (*(R_vec + i) - *(PHI_vec + i) - sum) / (*(GAMMA + i*horizon + i));
if(constraintsON == 1){
if(newU > ub)
newU = ub;
if(newU < lb)
newU = lb;
}
*(U + i) = newU;
sum = 0.0;
}
//print(U, horizon, 1);
/*
* Set last U to u
*/
if(constraintsON == 0){
*(u + 0) = *(U + horizon - 1);
}else{
*(u + 0) = *(U + 0);
}
}
/*
* Lower traingular toeplitz of extended observability matrix
*/
static void CAB(float* GAMMA, float* PHI, const float* A, const float* B, const float* C){
/*
* First create the initial C*A^0*B == C*I*B == C*B
*/
float CB[ydim*rdim];
memset(CB, 0, ydim*rdim*sizeof(float));
mul((float*)C, (float*)B, CB, ydim, adim, rdim);
/*
* Take the transpose of CB so it will have dimension rdim*ydim instead
*/
tran(CB, ydim, rdim);
/*
* Create the CAB matrix from PHI*B
*/
float PHIB[horizon*ydim*rdim];
mul(PHI, (float*) B, PHIB, horizon*ydim, adim, rdim); // CAB = PHI*B
tran(PHIB, horizon*ydim, rdim);
/*
* We insert GAMMA = [CB PHI;
* 0 CB PHI;
* 0 0 CB PHI;
* 0 0 0 CB PHI] from left to right
*/
for(int i = 0; i < horizon; i++) {
for(int j = 0; j < rdim; j++) {
memcpy(GAMMA + horizon*ydim*(i*rdim+j) + ydim*i, CB + ydim*j, ydim*sizeof(float)); // Add CB
memcpy(GAMMA + horizon*ydim*(i*rdim+j) + ydim*i + ydim, PHIB + horizon*ydim*j, (horizon-i-1)*ydim*sizeof(float)); // Add PHI*B
}
}
/*
* Transpose of gamma
*/
tran(GAMMA, horizon*rdim, horizon*ydim);
//print(CB, rdim, ydim);
//print(PHIB, rdim, horizon*ydim);
//print(GAMMA, horizon*ydim, horizon*rdim);
}
/*
* Transpose
*/
static void tran(float* A, int row, int column) {
float B[row*column];
float* transpose;
float* ptr_A = A;
for (int i = 0; i < row; i++) {
transpose = &B[i];
for (int j = 0; j < column; j++) {
*transpose = *ptr_A;
ptr_A++;
transpose += row;
}
}
// Copy!
memcpy(A, B, row*column*sizeof(float));
}
/*
* [C*A^1; C*A^2; C*A^3; ... ; C*A^horizon] % Extended observability matrix
*/
static void obsv(float* PHI, const float* A, const float* C){
/*
* This matrix will A^(i+1) all the time
*/
float A_pow[adim*adim];
memset(A_pow, 0, adim * adim * sizeof(float));
float A_copy[adim*adim];
memcpy(A_copy, (float*) A, adim * adim * sizeof(float));
/*
* Temporary matrix
*/
float T[ydim*adim];
memset(T, 0, ydim * adim * sizeof(float));
/*
* Regular T = C*A^(1+i)
*/
mul((float*) C, (float*) A, T, ydim, adim, adim);
/*
* Insert temporary T into PHI
*/
memcpy(PHI, T, ydim*adim*sizeof(float));
/*
* Do the rest C*A^(i+1) because we have already done i = 0
*/
for(int i = 1; i < horizon; i++){
mul((float*) A, A_copy, A_pow, adim, adim, adim); // Matrix power A_pow = A*A_copy
mul((float*) C, A_pow, T, ydim, adim, adim); // T = C*A^(1+i)
memcpy(PHI + i*ydim*adim, T, ydim*adim*sizeof(float)); // Insert temporary T into PHI
memcpy(A_copy, A_pow, adim * adim * sizeof(float)); // A_copy <- A_pow
}
}
/*
* x = Ax - KCx + Bu + Ky % Kalman filter
*/
static void kalman(float* x, const float* A, const float* B, float* u, const float* K, float* y, const float* C) {
/*
* Compute the vector A_vec = A*x
*/
float A_vec[adim*1];
memset(A_vec, 0, adim*sizeof(float));
mul((float*) A, x, A_vec, adim, adim, 1);
/*
* Compute the vector B_vec = B*u
*/
float B_vec[adim*1];
memset(B_vec, 0, adim*sizeof(float));
mul((float*) B, u, B_vec, adim, rdim, 1);
/*
* Compute the vector C_vec = C*x
*/
float C_vec[ydim*1];
memset(C_vec, 0, ydim*sizeof(float));
mul((float*) C, x, C_vec, ydim, adim, 1);
/*
* Compute the vector KC_vec = K*C_vec
*/
float KC_vec[adim*1];
memset(KC_vec, 0, adim*sizeof(float));
mul((float*) K, C_vec, KC_vec, adim, ydim, 1);
/*
* Compute the vector Ky_vec = K*y
*/
float Ky_vec[adim*1];
memset(Ky_vec, 0, adim*sizeof(float));
mul((float*) K, y, Ky_vec, adim, ydim, 1);
/*
* Now add x = A_vec - KC_vec + B_vec + Ky_vec
*/
for(int i = 0; i < adim; i++){
*(x + i) = *(A_vec + i) - *(KC_vec + i) + *(B_vec + i) + *(Ky_vec + i);
}
}
/*
* C = A*B
*/
static void mul(float* A, float* B, float* C, int row_a, int column_a, int column_b) {
// Data matrix
float* data_a = A;
float* data_b = B;
for (int i = 0; i < row_a; i++) {
// Then we go through every column of b
for (int j = 0; j < column_b; j++) {
data_a = &A[i * column_a];
data_b = &B[j];
*C = 0; // Reset
// And we multiply rows from a with columns of b
for (int k = 0; k < column_a; k++) {
*C += *data_a * *data_b;
data_a++;
data_b += column_b;
}
C++; // ;)
}
}
}
/*
* Print matrix or vector - Just for error check
*/
static void print(float* A, int row, int column) {
for (int i = 0; i < row; i++) {
for (int j = 0; j < column; j++) {
printf("%0.18f ", *(A++));
}
printf("\n");
}
printf("\n");
}
Disclaimer: I work on MATLAB Coder
There is a configuration setting to tell MATLAB Coder to generate code without using dynamically allocated memory or issue an error telling you why it can't do so.
cfg = coder.config('lib');
cfg.DynamicMemoryAllocation = 'Off';
codegen -config cfg ...
MATLAB Coder supports generating code with fixed-size arrays, variable-sized arrays, and dynamically allocated arrays. The various generated signature formats are shown in the documentation. For non-dynamically allocated variable-sized arrays, a common signature is something like: foo(x_data[100], x_size[2])
Yes, the generated code is generally portable and independent of MATLAB for the hardware you specify when generating code. The full list of available functions and classes supported for code generation is listed here. In a very small number of cases, the generated code needs to depend on libraries from MATLAB. Those cases will be called out in the documentation. Fundamental operations like horzcat and vertcat produce portable code that is independent of MATLAB.
Yes. For array outputs and MATLAB functions with multiple outputs, the generated code will return outputs by reference. It also supports passing an argument by reference in some cases when the corresponding MATLAB function has the same variable as an input and output: function A = foo(A,B) with a call like: y = foo(y,z); can produce something like void foo(double A[100], const double B[20]); where A is an input and output.
Related
I have two vectors a and b. Each vector contains the coordinates of a 3d points x, y, z vector3f.
struct Vector3f
{
float x;
float y;
float z;
}
vector a has a size of n = 5000 points and vector b has a size of m = 4000. I need to do a tensor vector product between them like on the right side of the picture. the resulted vector should have a length size of 5000 * 4000 and contain float point where results are stored at c.
__global__ void tensor3dProdcutClassic(const int n, const int m, const Vector3f *a, const Vector3f *b, float *c) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
// int j = blockIdy.y * blockDim.y + threadIdx.y;
//check if the idx is out of range
if (i < n) {
for (int j = 0; j < m; j++) {
int idx = j + m * i;
c[idx] = a[i].x * b[j].x + a[i].y * b[j].y + a[i].z * b[j].z;
}
}
}
dim3 blockSize(32, 1, 1);
dim3 gridSize((n + blockSize.x - 1) / blockSize.x, 1, 1);
tensor3dProdcutClassic<<<gridSize, blockSize>>>(n, m, x, y, out);
I get high execution time on Volta arch which is a lot.
My question is how can I optimize the kernel to reduce time which is mainly because of the for loop inside the kernel. I know here that all global reads and writes are not coalesced.
You can make the kernel move through both a and b simultaneously, like this:
__global__ void tensor3dProdcutClassic(const int n, const int m, const Vector3f *a, const Vector3f *b, float *c)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdy.y * blockDim.y + threadIdx.y;
if (i < n && j < m)
{
int idx = j + m * i;
c[idx] = a[i].x * b[j].x + a[i].y * b[j].y + a[i].z * b[j].z;
}
}
dim3 blockSize(32, 32);
dim3 gridSize((int)ceil(n / 32.0), (int)ceil(m / 32.0));
tensor3dProdcutClassic<<<gridSize, blockSize>>>(n, m, x, y, out);
Update
I tried to modify the code to use a single array with and without shared memory, the code without shared memory with always faster 3 or 4 times.
With shared memory:
#define BLOCK_SIZE 32
void tensor3dProdcut(const int n, const int m, const float* a, const float* b, float* c)
{
float* d_a;
size_t size = (uint64_t)n * 3 * sizeof(float);
cudaMalloc(&d_a, size);
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
float* d_b;
size = (uint64_t)m * 3 * sizeof(float);
cudaMalloc(&d_b, size);
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
float* d_c;
size = (uint64_t)n * m * sizeof(float);
cudaMalloc(&d_c, size);
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid((int)ceil((double)n / BLOCK_SIZE), (int)ceil((double)m / BLOCK_SIZE));
tensor3dProdcutKernel<<<dimGrid, dimBlock>>>(d_a, d_b, d_c, n, m);
cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
}
__global__ void tensor3dProdcutKernel(float* a, float* b, float* c, int n, int m)
{
int i, blockRow, blockCol, row, col;
float Cvalue;
blockRow = blockIdx.x;
blockCol = blockIdx.y;
row = threadIdx.x;
col = threadIdx.y;
if (blockRow * BLOCK_SIZE + row >= n || blockCol * BLOCK_SIZE + col >= m)
return;
__shared__ double as[BLOCK_SIZE][3];
__shared__ double bs[BLOCK_SIZE][3];
for (i = 0; i < 3; i++)
{
as[row][i] = a[(BLOCK_SIZE * blockRow + row) * 3 + i];
bs[col][i] = b[(BLOCK_SIZE * blockCol + col) * 3 + i];
}
__syncthreads();
Cvalue = 0;
for (i = 0; i < 3; i++)
Cvalue += as[row][i] * bs[col][i];
c[(BLOCK_SIZE * blockRow + row) * m + BLOCK_SIZE * blockCol + col] = Cvalue;
}
Without shared memory:
__global__ void tensor3dProdcutKernel(float* a, float* b, float* c, int n, int m)
{
int i, blockRow, blockCol, row, col;
float Cvalue;
blockRow = blockIdx.x;
blockCol = blockIdx.y;
row = threadIdx.x;
col = threadIdx.y;
if (blockRow * BLOCK_SIZE + row >= n || blockCol * BLOCK_SIZE + col >= m)
return;
Cvalue = 0;
for (i = 0; i < 3; i++)
Cvalue += a[(BLOCK_SIZE * blockRow + row) * 3 + i] * b[(BLOCK_SIZE * blockCol + col) * 3 + i];
c[(BLOCK_SIZE * blockRow + row) * m + BLOCK_SIZE * blockCol + col] = Cvalue;
}
Disclaimer: Im a cuda beginner.
typedef struct
{
int row_;
int col_;
float* element_;
int step;
}Matrix_t;
#define BLOCK_SIZE 64
__device__ float getElement(const Matrix_t A, int row, int col);
__device__ Matrix_t getSubMat(Matrix_t A, int row, int col);
__device__ void setElement(Matrix_t A, int row, int col, float value);
__global__ void MatrixDot(Matrix_t A, Matrix_t B, float* dot_);
float Matrix_dot_(float* M_dev_1, float* M_dev_2, int Number_of_cols, int Number_of_rows, int step);
the Matrix_t are used to link a cv::cuda::GpuMat to the C interface via the ptr() operator to get the GPU pointer to element.
__device__ float getElement(const Matrix_t A, int row, int col)
{
return A.element_[row* A.step + col];
}
__device__ void setElement(Matrix_t A, int row, int col, float value)
{
A.element_[row*A.step + col] = value;
}
__device__ Matrix_t getSubMat(Matrix_t A, int row, int col)
{
Matrix_t A_sub;
A_sub.row_ = BLOCK_SIZE;
A_sub.col_ = BLOCK_SIZE;
A_sub.step = A.step;
A_sub.element_ = &A.element_[A.step * BLOCK_SIZE * row + BLOCK_SIZE * col];
return A_sub;
}
Here is the kernel:
__global__ void MatrixDot(Matrix_t A, Matrix_t B, float* dot_)
{
int blockRow = blockIdx.y;
int blockCol = blockIdx.x;
float SubDotValue = 0.0f;
int row = threadIdx.y;
int col = threadIdx.x;
for(int m = 0; m < (A.row_ / BLOCK_SIZE); ++m)
{
//get subA & subB
Matrix_t A_sub = getSubMat(A, blockRow, m);
Matrix_t B_sub = getSubMat(B, blockRow, m);
//set Asub & Bsub to the __shared__ memory
__shared__ float ASub[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float BSub[BLOCK_SIZE][BLOCK_SIZE];
ASub[row][col] = getElement(A_sub, row, col);
BSub[row][col] = getElement(B_sub, row, col);
//Synchronize before calculations:
__syncthreads();
//Get the dot product of the vector Asub[] Bsub[]
for(int el_ = 0; el_ < BLOCK_SIZE; ++el_)
{
SubDotValue += ASub[row][el_] * BSub[row][el_];
}
__syncthreads();
}
dot_[row] = SubDotValue;
}
and the wrapper:
float Matrix_dot_(float* M_dev_1,float* M_dev_2, int Number_of_cols, int Number_of_rows, int step)
{
float retval = 0;
float* retval_partial;
float* retval_device;
Matrix_t A;
A.col_ = Number_of_cols;
A.row_ = Number_of_rows;
A.element_ = M_dev_1;
A.step = step;
Matrix_t B;
B.col_ = Number_of_cols;
B.row_ = Number_of_rows;
B.element_ = M_dev_2;
B.step = step;
retval_partial = (float*)malloc( B.row_*sizeof(float) );
cudaError_t err = cudaMalloc( (void**)&retval_device,B.row_/ BLOCK_SIZE *sizeof(float) );
printf("\n Cuda malloc: %s", cudaGetErrorString(err));
std::cout<<std::flush;
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(B.row_ / BLOCK_SIZE, B.col_ / BLOCK_SIZE);
MatrixDot<<<dimGrid, dimBlock>>>(A, B, retval_device);
err = cudaThreadSynchronize();
std::cout<<std::flush;
printf("\n Cuda kernel run: %s", cudaGetErrorString(err));
err = cudaMemcpy(retval_partial, retval_device, B.row_ / BLOCK_SIZE* sizeof(float), cudaMemcpyDeviceToHost);
printf("\n Cuda cudaMemcpy: %s", cudaGetErrorString(err));
err = cudaFree(retval_device);
printf("\n Cuda cudaFree: %s", cudaGetErrorString(err));
for(int i = 0; i<B.row_/ BLOCK_SIZE ; ++i)
{
retval+=retval_partial[i];
}
free(retval_partial);
return retval;
}
and the main:
int main(int argc, const char * argv[])
{
cv::cuda::DeviceInfo devInfo;
cv::cuda::setDevice(devInfo.deviceID());
cv::Mat cudatestA = cv::Mat(64*3, 64*3, CV_32FC1, 2);
cv::Mat cudatestB = cv::Mat(64*3, 64*3, CV_32FC1, 2);
double tr = (double) cv::getTickCount();
double res = cudatestA.dot(cudatestB);
tr = ((double)cv::getTickCount()-tr)/(double)cv::getTickFrequency();
cv::cuda::GpuMat ctA(cudatestA);
cv::cuda::GpuMat ctB(cudatestB);
double tm_ = (double) cv::getTickCount();
float res_m = 0;
res_m = Matrix_dot_((float* )ctA.ptr(), (float*)ctB.ptr(), ctA.cols, ctA.rows, ctA.step);
tm_ = ((double)cv::getTickCount()-tm_)/(double)cv::getTickFrequency();
printf("\nCPU: %0.4fms, res: %0.4f\nGPU_M: %0.4fms, res: %0.4f\n", tr*1000.0f, res, tm_*1000.0f,res_m);
return 0;
}
I'm currently stuck on various points:
1) it always output 0.
2) it can only work for matrix M*N Multiple of the defined BLOCK_SIZE (64).
for 1) I can't figure where my logic break, I could get the dot product to work on vector without any troubles but the matrix problem induced by the stride between each row prevent me to use the code (code deleted as the site tell me that there is too much code).
Partial answer:
In your kernel you aren't doing the good sum, nor taking the good elements, and your dim seems inverted
__global__ void MatrixDot(Matrix_t A, Matrix_t B, float* dot_)
{
//int blockRow = blockIdx.y;
//int blockCol = blockIdx.x;
int blockRow = blockIdx.x;
int blockCol = blockIdx.y;
float SubDotValue = 0.0f;
//int row = threadIdx.y;
//int col = threadIdx.x;
int row = threadIdx.x;
int col = threadIdx.y;
for(int m = 0; m < (A.row_ / BLOCK_SIZE); ++m)
{
//get subA & subB
Matrix_t A_sub = getSubMat(A, m, blockCol);//getSubMat(A, blockRow, m)
Matrix_t B_sub = getSubMat(B, m, blockCol);//getSubMat(B, blockRow, m)
//set Asub & Bsub to the __shared__ memory
__shared__ float ASub[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float BSub[BLOCK_SIZE][BLOCK_SIZE];
ASub[row][col] = getElement(A_sub, row, col);
BSub[row][col] = getElement(B_sub, row, col);
//Synchronize before calculations:
__syncthreads();
//Get the dot product of the vector Asub[] Bsub[]
for(int el_ = 0; el_ < BLOCK_SIZE; ++el_)
{
SubDotValue += ASub[row][el_] * BSub[row][el_];
}
__syncthreads();
}
dot_[blockRow*BLOCK_SIZE + row] = SubDotValue; //dot_[row] = SubDotValue;
}
And your wrapper isn't also allocating the size you need:
cudaError_t err = cudaMalloc( (void**)&retval_device,B.row_/ BLOCK_SIZE *sizeof(float) );
should be:
cudaError_t err = cudaMalloc( (void**)&retval_device,B.row_*sizeof(float) );
Note that other allocation related have to change too (Lazy me).
And your call in main need to divide the GpuMat step by the size of one element of the GpuMat
res_m = Matrix_dot_((float* )ctA.ptr(), (float*)ctB.ptr(), ctA.cols, ctA.rows, ctA.step/ctA.elemsize1());
You might also want to change your Matrix_t structure to use const float* instead of float to be able to use:
GpuMat_.ptr<float>();
instead of:
(float*)GpuMat.ptr();
Note that for a matrix of N rows you are starting N^2 threads doing the same thing. I don't have enough knowledge on Cuda to fix that.
So I'm trying to modify some code I found here to fit a different function, but my slightly modified version fails to converge and I don't understand why.
The function I'm trying to find the least squared fit for is "A + lambdalog(t) + blog(t)^2. Here's the code
main.cpp
#include <stdlib.h>
#include <stdio.h>
#include <gsl/gsl_rng.h>
#include <gsl/gsl_randist.h>
#include <gsl/gsl_vector.h>
#include <gsl/gsl_blas.h>
#include <gsl/gsl_multifit_nlin.h>
#include "expfit.c"
#define N 40 // All "N"s are 40
void print_state (size_t iter, gsl_multifit_fdfsolver * s);//The prototype of some function
int main (void){
const gsl_multifit_fdfsolver_type *T; //pointer to a newly allocated instance of a solver of type T
gsl_multifit_fdfsolver *s;
int status;
unsigned int i, iter = 0;
const size_t n = N; //N number of observations
const size_t p = 3;//3 parameters
gsl_matrix *covar = gsl_matrix_alloc (p, p); // creates a pxp gsl_matrix
double y[N], sigma[N]; //declares vector variables that will hold the noise data. They are "N" long
struct data d = { n, y, sigma}; //Populates struct d with variables n, y and sigma. Struct data is defined in expfit.c
gsl_multifit_function_fdf f;
double x_init[3] = { 0, -.1, -.1 }; //initial x values !These are initial guesses to the solution!
gsl_vector_view x = gsl_vector_view_array (x_init, p);//view arrays allow one to litterally view elements of a certain array without modifying or created a copy of the original array. Essentially a pointer to the original data.
const gsl_rng_type * type; // pointer to a new random number generator type. RNG type will be assigned later
gsl_rng * r; //Pointer to a new RNG
gsl_rng_env_setup();
type = gsl_rng_default; //Assigns random number generator type
r = gsl_rng_alloc (type); // Allocates memory for new RNG of type "type"
f.f = &logb_f;
f.df = &logb_df;
f.fdf = &logb_fdf;
f.n = n;
f.p = p;
f.params = &d;
for (i = 0; i < n; i++){// This is where the data is being generated
double t = i; // t is being redclared at each iteration for some reason wtf
if(t==0){//since log(0) is undefined, I said they equal 0 at t=0
y[i] = 2.0 -.5 * 0 - 0 + gsl_ran_gaussian (r, 0.1);
}else{
y[i] = 2.0 -.5 * log (t) - pow(log(t),2) + gsl_ran_gaussian (r, 0.1); //This is the noised up data
}
sigma[i] = .1; //not sure what this sigma does
printf ("data: %u %g %g\n", i, y[i], sigma[i]);//Printing out the data at each iteration
};
T = gsl_multifit_fdfsolver_lmsder; // Not sure what this is doing
s = gsl_multifit_fdfsolver_alloc (T, n, p);
gsl_multifit_fdfsolver_set (s, &f, &x.vector);
print_state (iter, s);
do{
iter++;
status = gsl_multifit_fdfsolver_iterate (s);
printf ("status = %s\n", gsl_strerror (status));
print_state (iter, s);
if (status)
break;
status = gsl_multifit_test_delta (s->dx, s->x, 1e-4, 1e-4);
}while (status == GSL_CONTINUE && iter < 500);
gsl_multifit_covar (s->J, 0.0, covar);
#define FIT(i) gsl_vector_get(s->x, i)
#define ERR(i) sqrt(gsl_matrix_get(covar,i,i))
{
double chi = gsl_blas_dnrm2(s->f);
double dof = n - p;
double c = GSL_MAX_DBL(1, chi / sqrt(dof));
printf("chisq/dof = %g\n", pow(chi, 2.0) / dof);
printf ("A = %.5f +/- %.5f\n", FIT(0), c*ERR(0));
printf ("lambda = %.5f +/- %.5f\n", FIT(1), c*ERR(1));
printf ("b = %.5f +/- %.5f\n", FIT(2), c*ERR(2));
}
printf ("status = %s\n", gsl_strerror (status));
gsl_multifit_fdfsolver_free (s);
gsl_matrix_free (covar);
gsl_rng_free (r);
return 0;
}
void print_state (size_t iter, gsl_multifit_fdfsolver * s){
printf ("iter: %3zu x = % 15.8f % 15.8f % 15.8f "
"|f(x)| = %g\n",
iter,
gsl_vector_get (s->x, 0),
gsl_vector_get (s->x, 1),
gsl_vector_get (s->x, 2),
gsl_blas_dnrm2 (s->f));
}
expfit.c
//
// expfit.c
// test
//
// Created by [] on 4/11/15.
// Copyright (c) 2015 []. All rights reserved.
//
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <gsl/gsl_rng.h>
#include <gsl/gsl_randist.h>
#include <gsl/gsl_vector.h>
#include <gsl/gsl_blas.h>
#include <gsl/gsl_multifit_nlin.h>
#include "expfit.h"
/* expfit.c -- model functions for exponential + background */
struct data {
size_t n;
double * y;
double * sigma;
};
int logb_f (const gsl_vector * x, void *data, gsl_vector * f){
size_t n = ((struct data *)data)->n;
double *y = ((struct data *)data)->y;
double *sigma = ((struct data *) data)->sigma;
double A = gsl_vector_get (x, 0);
double lambda = gsl_vector_get (x, 1);
double b = gsl_vector_get (x, 2);
double Yi;//will hold the value of the function to be stored into the vector set
double t;//time variable.
size_t i;//iterative variable
for (i = 0; i < n; i++){
/* Model Yi = A + lambda*log(i) + b*lambda*log(i)^2 */
t = i;
if(t==0){ //need if statement to bypass log(0) when t==0 since the value of log is undefined there
Yi = A + lambda * log(t) + b * pow(log(t),2);//function for t==0
}else{
Yi = A + lambda * log(t) + b * pow(log(t),2); // This is the function we used
}
gsl_vector_set (f, i, (Yi - y[i])/sigma[i]);
}
return GSL_SUCCESS;
}
int logb_df (const gsl_vector * x, void *data, gsl_matrix * J){
size_t n = ((struct data *)data)->n;
double *sigma = ((struct data *) data)->sigma;
//double A = gsl_vector_get (x, 0);
//double lambda = gsl_vector_get (x, 1);
//double b = gsl_vector_get(x,2);
size_t i;
for (i = 0; i < n; i++){
/* Jacobian matrix J(i,j) = dfi / dxj, */
/* where fi = (Yi - yi)/sigma[i], */
/* Yi = A + lambda*log(i) + b*log(i)^2 */
/* and the xj are the parameters (A,lambda,b) */
double t = i;
double s = sigma[i];
gsl_matrix_set (J, i, 0, 1/s);
gsl_matrix_set (J, i, 1, log(t)/s);
gsl_matrix_set (J, i, 2, pow(log(t),2)/s);
}
return GSL_SUCCESS;
}
int logb_fdf (const gsl_vector * x, void *data, gsl_vector * f, gsl_matrix * J){
logb_f (x, data, f);
logb_df (x, data, J);
return GSL_SUCCESS;
}
And here's the header file just incase you want it
//
// expfit.h
// test
//
// Created by [] on 4/11/15.
// Copyright (c) 2015 []. All rights reserved.
//
#ifndef __test__expfit__
#define __test__expfit__
#include <stdio.h>
#endif /* defined(__test__expfit__) */
When computing the fitting function, you consider the special case t=0 to avoid log(0), but the function values don't differ:
if(t==0){
Yi = A + lambda * log(t) + b * pow(log(t),2);//function for t==0
}else{
Yi = A + lambda * log(t) + b * pow(log(t),2); // This is the function we used
}
Furthermore, you don't consider this case when computing the derivatives.
So, I changed the function and the derivatives as follows:
int logb_f (const gsl_vector * x, void *data, gsl_vector * f){
size_t n = ((struct data *)data)->n;
double *y = ((struct data *)data)->y;
double *sigma = ((struct data *) data)->sigma;
double A = gsl_vector_get (x, 0);
double lambda = gsl_vector_get (x, 1);
double b = gsl_vector_get (x, 2);
double Yi;//will hold the value of the function to be stored into the vector set
double t;//time variable.
size_t i;//iterative variable
for (i = 0; i < n; i++){
/* Model Yi = A + lambda*log(i) + b*lambda*log(i)^2 */
t = i;
if(t==0){ //need if statement to bypass log(0) when t==0 since the value of log is undefined there
Yi = A ;
}else{
Yi = A + lambda * log(t) + b * pow(log(t),2); // This is the function we used
}
//Yi = A + lambda * log(t) + b * pow(log(t),2); // This is the function we used
gsl_vector_set (f, i, (Yi - y[i])/sigma[i]);
}
return GSL_SUCCESS;
}
int logb_df (const gsl_vector * x, void *data, gsl_matrix * J){
size_t n = ((struct data *)data)->n;
double *sigma = ((struct data *) data)->sigma;
//double A = gsl_vector_get (x, 0);
//double lambda = gsl_vector_get (x, 1);
//double b = gsl_vector_get(x,2);
size_t i;
for (i = 0; i < n; i++){
/* Jacobian matrix J(i,j) = dfi / dxj, */
/* where fi = (Yi - yi)/sigma[i], */
/* Yi = A + lambda*log(i) + b*log(i)^2 */
/* and the xj are the parameters (A,lambda,b) */
// d
double t = i;
double s = sigma[i];
if(i == 0)
{
gsl_matrix_set (J, i, 0, 0);
gsl_matrix_set (J, i, 1, 0);
gsl_matrix_set (J, i, 2, 0);
}
else
{
gsl_matrix_set (J, i, 0, 1/s);
gsl_matrix_set (J, i, 1, log(t)/s);
gsl_matrix_set (J, i, 2, pow(log(t),2)/s);
}
//Yi = A + lambda * log(t) + b * pow(log(t),2); // This is the function we used
}
return GSL_SUCCESS;
}
In this case the iteration converges:
iter: 0 x = 0.00000000 -0.10000000 -0.10000000 |f(x)| = 470.77
status = success
iter: 1 x = 2.08763815 -0.60282892 -0.97819822 |f(x)| = 5.5047
status = success
iter: 2 x = 2.08763815 -0.60282892 -0.97819822 |f(x)| = 5.5047
chisq/dof = 0.818964
A = 2.08764 +/- 0.08245
lambda = -0.60283 +/- 0.07702
b = -0.97820 +/- 0.01722
status = success
However, I recommend to verify the differentiability of the fit function for t -> 0.
If in doubt, you can also restrict the fit range in the above functions by only considering the values for t > 0:
for (i = 1; i < n; i++) ... instead of for (i = 0; i < n; i++) ...
I am trying to factorize a matrix with the QR factorization in C++, using Lapack's functions in order to solve a system of linear equations (Ax=b)
As far as I understood, dgeqrf computes the QR factorization and overwrites the input matrix. The output clearly contains values for L (upper triangular), but how do I obtain Q?
I tried dormqr, which is said to calculate Q from dgeqrf's output, but the result is the same matrix as in the previous call.
Here's my complete code:
boost::numeric::ublas::matrix<double> in_A(4, 3);
in_A(0, 0) = 1.0;
in_A(0, 1) = 2.0;
in_A(0, 2) = 3.0;
in_A(1, 1) = -3.0;
in_A(1, 2) = 2.0;
in_A(1, 3) = 1.0;
in_A(2, 1) = 2.0;
in_A(2, 2) = 0.0;
in_A(2, 3) = -1.0;
in_A(3, 1) = 3.0;
in_A(3, 2) = -1.0;
in_A(3, 3) = 2.0;
boost::numeric::ublas::vector<double> in_b(4);
in_b(0) = 2;
in_b(1) = 4;
in_b(2) = 6;
in_b(3) = 8;
int rows = in_A.size1();
int cols = in_A.size2();
double *A = (double *)malloc(rows*cols*sizeof(double));
double *b = (double *)malloc(in_b.size()*sizeof(double));
//Lapack has column-major order
for(size_t col=0; col<in_A.size2(); ++col)
{
for(size_t row = 0; row<in_A.size1(); ++row)
{
int D1_idx = col*in_A.size1() + row;
A[D1_idx] = in_A(row, col);
}
b[col] = in_b(col);
}
integer m = rows;
integer n = cols;
integer info = 0;
integer k = n; /* k = min(m,n); */
integer lda = m; /* lda = max(m,1); */
integer lwork = n; /* lwork = max(n,1); */
int max = lwork; /* max = max(lwork,1); */
double *work;
double *tau;
char *side = "L";
char *TR = "T";
integer one = 1;
int i;
double *vec;
work = (double *) malloc( max * sizeof( double ) );
tau = (double *) malloc( k * sizeof( double ) );
vec = (double *) malloc( m * sizeof( double ) );
memset(work, 0, max * sizeof(double));
memset(tau, 0, k * sizeof(double));
std::cout << std::endl;
for(size_t row = 0; row < rows; ++row)
{
for(size_t col = 0; col < cols; ++col)
{
size_t idx = col*rows + row;
std::cout << A[idx] << " ";
}
std::cout << std::endl;
}
dgeqrf_(&m, &n, A, &lda, tau, work, &lwork, &info);
//printf("tau[0] = %f tau[1] = %f\n",tau[0],tau[1]);
std::cout << std::endl;
for(size_t row = 0; row < rows; ++row)
{
for(size_t col = 0; col < cols; ++col)
{
size_t idx = col*rows + row;
std::cout << A[idx] << " ";
}
std::cout << std::endl;
}
memset(vec, 0, m * sizeof(double));
vec[2] = 1.0;
dormqr_(side, TR, &m, &one, &k, A, &lda, tau, vec, &lda, work, &lwork, &info);
free(vec);
free(tau);
free(work);
What's wrong with my code?
How can I factorize a matrix and solve a corresponding system of linear equations?
According to the documentation in
(http://www.netlib.org/lapack/explore-html/da/d82/dormqr_8f.html)
you are computing in vec the product Q^T*e3, where e3 is the third canonical basis vector (0,0,1,0,0,...,0). If you want to compute Q, then vec should contain a matrix sized array filled with the unit matrix, and TRANS should be "N".
dormqr (SIDE, TRANS, M, N, K, A, LDA, TAU, C, LDC, WORK, LWORK, INFO)
SIDE = "L" for the normal QR decomposition with Q left,
TRANS = "N" to return QC in the place of C
A has layout LDA x K in memory, of which the upper M x K block is used and encodes K reflectors
tau contains the factors for the K reflectors
C has layout LDC x M in memory, of which the upper M x N block will be used to hold the result QC
For C to hold Q on return, C must be a square M x M matrix initialized as identity, i.e., with diagonal entries all 1.
You might consider to use the lapack numeric bindings provided for ublas, as in
(http://boost.2283326.n4.nabble.com/How-to-use-the-qr-decomposition-correctly-td2710159.html)
However, this project may be defunct or resting by now.
Lets start again from first principles: The aim is to solve Ax=b, or at least to minimize |Ax-b|+|x|. For that to be consistent one needs colsA=rowsx and rowsA=rowsb.
Now for the discussed code to work A has to be square or a tall rectangular matrix, colsA<=rowsA, so that the system is overdetermined.
Computation steps
Solve Q*R=A:
(http://www.netlib.no/netlib/lapack/double/dgeqrf.f)
DGEQRF( rowsA, colsA, A, rowsA, TAU, WORK, LWORK, INFO )
Multiply by QT to get QT*b as in R*x=QT*b
(http://www.netlib.no/netlib/lapack/double/dormqr.f)
DORMQR( 'L', 'T', rowsA, 1, colsA, A, rowsA, TAU, b, rowsA, WORK, LWORK, INFO )
Use back-substitution using the upper right part of A
(http://www.netlib.no/netlib/lapack/double/dtrtrs.f)
DTRTRS( 'U', 'N', 'N', colsA, 1, A, rowsA, b, rowsA, INFO )
Now the first colsA entries of b contain the solution vector x. The euclidean norm of the remaining entries at index colsA+1 and thereafter is the error |A*x-b| of the solution.
Remark: For the pure solution process there is no reason to compute 'Q' explicitly or to invoke the generic matrix multiplication DGEMM. These should be reserved for experiments to check if A-QR is sufficiently close to zero.
Remark: Explore the optimal allocation of the WORK array by performing a dry run with LWORK=-1.
To conclude some code that works, however, the connection between ublas and lapack seems suboptimal
#include "boost/numeric/ublas/matrix.hpp"
#include "boost/numeric/ublas/vector.hpp"
typedef boost::numeric::ublas::matrix<double> bmatrix;
typedef boost::numeric::ublas::vector<double> bvector;
namespace lapack {
extern "C" {
void dgeqrf_(int* M, int* N,
double* A, int* LDA, double* TAU,
double* WORK, int* LWORK, int* INFO );
void dormqr_(char* SIDE, char* TRANS,
int* M, int* N, int* K,
double* A, int* LDA, double* TAU,
double* C, int* LDC,
double* WORK, int* LWORK, int* INFO );
void dtrtrs_(char* UPLO, char* TRANS, char* DIAG,
int* N, int* NRHS,
double* A, int* LDA,
double* B, int* LDB,
int* INFO );
}
int geqrf(int m, int n,
double* A, int lda, double *tau) {
int info=0;
int lwork=-1;
double iwork;
dgeqrf_(&m, &n, A, &lda, tau,
&iwork, &lwork, &info);
lwork = (int)iwork;
double* work = new double[lwork];
dgeqrf_(&m, &n, A, &lda, tau,
work, &lwork, &info);
delete[] work;
return info;
}
int ormqr(char side, char trans, int m, int n, int k,
double *A, int lda, double *tau, double* C, int ldc) {
int info=0;
int lwork=-1;
double iwork;
dormqr_(&side, &trans, &m, &n, &k,
A, &lda, tau, C, &ldc, &iwork, &lwork, &info);
lwork = (int)iwork;
double* work = new double[lwork];
dormqr_(&side, &trans, &m, &n, &k,
A, &lda, tau, C, &ldc, work, &lwork, &info);
delete[] work;
return info;
}
int trtrs(char uplo, char trans, char diag,
int n, int nrhs,
double* A, int lda, double* B, int ldb
) {
int info = 0;
dtrtrs_(&uplo, &trans, &diag, &n, &nrhs,
A, &lda, B, &ldb, &info);
return info;
}
}
static void PrintMatrix(double A[], size_t rows, size_t cols) {
std::cout << std::endl;
for(size_t row = 0; row < rows; ++row)
{
for(size_t col = 0; col < cols; ++col)
{
// Lapack uses column major format
size_t idx = col*rows + row;
std::cout << A[idx] << " ";
}
std::cout << std::endl;
}
}
static int SolveQR(
const bmatrix &in_A, // IN
const bvector &in_b, // IN
bvector &out_x // OUT
) {
size_t rows = in_A.size1();
size_t cols = in_A.size2();
double *A = new double[rows*cols];
double *b = new double[in_b.size()];
//Lapack has column-major order
for(size_t col=0, D1_idx=0; col<cols; ++col)
{
for(size_t row = 0; row<rows; ++row)
{
// Lapack uses column major format
A[D1_idx++] = in_A(row, col);
}
b[col] = in_b(col);
}
for(size_t row = 0; row<rows; ++row)
{
b[row] = in_b(row);
}
// DGEQRF for Q*R=A, i.e., A and tau hold R and Householder reflectors
double* tau = new double[cols];
PrintMatrix(A, rows, cols);
lapack::geqrf(rows, cols, A, rows, tau);
PrintMatrix(A, rows, cols);
// DORMQR: to compute b := Q^T*b
lapack::ormqr('L', 'T', rows, 1, cols, A, rows, tau, b, rows);
PrintMatrix(b, rows, 1);
// DTRTRS: solve Rx=b by back substitution
lapack::trtrs('U', 'N', 'N', cols, 1, A, rows, b, rows);
for(size_t col=0; col<cols; col++) {
out_x(col)=b[col];
}
PrintMatrix(b,cols,1);
delete[] A;
delete[] b;
delete[] tau;
return 0;
}
int main() {
bmatrix in_A(4, 3);
in_A(0, 0) = 1.0; in_A(0, 1) = 2.0; in_A(0, 2) = 3.0;
in_A(1, 0) = -3.0; in_A(1, 1) = 2.0; in_A(1, 2) = 1.0;
in_A(2, 0) = 2.0; in_A(2, 1) = 0.0; in_A(2, 2) = -1.0;
in_A(3, 0) = 3.0; in_A(3, 1) = -1.0; in_A(3, 2) = 2.0;
bvector in_b(4);
in_b(0) = 2;
in_b(1) = 4;
in_b(2) = 6;
in_b(3) = 8;
bvector out_x(3);
SolveQR( in_A, in_b, out_x);
return 0;
}
While this is an old question, but if you are looking for a way to solve LLS using QR with LAPACK use dgels, it does the same as the answer above.
I have two 3D point clouds, and I'd like to use opencv to find the rigid transformation matrix (translation, rotation, constant scaling among all 3 axes).
I've found an estimateRigidTransformation function, but it's only for 2D points apparently
In addition, I've found estimateAffine3D, but it doesn't seem to support rigid transformation mode.
Do I need to just write my own rigid transformation function?
I did not find the required functionality in OpenCV so I have written my own implementation. Based on ideas from OpenSFM.
cv::Vec3d
CalculateMean(const cv::Mat_<cv::Vec3d> &points)
{
cv::Mat_<cv::Vec3d> result;
cv::reduce(points, result, 0, CV_REDUCE_AVG);
return result(0, 0);
}
cv::Mat_<double>
FindRigidTransform(const cv::Mat_<cv::Vec3d> &points1, const cv::Mat_<cv::Vec3d> points2)
{
/* Calculate centroids. */
cv::Vec3d t1 = -CalculateMean(points1);
cv::Vec3d t2 = -CalculateMean(points2);
cv::Mat_<double> T1 = cv::Mat_<double>::eye(4, 4);
T1(0, 3) = t1[0];
T1(1, 3) = t1[1];
T1(2, 3) = t1[2];
cv::Mat_<double> T2 = cv::Mat_<double>::eye(4, 4);
T2(0, 3) = -t2[0];
T2(1, 3) = -t2[1];
T2(2, 3) = -t2[2];
/* Calculate covariance matrix for input points. Also calculate RMS deviation from centroid
* which is used for scale calculation.
*/
cv::Mat_<double> C(3, 3, 0.0);
double p1Rms = 0, p2Rms = 0;
for (int ptIdx = 0; ptIdx < points1.rows; ptIdx++) {
cv::Vec3d p1 = points1(ptIdx, 0) + t1;
cv::Vec3d p2 = points2(ptIdx, 0) + t2;
p1Rms += p1.dot(p1);
p2Rms += p2.dot(p2);
for (int i = 0; i < 3; i++) {
for (int j = 0; j < 3; j++) {
C(i, j) += p2[i] * p1[j];
}
}
}
cv::Mat_<double> u, s, vh;
cv::SVD::compute(C, s, u, vh);
cv::Mat_<double> R = u * vh;
if (cv::determinant(R) < 0) {
R -= u.col(2) * (vh.row(2) * 2.0);
}
double scale = sqrt(p2Rms / p1Rms);
R *= scale;
cv::Mat_<double> M = cv::Mat_<double>::eye(4, 4);
R.copyTo(M.colRange(0, 3).rowRange(0, 3));
cv::Mat_<double> result = T2 * M * T1;
result /= result(3, 3);
return result.rowRange(0, 3);
}
I've found PCL to be a nice adjunct to OpenCV. Take a look at their Iterative Closest Point (ICP) example. The provided example registers the two point clouds and then displays the rigid transformation.
Here's my rmsd code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <assert.h>
typedef struct
{
float m[4][4];
} MATRIX;
#define vdiff2(a,b) ( ((a)[0]-(b)[0]) * ((a)[0]-(b)[0]) + \
((a)[1]-(b)[1]) * ((a)[1]-(b)[1]) + \
((a)[2]-(b)[2]) * ((a)[2]-(b)[2]) )
static double alignedrmsd(float *v1, float *v2, int N);
static void centroid(float *ret, float *v, int N);
static int getalignmtx(float *v1, float *v2, int N, MATRIX *mtx);
static void crossproduct(float *ans, float *pt1, float *pt2);
static void mtx_root(MATRIX *mtx);
static int almostequal(MATRIX *a, MATRIX *b);
static void mulpt(MATRIX *mtx, float *pt);
static void mtx_mul(MATRIX *ans, MATRIX *x, MATRIX *y);
static void mtx_identity(MATRIX *mtx);
static void mtx_trans(MATRIX *mtx, float x, float y, float z);
static int mtx_invert(float *mtx, int N);
static float absmaxv(float *v, int N);
/*
calculate rmsd between two structures
Params: v1 - first set of points
v2 - second set of points
N - number of points
mtx - return for transfrom matrix used to align structures
Returns: rmsd score
Notes: mtx can be null. Transform will be rigid. Inputs must
be previously aligned for sequence alignment
*/
double rmsd(float *v1, float *v2, int N, float *mtx)
{
float cent1[3];
float cent2[3];
MATRIX tmtx;
MATRIX tempmtx;
MATRIX move1;
MATRIX move2;
int i;
double answer;
float *temp1 = 0;
float *temp2 = 0;
int err;
assert(N > 3);
temp1 = malloc(N * 3 * sizeof(float));
temp2 = malloc(N * 3 * sizeof(float));
if(!temp1 || !temp2)
goto error_exit;
centroid(cent1, v1, N);
centroid(cent2, v2, N);
for(i=0;i<N;i++)
{
temp1[i*3+0] = v1[i*3+0] - cent1[0];
temp1[i*3+1] = v1[i*3+1] - cent1[1];
temp1[i*3+2] = v1[i*3+2] - cent1[2];
temp2[i*3+0] = v2[i*3+0] - cent2[0];
temp2[i*3+1] = v2[i*3+1] - cent2[1];
temp2[i*3+2] = v2[i*3+2] - cent2[2];
}
err = getalignmtx(temp1, temp2, N, &tmtx);
if(err == -1)
goto error_exit;
mtx_trans(&move1, -cent2[0], -cent2[1], -cent2[2]);
mtx_mul(&tempmtx, &move1, &tmtx);
mtx_trans(&move2, cent1[0], cent1[1], cent1[2]);
mtx_mul(&tmtx, &tempmtx, &move2);
memcpy(temp2, v2, N * sizeof(float) * 3);
for(i=0;i<N;i++)
mulpt(&tmtx, temp2 + i * 3);
answer = alignedrmsd(v1, temp2, N);
free(temp1);
free(temp2);
if(mtx)
memcpy(mtx, &tmtx.m, 16 * sizeof(float));
return answer;
error_exit:
free(temp1);
free(temp2);
if(mtx)
{
for(i=0;i<16;i++)
mtx[i] = 0;
}
return sqrt(-1.0);
}
/*
calculate rmsd between two aligned structures (trivial)
Params: v1 - first structure
v2 - second structure
N - number of points
Returns: rmsd
*/
static double alignedrmsd(float *v1, float *v2, int N)
{
double answer =0;
int i;
for(i=0;i<N;i++)
answer += vdiff2(v1 + i *3, v2 + i * 3);
return sqrt(answer/N);
}
/*
compute the centroid
*/
static void centroid(float *ret, float *v, int N)
{
int i;
ret[0] = 0;
ret[1] = 0;
ret[2] = 0;
for(i=0;i<N;i++)
{
ret[0] += v[i*3+0];
ret[1] += v[i*3+1];
ret[2] += v[i*3+2];
}
ret[0] /= N;
ret[1] /= N;
ret[2] /= N;
}
/*
get the matrix needed to align two structures
Params: v1 - reference structure
v2 - structure to align
N - number of points
mtx - return for rigid body alignment matrix
Notes: only calculates rotation part of matrix.
assumes input has been aligned to centroids
*/
static int getalignmtx(float *v1, float *v2, int N, MATRIX *mtx)
{
MATRIX A = { {{0,0,0,0},{0,0,0,0},{0,0,0,0},{0,0,0,1}} };
MATRIX At;
MATRIX Ainv;
MATRIX temp;
float tv[3];
float tw[3];
float tv2[3];
float tw2[3];
int k, i, j;
int flag = 0;
float correction;
correction = absmaxv(v1, N * 3) * absmaxv(v2, N * 3);
for(k=0;k<N;k++)
for(i=0;i<3;i++)
for(j=0;j<3;j++)
A.m[i][j] += (v1[k*3+i] * v2[k*3+j])/correction;
while(flag < 3)
{
for(i=0;i<4;i++)
for(j=0;j<4;j++)
At.m[i][j] = A.m[j][i];
memcpy(&Ainv, &A, sizeof(MATRIX));
/* this will happen if all points are in a plane */
if( mtx_invert((float *) &Ainv, 4) == -1)
{
if(flag == 0)
{
crossproduct(tv, v1, v1+3);
crossproduct(tw, v2, v2+3);
}
else
{
crossproduct(tv2, tv, v1);
crossproduct(tw2, tw, v2);
memcpy(tv, tv2, 3 * sizeof(float));
memcpy(tw, tw2, 3 * sizeof(float));
}
for(i=0;i<3;i++)
for(j=0;j<3;j++)
A.m[i][j] += tv[i] * tw[j];
flag++;
}
else
flag = 5;
}
if(flag != 5)
return -1;
mtx_mul(&temp, &At, &A);
mtx_root(&temp);
mtx_mul(mtx, &temp, &Ainv);
return 0;
}
/*
get the crossproduct of two vectors.
Params: ans - return pinter for answer.
pt1 - first vector
pt2 - second vector.
Notes: crossproduct is at right angles to the two vectors.
*/
static void crossproduct(float *ans, float *pt1, float *pt2)
{
ans[0] = pt1[1] * pt2[2] - pt1[2] * pt2[1];
ans[1] = pt1[0] * pt2[2] - pt1[2] * pt2[0];
ans[2] = pt1[0] * pt2[1] - pt1[1] * pt2[0];
}
/*
Denman-Beavers square root iteration
*/
static void mtx_root(MATRIX *mtx)
{
MATRIX Y = *mtx;
MATRIX Z;
MATRIX Y1;
MATRIX Z1;
MATRIX invY;
MATRIX invZ;
MATRIX Y2;
int iter = 0;
int i, ii;
mtx_identity(&Z);
do
{
invY = Y;
invZ = Z;
if( mtx_invert((float *) &invY, 4) == -1)
return;
if( mtx_invert((float *) &invZ, 4) == -1)
return;
for(i=0;i<4;i++)
for(ii=0;ii<4;ii++)
{
Y1.m[i][ii] = 0.5 * (Y.m[i][ii] + invZ.m[i][ii]);
Z1.m[i][ii] = 0.5 * (Z.m[i][ii] + invY.m[i][ii]);
}
Y = Y1;
Z = Z1;
mtx_mul(&Y2, &Y, &Y);
}
while(!almostequal(&Y2, mtx) && iter++ < 20 );
*mtx = Y;
}
/*
Check two matrices for near-enough equality
Params: a - first matrix
b - second matrix
Returns: 1 if almost equal, else 0, epsilon 0.0001f.
*/
static int almostequal(MATRIX *a, MATRIX *b)
{
int i, ii;
float epsilon = 0.001f;
for(i=0;i<4;i++)
for(ii=0;ii<4;ii++)
if(fabs(a->m[i][ii] - b->m[i][ii]) > epsilon)
return 0;
return 1;
}
/*
multiply a point by a matrix.
Params: mtx - matrix
pt - the point (transformed)
*/
static void mulpt(MATRIX *mtx, float *pt)
{
float ans[4] = {0};
int i;
int ii;
for(i=0;i<4;i++)
{
for(ii=0;ii<3;ii++)
{
ans[i] += pt[ii] * mtx->m[ii][i];
}
ans[i] += mtx->m[3][i];
}
pt[0] = ans[0];
pt[1] = ans[1];
pt[2] = ans[2];
}
/*
multiply two matrices.
Params: ans - return pointer for answer.
x - first matrix
y - second matrix.
Notes: ans may not be equal to x or y.
*/
static void mtx_mul(MATRIX *ans, MATRIX *x, MATRIX *y)
{
int i;
int ii;
int iii;
for(i=0;i<4;i++)
for(ii=0;ii<4;ii++)
{
ans->m[i][ii] = 0;
for(iii=0;iii<4;iii++)
ans->m[i][ii] += x->m[i][iii] * y->m[iii][ii];
}
}
/*
create an identity matrix.
Params: mtx - return pointer.
*/
static void mtx_identity(MATRIX *mtx)
{
int i;
int ii;
for(i=0;i<4;i++)
for(ii=0;ii<4;ii++)
{
if(i==ii)
mtx->m[i][ii] = 1.0f;
else
mtx->m[i][ii] = 0;
}
}
/*
create a translation matrix.
Params: mtx - return pointer for matrix.
x - x translation.
y - y translation.
z - z translation
*/
static void mtx_trans(MATRIX *mtx, float x, float y, float z)
{
mtx->m[0][0] = 1;
mtx->m[0][1] = 0;
mtx->m[0][2] = 0;
mtx->m[0][3] = 0;
mtx->m[1][0] = 0;
mtx->m[1][1] = 1;
mtx->m[1][2] = 0;
mtx->m[1][3] = 0;
mtx->m[2][0] = 0;
mtx->m[2][1] = 0;
mtx->m[2][2] = 1;
mtx->m[2][3] = 0;
mtx->m[3][0] = x;
mtx->m[3][1] = y;
mtx->m[3][2] = z;
mtx->m[3][3] = 1;
}
/*
matrix invert routine
Params: mtx - the matrix in raw format, in/out
N - width and height
Returns: 0 on success, -1 on fail
*/
static int mtx_invert(float *mtx, int N)
{
int indxc[100]; /* these 100s are the only restriction on matrix size */
int indxr[100];
int ipiv[100];
int i, j, k;
int irow, icol;
double big;
double pinv;
int l, ll;
double dum;
double temp;
assert(N <= 100);
for(i=0;i<N;i++)
ipiv[i] = 0;
for(i=0;i<N;i++)
{
big = 0.0;
/* find biggest element */
for(j=0;j<N;j++)
if(ipiv[j] != 1)
for(k=0;k<N;k++)
if(ipiv[k] == 0)
if(fabs(mtx[j*N+k]) >= big)
{
big = fabs(mtx[j*N+k]);
irow = j;
icol = k;
}
ipiv[icol]=1;
if(irow != icol)
for(l=0;l<N;l++)
{
temp = mtx[irow * N + l];
mtx[irow * N + l] = mtx[icol * N + l];
mtx[icol * N + l] = temp;
}
indxr[i] = irow;
indxc[i] = icol;
/* if biggest element is zero matrix is singular, bail */
if(mtx[icol* N + icol] == 0)
goto error_exit;
pinv = 1.0/mtx[icol * N + icol];
mtx[icol * N + icol] = 1.0;
for(l=0;l<N;l++)
mtx[icol * N + l] *= pinv;
for(ll=0;ll<N;ll++)
if(ll != icol)
{
dum = mtx[ll * N + icol];
mtx[ll * N + icol] = 0.0;
for(l=0;l<N;l++)
mtx[ll * N + l] -= mtx[icol * N + l]*dum;
}
}
/* unscramble matrix */
for (l=N-1;l>=0;l--)
{
if (indxr[l] != indxc[l])
for (k=0;k<N;k++)
{
temp = mtx[k * N + indxr[l]];
mtx[k * N + indxr[l]] = mtx[k * N + indxc[l]];
mtx[k * N + indxc[l]] = temp;
}
}
return 0;
error_exit:
return -1;
}
/*
get the asolute maximum of an array
*/
static float absmaxv(float *v, int N)
{
float answer;
int i;
for(i=0;i<N;i++)
if(answer < fabs(v[i]))
answer = fabs(v[i]);
return answer;
}
#include <stdio.h>
/*
debug utlitiy
*/
static void printmtx(FILE *fp, MATRIX *mtx)
{
int i, ii;
for(i=0;i<4;i++)
{
for(ii=0;ii<4;ii++)
fprintf(fp, "%f, ", mtx->m[i][ii]);
fprintf(fp, "\n");
}
}
int rmsdmain(void)
{
float one[4*3] = {0,0,0, 1,0,0, 2,1,0, 0,3,1};
float two[4*3] = {0,0,0, 0,1,0, 1,2,0, 3,0,1};
MATRIX mtx;
double diff;
int i;
diff = rmsd(one, two, 4, (float *) &mtx.m);
printf("%f\n", diff);
printmtx(stdout, &mtx);
for(i=0;i<4;i++)
{
mulpt(&mtx, two + i * 3);
printf("%f %f %f\n", two[i*3], two[i*3+1], two[i*3+2]);
}
return 0;
}
I took #vagran's implementation and added RANSAC on top of it, since estimateRigidTransform2d does it and it was helpful for me since my data is noisy. (Note: This code doesn't have constant scaling along all 3 axes; you can add it back in easily by comparing to vargran's).
cv::Vec3f CalculateMean(const cv::Mat_<cv::Vec3f> &points)
{
if(points.size().height == 0){
return 0;
}
assert(points.size().width == 1);
double mx = 0.0;
double my = 0.0;
double mz = 0.0;
int n_points = points.size().height;
for(int i = 0; i < n_points; i++){
double x = double(points(i)[0]);
double y = double(points(i)[1]);
double z = double(points(i)[2]);
mx += x;
my += y;
mz += z;
}
return cv::Vec3f(mx/n_points, my/n_points, mz/n_points);
}
cv::Mat_<double>
FindRigidTransform(const cv::Mat_<cv::Vec3f> &points1, const cv::Mat_<cv::Vec3f> points2)
{
/* Calculate centroids. */
cv::Vec3f t1 = CalculateMean(points1);
cv::Vec3f t2 = CalculateMean(points2);
cv::Mat_<double> T1 = cv::Mat_<double>::eye(4, 4);
T1(0, 3) = double(-t1[0]);
T1(1, 3) = double(-t1[1]);
T1(2, 3) = double(-t1[2]);
cv::Mat_<double> T2 = cv::Mat_<double>::eye(4, 4);
T2(0, 3) = double(t2[0]);
T2(1, 3) = double(t2[1]);
T2(2, 3) = double(t2[2]);
/* Calculate covariance matrix for input points. Also calculate RMS deviation from centroid
* which is used for scale calculation.
*/
cv::Mat_<double> C(3, 3, 0.0);
for (int ptIdx = 0; ptIdx < points1.rows; ptIdx++) {
cv::Vec3f p1 = points1(ptIdx) - t1;
cv::Vec3f p2 = points2(ptIdx) - t2;
for (int i = 0; i < 3; i++) {
for (int j = 0; j < 3; j++) {
C(i, j) += double(p2[i] * p1[j]);
}
}
}
cv::Mat_<double> u, s, vt;
cv::SVD::compute(C, s, u, vt);
cv::Mat_<double> R = u * vt;
if (cv::determinant(R) < 0) {
R -= u.col(2) * (vt.row(2) * 2.0);
}
cv::Mat_<double> M = cv::Mat_<double>::eye(4, 4);
R.copyTo(M.colRange(0, 3).rowRange(0, 3));
cv::Mat_<double> result = T2 * M * T1;
result /= result(3, 3);
return result;
}
cv::Mat_<double> RANSACFindRigidTransform(const cv::Mat_<cv::Vec3f> &points1, const cv::Mat_<cv::Vec3f> &points2)
{
cv::Mat points1Homo;
cv::convertPointsToHomogeneous(points1, points1Homo);
int iterations = 100;
int min_n_points = 3;
int n_points = points1.size().height;
std::vector<int> range(n_points);
cv::Mat_<double> best;
int best_inliers = -1;
// inlier points should be projected within this many units
float threshold = .02;
std::iota(range.begin(), range.end(), 0);
auto gen = std::mt19937{std::random_device{}()};
for(int i = 0; i < iterations; i++) {
std::shuffle(range.begin(), range.end(), gen);
cv::Mat_<cv::Vec3f> points1subset(min_n_points, 1, cv::Vec3f(0,0,0));
cv::Mat_<cv::Vec3f> points2subset(min_n_points, 1, cv::Vec3f(0,0,0));
for(int j = 0; j < min_n_points; j++) {
points1subset(j) = points1(range[j]);
points2subset(j) = points2(range[j]);
}
cv::Mat_<float> rigidT = FindRigidTransform(points1subset, points2subset);
cv::Mat_<float> rigidT_float = cv::Mat::eye(4, 4, CV_32F);
rigidT.convertTo(rigidT_float, CV_32F);
std::vector<int> inliers;
for(int j = 0; j < n_points; j++) {
cv::Mat_<float> t1_3d = rigidT_float * cv::Mat_<float>(points1Homo.at<cv::Vec4f>(j));
if(t1_3d(3) == 0) {
continue; // Avoid 0 division
}
float dx = (t1_3d(0)/t1_3d(3) - points2(j)[0]);
float dy = (t1_3d(1)/t1_3d(3) - points2(j)[1]);
float dz = (t1_3d(2)/t1_3d(3) - points2(j)[2]);
float square_dist = dx * dx + dy * dy + dz * dz;
if(square_dist < threshold * threshold){
inliers.push_back(j);
}
}
int n_inliers = inliers.size();
if(n_inliers > best_inliers) {
best_inliers = n_inliers;
best = rigidT;
}
}
return best;
}
#vagran Thanks for the code! Seems to work very well.
I do have a little terminology suggestion though. Since you are estimating and applying a scale during the transformation, it is a 7-parameter transformation, or Helmert / similarity transformation. And in a rigid transformation, no scaling is applied because all Euclidiean distances need to be reserved.
I would've added this as comment, but don't have enough points.. D: sorry for that.
rigid transformation: https://en.wikipedia.org/wiki/Rigid_transformation
Helmert transformation: https://www.researchgate.net/publication/322841143_Parameter_estimation_in_3D_affine_and_similarity_transformation_implementation_of_variance_component_estimation