I am trying to factorize a matrix with the QR factorization in C++, using Lapack's functions in order to solve a system of linear equations (Ax=b)
As far as I understood, dgeqrf computes the QR factorization and overwrites the input matrix. The output clearly contains values for L (upper triangular), but how do I obtain Q?
I tried dormqr, which is said to calculate Q from dgeqrf's output, but the result is the same matrix as in the previous call.
Here's my complete code:
boost::numeric::ublas::matrix<double> in_A(4, 3);
in_A(0, 0) = 1.0;
in_A(0, 1) = 2.0;
in_A(0, 2) = 3.0;
in_A(1, 1) = -3.0;
in_A(1, 2) = 2.0;
in_A(1, 3) = 1.0;
in_A(2, 1) = 2.0;
in_A(2, 2) = 0.0;
in_A(2, 3) = -1.0;
in_A(3, 1) = 3.0;
in_A(3, 2) = -1.0;
in_A(3, 3) = 2.0;
boost::numeric::ublas::vector<double> in_b(4);
in_b(0) = 2;
in_b(1) = 4;
in_b(2) = 6;
in_b(3) = 8;
int rows = in_A.size1();
int cols = in_A.size2();
double *A = (double *)malloc(rows*cols*sizeof(double));
double *b = (double *)malloc(in_b.size()*sizeof(double));
//Lapack has column-major order
for(size_t col=0; col<in_A.size2(); ++col)
{
for(size_t row = 0; row<in_A.size1(); ++row)
{
int D1_idx = col*in_A.size1() + row;
A[D1_idx] = in_A(row, col);
}
b[col] = in_b(col);
}
integer m = rows;
integer n = cols;
integer info = 0;
integer k = n; /* k = min(m,n); */
integer lda = m; /* lda = max(m,1); */
integer lwork = n; /* lwork = max(n,1); */
int max = lwork; /* max = max(lwork,1); */
double *work;
double *tau;
char *side = "L";
char *TR = "T";
integer one = 1;
int i;
double *vec;
work = (double *) malloc( max * sizeof( double ) );
tau = (double *) malloc( k * sizeof( double ) );
vec = (double *) malloc( m * sizeof( double ) );
memset(work, 0, max * sizeof(double));
memset(tau, 0, k * sizeof(double));
std::cout << std::endl;
for(size_t row = 0; row < rows; ++row)
{
for(size_t col = 0; col < cols; ++col)
{
size_t idx = col*rows + row;
std::cout << A[idx] << " ";
}
std::cout << std::endl;
}
dgeqrf_(&m, &n, A, &lda, tau, work, &lwork, &info);
//printf("tau[0] = %f tau[1] = %f\n",tau[0],tau[1]);
std::cout << std::endl;
for(size_t row = 0; row < rows; ++row)
{
for(size_t col = 0; col < cols; ++col)
{
size_t idx = col*rows + row;
std::cout << A[idx] << " ";
}
std::cout << std::endl;
}
memset(vec, 0, m * sizeof(double));
vec[2] = 1.0;
dormqr_(side, TR, &m, &one, &k, A, &lda, tau, vec, &lda, work, &lwork, &info);
free(vec);
free(tau);
free(work);
What's wrong with my code?
How can I factorize a matrix and solve a corresponding system of linear equations?
According to the documentation in
(http://www.netlib.org/lapack/explore-html/da/d82/dormqr_8f.html)
you are computing in vec the product Q^T*e3, where e3 is the third canonical basis vector (0,0,1,0,0,...,0). If you want to compute Q, then vec should contain a matrix sized array filled with the unit matrix, and TRANS should be "N".
dormqr (SIDE, TRANS, M, N, K, A, LDA, TAU, C, LDC, WORK, LWORK, INFO)
SIDE = "L" for the normal QR decomposition with Q left,
TRANS = "N" to return QC in the place of C
A has layout LDA x K in memory, of which the upper M x K block is used and encodes K reflectors
tau contains the factors for the K reflectors
C has layout LDC x M in memory, of which the upper M x N block will be used to hold the result QC
For C to hold Q on return, C must be a square M x M matrix initialized as identity, i.e., with diagonal entries all 1.
You might consider to use the lapack numeric bindings provided for ublas, as in
(http://boost.2283326.n4.nabble.com/How-to-use-the-qr-decomposition-correctly-td2710159.html)
However, this project may be defunct or resting by now.
Lets start again from first principles: The aim is to solve Ax=b, or at least to minimize |Ax-b|+|x|. For that to be consistent one needs colsA=rowsx and rowsA=rowsb.
Now for the discussed code to work A has to be square or a tall rectangular matrix, colsA<=rowsA, so that the system is overdetermined.
Computation steps
Solve Q*R=A:
(http://www.netlib.no/netlib/lapack/double/dgeqrf.f)
DGEQRF( rowsA, colsA, A, rowsA, TAU, WORK, LWORK, INFO )
Multiply by QT to get QT*b as in R*x=QT*b
(http://www.netlib.no/netlib/lapack/double/dormqr.f)
DORMQR( 'L', 'T', rowsA, 1, colsA, A, rowsA, TAU, b, rowsA, WORK, LWORK, INFO )
Use back-substitution using the upper right part of A
(http://www.netlib.no/netlib/lapack/double/dtrtrs.f)
DTRTRS( 'U', 'N', 'N', colsA, 1, A, rowsA, b, rowsA, INFO )
Now the first colsA entries of b contain the solution vector x. The euclidean norm of the remaining entries at index colsA+1 and thereafter is the error |A*x-b| of the solution.
Remark: For the pure solution process there is no reason to compute 'Q' explicitly or to invoke the generic matrix multiplication DGEMM. These should be reserved for experiments to check if A-QR is sufficiently close to zero.
Remark: Explore the optimal allocation of the WORK array by performing a dry run with LWORK=-1.
To conclude some code that works, however, the connection between ublas and lapack seems suboptimal
#include "boost/numeric/ublas/matrix.hpp"
#include "boost/numeric/ublas/vector.hpp"
typedef boost::numeric::ublas::matrix<double> bmatrix;
typedef boost::numeric::ublas::vector<double> bvector;
namespace lapack {
extern "C" {
void dgeqrf_(int* M, int* N,
double* A, int* LDA, double* TAU,
double* WORK, int* LWORK, int* INFO );
void dormqr_(char* SIDE, char* TRANS,
int* M, int* N, int* K,
double* A, int* LDA, double* TAU,
double* C, int* LDC,
double* WORK, int* LWORK, int* INFO );
void dtrtrs_(char* UPLO, char* TRANS, char* DIAG,
int* N, int* NRHS,
double* A, int* LDA,
double* B, int* LDB,
int* INFO );
}
int geqrf(int m, int n,
double* A, int lda, double *tau) {
int info=0;
int lwork=-1;
double iwork;
dgeqrf_(&m, &n, A, &lda, tau,
&iwork, &lwork, &info);
lwork = (int)iwork;
double* work = new double[lwork];
dgeqrf_(&m, &n, A, &lda, tau,
work, &lwork, &info);
delete[] work;
return info;
}
int ormqr(char side, char trans, int m, int n, int k,
double *A, int lda, double *tau, double* C, int ldc) {
int info=0;
int lwork=-1;
double iwork;
dormqr_(&side, &trans, &m, &n, &k,
A, &lda, tau, C, &ldc, &iwork, &lwork, &info);
lwork = (int)iwork;
double* work = new double[lwork];
dormqr_(&side, &trans, &m, &n, &k,
A, &lda, tau, C, &ldc, work, &lwork, &info);
delete[] work;
return info;
}
int trtrs(char uplo, char trans, char diag,
int n, int nrhs,
double* A, int lda, double* B, int ldb
) {
int info = 0;
dtrtrs_(&uplo, &trans, &diag, &n, &nrhs,
A, &lda, B, &ldb, &info);
return info;
}
}
static void PrintMatrix(double A[], size_t rows, size_t cols) {
std::cout << std::endl;
for(size_t row = 0; row < rows; ++row)
{
for(size_t col = 0; col < cols; ++col)
{
// Lapack uses column major format
size_t idx = col*rows + row;
std::cout << A[idx] << " ";
}
std::cout << std::endl;
}
}
static int SolveQR(
const bmatrix &in_A, // IN
const bvector &in_b, // IN
bvector &out_x // OUT
) {
size_t rows = in_A.size1();
size_t cols = in_A.size2();
double *A = new double[rows*cols];
double *b = new double[in_b.size()];
//Lapack has column-major order
for(size_t col=0, D1_idx=0; col<cols; ++col)
{
for(size_t row = 0; row<rows; ++row)
{
// Lapack uses column major format
A[D1_idx++] = in_A(row, col);
}
b[col] = in_b(col);
}
for(size_t row = 0; row<rows; ++row)
{
b[row] = in_b(row);
}
// DGEQRF for Q*R=A, i.e., A and tau hold R and Householder reflectors
double* tau = new double[cols];
PrintMatrix(A, rows, cols);
lapack::geqrf(rows, cols, A, rows, tau);
PrintMatrix(A, rows, cols);
// DORMQR: to compute b := Q^T*b
lapack::ormqr('L', 'T', rows, 1, cols, A, rows, tau, b, rows);
PrintMatrix(b, rows, 1);
// DTRTRS: solve Rx=b by back substitution
lapack::trtrs('U', 'N', 'N', cols, 1, A, rows, b, rows);
for(size_t col=0; col<cols; col++) {
out_x(col)=b[col];
}
PrintMatrix(b,cols,1);
delete[] A;
delete[] b;
delete[] tau;
return 0;
}
int main() {
bmatrix in_A(4, 3);
in_A(0, 0) = 1.0; in_A(0, 1) = 2.0; in_A(0, 2) = 3.0;
in_A(1, 0) = -3.0; in_A(1, 1) = 2.0; in_A(1, 2) = 1.0;
in_A(2, 0) = 2.0; in_A(2, 1) = 0.0; in_A(2, 2) = -1.0;
in_A(3, 0) = 3.0; in_A(3, 1) = -1.0; in_A(3, 2) = 2.0;
bvector in_b(4);
in_b(0) = 2;
in_b(1) = 4;
in_b(2) = 6;
in_b(3) = 8;
bvector out_x(3);
SolveQR( in_A, in_b, out_x);
return 0;
}
While this is an old question, but if you are looking for a way to solve LLS using QR with LAPACK use dgels, it does the same as the answer above.
Related
I need to convert this code into C code.
Questions:
Will MATLAB Coder generate C code that are memory safe, e.g they not using calloc or malloc. Misra C standard does not allow coder to use dynamical memory allocation. It's dangerous for embedded system due to memory leaks.
Will MATLAB Coder generate C code with dynamical matrix as argument e.g. functions with arguments foo(float* A, int m, int n) or foo(int m, int n, float A[m][n]) or is fix size example foo(float A[3][5]), only available as option?
Will MATLAB Coder generate C code that can be fitted into an embedded system. How about the internal C++ commands in the .m files such as horzcat, size and vertcat? Will they become 100% portable C-code?
Will MATLAB Coder generate functions that have call by reference? Example foo(float* input, float* output) instead of float* output = foo(float* input)
function [U] = mpc (A, B, C, x, N, r, lb)
## Find matrix
PHI = phiMat(A, C, N);
GAMMA = gammaMat(A, B, C, N);
## Solve first with no constraints
U = solve(PHI, GAMMA, x, N, r, 0, 0, false);
## Then use the last U as upper bound
U = solve(PHI, GAMMA, x, N, r, lb, U(end), true);
end
function U = solve(PHI, GAMMA, x, N, r, lb, ub, constraints)
## Set U
U = zeros(N, 1);
## Iterate Gaussian Elimination
for i = 1:N
## Solve u
if(i == 1)
u = (r - PHI(i,:)*x)/GAMMA(i,i)
else
u = (r - PHI(i,:)*x - GAMMA(i,1:i-1)*U(1:i-1) )/GAMMA(i,i)
end
## Constraints
if(constraints == true)
if(u > ub)
u = ub;
elseif(u < lb)
u = lb;
end
end
## Save u
U(i) = u
end
end
function PHI = phiMat(A, C, N)
## Create the special Observabillity matrix
PHI = [];
for i = 1:N
PHI = vertcat(PHI, C*A^i);
end
end
function GAMMA = gammaMat(A, B, C, N)
## Create the lower triangular toeplitz matrix
GAMMA = [];
for i = 1:N
GAMMA = horzcat(GAMMA, vertcat(zeros((i-1)*size(C*A*B, 1), size(C*A*B, 2)),cabMat(A, B, C, N-i+1)));
end
end
function CAB = cabMat(A, B, C, N)
## Create the column for the GAMMA matrix
CAB = [];
for i = 0:N-1
CAB = vertcat(CAB, C*A^i*B);
end
end
My C-code. Yes its working!
/*
* Generalized_Predictive_Control.c
*
* Created on:
* Author:
*/
#include "Generalized_Predictive_Control.h"
/*
* Parameters
*/
int adim;
int ydim;
int rdim;
int horizon;
/*
* Deceleration
*/
static void obsv(float* PHI, const float* A, const float* C);
static void kalman(float* x, const float* A, const float* B, float* u, const float* K, float* y, const float* C);
static void mul(float* A, float* B, float* C, int row_a, int column_a, int column_b);
static void tran(float* A, int row, int column);
static void CAB(float* GAMMA, float* PHI, const float* A, const float* B, const float* C);
static void solve(float* GAMMA, float* PHI, float* x, float* u, float* r, float lb, float ub, int constraintsON);
static void print(float* A, int row, int column);
void GPC(int adim_, int ydim_, int rdim_, int horizon_, const float* A, const float* B, const float* C, const float* D, const float* K, float* u, float* r, float* y, float* x){
/*
* Set the dimensions
*/
adim = adim_;
ydim = ydim_;
rdim = rdim_;
horizon = horizon_;
/*
* Identify the model - Extended Least Square
*/
int n = 5;
float* phi;
float* theta;
//els(phi, theta, n, y, u, P);
/*
* Create a state space model with Observable canonical form
*/
/*
* Create the extended observability matrix
*/
float PHI[horizon*ydim*adim];
memset(PHI, 0, horizon*ydim*adim*sizeof(float));
obsv(PHI, A, C);
/*
* Create the lower triangular toeplitz matrix
*/
float GAMMA[horizon*rdim*horizon*ydim];
memset(GAMMA, 0, horizon*rdim*horizon*ydim*sizeof(float));
CAB(GAMMA, PHI, A, B, C);
/*
* Solve the best input value
*/
solve(GAMMA, PHI, x, u, r, 0, 0, 0);
solve(GAMMA, PHI, x, u, r, 0, *(u), 1);
/*
* Estimate the state vector
*/
kalman(x, A, B, u, K, y, C);
}
/*
* Identify the model
*/
static void els(float* P, float* phi, float* theta, int polyLength, int totalPolyLength, float* y, float* u, float* e){
/*
* move phi with the inputs, outputs, errors one step to right
*/
for(int i = 0; i < polyLength; i++){
*(phi + i+1 + totalPolyLength*0) = *(phi + i + totalPolyLength*0); // Move one to right for the y's
*(phi + i+1 + totalPolyLength*1) = *(phi + i + totalPolyLength*1); // Move one to right for the u's
*(phi + i+1 + totalPolyLength*2) = *(phi + i + totalPolyLength*2); // Move one to right for the e's
}
/*
* Add the current y, u and e
(*phi + totalPolyLength*0) = -*(y + 0); // Need to be negative!
(*phi + totalPolyLength*1) = *(u + 0);
(*phi + totalPolyLength*2) = *(e + 0);
*/
/*
* phi'*theta
*/
float y_est = 0;
for(int i = 0; i < totalPolyLength; i++){
y_est += *(phi + i) * *(theta + i);
}
float epsilon = *(y + 0) - y_est; // In this case, y is only one element array
/*
* phi*epsilon
*/
float phi_epsilon[totalPolyLength];
memset(phi_epsilon, 0, totalPolyLength*sizeof(float));
for(int i = 0; i < totalPolyLength; i++){
*(phi_epsilon + i) = *(phi + i) * epsilon;
}
/*
* P_vec = P*phi_epsilon
*/
float P_vec[totalPolyLength];
memset(P_vec, 0, totalPolyLength*sizeof(float));
mul(P, phi_epsilon, P_vec, totalPolyLength, totalPolyLength, 1);
/*
* Update our estimated vector theta = theta + P_vec
*/
for(int i = 0; i < totalPolyLength; i++){
*(theta + i) = *(theta + i) + *(P_vec + i);
}
/*
* Update P = P - (P*phi*phi'*P)/(1 + phi'*P*phi)
*/
// Create phi'
float phiT[totalPolyLength];
memset(phiT, 0, totalPolyLength*sizeof(float));
memcpy(phiT, phi, totalPolyLength*sizeof(float));
tran(phiT, totalPolyLength, 1);
// phi'*P
float phiT_P[totalPolyLength];
memset(phiT_P, 0, totalPolyLength*sizeof(float));
mul(phiT, P, phiT_P, 1, totalPolyLength, totalPolyLength);
// phi*phi'*P
float phi_phiT_P[totalPolyLength*totalPolyLength];
memset(phi_phiT_P, 0, totalPolyLength*totalPolyLength*sizeof(float));
mul(phi, phiT_P, phi_phiT_P, totalPolyLength, 1, totalPolyLength);
// P*phi*phi'*P
float P_phi_phiT_P[totalPolyLength*totalPolyLength];
memset(P_phi_phiT_P, 0, totalPolyLength*totalPolyLength*sizeof(float));
mul(P, phi_phiT_P, P_phi_phiT_P, totalPolyLength, totalPolyLength, totalPolyLength);
// P*phi
float P_phi[totalPolyLength];
memset(P_phi, 0, totalPolyLength*sizeof(float));
mul(P, phi, P_phi, totalPolyLength, totalPolyLength, 1);
// phi'*P*phi
float phiT_P_phi[1];
memset(phiT_P_phi, 0, 1*sizeof(float));
mul(phiT, P_phi, phiT_P_phi, 1, totalPolyLength, 1);
// P = P - (P_phi_phiT_P) / (1+phi'*P*phi)
for(int i = 0; i < totalPolyLength*totalPolyLength; i++){
*(P + i) = *(P + i) - *(P_phi_phiT_P + i) / (1 + *(phiT_P_phi));
}
}
/*
* This will solve if GAMMA is square!
*/
static void solve(float* GAMMA, float* PHI, float* x, float* u, float* r, float lb, float ub, int constraintsON){
/*
* Now we are going to solve on the form
* Ax=b, where b = (R*r-PHI*x) and A = GAMMA and x = U
*/
/*
* R_vec = R*r
*/
float R_vec[horizon*ydim];
memset(R_vec, 0, horizon*ydim*sizeof(float));
for(int i = 0; i < horizon*ydim; i++){
for (int j = 0; j < rdim; j++) {
*(R_vec + i + j) = *(r + j);
}
i += rdim-1;
}
/*
* PHI_vec = PHI*x
*/
float PHI_vec[horizon*ydim];
memset(PHI_vec, 0, horizon * ydim * sizeof(float));
mul(PHI, x, PHI_vec, horizon*ydim, adim, 1);
/*
* Solve now (R_vec - PHI_vec) = GAMMA*U
* Notice that this is ONLY for Square GAMMA with lower triangular toeplitz matrix e.g SISO case
* This using Gaussian Elimination backward substitution
*/
float U[horizon];
float sum = 0.0;
memset(U, 0, horizon*sizeof(float));
for(int i = 0; i < horizon; i++){
for(int j = 0; j < i; j++){
sum += *(GAMMA + i*horizon + j) * *(U + j);
}
float newU = (*(R_vec + i) - *(PHI_vec + i) - sum) / (*(GAMMA + i*horizon + i));
if(constraintsON == 1){
if(newU > ub)
newU = ub;
if(newU < lb)
newU = lb;
}
*(U + i) = newU;
sum = 0.0;
}
//print(U, horizon, 1);
/*
* Set last U to u
*/
if(constraintsON == 0){
*(u + 0) = *(U + horizon - 1);
}else{
*(u + 0) = *(U + 0);
}
}
/*
* Lower traingular toeplitz of extended observability matrix
*/
static void CAB(float* GAMMA, float* PHI, const float* A, const float* B, const float* C){
/*
* First create the initial C*A^0*B == C*I*B == C*B
*/
float CB[ydim*rdim];
memset(CB, 0, ydim*rdim*sizeof(float));
mul((float*)C, (float*)B, CB, ydim, adim, rdim);
/*
* Take the transpose of CB so it will have dimension rdim*ydim instead
*/
tran(CB, ydim, rdim);
/*
* Create the CAB matrix from PHI*B
*/
float PHIB[horizon*ydim*rdim];
mul(PHI, (float*) B, PHIB, horizon*ydim, adim, rdim); // CAB = PHI*B
tran(PHIB, horizon*ydim, rdim);
/*
* We insert GAMMA = [CB PHI;
* 0 CB PHI;
* 0 0 CB PHI;
* 0 0 0 CB PHI] from left to right
*/
for(int i = 0; i < horizon; i++) {
for(int j = 0; j < rdim; j++) {
memcpy(GAMMA + horizon*ydim*(i*rdim+j) + ydim*i, CB + ydim*j, ydim*sizeof(float)); // Add CB
memcpy(GAMMA + horizon*ydim*(i*rdim+j) + ydim*i + ydim, PHIB + horizon*ydim*j, (horizon-i-1)*ydim*sizeof(float)); // Add PHI*B
}
}
/*
* Transpose of gamma
*/
tran(GAMMA, horizon*rdim, horizon*ydim);
//print(CB, rdim, ydim);
//print(PHIB, rdim, horizon*ydim);
//print(GAMMA, horizon*ydim, horizon*rdim);
}
/*
* Transpose
*/
static void tran(float* A, int row, int column) {
float B[row*column];
float* transpose;
float* ptr_A = A;
for (int i = 0; i < row; i++) {
transpose = &B[i];
for (int j = 0; j < column; j++) {
*transpose = *ptr_A;
ptr_A++;
transpose += row;
}
}
// Copy!
memcpy(A, B, row*column*sizeof(float));
}
/*
* [C*A^1; C*A^2; C*A^3; ... ; C*A^horizon] % Extended observability matrix
*/
static void obsv(float* PHI, const float* A, const float* C){
/*
* This matrix will A^(i+1) all the time
*/
float A_pow[adim*adim];
memset(A_pow, 0, adim * adim * sizeof(float));
float A_copy[adim*adim];
memcpy(A_copy, (float*) A, adim * adim * sizeof(float));
/*
* Temporary matrix
*/
float T[ydim*adim];
memset(T, 0, ydim * adim * sizeof(float));
/*
* Regular T = C*A^(1+i)
*/
mul((float*) C, (float*) A, T, ydim, adim, adim);
/*
* Insert temporary T into PHI
*/
memcpy(PHI, T, ydim*adim*sizeof(float));
/*
* Do the rest C*A^(i+1) because we have already done i = 0
*/
for(int i = 1; i < horizon; i++){
mul((float*) A, A_copy, A_pow, adim, adim, adim); // Matrix power A_pow = A*A_copy
mul((float*) C, A_pow, T, ydim, adim, adim); // T = C*A^(1+i)
memcpy(PHI + i*ydim*adim, T, ydim*adim*sizeof(float)); // Insert temporary T into PHI
memcpy(A_copy, A_pow, adim * adim * sizeof(float)); // A_copy <- A_pow
}
}
/*
* x = Ax - KCx + Bu + Ky % Kalman filter
*/
static void kalman(float* x, const float* A, const float* B, float* u, const float* K, float* y, const float* C) {
/*
* Compute the vector A_vec = A*x
*/
float A_vec[adim*1];
memset(A_vec, 0, adim*sizeof(float));
mul((float*) A, x, A_vec, adim, adim, 1);
/*
* Compute the vector B_vec = B*u
*/
float B_vec[adim*1];
memset(B_vec, 0, adim*sizeof(float));
mul((float*) B, u, B_vec, adim, rdim, 1);
/*
* Compute the vector C_vec = C*x
*/
float C_vec[ydim*1];
memset(C_vec, 0, ydim*sizeof(float));
mul((float*) C, x, C_vec, ydim, adim, 1);
/*
* Compute the vector KC_vec = K*C_vec
*/
float KC_vec[adim*1];
memset(KC_vec, 0, adim*sizeof(float));
mul((float*) K, C_vec, KC_vec, adim, ydim, 1);
/*
* Compute the vector Ky_vec = K*y
*/
float Ky_vec[adim*1];
memset(Ky_vec, 0, adim*sizeof(float));
mul((float*) K, y, Ky_vec, adim, ydim, 1);
/*
* Now add x = A_vec - KC_vec + B_vec + Ky_vec
*/
for(int i = 0; i < adim; i++){
*(x + i) = *(A_vec + i) - *(KC_vec + i) + *(B_vec + i) + *(Ky_vec + i);
}
}
/*
* C = A*B
*/
static void mul(float* A, float* B, float* C, int row_a, int column_a, int column_b) {
// Data matrix
float* data_a = A;
float* data_b = B;
for (int i = 0; i < row_a; i++) {
// Then we go through every column of b
for (int j = 0; j < column_b; j++) {
data_a = &A[i * column_a];
data_b = &B[j];
*C = 0; // Reset
// And we multiply rows from a with columns of b
for (int k = 0; k < column_a; k++) {
*C += *data_a * *data_b;
data_a++;
data_b += column_b;
}
C++; // ;)
}
}
}
/*
* Print matrix or vector - Just for error check
*/
static void print(float* A, int row, int column) {
for (int i = 0; i < row; i++) {
for (int j = 0; j < column; j++) {
printf("%0.18f ", *(A++));
}
printf("\n");
}
printf("\n");
}
Disclaimer: I work on MATLAB Coder
There is a configuration setting to tell MATLAB Coder to generate code without using dynamically allocated memory or issue an error telling you why it can't do so.
cfg = coder.config('lib');
cfg.DynamicMemoryAllocation = 'Off';
codegen -config cfg ...
MATLAB Coder supports generating code with fixed-size arrays, variable-sized arrays, and dynamically allocated arrays. The various generated signature formats are shown in the documentation. For non-dynamically allocated variable-sized arrays, a common signature is something like: foo(x_data[100], x_size[2])
Yes, the generated code is generally portable and independent of MATLAB for the hardware you specify when generating code. The full list of available functions and classes supported for code generation is listed here. In a very small number of cases, the generated code needs to depend on libraries from MATLAB. Those cases will be called out in the documentation. Fundamental operations like horzcat and vertcat produce portable code that is independent of MATLAB.
Yes. For array outputs and MATLAB functions with multiple outputs, the generated code will return outputs by reference. It also supports passing an argument by reference in some cases when the corresponding MATLAB function has the same variable as an input and output: function A = foo(A,B) with a call like: y = foo(y,z); can produce something like void foo(double A[100], const double B[20]); where A is an input and output.
Disclaimer: Im a cuda beginner.
typedef struct
{
int row_;
int col_;
float* element_;
int step;
}Matrix_t;
#define BLOCK_SIZE 64
__device__ float getElement(const Matrix_t A, int row, int col);
__device__ Matrix_t getSubMat(Matrix_t A, int row, int col);
__device__ void setElement(Matrix_t A, int row, int col, float value);
__global__ void MatrixDot(Matrix_t A, Matrix_t B, float* dot_);
float Matrix_dot_(float* M_dev_1, float* M_dev_2, int Number_of_cols, int Number_of_rows, int step);
the Matrix_t are used to link a cv::cuda::GpuMat to the C interface via the ptr() operator to get the GPU pointer to element.
__device__ float getElement(const Matrix_t A, int row, int col)
{
return A.element_[row* A.step + col];
}
__device__ void setElement(Matrix_t A, int row, int col, float value)
{
A.element_[row*A.step + col] = value;
}
__device__ Matrix_t getSubMat(Matrix_t A, int row, int col)
{
Matrix_t A_sub;
A_sub.row_ = BLOCK_SIZE;
A_sub.col_ = BLOCK_SIZE;
A_sub.step = A.step;
A_sub.element_ = &A.element_[A.step * BLOCK_SIZE * row + BLOCK_SIZE * col];
return A_sub;
}
Here is the kernel:
__global__ void MatrixDot(Matrix_t A, Matrix_t B, float* dot_)
{
int blockRow = blockIdx.y;
int blockCol = blockIdx.x;
float SubDotValue = 0.0f;
int row = threadIdx.y;
int col = threadIdx.x;
for(int m = 0; m < (A.row_ / BLOCK_SIZE); ++m)
{
//get subA & subB
Matrix_t A_sub = getSubMat(A, blockRow, m);
Matrix_t B_sub = getSubMat(B, blockRow, m);
//set Asub & Bsub to the __shared__ memory
__shared__ float ASub[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float BSub[BLOCK_SIZE][BLOCK_SIZE];
ASub[row][col] = getElement(A_sub, row, col);
BSub[row][col] = getElement(B_sub, row, col);
//Synchronize before calculations:
__syncthreads();
//Get the dot product of the vector Asub[] Bsub[]
for(int el_ = 0; el_ < BLOCK_SIZE; ++el_)
{
SubDotValue += ASub[row][el_] * BSub[row][el_];
}
__syncthreads();
}
dot_[row] = SubDotValue;
}
and the wrapper:
float Matrix_dot_(float* M_dev_1,float* M_dev_2, int Number_of_cols, int Number_of_rows, int step)
{
float retval = 0;
float* retval_partial;
float* retval_device;
Matrix_t A;
A.col_ = Number_of_cols;
A.row_ = Number_of_rows;
A.element_ = M_dev_1;
A.step = step;
Matrix_t B;
B.col_ = Number_of_cols;
B.row_ = Number_of_rows;
B.element_ = M_dev_2;
B.step = step;
retval_partial = (float*)malloc( B.row_*sizeof(float) );
cudaError_t err = cudaMalloc( (void**)&retval_device,B.row_/ BLOCK_SIZE *sizeof(float) );
printf("\n Cuda malloc: %s", cudaGetErrorString(err));
std::cout<<std::flush;
dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE);
dim3 dimGrid(B.row_ / BLOCK_SIZE, B.col_ / BLOCK_SIZE);
MatrixDot<<<dimGrid, dimBlock>>>(A, B, retval_device);
err = cudaThreadSynchronize();
std::cout<<std::flush;
printf("\n Cuda kernel run: %s", cudaGetErrorString(err));
err = cudaMemcpy(retval_partial, retval_device, B.row_ / BLOCK_SIZE* sizeof(float), cudaMemcpyDeviceToHost);
printf("\n Cuda cudaMemcpy: %s", cudaGetErrorString(err));
err = cudaFree(retval_device);
printf("\n Cuda cudaFree: %s", cudaGetErrorString(err));
for(int i = 0; i<B.row_/ BLOCK_SIZE ; ++i)
{
retval+=retval_partial[i];
}
free(retval_partial);
return retval;
}
and the main:
int main(int argc, const char * argv[])
{
cv::cuda::DeviceInfo devInfo;
cv::cuda::setDevice(devInfo.deviceID());
cv::Mat cudatestA = cv::Mat(64*3, 64*3, CV_32FC1, 2);
cv::Mat cudatestB = cv::Mat(64*3, 64*3, CV_32FC1, 2);
double tr = (double) cv::getTickCount();
double res = cudatestA.dot(cudatestB);
tr = ((double)cv::getTickCount()-tr)/(double)cv::getTickFrequency();
cv::cuda::GpuMat ctA(cudatestA);
cv::cuda::GpuMat ctB(cudatestB);
double tm_ = (double) cv::getTickCount();
float res_m = 0;
res_m = Matrix_dot_((float* )ctA.ptr(), (float*)ctB.ptr(), ctA.cols, ctA.rows, ctA.step);
tm_ = ((double)cv::getTickCount()-tm_)/(double)cv::getTickFrequency();
printf("\nCPU: %0.4fms, res: %0.4f\nGPU_M: %0.4fms, res: %0.4f\n", tr*1000.0f, res, tm_*1000.0f,res_m);
return 0;
}
I'm currently stuck on various points:
1) it always output 0.
2) it can only work for matrix M*N Multiple of the defined BLOCK_SIZE (64).
for 1) I can't figure where my logic break, I could get the dot product to work on vector without any troubles but the matrix problem induced by the stride between each row prevent me to use the code (code deleted as the site tell me that there is too much code).
Partial answer:
In your kernel you aren't doing the good sum, nor taking the good elements, and your dim seems inverted
__global__ void MatrixDot(Matrix_t A, Matrix_t B, float* dot_)
{
//int blockRow = blockIdx.y;
//int blockCol = blockIdx.x;
int blockRow = blockIdx.x;
int blockCol = blockIdx.y;
float SubDotValue = 0.0f;
//int row = threadIdx.y;
//int col = threadIdx.x;
int row = threadIdx.x;
int col = threadIdx.y;
for(int m = 0; m < (A.row_ / BLOCK_SIZE); ++m)
{
//get subA & subB
Matrix_t A_sub = getSubMat(A, m, blockCol);//getSubMat(A, blockRow, m)
Matrix_t B_sub = getSubMat(B, m, blockCol);//getSubMat(B, blockRow, m)
//set Asub & Bsub to the __shared__ memory
__shared__ float ASub[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float BSub[BLOCK_SIZE][BLOCK_SIZE];
ASub[row][col] = getElement(A_sub, row, col);
BSub[row][col] = getElement(B_sub, row, col);
//Synchronize before calculations:
__syncthreads();
//Get the dot product of the vector Asub[] Bsub[]
for(int el_ = 0; el_ < BLOCK_SIZE; ++el_)
{
SubDotValue += ASub[row][el_] * BSub[row][el_];
}
__syncthreads();
}
dot_[blockRow*BLOCK_SIZE + row] = SubDotValue; //dot_[row] = SubDotValue;
}
And your wrapper isn't also allocating the size you need:
cudaError_t err = cudaMalloc( (void**)&retval_device,B.row_/ BLOCK_SIZE *sizeof(float) );
should be:
cudaError_t err = cudaMalloc( (void**)&retval_device,B.row_*sizeof(float) );
Note that other allocation related have to change too (Lazy me).
And your call in main need to divide the GpuMat step by the size of one element of the GpuMat
res_m = Matrix_dot_((float* )ctA.ptr(), (float*)ctB.ptr(), ctA.cols, ctA.rows, ctA.step/ctA.elemsize1());
You might also want to change your Matrix_t structure to use const float* instead of float to be able to use:
GpuMat_.ptr<float>();
instead of:
(float*)GpuMat.ptr();
Note that for a matrix of N rows you are starting N^2 threads doing the same thing. I don't have enough knowledge on Cuda to fix that.
This is the original MATLAB implementation
function[m, p] = max2(im)
[m1, k1] = max(im);
[m, k2] = max(m1);
x = k2;
y = k1(k2);
p = [y, x];
It is being used inside this functionality
for r = 2.^linspace(log2(minR),log2(maxR),numSteps);
itestSeek = imresize(itestBase,minR/r);
icorr = normxcorr2(cc,itestSeek);
[m,p] = max2(icorr); //here
if (m>bestm)
bestp = p*r;
bests = ccSize*r;
bestm = m;
end;
end;
Here is my OpenCV 3.0.0/ c++ implementation
void Utilities::Max2(cv::Mat input_image, double& m, std::vector<int>& p)
{
std::vector<double> m1(input_image.cols); // the local maximum for each column
std::vector<int> k1(input_image.cols); // the index of the local maximum
for (int c = 0; c < input_image.cols; ++c)
{
float temp_max = input_image.at<float>(0, c);
int temp_index = 0;
for (int r = 0; r < input_image.rows; ++r)
{
if (temp_max < input_image.at<float>(r, c))
{
temp_max = input_image.at<float>(r, c);
temp_index = r;
}
}
m1[c] = temp_max;
k1[c] = temp_index;
}
auto iter = std::max_element(m1.begin(), m1.end()); //max of all the local maximum;
m = *iter;
int k2 = std::distance(m1.begin(), iter);
double y = k1[k2];
p.push_back(y);
p.push_back(k2);
}
c++ usage of the function
std::vector<double> best_p;
std::vector<double> best_s;
for (double i = 0; i < linspace_vector.size(); i++)
{
cv::Mat i_test_seek;
cv::Mat i_corr;
double r = linspace_vector[i];
double resize_factor = min_r / r; // minR/r in matlab
cv::resize(i_test_base, i_test_seek, cv::Size(), resize_factor, resize_factor, cv::INTER_CUBIC);
cv::matchTemplate(i_test_seek, cc_template, i_corr, CV_TM_CCORR_NORMED);
cv::imshow("i_corr", i_corr);
cv::waitKey(0);
double m;
std::vector<int> p;
Utilities::Max2(i_corr, m, p);
if (m> best_m)
{
best_p.clear();
best_s.clear();
for (int i = 0; i < p.size(); ++i)
{
best_p.push_back(p[i] * r);
}
best_s.push_back(cc_size_height * r);
best_s.push_back(cc_size_width * r);
best_m = m;
}
}
Can you suggest a more efficient way of doing this?
I find the local maximum for each column and the index of that value.
Later I find the global maximum of all of the indices.
Can you try the following and benchmark, if the performance increases:
#include <limits>
void Utilities::Max2(cv::Mat input_image, double& m, std::vector<int>& p)
{
m = std::numeric_limits<double>::min;
std::pair<int, int> temp_index = 0;
for (int r = 0; r < input_image.rows; ++r)
{
for (int c = 0; c < input_image.cols; ++c)
{
if (m < input_image.at<float>(r, c))
{
m = input_image.at<float>(r, c);
temp_index = std::make_pair(c, r);
}
}
}
p[0] = temp_index.second;
p[1] = temp_index.first;
}
If there is a way to get the input as a vector and you can get the number col columns, for example using:
int cols = input_image.rows;
std::vector<double> v;
v.assign(input_image.datastart, input_image.dataend);
Then you can compute in just one go:
std::vector<double>::iterator iter = std::max_element(v.begin(), v.end());
double m = *iter;
int k = std::distance(v.begin(), iter);
int y = (int)k / cols;
int x = k % cols;
However, I am not sure if getting the data as a vector is an option nor the performance of convert it into a vector. Maybe you can run and see how it compares to your implementation.
The first piece of code is essentially finding the max value and its indices (both x and y) in an image to my understanding.
function[m, p] = max2(im)
[m1, k1] = max(im); %find the max value in each col
[m, k2] = max(m1); %find the max value among maxes
x = k2; %find the "row" of the max value
y = k1(k2); %and its "col"
p = [y, x];
This can be done using some iterations but iteration is almost always significantly slower than vector operations or Opencv functions.
So, if my understanding is correct, this operation can simply be done by
double minVal, maxVal;
Point minLoc, maxLoc;
minMaxLoc(im, &minVal, &maxVal, &minLoc, &maxLoc);
maxLoc.y will give the row, and maxLoc.x will give col.
update: Your Matlab code can also be simplified (which potentially will speed up too)
[mx, ind] = max(im(:));
p = [rem(ind,size(im,1)) ceil(ind/size(im,1))];
You could also try the following:
// creating a random matrix with 2 rows and 4 columns
Mat1d mat(2, 4);
double low = -7000.0; // minimum value for generating random numbers
double high = +7000.0; // maximum value for generating random numbers
randu(mat, Scalar(low), Scalar(high)); // generating random number matrix
double max_element = *std::max_element(mat.begin(),mat.end()); // get the max element in the matrix
int max_element_index = std::max_element(mat.begin(),mat.end()) - mat.begin(); // get the max_element_index from the matrix`
The max element index is a row major order value starting from 0 until number of items in the matrix, in this case 7,
cout << mat << endl;
cout << max_element << endl;
cout << max_element_index << endl;
[Referred Generate random numbers matrix in OpenCV for the code above]
i found good code to do some polynomial least squares fitting based on GSL.
i am using it with 3 degrees: y = Cx² + Bx + A.
In my application i know that A must be zero. Is it possible to change the algorithm so that A alway will be zero?
bool polynomialfit(int obs, int degree,
double *dx, double *dy, double *store) /* n, p */
{
gsl_multifit_linear_workspace *ws;
gsl_matrix *cov, *X;
gsl_vector *y, *c;
double chisq;
int i, j;
X = gsl_matrix_alloc(obs, degree);
y = gsl_vector_alloc(obs);
c = gsl_vector_alloc(degree);
cov = gsl_matrix_alloc(degree, degree);
for(i=0; i < obs; i++) {
gsl_matrix_set(X, i, 0, 1.0);
for(j=0; j < degree; j++) {
gsl_matrix_set(X, i, j, pow(dx[i], j));
}
gsl_vector_set(y, i, dy[i]);
}
ws = gsl_multifit_linear_alloc(obs, degree);
gsl_multifit_linear(X, y, c, cov, &chisq, ws);
/* store result ... */
for(i=0; i < degree; i++)
{
store[i] = gsl_vector_get(c, i);
}
gsl_multifit_linear_free(ws);
gsl_matrix_free(X);
gsl_matrix_free(cov);
gsl_vector_free(y);
gsl_vector_free(c);
return true; /* we do not "analyse" the result (cov matrix mainly)
to know if the fit is "good" */
}
You can replace y by y' = y/x and then perform fitting of a 1. degree polynomial y'= Cx + B?
(if point x = 0 is present in your data set you have to remove it but this point does not improve fit in case you want to apply the A = 0 constraint, you can still use it to re-compute goodness of fit)
In the code you posted there is this loop:
for(j=0; j < degree; j++) {
gsl_matrix_set(X, i, j, pow(dx[i], j));
}
and the function pow computes the x^j terms, you have to "ignore" the term where j==0.
I have no access to GSL and so the following is just off the top of my head and it is untested:
bool polynomialfit(int obs, int polynom_degree,
double *dx, double *dy, double *store) /* n, p */
{
gsl_multifit_linear_workspace *ws;
gsl_matrix *cov, *X;
gsl_vector *y, *c;
double chisq;
int i, j;
int degree = polynom_degree - 1;
X = gsl_matrix_alloc(obs, degree);
y = gsl_vector_alloc(obs);
c = gsl_vector_alloc(degree);
cov = gsl_matrix_alloc(degree, degree);
for(i=0; i < obs; i++) {
gsl_matrix_set(X, i, 0, 1.0);
for(j=0; j < degree; j++) {
gsl_matrix_set(X, i, j, pow(dx[i], j+1));
}
gsl_vector_set(y, i, dy[i]);
}
ws = gsl_multifit_linear_alloc(obs, degree);
gsl_multifit_linear(X, y, c, cov, &chisq, ws);
/* store result ... */
for(i=0; i < degree; i++)
{
store[i] = gsl_vector_get(c, i);
}
gsl_multifit_linear_free(ws);
gsl_matrix_free(X);
gsl_matrix_free(cov);
gsl_vector_free(y);
gsl_vector_free(c);
return true; /* we do not "analyse" the result (cov matrix mainly)
to know if the fit is "good" */
}
In order to fit to y=c*x*x+b*x you have to call it with polynom_degree set to 3.
You also may have a look at the theory:
Weisstein, Eric W. "Least Squares Fitting--Polynomial." From MathWorld--A Wolfram Web Resource. http://mathworld.wolfram.com/LeastSquaresFittingPolynomial.html
I'm trying to implement a matrix-vector Multiplication on GPU (using CUDA).
In my C++ code (CPU), I load the matrix as a dense matrix, and then I perform the matrix-vector multiplication using CUDA. I'm also using shared memory to improve the performance.
How can I load the matrix in an efficient way, knowing that my matrix is a sparse matrix?
Below is my C++ function to load the matrix:
int readMatrix( char* filename, float* &matrix, unsigned int *dim = NULL, int majority = ROW_MAJOR )
{
unsigned int w, h, x, y, num_entries;
float val;
std::ifstream file( filename );
if ( file )
{
file >> h >> w >> num_entries;
cout << w << " " << h << " " << num_entries << "\n";
assert( w == h || w == 1 || h == 1 );
if( dim != NULL ) *dim = std::max( w, h );
matrix = new float[ w * h ];
unsigned int i;
for( i = 0; i < num_entries; i++ ){
if( file.eof() ) break;
file >> y >> x >> val;
if( majority == ROW_MAJOR ){
matrix[ w * y + x ] = val;
} else if( majority == COLUMN_MAJOR ){
matrix[ h * x + y ] = val;
}
}
file.close();
if( i == num_entries )
std::cout << "\nFile read successfully\n";
else
std::cout << "\nFile read successfully but seems defective:\n num entries read = " << i << ", entries epected = " << num_entries << "\n";
// print first few elements
if( w == h ){
for( unsigned int i = 0; i < w; i++ ){
printf("\n");
for( unsigned int j = 0; j < h; j++ ){
printf("%.2f ", matrix[ j + w * i ] );
}
}
}
else{
printf("\n");
for( unsigned int j = 0; j < h; j++ ){
printf("%.2f ", matrix[ j ] );
}
}
} else {
std::cout << "Unable to open file\n";
return false;
}
return true;
}
Below is my CUDA Kernel function that handles the matrix-vector multiplication:
__global__ void
_cl_matrix_vector_( float *A, float *b, float *x, int dim )
{
extern __shared__ float vec[];
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
float temp = 0.0;
int vOffs = 0;
//load vector into shared memory
for (int i = 0; i < (dim/blockDim.x) + 1 ; ++i, vOffs+= blockDim.x) {
vec[vOffs + threadIdx.x] = b[vOffs + threadIdx.x];
}
//make sure all threads are synchronized
__syncthreads();
if (idx < dim) {
temp = 0.0;
//dot product (multiplication)
for (int i = 0; i < dim; i++){
temp += A[idx * dim + i] * vec[i];
}
x[idx] = temp;
}
}
What are the necessary changes that I have to make on my CUDA code to take into account that my matrix is a sparse matrix?
I found out from a forum that we can also use padding to be able to optimize the performance, but this requires me to change the way I read the matrix / sort the matrix. Any ideas how to implement this padding in the way I read the matrix and perform the calculation?
This is a very old post and I want to highlight that cuSPARSE (since some time now) makes routines for the multiplication between sparse matrices or between a sparse matrix and a dense vector available.
For the csr format, the relevant routine for the multiplication between a sparse matrix and a dense vector is cusparse<t>csrmv. Below, a fully worked example showing its use.
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <assert.h>
#include "Utilities.cuh"
#include <cuda_runtime.h>
#include <cusparse_v2.h>
/********/
/* MAIN */
/********/
int main()
{
// --- Initialize cuSPARSE
cusparseHandle_t handle; cusparseSafeCall(cusparseCreate(&handle));
/**************************/
/* SETTING UP THE PROBLEM */
/**************************/
const int N = 4; // --- Number of rows and columns
// --- Host side dense matrices
double *h_A_dense = (double*)malloc(N * N * sizeof(double));
double *h_x_dense = (double*)malloc(N * sizeof(double));
double *h_y_dense = (double*)malloc(N * sizeof(double));
// --- Column-major ordering
h_A_dense[0] = 0.4612; h_A_dense[4] = -0.0006; h_A_dense[8] = 0.3566; h_A_dense[12] = 0.0;
h_A_dense[1] = -0.0006; h_A_dense[5] = 0.4640; h_A_dense[9] = 0.0723; h_A_dense[13] = 0.0;
h_A_dense[2] = 0.3566; h_A_dense[6] = 0.0723; h_A_dense[10] = 0.7543; h_A_dense[14] = 0.0;
h_A_dense[3] = 0.; h_A_dense[7] = 0.0; h_A_dense[11] = 0.0; h_A_dense[15] = 0.1;
// --- Initializing the data and result vectors
for (int k = 0; k < N; k++) {
h_x_dense[k] = 1.;
h_y_dense[k] = 0.;
}
// --- Create device arrays and copy host arrays to them
double *d_A_dense; gpuErrchk(cudaMalloc(&d_A_dense, N * N * sizeof(double)));
double *d_x_dense; gpuErrchk(cudaMalloc(&d_x_dense, N * sizeof(double)));
double *d_y_dense; gpuErrchk(cudaMalloc(&d_y_dense, N * sizeof(double)));
gpuErrchk(cudaMemcpy(d_A_dense, h_A_dense, N * N * sizeof(double), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_x_dense, h_x_dense, N * sizeof(double), cudaMemcpyHostToDevice));
gpuErrchk(cudaMemcpy(d_y_dense, h_y_dense, N * sizeof(double), cudaMemcpyHostToDevice));
// --- Descriptor for sparse matrix A
cusparseMatDescr_t descrA; cusparseSafeCall(cusparseCreateMatDescr(&descrA));
cusparseSafeCall(cusparseSetMatType (descrA, CUSPARSE_MATRIX_TYPE_GENERAL));
cusparseSafeCall(cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ONE));
int nnzA = 0; // --- Number of nonzero elements in dense matrix A
const int lda = N; // --- Leading dimension of dense matrix
// --- Device side number of nonzero elements per row of matrix A
int *d_nnzPerVectorA; gpuErrchk(cudaMalloc(&d_nnzPerVectorA, N * sizeof(*d_nnzPerVectorA)));
cusparseSafeCall(cusparseDnnz(handle, CUSPARSE_DIRECTION_ROW, N, N, descrA, d_A_dense, lda, d_nnzPerVectorA, &nnzA));
// --- Host side number of nonzero elements per row of matrix A
int *h_nnzPerVectorA = (int *)malloc(N * sizeof(*h_nnzPerVectorA));
gpuErrchk(cudaMemcpy(h_nnzPerVectorA, d_nnzPerVectorA, N * sizeof(*h_nnzPerVectorA), cudaMemcpyDeviceToHost));
printf("Number of nonzero elements in dense matrix A = %i\n\n", nnzA);
for (int i = 0; i < N; ++i) printf("Number of nonzero elements in row %i for matrix = %i \n", i, h_nnzPerVectorA[i]);
printf("\n");
// --- Device side sparse matrix
double *d_A; gpuErrchk(cudaMalloc(&d_A, nnzA * sizeof(*d_A)));
int *d_A_RowIndices; gpuErrchk(cudaMalloc(&d_A_RowIndices, (N + 1) * sizeof(*d_A_RowIndices)));
int *d_A_ColIndices; gpuErrchk(cudaMalloc(&d_A_ColIndices, nnzA * sizeof(*d_A_ColIndices)));
cusparseSafeCall(cusparseDdense2csr(handle, N, N, descrA, d_A_dense, lda, d_nnzPerVectorA, d_A, d_A_RowIndices, d_A_ColIndices));
// --- Host side sparse matrices
double *h_A = (double *)malloc(nnzA * sizeof(*h_A));
int *h_A_RowIndices = (int *)malloc((N + 1) * sizeof(*h_A_RowIndices));
int *h_A_ColIndices = (int *)malloc(nnzA * sizeof(*h_A_ColIndices));
gpuErrchk(cudaMemcpy(h_A, d_A, nnzA * sizeof(*h_A), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_A_RowIndices, d_A_RowIndices, (N + 1) * sizeof(*h_A_RowIndices), cudaMemcpyDeviceToHost));
gpuErrchk(cudaMemcpy(h_A_ColIndices, d_A_ColIndices, nnzA * sizeof(*h_A_ColIndices), cudaMemcpyDeviceToHost));
printf("\nOriginal matrix A in CSR format\n\n");
for (int i = 0; i < nnzA; ++i) printf("A[%i] = %f ", i, h_A[i]); printf("\n");
printf("\n");
for (int i = 0; i < (N + 1); ++i) printf("h_A_RowIndices[%i] = %i \n", i, h_A_RowIndices[i]); printf("\n");
printf("\n");
for (int i = 0; i < nnzA; ++i) printf("h_A_ColIndices[%i] = %i \n", i, h_A_ColIndices[i]);
printf("\n");
for (int i = 0; i < N; ++i) printf("h_x[%i] = %f \n", i, h_x_dense[i]); printf("\n");
const double alpha = 1.;
const double beta = 0.;
cusparseSafeCall(cusparseDcsrmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, N, N, nnzA, &alpha, descrA, d_A, d_A_RowIndices, d_A_ColIndices, d_x_dense,
&beta, d_y_dense));
gpuErrchk(cudaMemcpy(h_y_dense, d_y_dense, N * sizeof(double), cudaMemcpyDeviceToHost));
printf("\nResult vector\n\n");
for (int i = 0; i < N; ++i) printf("h_y[%i] = %f ", i, h_y_dense[i]); printf("\n");
}
You might want to have a look at the very good CUSP library. They implement sparse matrices in a variety of formats (coo, csr, ellpack, diagonal and a hybrid between ellpack and coo). Each with their own advantages as described in the documentation. Most of them are "standard" sparse matrix formats about which you can find more information online. Not a complete answer to your question perhaps, but it should provide a starting point.