I made a zero-one matrix with power 2. However, I want the code to be applied to any power the user enters. I tried several times, but it didn't work.
Here's a part of the code that would concern you.
Notes: Suppose the user has entered his (n*m) matrix which is "a", as n and m are equals and they are denoted by s.
k=0;
for(int j=0; j<s; j++)
for(int i=0; i<s; i++)
{
m[k]=0;
for(int t=0; t<s; t++)
m[k]+=a[j][t]*a[t][i];
k++;
}
Here is my implementation for matrix exponentiation:
struct matrix {
intt m[K][K];
matrix() {
memset (m, 0, sizeof (m));
}
matrix operator * (matrix b) {
matrix c = matrix();
for (intt i = 0; i < K; i++) {
for (intt k = 0; k < K; k++) {
for (intt j = 0; j < K; j++) {
c.m[i][j] = (c.m[i][j] + m[i][k] * b.m[k][j]) % MOD;
}
}
}
return c;
}
matrix pow (intt n) {
if (n <= 0) {
return matrix();
}
if (n == 1) {
return *this;
}
if (n % 2 == 1) {
return (*this) * pow (n - 1);
} else {
matrix X = pow (n / 2);
return X * X;
}
}
};
Related
I dont know why but my matrix multipication is very slow and I need to optimize it. and also the print of the matrix (1000X1000) taking long time.
The aim of the function is to calculate the matrix exponential, but my main problem is that this 2 actions are very slow for large matrices like 1000X1000.
These 2 actions implemented at poweMat() function and printeResult() function.
Here is the code:
#define M 1000
#define e 2.71828182845904523536;
//declaration of the functions
void sumMatrices(vector<vector<double> >& mat1, vector<vector<double> >& mat2);
void printResult(vector<vector<double> >&matRes);
void mulMatWithFactorial(long factorialValue);
long factorialCalculate(int n);
void initializeMatrix();
void initializeIdenticalMatrix();
void checkIfTheMatrixIsDiagonal();
void calculateExpoMatrixWithDiagonalMatrix();
void readMatrixFromFile();
void powerMat(vector<vector<double> >& mat, int powNum);
//declaration of the variables
vector<vector<double>> inputMatrix(M, vector<double>(M));
vector<vector<double>> sumMatrixResult(M, vector<double>(M));
vector<vector<double>> powerMatrixResult(M, vector<double>(M));
vector<vector<double>> mulFactorialMatrixResult(M, vector<double>(M));
vector<vector<double>> finalMatrixResult(M, vector<double>(M));
vector<vector<double>> identicalMatrix(M, vector<double>(M));
vector<vector<vector<double>>> listOfMatrices;
bool matrixIsNilpotent = false;
int diagonaMatrixlFlag = 1;
int main() {
//variables
long factorialValue;
initializeIdenticalMatrix();
readMatrixFromFile();
//check if the matrix is diagonal - so we will have easier and faster compute
checkIfTheMatrixIsDiagonal();
if (diagonaMatrixlFlag == 1) {
calculateExpoMatrixWithDiagonalMatrix();
goto endOfLoop;
}
//loop for taylor series
for (int i = 0; i < 5; i++) {
if (i == 0) { // first we add identical matrix when the power is 0
sumMatrices(finalMatrixResult, identicalMatrix); // summarize between this 2 matrices
finalMatrixResult = sumMatrixResult; //copy matrices
}
if (i == 1) { // we add the matrix itself because the power is 1
sumMatrices(finalMatrixResult, inputMatrix);
finalMatrixResult = sumMatrixResult; //copy matrices
}
if (i > 1 ) {
powerMat(inputMatrix, i);
if (matrixIsNilpotent) { // it means that A^i is 0 for some integer, so the series terminates after a finite number
goto endOfLoop;
}
factorialValue = factorialCalculate(i); // calculate the factorial of i
mulMatWithFactorial(factorialValue); // multiply (1/i) * matrix^i - like in the algorithm
sumMatrices(finalMatrixResult, mulFactorialMatrixResult); // summarize it with the previous result
finalMatrixResult = sumMatrixResult; //copy matrices
}
}
endOfLoop:
printResult(finalMatrixResult); // print the final result - e^M
return 0;
}
//Summarize matrices
void sumMatrices(vector<vector<double> >& mat1, vector<vector<double> >& mat2) {
for (int i = 0; i < M; i++)
for (int j = 0; j < M; j++)
sumMatrixResult[i][j] = mat1[i][j] + mat2[i][j];
}
//Print matrix
void printResult(vector<vector<double> >& matRes) {
for (int i = 0; i < M; i++) {
for (int j = 0; j < M; j++) {
printf("%f ", matRes[i][j]);
if (j == M - 1) {
printf("\n");
}
}
}
}
//Calculate the factorial of n
long factorialCalculate(int n) {
long factorial = 1.0;
for (int i = 1; i <= n; ++i) {
factorial *= i;
}
return factorial;
}
// mutiply the matrix with scalar
void mulMatWithFactorial(long factorialValue) {
for (int i = 0; i < M; i++) {
for (int j = 0; j < M; j++) {
mulFactorialMatrixResult[i][j] = powerMatrixResult[i][j] * 1/factorialValue;
}
}
}
//initialize matrix
void initializeMatrix() {
for (int i = 0; i < M; i++) {
for (int j = 0; j < M; j++) {
powerMatrixResult[i][j] = 0;
}
}
}
void checkIfTheMatrixIsDiagonal() {
for (int i = 0; i < M; i++) {
for (int j = 0; j < M; j++) {
if (i == j)
{
if (inputMatrix[i][j] == 0) {
diagonaMatrixlFlag = 0;
goto endOfLoop;
}
}
else
{
if (inputMatrix[i][j] != 0) {
diagonaMatrixlFlag = 0;
goto endOfLoop;
}
}
}
}
endOfLoop:
return;
}
void calculateExpoMatrixWithDiagonalMatrix() {
for (int i = 0; i < M; i++) {
for (int j = 0; j < M; j++) {
if (i == j)
{
for (int k = 0; k < inputMatrix[i][j]; ++k)// loop to calculate the pow of e^alpha
{
finalMatrixResult[i][j] *= e;
}
}
else
{
finalMatrixResult[i][j] = 0;
}
}
}
}
void readMatrixFromFile() {
ifstream f("inv_matrix(1000x1000).txt");
for (int i = 0; i < M; i++)
for (int j = 0; j < M; j++) {
f >> inputMatrix[i][j];
if (f.peek() == ',')
f.ignore();
}
listOfMatrices.push_back(inputMatrix);
}
void initializeIdenticalMatrix() {
for (int i = 0; i < M; i++) {
for (int k = 0; k < M; k++) {
if (i == k) {
identicalMatrix[i][k] = 1;
}
else {
identicalMatrix[i][k] = 0;
}
}
}
}
void powerMat(vector<vector<double> >& mat, int powNum) {
int counterForNilpotent = 0;
initializeMatrix();
auto start = high_resolution_clock::now();
for (int i = 0; i < M; i++) {
for (int k = 0; k < M; k++) {
for (int j = 0; j < M; j++) {
powerMatrixResult[i][j] += mat[i][k] * listOfMatrices[powNum-2][k][j];
}
}
}
auto stop = high_resolution_clock::now();
auto duration = duration_cast<seconds>(stop - start);
cout << duration.count() << " seconds" << endl; // checking run time
listOfMatrices.push_back(powerMatrixResult);
// check if after we we did A^i , the matrix is equal to 0
for (int i = 0; i < M; i++) {
for (int j = 0; j < M; j++) {
if (powerMatrixResult[i][j] == 0) {
counterForNilpotent++;
}
}
}
if (counterForNilpotent == M * M) {
matrixIsNilpotent = true;
}
}
Going through each element of an array of size "n" will have some computational efficiency of O(n^2), meaning for large arrays it will take a while but won't be "life-time-of-the-universe" lengths of time.
Usually to do operations on massive arrays like this, they're reduced in some form first so that the computation can be closer to O(n) or better using some truths about reduced forms of matrices.
So, a faster implementation for matrix multiplication would start with some rref() function upon both matrices and then only evaluating parts of those matrices that would have objects in the columns and rows.
Here are some great places to review/learn (for free) Linear Algebra:
"3b1b (2016): Essence of Linear Algebra" = https://www.youtube.com/watch?v=kjBOesZCoqc&list=PL0-GT3co4r2y2YErbmuJw2L5tW4Ew2O5B
"MIT OpenCourseWare (2009): Linear Algebra" = https://www.youtube.com/watch?v=ZK3O402wf1c&list=PL49CF3715CB9EF31D&index=1
Use SSE2. It’s not a library. It’s a method to use cpu vector hardware.
You set up operations to run in parallel.
https://en.wikipedia.org/wiki/SSE2
For educational purpose I'm developing c++ library for operating with large numbers represented as vectors of chars (vector<char>).
Here is algorithm that I am using for multiplication:
string multiplicationInner(CharVector a, CharVector b) {
reverse(a.begin(), a.end());
reverse(b.begin(), b.end());
IntVector stack(a.size() + b.size() + 1);
int i, j;
for (i = 0; i < a.size(); i++)
for (j = 0; j < b.size(); j++)
stack[i + j] += charToInt(a[i]) * charToInt(b[j]);
for (int i = 0; i < stack.size(); i++) {
int num = stack[i] % 10;
int move = stack[i] / 10;
stack[i] = num;
if (stack[i + 1])
stack[i + 1] += move;
else if (move)
stack[i + 1] = move;
}
CharVector stackChar = intVectorToCharVector(&stack);
deleteZerosAtEnd(&stackChar);
reverse(stackChar.begin(), stackChar.end());
return charVectorToString(&stackChar);
};
This function is called billion times in my program, so I would like to implement #pragma omp parallel for in it.
My question is: How can i parallelize first cycle?
This is what I have tried:
int i, j;
#pragma omp parallel for
for (i = 0; i < a.size(); i++) {
for (j = 0; j < b.size(); j++)
stack[i + j] += charToInt(a[i]) * charToInt(b[j]);
}
Algorithm stops working properly.
Advice needed.
Edit:
This variant works, but (with omp parallel for) benchmark shows it is 15x-20x slower than without it. (CPU: M1 Pro, 8 cores)
#pragma omp parallel for schedule(dynamic)
for (int k = 0; k < a.size() + b.size(); k++) {
for (int i = 0; i < a.size(); i++) {
int j = k - i;
if (j >= 0 && j < b.size()) {
stack[k] += charToInt(a[i]) * charToInt(b[j]);
}
}
}
This is part of my program, where multiplication is called most often. (Miller-Rabin test)
BigInt modularExponentiation(BigInt base, BigInt exponent, BigInt mod) {
BigInt x = B_ONE; // 1
BigInt y = base;
while (exponent > B_ZERO) { // while exponent > 0
if (isOdd(exponent))
x = (x * y) % mod;
y = (y * y) % mod;
exponent /= B_TWO; // exponent /= 2
}
return (x % mod);
};
bool isMillerRabinTestOk(BigInt candidate) {
if (candidate < B_TWO)
return false;
if (candidate != B_TWO && isEven(candidate))
return false;
BigInt canditateMinusOne = candidate - B_ONE;
BigInt s = canditateMinusOne;
while (isEven(s))
s /= B_TWO;
for (int i = 0; i < MILLER_RABIN_TEST_ITERATIONS; i++) {
BigInt a = BigInt(rand()) % canditateMinusOne + B_ONE;
BigInt temp = s;
BigInt mod = modularExponentiation(a, temp, candidate);
while (temp != canditateMinusOne && mod != B_ONE && mod != canditateMinusOne) {
mod = (mod * mod) % candidate;
temp *= B_TWO;
}
if (mod != canditateMinusOne && isEven(temp))
return false;
}
return true;
};
Your loops do not have the proper structure for parallelization. However, you can transform them:
for (k=0; k<a.size()+b.size(); k++) {
for (i=0; i<a.size(); i++) {
j=k-i;
stack[k] += a[i] * b[j];
}
Now the outer loop has no conflicts. Look at this as a "coordinate transformation": you're still traversing the same i/j row/column space, but now in new coordinates: k/i stands for diagonal/row.
Btw, this code is a little metaphorical. Check your loop bounds, and use the right multiplication. I'm just indicating the principle here.
I need make Pascal Triangle matrix using vectors and then print it.
This algorithm would work with arrays, but somehow it doesn't work with matrix using vectors.
#include <iomanip>
#include <iostream>
#include <vector>
typedef std::vector<std::vector<int>> Matrix;
int NumberOfRows(Matrix m) { return m.size(); }
int NumberOfColumns(Matrix m) {
if (m.size() != 0)
return m[0].size();
return 0;
}
Matrix PascalTriangle(int n) {
Matrix mat;
int a;
for (int i = 1; i <= n; i++) {
a = 1;
for (int j = 1; j <= i; j++) {
if (j == 1)
mat.push_back(j);
else
mat.push_back(a);
a = a * (i - j) / j;
}
}
return mat;
}
void PrintMatrix(Matrix m, int width) {
for (int i = 0; i < NumberOfRows(m); i++) {
for (int j = 0; j < NumberOfColumns(m); j++)
std::cout << std::setw(width) << m[i][j];
std::cout << std::endl;
}
}
int main() {
Matrix m = PascalTriangle(7);
PrintMatrix(m, 10);
return 0;
}
I get nothing on screen, and here's the same code just without matrix using vectors program (which works fine).
Could you help me fix this code?
The main problem is that in PascalTriangle, you are starting out with an empty Matrix in both the number of rows and columns.
Since my comments mentioned push_back, here is the way to use it if you did not initialize the Matrix with the number of elements that are passed in.
The other issue is that NumberOfColumns should specify the row, not just the matrix vector.
The final issue is that you should be passing the Matrix by const reference, not by value.
Addressing all of these issues, results in this:
Matrix PascalTriangle(int n)
{
Matrix mat;
for (int i = 0; i < n; i++)
{
mat.push_back({}); // creates a new empty row
std::vector<int>& newRow = mat.back(); // get reference to this row
int a = 1;
for (int j = 0; j < i + 1; j++)
{
if (j == 0)
newRow.push_back(1);
else
newRow.push_back(a);
a = a * (i - j) / (j + 1);
}
}
return mat;
}
And then in NumberOfColumns:
int NumberOfColumns(const Matrix& m, int row)
{
if (!m.empty())
return m[row].size();
return 0;
}
And then, NumberOfRows:
int NumberOfRows(const Matrix& m) { return m.size(); }
And last, PrintMatrix:
void PrintMatrix(const Matrix& m, int width)
{
for (int i = 0; i < NumberOfRows(m); i++)
{
for (int j = 0; j < NumberOfColumns(m, i); j++)
std::cout << std::setw(width) << m[i][j];
std::cout << std::endl;
}
}
Here is a live demo
Your code won't compile because you have numerous errors in PascalTriangle.
For one, you initialize a matrix with no elements. Additionally, you use matrix indices starting at 1 rather than 0.
The following prints things for me:
Matrix PascalTriangle(int n) {
Matrix mat(n, std::vector<int>(n, 0)); // Construct Matrix Properly
int a;
for (int i = 0; i < n; i++) { // Start index at 0
a = 1;
for (int j = 0; j < i + 1; j++) { // Start index at 0
if (j == 0) // Changed 1 to 0
mat[i][j] = 1;
else
mat[i][j] = a;
a = a * (i - j) / (j+1); // Changed j to j+1 since j starts at 0
}
}
return mat;
}
I was performance profiling our library and noticed that most time is spent in matrix manipulations.
I wanted to see whether I could improve performance by changing the order of the matrix loops or by changing the matrix class definition from row major to column major.
Questions:
Below I test 2 cases. Test case 1 is always the fastest, no matter whether my matrix is row or columns major. Why is that?
Turning on vectorization improves Test case 1 with a factor 2, why is that?
Performance profiling is done with Very Sleepy.
I used Visual Studio 2019 – platformtoolset v142, and compiled in 32-bit.
Our library defines a matrix template where the underlying is a dynamic array where the ordering is column major (full code follows below):
Type& operator()(int row, int col)
{
return pArr[row + col * m_rows];
}
Type operator()(int row, int col) const
{
return pArr[row + col * m_rows];
}
We also have a matrix class specific for doubles:
class DMatrix : public TMatrix<double>
{
public:
// Constructors:
DMatrix() : TMatrix<double>() { }
DMatrix(int rows, int cols) : TMatrix<double>(rows, cols, true) {}
};
I ran 2 test cases that perform nested loop operations on randomly filled matrices. The difference between Test case 1 and 2 is the order of the inner loops.
int nrep = 10000; // Large number of calculations
int nstate = 400;
int nstep = 400;
int nsec = 3; // 100 times smaller than nstate and nstep
DMatrix value(nstate, nsec);
DMatrix Rc(nstate, 3 * nstep);
DMatrix rhs(nstate, nsec);
// Test case 1
for (int k = 0; k < nrep; k++) {
for (int n = 0; n < nstep; n++) {
int diag = 3 * n + 1;
for (int i = 1; i < nstate; i++) {
for (int j = 0; j < nsec; j++) {
value(i, j) = (rhs(i, j) - Rc(i, diag - 1) * value(i - 1, j)) / Rc(i, diag);
}
}
}
}
// Test case 2
for (int k = 0; k < nrep; k++) {
for (int n = 0; n < nstep; n++) {
int diag = 3 * n + 1;
for (int j = 0; j < nsec; j++) {
for (int i = 1; i < nstate; i++) {
value(i, j) = (rhs(i, j) - Rc(i, diag - 1) * value(i - 1, j)) / Rc(i, diag);
}
}
}
}
Since the matrix is column major, I expected that I would get the best performance when the inner loop follows a column, due to nearby elements being CPU cached, but instead it is doing the opposite. Note that nstep and nstate are typically 100 times larger than nsec.
When I turn on vectorization:
“Advanced Vector Extensions 2” in Code Generation/Enable Enhanced Instruction Set, the performance difference gets even larger:
When I turn off the vectorization and make the matrix row major:
Type& operator()(int row, int col)
{
return pArr[col + row*m_cols];
}
Type operator()(int row, int col) const
{
return pArr[col + row*m_cols];
}
I don’t get any difference in performance compared to when the matrix was column major:
With vector optimizations:
The full code. matrix.h:
#ifndef __MATRIX_H
#define __MATRIX_H
#include <assert.h>
#include <iostream>
template<class Type>
class TMatrix
{
public:
TMatrix(); // Default constructor
TMatrix(int rows, int cols, bool init = false); // Constructor with dimensions + flag to default initialize or not
TMatrix(const TMatrix& mat); // Copy constructor
TMatrix& operator=(const TMatrix& mat); // Assignment operator
~TMatrix(); // Destructor
// Move constructor/assignment
TMatrix(TMatrix&& mat) noexcept;
TMatrix& operator=(TMatrix&& mat) noexcept;
// Get matrix dimensions
int no_rows() const { return m_rows; }
int no_columns() const { return m_cols; }
Type& operator()(int row, int col)
{
assert(row >= 0 && row < m_rows&& col >= 0 && col < m_cols);
return pArr[row + col * m_rows]; // elements in a column lay next to each other
//return pArr[col + row*m_cols]; // elements in a row lay next to each other
}
Type operator()(int row, int col) const
{
assert(row >= 0 && row < m_rows&& col >= 0 && col < m_cols);
return pArr[row + col * m_rows];
// return pArr[col + row*m_cols];
}
protected:
void clear();
Type* pArr;
int m_rows, m_cols;
};
//**************************************************************
// Implementation of TMatrix
//**************************************************************
// Default constructor
template<class Type>
TMatrix<Type>::TMatrix()
{
m_rows = 0;
m_cols = 0;
pArr = 0;
}
// Constructor with matrix dimensions (rows, cols)
template<class Type>
TMatrix<Type>::TMatrix(int rows, int cols, bool init)
{
pArr = 0;
m_rows = rows;
m_cols = cols;
if (m_rows > 0 && m_cols > 0)
if (init)
pArr = new Type[m_rows * m_cols]();
else
pArr = new Type[m_rows * m_cols]; // TODO: check for p = NULL (memory allocation error, which will triger a GPF)
else
{
m_rows = 0;
m_cols = 0;
}
}
// Copy constructor
template<class Type>
TMatrix<Type>::TMatrix(const TMatrix& mat)
{
pArr = 0;
m_rows = mat.m_rows;
m_cols = mat.m_cols;
if (m_rows > 0 && m_cols > 0)
{
int dim = m_rows * m_cols;
pArr = new Type[dim];
for (int i = 0; i < dim; i++)
pArr[i] = mat.pArr[i];
}
else
{
m_rows = m_cols = 0;
}
}
// Move constructors
template<class Type>
TMatrix<Type>::TMatrix(TMatrix&& mat) noexcept
{
m_rows = mat.m_rows;
m_cols = mat.m_cols;
if (m_rows > 0 && m_cols > 0)
{
pArr = mat.pArr;
}
else
{
m_rows = m_cols = 0;
pArr = 0;
}
mat.pArr = 0;
}
// Clear the matrix
template<class Type>
void TMatrix<Type>::clear()
{
delete[] pArr;
pArr = 0;
m_rows = m_cols = 0;
}
// Destructor
template<class Type>
TMatrix<Type>::~TMatrix()
{
clear();
}
// Move assignment
template<class Type>
TMatrix<Type>& TMatrix<Type>::operator=(TMatrix&& mat) noexcept
{
if (this != &mat) // Check for self assignment
{
clear();
m_rows = mat.m_rows;
m_cols = mat.m_cols;
if (m_rows > 0 && m_cols > 0)
{
pArr = mat.pArr;
}
else
{
m_rows = m_cols = 0;
}
mat.pArr = nullptr;
}
return *this;
}
// Assignment operator with check for self-assignment
template<class Type>
TMatrix<Type>& TMatrix<Type>::operator=(const TMatrix& mat)
{
if (this != &mat) // Guard against self assignment
{
clear();
m_rows = mat.m_rows;
m_cols = mat.m_cols;
if (m_rows > 0 && m_cols > 0)
{
int dim = m_rows * m_cols;
pArr = new Type[dim];
for (int i = 0; i < dim; i++)
pArr[i] = mat.pArr[i];
}
else
{
m_rows = m_cols = 0;
}
}
return *this;
}
#endif
dmatrix.h:
#ifndef __DMATRIX_H
#define __DMATRIX_H
#include "matrix.h"
class DMatrix : public TMatrix<double>
{
public:
// Constructors:
DMatrix() : TMatrix<double>() { }
DMatrix(int rows, int cols) : TMatrix<double>(rows, cols, true) {}
};
#endif
Main:
#include <iostream>
#include "dmatrix.h"
int main()
{
int nrep = 10000; // Large number of calculations
int nstate = 400;
int nstep = 400;
int nsec = 3; // 100 times smaller than nstate and nstep
DMatrix value(nstate, nsec);
DMatrix Rc(nstate, 3 * nstep);
DMatrix rhs(nstate, nsec);
// Give some random input
for (int i = 0; i < Rc.no_rows(); i++) {
for (int j = 0; j < Rc.no_columns(); j++) {
Rc(i, j) = double(std::rand()) / RAND_MAX;
}
}
for (int i = 0; i < value.no_rows(); i++) {
for (int j = 0; j < value.no_columns(); j++) {
value(i, j) = 1 + double(std::rand()) / RAND_MAX;
}
}
for (int i = 0; i < rhs.no_rows(); i++) {
for (int j = 0; j < rhs.no_columns(); j++) {
rhs(i, j) = 1 + double(std::rand()) / RAND_MAX;
}
}
// Test case 1
for (int k = 0; k < nrep; k++) {
for (int n = 0; n < nstep; n++) {
int diag = 3 * n + 1;
for (int i = 1; i < nstate; i++) {
for (int j = 0; j < nsec; j++) { // Expectation: this is fast - inner loop follows row
value(i, j) = (rhs(i, j) - Rc(i, diag - 1) * value(i - 1, j)) / Rc(i, diag);
}
}
}
}
// Test case 2
for (int k = 0; k < nrep; k++) {
for (int n = 0; n < nstep; n++) {
int diag = 3 * n + 1;
for (int j = 0; j < nsec; j++) {
for (int i = 1; i < nstate; i++) { // Expectation: this is slow - inner loop walks down column
value(i, j) = (rhs(i, j) - Rc(i, diag - 1) * value(i - 1, j)) / Rc(i, diag);
}
}
}
}
return 0;
}
Thanks in advance for your help.
Best regards,
Nele
As I mentioned in a comment, after some testing:
Rc is the largest matrix here (by roughly a factor of 100), and it is reasonable to assume that most of the running time is spent on handling it. When the inner loop is on j, you get significant improvement because Rc(i, diag - 1) and Rc(i, diag) can be reused in all iterations of the inner loop.
To make sure that this is the case, I changed the loops to the following:
// Test case 1
for (int k = 0; k < nrep; k++) {
for (int i = 1; i < nstate; i++) {
for (int j = 0; j < nsec; j++) { // Expectation: this is fast - inner loop follows row
value(i, j) = (rhs(i, j) - value(i - 1, j));
}
}
}
// Test case 2
for (int k = 0; k < nrep; k++) {
for (int j = 0; j < nsec; j++) {
for (int i = 1; i < nstate; i++) { // Expectation: this is slow - inner loop walks down column
value(i, j) = (rhs(i, j) - value(i - 1, j)) ;
}
}
}
With this calculation (and different matrix sizes - 2000 by 2000, for 200 repetitions), one test case runs 10 times faster than the other (no fancy profiling, but linux's time gives 18s vs. ~2s).
When I change row-major and column-major the trend is reversed.
EDIT:
Conclusion - you need to select row-major/column-major based on what workes best for Rc, and always use Test case 1 (if this represents the problems you're actually trying to solve).
Regarding vectorization - I'm not sure how this works. Maybe someone else can offer an explanation.
I have a task to calculate a scalar product
s=(B*(r+q+r), A*A*p)
As I understand, I need to calculate 2 vectors: first - B*(r+q+r), second - AAp, and then calculate a scalar product.
#include <iostream>
#include <vector>
using namespace std;
using matrix = vector<vector<double>>;
matrix add(matrix A, matrix B) {
matrix C;
C.resize(A.size());
for (int i = 0; i< A.size(); i++) {
C[i].resize(B.size());
for (int j = 0; j < B.size(); j++) {
C[i][j] = A[i][j] + B[i][j];
}
}
return C;
}
matrix multiple(matrix A, matrix B)
{
matrix C;
C.reserve(100);
C.resize(B.size());
for (int i = 0; i < A.size(); i++) {
C[i].resize(B.size());
for (int j = 0; j < B.size(); j++) {
for (int k = 0; k < B.size(); k++)
C[i][j] += A[i][k] * B[k][j];
}
}
return C;
}
void main() {
matrix A = { {1,2,3}, {1,2,1}, {3,2,0} };
matrix B = { {4,1,2},{0,4,3},{1,1,1} };
matrix r = { {-0.7f, 1.3, 0.2} };
matrix q = { { -1.6f, 0.8, 1.1} };
matrix p = { {0.1, 1.7, -1.5} };
matrix r_q = add(r, q);
for (int i = 0; i < r_q.size(); i++) {
for (int j = 0; j < r_q.size(); j++) {
cout << r_q[i][j] << "\t";
}
cout << "\n";
}
matrix a_a = multiple(A, A);
matrix a_a_p = multiple(a_a,p);
getchar();
}
Problems:
add method work not correct, it put in result only one number - sum of the first items.
Multipling matrix with the same dimensions (A*A) work correct. Multipling matrix with the different dimensions (a_a * p) - return error "vector subscript out of range".
Thanks for any advice.
The OP chose to implement both matrices and vectors using a std::vector<std::vector<double>>.
This may not be a good design choice in general, but also in particular, to be consistent to the mathematical meaning of all the involved operations, all the vectors should be considered (and declared as well) as "column" matrices (or Nx1 matrices):
matrix r = { {-0.7}, {1.3}, {0.2} };
matrix q = { {-1.6}, {0.8}, {1.1} };
matrix p = { {0.1}, {1.7}, {-1.5} };
Then, in the functions that perform the calculations, special attention should be paid to the correct sizes of rows and columns to avoid out of bounds accesses.
matrix add(matrix const &A, matrix const &B)
{
if (A.size() != B.size() || A.size() == 0)
throw std::runtime_error("number of rows mismatch");
size_t columns = A[0].size();
matrix C(A.size(), std::vector<double>(columns, 0.0));
for (size_t i = 0; i < A.size(); i++)
{
if ( A[i].size() != columns || B[i].size() != columns )
throw std::runtime_error("number of columns mismatch");
for (size_t j = 0; j < columns; j++)
{
C[i][j] = A[i][j] + B[i][j];
}
}
return C;
}
matrix multiple(matrix const &A, matrix const &B)
{
if ( A.size() == 0 || B.size() == 0 || B[0].size() == 0)
throw std::runtime_error("size mismatch");
size_t columns = B[0].size();
matrix C(A.size(), std::vector<double>(columns, 0.0));
for (size_t i = 0; i < A.size(); i++)
{
if ( A[i].size() != B.size() || B[i].size() != columns )
throw std::runtime_error("inner size mismatch");
for (size_t j = 0; j < columns; j++)
{
for (size_t k = 0; k < B.size(); k++)
C[i][j] += A[i][k] * B[k][j];
}
}
return C;
}
The compiler should have also warned the OP about the incorrect use of void main instead of int main and about the comparisons between signed and unsigned integer expressions (I used size_t instead of int).
From a mathematical point of view, it's worth noting that to solve OP problem, that is to calculate the scalar product s = (B(r+q+r), AAp), the operations really needed (to be implemented) are the sum of vectors, the product of a matrix by a vector (easier and more efficient then matrix multiplication) and the dot product of two vectors:
t = r + q + r
b = Bt
u = Ap
a = Au
s = (b, a)