I was performance profiling our library and noticed that most time is spent in matrix manipulations.
I wanted to see whether I could improve performance by changing the order of the matrix loops or by changing the matrix class definition from row major to column major.
Questions:
Below I test 2 cases. Test case 1 is always the fastest, no matter whether my matrix is row or columns major. Why is that?
Turning on vectorization improves Test case 1 with a factor 2, why is that?
Performance profiling is done with Very Sleepy.
I used Visual Studio 2019 – platformtoolset v142, and compiled in 32-bit.
Our library defines a matrix template where the underlying is a dynamic array where the ordering is column major (full code follows below):
Type& operator()(int row, int col)
{
return pArr[row + col * m_rows];
}
Type operator()(int row, int col) const
{
return pArr[row + col * m_rows];
}
We also have a matrix class specific for doubles:
class DMatrix : public TMatrix<double>
{
public:
// Constructors:
DMatrix() : TMatrix<double>() { }
DMatrix(int rows, int cols) : TMatrix<double>(rows, cols, true) {}
};
I ran 2 test cases that perform nested loop operations on randomly filled matrices. The difference between Test case 1 and 2 is the order of the inner loops.
int nrep = 10000; // Large number of calculations
int nstate = 400;
int nstep = 400;
int nsec = 3; // 100 times smaller than nstate and nstep
DMatrix value(nstate, nsec);
DMatrix Rc(nstate, 3 * nstep);
DMatrix rhs(nstate, nsec);
// Test case 1
for (int k = 0; k < nrep; k++) {
for (int n = 0; n < nstep; n++) {
int diag = 3 * n + 1;
for (int i = 1; i < nstate; i++) {
for (int j = 0; j < nsec; j++) {
value(i, j) = (rhs(i, j) - Rc(i, diag - 1) * value(i - 1, j)) / Rc(i, diag);
}
}
}
}
// Test case 2
for (int k = 0; k < nrep; k++) {
for (int n = 0; n < nstep; n++) {
int diag = 3 * n + 1;
for (int j = 0; j < nsec; j++) {
for (int i = 1; i < nstate; i++) {
value(i, j) = (rhs(i, j) - Rc(i, diag - 1) * value(i - 1, j)) / Rc(i, diag);
}
}
}
}
Since the matrix is column major, I expected that I would get the best performance when the inner loop follows a column, due to nearby elements being CPU cached, but instead it is doing the opposite. Note that nstep and nstate are typically 100 times larger than nsec.
When I turn on vectorization:
“Advanced Vector Extensions 2” in Code Generation/Enable Enhanced Instruction Set, the performance difference gets even larger:
When I turn off the vectorization and make the matrix row major:
Type& operator()(int row, int col)
{
return pArr[col + row*m_cols];
}
Type operator()(int row, int col) const
{
return pArr[col + row*m_cols];
}
I don’t get any difference in performance compared to when the matrix was column major:
With vector optimizations:
The full code. matrix.h:
#ifndef __MATRIX_H
#define __MATRIX_H
#include <assert.h>
#include <iostream>
template<class Type>
class TMatrix
{
public:
TMatrix(); // Default constructor
TMatrix(int rows, int cols, bool init = false); // Constructor with dimensions + flag to default initialize or not
TMatrix(const TMatrix& mat); // Copy constructor
TMatrix& operator=(const TMatrix& mat); // Assignment operator
~TMatrix(); // Destructor
// Move constructor/assignment
TMatrix(TMatrix&& mat) noexcept;
TMatrix& operator=(TMatrix&& mat) noexcept;
// Get matrix dimensions
int no_rows() const { return m_rows; }
int no_columns() const { return m_cols; }
Type& operator()(int row, int col)
{
assert(row >= 0 && row < m_rows&& col >= 0 && col < m_cols);
return pArr[row + col * m_rows]; // elements in a column lay next to each other
//return pArr[col + row*m_cols]; // elements in a row lay next to each other
}
Type operator()(int row, int col) const
{
assert(row >= 0 && row < m_rows&& col >= 0 && col < m_cols);
return pArr[row + col * m_rows];
// return pArr[col + row*m_cols];
}
protected:
void clear();
Type* pArr;
int m_rows, m_cols;
};
//**************************************************************
// Implementation of TMatrix
//**************************************************************
// Default constructor
template<class Type>
TMatrix<Type>::TMatrix()
{
m_rows = 0;
m_cols = 0;
pArr = 0;
}
// Constructor with matrix dimensions (rows, cols)
template<class Type>
TMatrix<Type>::TMatrix(int rows, int cols, bool init)
{
pArr = 0;
m_rows = rows;
m_cols = cols;
if (m_rows > 0 && m_cols > 0)
if (init)
pArr = new Type[m_rows * m_cols]();
else
pArr = new Type[m_rows * m_cols]; // TODO: check for p = NULL (memory allocation error, which will triger a GPF)
else
{
m_rows = 0;
m_cols = 0;
}
}
// Copy constructor
template<class Type>
TMatrix<Type>::TMatrix(const TMatrix& mat)
{
pArr = 0;
m_rows = mat.m_rows;
m_cols = mat.m_cols;
if (m_rows > 0 && m_cols > 0)
{
int dim = m_rows * m_cols;
pArr = new Type[dim];
for (int i = 0; i < dim; i++)
pArr[i] = mat.pArr[i];
}
else
{
m_rows = m_cols = 0;
}
}
// Move constructors
template<class Type>
TMatrix<Type>::TMatrix(TMatrix&& mat) noexcept
{
m_rows = mat.m_rows;
m_cols = mat.m_cols;
if (m_rows > 0 && m_cols > 0)
{
pArr = mat.pArr;
}
else
{
m_rows = m_cols = 0;
pArr = 0;
}
mat.pArr = 0;
}
// Clear the matrix
template<class Type>
void TMatrix<Type>::clear()
{
delete[] pArr;
pArr = 0;
m_rows = m_cols = 0;
}
// Destructor
template<class Type>
TMatrix<Type>::~TMatrix()
{
clear();
}
// Move assignment
template<class Type>
TMatrix<Type>& TMatrix<Type>::operator=(TMatrix&& mat) noexcept
{
if (this != &mat) // Check for self assignment
{
clear();
m_rows = mat.m_rows;
m_cols = mat.m_cols;
if (m_rows > 0 && m_cols > 0)
{
pArr = mat.pArr;
}
else
{
m_rows = m_cols = 0;
}
mat.pArr = nullptr;
}
return *this;
}
// Assignment operator with check for self-assignment
template<class Type>
TMatrix<Type>& TMatrix<Type>::operator=(const TMatrix& mat)
{
if (this != &mat) // Guard against self assignment
{
clear();
m_rows = mat.m_rows;
m_cols = mat.m_cols;
if (m_rows > 0 && m_cols > 0)
{
int dim = m_rows * m_cols;
pArr = new Type[dim];
for (int i = 0; i < dim; i++)
pArr[i] = mat.pArr[i];
}
else
{
m_rows = m_cols = 0;
}
}
return *this;
}
#endif
dmatrix.h:
#ifndef __DMATRIX_H
#define __DMATRIX_H
#include "matrix.h"
class DMatrix : public TMatrix<double>
{
public:
// Constructors:
DMatrix() : TMatrix<double>() { }
DMatrix(int rows, int cols) : TMatrix<double>(rows, cols, true) {}
};
#endif
Main:
#include <iostream>
#include "dmatrix.h"
int main()
{
int nrep = 10000; // Large number of calculations
int nstate = 400;
int nstep = 400;
int nsec = 3; // 100 times smaller than nstate and nstep
DMatrix value(nstate, nsec);
DMatrix Rc(nstate, 3 * nstep);
DMatrix rhs(nstate, nsec);
// Give some random input
for (int i = 0; i < Rc.no_rows(); i++) {
for (int j = 0; j < Rc.no_columns(); j++) {
Rc(i, j) = double(std::rand()) / RAND_MAX;
}
}
for (int i = 0; i < value.no_rows(); i++) {
for (int j = 0; j < value.no_columns(); j++) {
value(i, j) = 1 + double(std::rand()) / RAND_MAX;
}
}
for (int i = 0; i < rhs.no_rows(); i++) {
for (int j = 0; j < rhs.no_columns(); j++) {
rhs(i, j) = 1 + double(std::rand()) / RAND_MAX;
}
}
// Test case 1
for (int k = 0; k < nrep; k++) {
for (int n = 0; n < nstep; n++) {
int diag = 3 * n + 1;
for (int i = 1; i < nstate; i++) {
for (int j = 0; j < nsec; j++) { // Expectation: this is fast - inner loop follows row
value(i, j) = (rhs(i, j) - Rc(i, diag - 1) * value(i - 1, j)) / Rc(i, diag);
}
}
}
}
// Test case 2
for (int k = 0; k < nrep; k++) {
for (int n = 0; n < nstep; n++) {
int diag = 3 * n + 1;
for (int j = 0; j < nsec; j++) {
for (int i = 1; i < nstate; i++) { // Expectation: this is slow - inner loop walks down column
value(i, j) = (rhs(i, j) - Rc(i, diag - 1) * value(i - 1, j)) / Rc(i, diag);
}
}
}
}
return 0;
}
Thanks in advance for your help.
Best regards,
Nele
As I mentioned in a comment, after some testing:
Rc is the largest matrix here (by roughly a factor of 100), and it is reasonable to assume that most of the running time is spent on handling it. When the inner loop is on j, you get significant improvement because Rc(i, diag - 1) and Rc(i, diag) can be reused in all iterations of the inner loop.
To make sure that this is the case, I changed the loops to the following:
// Test case 1
for (int k = 0; k < nrep; k++) {
for (int i = 1; i < nstate; i++) {
for (int j = 0; j < nsec; j++) { // Expectation: this is fast - inner loop follows row
value(i, j) = (rhs(i, j) - value(i - 1, j));
}
}
}
// Test case 2
for (int k = 0; k < nrep; k++) {
for (int j = 0; j < nsec; j++) {
for (int i = 1; i < nstate; i++) { // Expectation: this is slow - inner loop walks down column
value(i, j) = (rhs(i, j) - value(i - 1, j)) ;
}
}
}
With this calculation (and different matrix sizes - 2000 by 2000, for 200 repetitions), one test case runs 10 times faster than the other (no fancy profiling, but linux's time gives 18s vs. ~2s).
When I change row-major and column-major the trend is reversed.
EDIT:
Conclusion - you need to select row-major/column-major based on what workes best for Rc, and always use Test case 1 (if this represents the problems you're actually trying to solve).
Regarding vectorization - I'm not sure how this works. Maybe someone else can offer an explanation.
Related
I need make Pascal Triangle matrix using vectors and then print it.
This algorithm would work with arrays, but somehow it doesn't work with matrix using vectors.
#include <iomanip>
#include <iostream>
#include <vector>
typedef std::vector<std::vector<int>> Matrix;
int NumberOfRows(Matrix m) { return m.size(); }
int NumberOfColumns(Matrix m) {
if (m.size() != 0)
return m[0].size();
return 0;
}
Matrix PascalTriangle(int n) {
Matrix mat;
int a;
for (int i = 1; i <= n; i++) {
a = 1;
for (int j = 1; j <= i; j++) {
if (j == 1)
mat.push_back(j);
else
mat.push_back(a);
a = a * (i - j) / j;
}
}
return mat;
}
void PrintMatrix(Matrix m, int width) {
for (int i = 0; i < NumberOfRows(m); i++) {
for (int j = 0; j < NumberOfColumns(m); j++)
std::cout << std::setw(width) << m[i][j];
std::cout << std::endl;
}
}
int main() {
Matrix m = PascalTriangle(7);
PrintMatrix(m, 10);
return 0;
}
I get nothing on screen, and here's the same code just without matrix using vectors program (which works fine).
Could you help me fix this code?
The main problem is that in PascalTriangle, you are starting out with an empty Matrix in both the number of rows and columns.
Since my comments mentioned push_back, here is the way to use it if you did not initialize the Matrix with the number of elements that are passed in.
The other issue is that NumberOfColumns should specify the row, not just the matrix vector.
The final issue is that you should be passing the Matrix by const reference, not by value.
Addressing all of these issues, results in this:
Matrix PascalTriangle(int n)
{
Matrix mat;
for (int i = 0; i < n; i++)
{
mat.push_back({}); // creates a new empty row
std::vector<int>& newRow = mat.back(); // get reference to this row
int a = 1;
for (int j = 0; j < i + 1; j++)
{
if (j == 0)
newRow.push_back(1);
else
newRow.push_back(a);
a = a * (i - j) / (j + 1);
}
}
return mat;
}
And then in NumberOfColumns:
int NumberOfColumns(const Matrix& m, int row)
{
if (!m.empty())
return m[row].size();
return 0;
}
And then, NumberOfRows:
int NumberOfRows(const Matrix& m) { return m.size(); }
And last, PrintMatrix:
void PrintMatrix(const Matrix& m, int width)
{
for (int i = 0; i < NumberOfRows(m); i++)
{
for (int j = 0; j < NumberOfColumns(m, i); j++)
std::cout << std::setw(width) << m[i][j];
std::cout << std::endl;
}
}
Here is a live demo
Your code won't compile because you have numerous errors in PascalTriangle.
For one, you initialize a matrix with no elements. Additionally, you use matrix indices starting at 1 rather than 0.
The following prints things for me:
Matrix PascalTriangle(int n) {
Matrix mat(n, std::vector<int>(n, 0)); // Construct Matrix Properly
int a;
for (int i = 0; i < n; i++) { // Start index at 0
a = 1;
for (int j = 0; j < i + 1; j++) { // Start index at 0
if (j == 0) // Changed 1 to 0
mat[i][j] = 1;
else
mat[i][j] = a;
a = a * (i - j) / (j+1); // Changed j to j+1 since j starts at 0
}
}
return mat;
}
I tried to make the Matrix header file. I also gave it a go at representing the matrix with a single array. Every method I wrote worked just fine. On the contrary, I couldn't figure out the problem with the matrix multiplication.
It seems fine though. I have stripped some code to make it small.
any help is appreciated.
#ifndef MATRIX_HPP
#define MATRIX_HPP
#include <iostream>
#include <random>
#include <chrono>
#include <ctime>
#include <iomanip>
using std::cout;
using std::endl;
using std::ios;
static std::ostream & pretty_print(std::ostream & output) {
output.setf(ios::showpoint);
output.setf(ios::showpos);
output.width(6);
output.precision(2);
return output;
}
class Matrix {
public:
// Constructor functions
Matrix();
Matrix(size_t r, size_t c, double v = 0);
Matrix(size_t r, size_t c, double *array);
Matrix(const Matrix &mat);
// helping functions
void randomize(double a = -1, double b = 1);
void addMat(const Matrix &mat);
void subMat(const Matrix &mat);
void multiply_matrix(const Matrix &mat);
Matrix transpose();
double* toArray();
void display();
// Static functions
static Matrix matMul(const Matrix &mat1, const Matrix &mat2);
static Matrix transpose(const Matrix &mat);
static Matrix fromArray(double *arr, size_t size);
// Overloaded operator functions and some friend functions
/*friend ostream& operator<<(ostream &dout, Matrix &mat);
friend istream& operator>>(istream &din, Matrix &mat);*/
private:
size_t rows;
size_t cols;
double *matrix;
// Random number engine
static uint32_t generate_seed();
static double get_random(double a,double b);
};
// Private static functions
uint32_t Matrix::generate_seed() {
{
std::random_device random;
if (random.entropy() > 0.0) {
return random();
}
}
return std::chrono::high_resolution_clock::now().time_since_epoch().count();
}
//--------------------------------------------------------------------
double Matrix::get_random(double a, double b) {
static std::mt19937 random(Matrix::generate_seed());
std::uniform_real_distribution<double> double_dist{a, b};
return double_dist(random);
}
//--------------------------------------------------------------------
Matrix::Matrix() {
rows = 0;
cols = 0;
matrix = new double[rows * cols];
for (size_t i = 0; i < rows; ++i) {
for (size_t j = 0; j < cols; ++j) {
*(matrix + i * cols + j) = 0;
}
}
}
//--------------------------------------------------------------------
Matrix::Matrix::Matrix(size_t r, size_t c, double v) {
rows = r;
cols = c;
matrix = new double[rows * cols];
for (size_t i = 0; i < rows; ++i) {
for (size_t j = 0; j < cols; ++j) {
*(matrix + i * cols + j) = v;
}
}
}
//--------------------------------------------------------------------
Matrix::Matrix(size_t r, size_t c, double *array) {
rows = r;
cols = c;
matrix = new double[rows * cols];
for (size_t i = 0; i < rows; ++i) {
for (size_t j = 0; j < cols; ++j) {
*(matrix + i * cols + j) = *(array + i * cols + j);
}
}
}
//----------------------------------------------------------------
Matrix::Matrix(const Matrix &mat) {
rows = mat.rows;
cols = mat.cols;
matrix = new double[rows * cols];
for (size_t i = 0; i < rows; ++i) {
for (size_t j = 0; j < cols; ++j) {
*(matrix + i * cols + j) = *(mat.matrix + i * rows + cols);
}
}
}
//------------------------------------------------------------------
void Matrix::randomize(double a, double b) {
for (size_t i = 0; i < rows; i++) {
for (size_t j = 0; j < cols; j++) {
*(matrix + i * cols + j) = Matrix::get_random(a, b);
}
}
}
//-----------------------------------------------------------------------
Matrix Matrix::transpose() {
Matrix result(cols, rows);
for (size_t i = 0; i < result.rows; i++) {
for (size_t j = 0; j < result.cols; j++) {
*(result.matrix + i * result.cols + j) = *(matrix + j * cols + i);
}
}
return result;
}
//-----------------------------------------------------------------------
void Matrix::display() {
cout<<"[";
for (size_t i = 0; i < rows; i++) {
cout<<"[";
for (size_t j = 0; j < cols; j++) {
if (j != cols - 1) {
cout<<pretty_print<<*(matrix + i * cols + j)<<", ";
} else if (i != rows - 1 && j == cols - 1) {
cout<<pretty_print<<*(matrix + i * cols + j)<<"],"<<endl<<" ";
} else if (i == rows - 1 && j == cols - 1) {
cout<<pretty_print<<*(matrix + i * cols + j)<<"]]"<<endl;
}
}
}
cout<<endl;
}
//-----------------------------------------------------------------------
Matrix Matrix::matMul(const Matrix& mat1, const Matrix& mat2) {
if (mat1.cols == mat2.rows) {
Matrix result(mat1.rows, mat2.cols);
double sum; // mat1[i][k] * mat2[k][j];
for (size_t i = 0; i < result.rows; i++) {
for (size_t j = 0; j < result.cols; j++) {
sum = 0;
for (size_t k = 0; k < mat1.cols; k++) {
sum += *(mat1.matrix + i * mat1.cols + k) * *(mat2.matrix + k * mat2.cols + j);
}
*(result.matrix + i * result.cols + j) = sum;
}
}
return result;
} else {
cout<<"Matrix multiplication is not possible!"<<endl;
return Matrix();
}
}
I have two sorting methods: insertion sort and shell sort. Two of those working function I have adapted to C++ from plain C. The problem is that ins_sort function works just well and shell_sort fails. What reason for that can be?
bool less(QVector<int> &arr, int a, int b)
{
return arr[a] < arr[b];
}
// Performs swap on elements at a and b in QVector<int> arr
void qswap(QVector<int> &arr, int a, int b)
{
int temp = arr[a];
arr[a] = arr[b];
arr[b] = temp;
}
/* Failure is thrown in this method */
void shell_sort(GraphicsView &window, SwapManager &manager)
{
auto list = window.items();
QVector<int> arr;
for (auto item : list)
arr.push_back(static_cast<QGraphicsRectWidget*>(item)->m_number);
int N = arr.size();
int h = 1;
while (h < N/3) h = 3*h + 1;
while (h >= 1)
{
for (int i = h; i < N; ++i)
{
for (int j = i; less(arr, j, j-h) && j >= h; j -= h)
{
qswap(arr, j, j-h);
manager.addPair(j, j - h);
}
}
h /= 3;
}
}
And that one does well.
/* This method works just fine */
void ins_sort(GraphicsView &window, SwapManager &manager)
{
auto list = window.items();
int i, j;
QVector<int> arr;
for (auto item : list)
{
arr.push_back(static_cast<QGraphicsRectWidget*>(item)->m_number);
}
int N = arr.size();
for (i = 1; i < N; ++i)
{
for (j = i - 1; j != -1 && less(arr, j + 1, j); --j)
{
qswap(arr, j, j + 1);
manager.addPair(j, j + 1);
}
}
}
Debugger points to this piece of code in "qvector.h"
Q_ASSERT_X(i >= 0 && i < d->size, "QVector<T>::operator[]", "index out of range");
return data()[i]; }
In the for-loop condition there is sense to check j value before comparing items:
for (int j = i; j >= h && less(arr, j, j-h); j -= h)
I have a University assignment whereby I have a 1D array, containing 262144 values. I've created a matrix class which places these values into an object with the datasource being the double* list of 262144 values.
I need to be able to obtain a sub-matrix (which I'm able to do) from ANOTHER set of 262144 values (which I've also placed into a matrix object).
However, I'm having serious trouble and I've been trying so hard for the last 3 days to try and replace original matrix values from a sub-matrix. I've tried passing by reference, creating Matrix*'s. I've tried everything we've been taught and even researched a few more methods, all of which I haven't understood. I'll throw my code in here to see if anyone can explain a method to me which will be able to do this.
Matrix::Matrix()
{
"Matrix::Matrix() is invoked";
}
Matrix::Matrix(const Matrix& m)
{
"Matrix::Matrix(const Matrix&) is invoked";
_M = m._M;
_N = m._N;
_data = new double[_M*_N];
for (int i = 0; i < _M*_N; i++)
{
_data[i] = m._data[i];
}
}
Matrix::Matrix(int sizeR, int sizeC, double *input_data)
{
"Matrix::Matrix(int sizeR, int sizeC, double *input_data is invoked";
_M = sizeR;
_N = sizeC;
_data = new double[_M*_N];
for (int i = 0; i < _M*_N; i++)
{
_data[i] = input_data[i];
}
}
Matrix Matrix::get_Block(int start_row, int end_row, int start_coloumn, int end_coloumn)
{
int rows = (end_row - start_row);
int columns = (end_coloumn - start_coloumn);
int ctr = 0;
double *temp_Data = new double[rows*columns];
for (int x = start_row; x < (rows + start_row); x++)
{
for (int y = start_coloumn; y < (columns + start_coloumn); y++)
{
temp_Data[ctr] = get(x, y);
ctr++;
}
}
Matrix block(rows, columns, temp_Data);
delete[] temp_Data;
return block;
}
Matrix Matrix::operator+(const Matrix & other)
{
Matrix temp;
temp._M = other._M;
temp._N = other._N;
temp._data = new double[temp._M*temp._N];
for (int x = 0; x < (temp._M*temp._N); x++)
{
temp._data[x] = this->_data[x] + other._data[x];
}
return temp;
}
Matrix Matrix::operator*(const Matrix & other)
{
Matrix temp;
temp._M = other._M;
temp._N = other._N;
temp._data = new double[temp._M*temp._N];
for (int x = 0; x < (temp._M*temp._N); x++)
{
temp._data[x] = this->_data[x] * other._data[x];
}
return temp;
}
Matrix Matrix::operator-(const Matrix & other)
{
Matrix temp;
temp._M = other._M;
temp._N = other._N;
temp._data = new double[temp._M*temp._N];
for (int x = 0; x < (temp._M*temp._N); x++)
{
temp._data[x] = this->_data[x] - other._data[x];
}
return temp;
}
void Matrix::replace_Block(Matrix& noisy, Matrix& shuffled,int k, int j, int i)
{
int val_to_replace = 0;
for (int i = 0; i < 3 * 3; i++)
{
val_to_replace = shuffled.get(i, j);
noisy.set(i, j, val_to_replace);
}
}
void Matrix::set_Block(Matrix block, Matrix& Noisy, int start_row, int end_row)
{
int ctr = 0;
int ctr2 = 0;
int ctr3 = 0;
for (int i = 0; i < 3; i++)
{
Noisy._data[(start_row*_M)+i+4] = block.get(i, ctr);
ctr++;
}
for (int j = 0; j < 3; j++)
{
Noisy._data[((start_row + 1)*_M) + j + 3] = block.get(j, ctr2);
ctr2++;
}
for (int j = 0; j < 3; j++)
{
Noisy._data[((start_row + 1)*_M) + j + 2] = block.get(j, ctr3);
ctr3++;
}
}
double Matrix::get_Sum(Matrix m)
{
double total = 0;
short row = m.get_M();
short column = m.get_N();
for (int j = 0; j < row; j++)
{
for (int i = 0; i < column; i++)
{
total += m.get(j,i);
}
}
return total;
}
double Matrix::get_Sum(Matrix* m)
{
double total = 0;
short row = m->get_M();
short column = m->get_N();
for (int j = 0; j < row; j++)
{
for (int i = 0; i < column; i++)
{
total += m->get(i, j);
}
}
return total;
}
double Matrix::get(int i, int j)
{
return _data[(i * _M) + j];
}
void Matrix::write_Block(int i, int j)
{
for (int ctr = 0; ctr < i; ctr++)
{
for (int ctr2 = 0; ctr2 < j; ctr2++)
{
std::cout << " " << this->get(ctr,ctr2);
}
std::cout << std::endl;
}
}
void Matrix::set(int i, int j, double val)
{
this->_data[(i*_M) + j] = val;
}
void Matrix::set_N(int N)
{
_N = N;
}
void Matrix::set_M(int M)
{
_M = M;
}
int Matrix::get_N()
{
return _N;
}
int Matrix::get_M()
{
return _M;
}
Matrix::~Matrix()
{
"Matrix::~Matrix() is invoked";
delete[] _data;
}
If it would be helpful to see main() I can supply that too, however all it really contains is the creation of the matrix objects using overloaded constructors.
explanation
Answer is only 4 years late . . .
Anyway. Maybe it will help somebody else. The secret is to use a std::valarray. With that it is utmost simple to work on a matrix. And, many many functions are available.
All the functions that you want to implement are already available.
And you sub-matrix coy can be a one liner . . .
Please see example code:
#include <iostream>
#include <algorithm>
#include <numeric>
#include <valarray>
#include <iomanip>
constexpr size_t NRows = 6;
constexpr size_t NCols = 8;
constexpr size_t SubNRows = 2;
constexpr size_t SubNCols = 3;
void debugPrint(std::valarray<int> &v, size_t nrows = NRows, size_t ncols = NCols)
{
for (int r = 0; r < nrows; ++r) {
for (int c = 0; c < ncols; ++c)
std::cout << std::setw(3) << v[r*ncols+c] << ' ';
std::cout << '\n';
}
std::cout << '\n';
}
int main()
{
std::valarray<int> v1(NRows * NCols); // Define array with given size
std::iota(std::begin(v1),std::end(v1),0); // Fill the array with consecutive nunbers
debugPrint (v1); // Print the result
std::cout << "\nSum = " << v1.sum() << "\n\n"; // Print the sum of all values in matrix
std::valarray<int> v2(v1); // Create a 2nd matrix as a copy to the first
v2 += 100; // Add 100 to each value in the matrix
debugPrint(v2);
std::valarray<int> v3(NCols); // Get one column
v3 = v1[std::slice(2,NRows,NCols)];
debugPrint(v3,NRows,1);
std::valarray<int> subV2(SubNRows*SubNCols); // So, now the sub array
subV2 = v2[std::gslice(12,{SubNRows, SubNCols},{NCols,1})]; // Slice it out
debugPrint(subV2, SubNRows, SubNCols);
v1[std::gslice(25,{SubNRows, SubNCols},{NCols,1})] = subV2; // And copy to the first array
debugPrint (v1);
return 0;
}
I am trying to achieve the fftshift function (from MATLAB) in c++ with for loop and it's really time-consuming. here is my code:
const int a = 3;
const int b = 4;
const int c = 5;
int i, j, k;
int aa = a / 2;
int bb = b / 2;
int cc = c / 2;
double ***te, ***tempa;
te = new double **[a];
tempa = new double **[a];
for (i = 0; i < a; i++)
{
te[i] = new double *[b];
tempa[i] = new double *[b];
for (j = 0; j < b; j++)
{
te[i][j] = new double [c];
tempa[i][j] = new double [c];
for (k = 0; k < c; k++)
{
te[i][j][k] = i + j+k;
}
}
}
/*for the row*/
if (c % 2 == 1)
{
for (i = 0; i < a; i++)
{
for (j = 0; j < b; j++)
{
for (k = 0; k < cc; k++)
{
tempa[i][j][k] = te[i][j][k + cc + 1];
tempa[i][j][k + cc] = te[i][j][k];
tempa[i][j][c - 1] = te[i][j][cc];
}
}
}
}
else
{
for (i = 0; i < a; i++)
{
for (j = 0; j < b; j++)
{
for (k = 0; k < cc; k++)
{
tempa[i][j][k] = te[i][j][k + cc];
tempa[i][j][k + cc] = te[i][j][k];
}
}
}
}
for (i = 0; i < a; i++)
{
for (j = 0; j < b; j++)
{
for (k = 0; k < c; k++)
{
te[i][j][k] = tempa[i][j][k];
}
}
}
/*for the column*/
if (b % 2 == 1)
{
for (i = 0; i < a; i++)
{
for (j = 0; j < bb; j++)
{
for (k = 0; k < c; k++)
{
tempa[i][j][k] = te[i][j + bb + 1][k];
tempa[i][j + bb][k] = te[i][j][k];
tempa[i][b - 1][k] = te[i][bb][k];
}
}
}
}
else
{
for (i = 0; i < a; i++)
{
for (j = 0; j < bb; j++)
{
for (k = 0; k < c; k++)
{
tempa[i][j][k] = te[i][j + bb][k];
tempa[i][j + bb][k] = te[i][j][k];
}
}
}
}
for (i = 0; i < a; i++)
{
for (j = 0; j < b; j++)
{
for (k = 0; k < c; k++)
{
te[i][j][k] = tempa[i][j][k];
}
}
}
/*for the third dimension*/
if (a % 2 == 1)
{
for ( i = 0; i < aa; i++)
{
for (j = 0; j < b; j++)
{
for ( k = 0; k < c; k++)
{
tempa[i][j][k] = te[i + aa + 1][j][k];
tempa[i + aa][j][k] = te[i][j][k];
tempa[a - 1][j][k] = te[aa][j][k];
}
}
}
}
else
{
for (i = 0; i < aa; i++)
{
for ( j = 0; j < b; j++)
{
for ( k = 0; k < c; k++)
{
tempa[i][j][k] = te[i + aa][j][k];
tempa[i + aa][j][k] = te[i][j][k];
}
}
}
}
for (i = 0; i < a; i++)
{
for (j = 0; j < b; j++)
{
for (k = 0; k < c; k++)
{
cout << te[i][j][k] << ' ';
}
cout << endl;
}
cout << "\n";
}
cout << "and then" << endl;
for (i = 0; i < a; i++)
{
for (j = 0; j < b; j++)
{
for (k = 0; k < c; k++)
{
cout << tempa[i][j][k] << ' ';
}
cout << endl;
}
cout << "\n";
}
now I want to rewrite it with memmove to improve the running efficiency.
For the 3rd dimension, I use:
memmove(tempa, te + aa, sizeof(double)*(a - aa));
memmove(tempa + aa+1, te, sizeof(double)* aa);
this code can works well with 1d and 2d array, but doesn't work for the 3d array. Also, I do not know how to move the column and row elements with memmove. Anyone can help me with all of these? thanks so much!!
Now I have modified the code as below:
double ***te, ***tempa1,***tempa2, ***tempa3;
te = new double **[a];
tempa1 = new double **[a];
tempa2 = new double **[a];
tempa3 = new double **[a];
for (i = 0; i < a; i++)
{
te[i] = new double *[b];
tempa1[i] = new double *[b];
tempa2[i] = new double *[b];
tempa3[i] = new double *[b];
for (j = 0; j < b; j++)
{
te[i][j] = new double [c];
tempa1[i][j] = new double [c];
tempa2[i][j] = new double [c];
tempa3[i][j] = new double [c];
for (k = 0; k < c; k++)
{
te[i][j][k] = i + j+k;
}
}
}
/*for the third dimension*/
memmove(tempa1, te + (a-aa), sizeof(double**)*aa);
memmove(tempa1 + aa, te, sizeof(double**)* (a-aa));
//memmove(te, tempa, sizeof(double)*a);
/*for the row*/
for (i = 0; i < a; i++)
{
memmove(tempa2[i], tempa1[i] + (b - bb), sizeof(double*)*bb);
memmove(tempa2[i] + bb, tempa1[i], sizeof(double*)*(b - bb));
}
/*for the column*/
for (j = 0; i < a; i++)
{
for (k = 0; j < b; j++)
{
memmove(tempa3[i][j], tempa2[i][j] + (c - cc), sizeof(double)*cc);
memmove(tempa3[i][j] + cc, tempa2[i][j], sizeof(double)*(c-cc));
}
}
but the problem is that I define too much new dynamic arrays and also the results for tempa3 are incorrect. could anyone give some suggestions?
I believe you want something like that:
memmove(tempa, te + (a - aa), sizeof(double**) * aa);
memmove(tempa + aa, te, sizeof(double**) * (a - aa));
or
memmove(tempa, te + aa, sizeof(double**) * (a - aa));
memmove(tempa + (a - aa), te, sizeof(double**) * aa);
depending on whether you want to swap the first half "rounded up or down" (I assume you want it rounded up, it's the first version then).
I don't really like your code's design though:
First and foremost, avoid dynamic allocation and use std::vector or std::array when possible.
You could argue it would prevent you from safely using memmove instead of swap for the first dimensions (well, it should work, but I'm not 100% sure it isn't implementation defined) but I don't think that would improve that much the efficiency.
Besides, if you want to have a N-dimensional array, I usually prefer avoiding "chaining pointers" (although with your algorithm, you can actually use this structure, so it's not that bad).
For instance, if you're adamant about dynamically allocating your array with new, you might use something like that instead to reduce memory usage (the difference might be neglectible though; it's also probably slightly faster but again, probably neglectible):
#include <cstddef>
#include <iostream>
typedef std::size_t index_t;
constexpr index_t width = 3;
constexpr index_t height = 4;
constexpr index_t depth = 5;
// the cells (i, j, k) and (i, j, k+1) are adjacent in memory
// the rows (i, j, _) and (i, j+1, _) are adjacent in memory
// the "slices" (i, _, _) and (i+1, _, _) are adjacent in memory
constexpr index_t cell_index(index_t i, index_t j, index_t k) {
return (i * height + j) * depth + k;
}
int main() {
int* array = new int[width * height * depth]();
for( index_t i = 0 ; i < width ; ++i )
for( index_t j = 0 ; j < height ; ++j )
for( index_t k = 0 ; k < depth ; ++k ) {
// do something on the cell (i, j, k)
array[cell_index(i, j, k)] = i + j + k;
std::cout << array[cell_index(i, j, k)] << ' ';
}
std::cout << '\n';
// alternatively you can do this:
//*
for( index_t index = 0 ; index < width * height * depth ; ++index) {
index_t i = index / (height * depth);
index_t j = (index / depth) % height;
index_t k = index % depth;
array[index] = i + j + k;
std::cout << array[index] << ' ';
}
std::cout << '\n';
//*/
delete[] array;
}
The difference is the organization in memory. Here you have a big block of 60*sizeof(int) bytes (usually 240 or 480 bytes), whereas with your method you would have:
- 1 block of 3*sizeof(int**) bytes
- 3 blocks of 4*sizeof(int*) bytes
- 12 blocks of 5*sizeof(int) bytes
(120 more bytes on a 64 bit architecture, two additional indirections for each cell access, and more code for allocating/deallocating all that memory)
Granted, you can't do array[i][j][k] anymore, but still...
The same stands with vectors (you can either make an std::vector<std::vector<std::vector<int>>> or a std::vector<int>)
There is also a bit too much code repetition: your algorithm basically swaps the two halves of your table three times (once for each dimension), but you rewrote 3 times the same thing with a few differences.
There is also too much memory allocation/copy (your algorithm works and can exploit the structure of array of pointers by simply swapping pointers to swap whole rows/slices, in that specific case, you can exploit this data structure to avoid copies with your algorithm... but you don't)
You should choose more explicit variable names, that helps. For instance use width, height, depth instead of a, b, c.
For instance, here is an implementation with vectors (I didn't know matlab's fftshift function though, but according to your code and this page, I assume it's basically "swapping the corners"):
(also, compile with -std=c++11)
#include <cstddef>
#include <iostream>
#include <vector>
#include <algorithm>
typedef std::size_t index_t;
typedef double element_t;
typedef std::vector<element_t> row_t;
typedef std::vector<row_t> slice_t;
typedef std::vector<slice_t> array_3d_t;
// for one dimension
// you might overload this for a std::vector<double>& and use memmove
// as you originally wanted to do here
template<class T>
void fftshift_dimension(std::vector<T>& row)
{
using std::swap;
const index_t size = row.size();
if(size <= 1)
return;
const index_t halved_size = size / 2;
// swap the two halves
for(index_t i = 0, j = size - halved_size ; i < halved_size ; ++i, ++j)
swap(row[i], row[j]);
// if the size is odd, rotate the right part
if(size % 2)
{
swap(row[halved_size], row[size - 1]);
const index_t n = size - 2;
for(index_t i = halved_size ; i < n ; ++i)
swap(row[i], row[i + 1]);
}
}
// base case
template<class T>
void fftshift(std::vector<T>& array) {
fftshift_dimension(array);
}
// reduce the problem for a dimension N+1 to a dimension N
template<class T>
void fftshift(std::vector<std::vector<T>>& array) {
fftshift_dimension(array);
for(auto& slice : array)
fftshift(slice);
}
// overloads operator<< to print a 3-dimensional array
std::ostream& operator<<(std::ostream& output, const array_3d_t& input) {
const index_t width = input.size();
for(index_t i = 0; i < width ; i++)
{
const index_t height = input[i].size();
for(index_t j = 0; j < height ; j++)
{
const index_t depth = input[i][j].size();
for(index_t k = 0; k < depth; k++)
output << input[i][j][k] << ' ';
output << '\n';
}
output << '\n';
}
return output;
}
int main()
{
constexpr index_t width = 3;
constexpr index_t height = 4;
constexpr index_t depth = 5;
array_3d_t input(width, slice_t(height, row_t(depth)));
// initialization
for(index_t i = 0 ; i < width ; ++i)
for(index_t j = 0 ; j < height ; ++j)
for(index_t k = 0 ; k < depth ; ++k)
input[i][j][k] = i + j + k;
std::cout << input;
// in place fftshift
fftshift(input);
std::cout << "and then" << '\n' << input;
}
live example
You could probably make a slightly more efficient algorithm by avoiding to swap multiple times the same cell and/or using memmove, but I think it's already fast enough for many uses (on my machine fftshift takes roughly 130ms for a 1000x1000x100 table).