How to improve memory management for matrix multiplication - c++

I'm trying to learn about matrix multiplication and encounter this code for Strassen multiplication vs standard matrix multiplication, so I've tried to implement it. However, this code uses too much memory to the point that when the matrix it's big enough it kills the program. Also, because it uses too much memory it takes longer to process.
I'm not too comfortable to mess around with the code too much since I don't fully understand complex memory management and I would really like to learn about this topic.
Build in the code there's a cut parameter and found that at 320 makes it run faster and seems like improves with memory management.
EDIT. I've implemented a copy constructor, destructor and a function to track memory usage and it fixed the memory leaks it was having, but the big jump on the time between 1990 dimension to 2100 still there for the Strassen matrix.
matrix.h
#ifndef MATRIX_H
#define MATRIX_H
#include <vector>
using namespace std;
class matrix
{
public:
matrix(int dim, bool random, bool strassen);
matrix(const matrix& old_m);
inline int dim() {
return dim_;
}
inline int& operator()(unsigned row, unsigned col) {
return data_[dim_ * row + col];
}
inline int operator()(unsigned row, unsigned col) const {
return data_[dim_ * row + col];
}
void print();
matrix operator+(matrix b);
matrix operator-(matrix b);
~matrix();
private:
int dim_;
int* data_;
};
#endif
Matrix.cpp
#include <iostream>
#include <vector>
#include <stdlib.h>
#include <time.h>
#include "SAMmatrix.h"
using namespace std;
matrix::matrix(int dim, bool random, bool strassen) : dim_(dim) {
if (strassen) {
int dim2 = 2;
while (dim2 < dim)
dim2 *= 2;
dim_ = dim2;
}
data_ = new int[dim_ * dim_];
if (!random) return;
for (int i = 0; i < dim_ * dim_; i++)
data_[i] = rand() % 10;
}
matrix::matrix(const matrix& old_m){
dim_ = old_m.dim_;
data_ = new int[dim_ * dim_];
for (int i = 0; i < dim_ * dim_; i++)
data_[i] = old_m.data_[i];
}
void matrix::print() {
for (int i = 0; i < dim_; i++) {
for (int j = 0; j < dim_; j++)
cout << (*this)(i, j) << " ";
cout << "\n";
}
cout << "\n";
}
matrix matrix::operator+(matrix b) {
matrix c(dim_, false, false);
for (int i = 0; i < dim_; i++)
for (int j = 0; j < dim_; j++)
c(i, j) = (*this)(i, j) + b(i, j);
return c;
}
matrix matrix::operator-(matrix b) {
matrix c(dim_, false, false);
for (int i = 0; i < dim_; i++)
for (int j = 0; j < dim_; j++)
c(i, j) = (*this)(i, j) - b(i, j);
return c;
}
matrix::~matrix()
{
delete [] data_;
}
Matrix main
#include <iostream>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
#include "SAMmatrix.h"
#include "stdlib.h"
#include "stdio.h"
#include "string.h"
typedef pair<matrix, long> result;
int cut = 64;
matrix mult_std(matrix a, matrix b)
{
matrix c(a.dim(), false, false);
for (int i = 0; i < a.dim(); i++)
for (int k = 0; k < a.dim(); k++)
for (int j = 0; j < a.dim(); j++)
c(i, j) += a(i, k) * b(k, j);
return c;
}
matrix get_part(int pi, int pj, matrix m)
{
matrix p(m.dim() / 2, false, true);
pi = pi * p.dim();
pj = pj * p.dim();
for (int i = 0; i < p.dim(); i++)
for (int j = 0; j < p.dim(); j++)
p(i, j) = m(i + pi, j + pj);
return p;
}
void set_part(int pi, int pj, matrix* m, matrix p)
{
pi = pi * p.dim();
pj = pj * p.dim();
for (int i = 0; i < p.dim(); i++)
for (int j = 0; j < p.dim(); j++)
(*m)(i + pi, j + pj) = p(i, j);
}
matrix mult_strassen(matrix a, matrix b)
{
if (a.dim() <= cut)
return mult_std(a, b);
matrix a11 = get_part(0, 0, a);
matrix a12 = get_part(0, 1, a);
matrix a21 = get_part(1, 0, a);
matrix a22 = get_part(1, 1, a);
matrix b11 = get_part(0, 0, b);
matrix b12 = get_part(0, 1, b);
matrix b21 = get_part(1, 0, b);
matrix b22 = get_part(1, 1, b);
matrix m1 = mult_strassen(a11 + a22, b11 + b22);
matrix m2 = mult_strassen(a21 + a22, b11);
matrix m3 = mult_strassen(a11, b12 - b22);
matrix m4 = mult_strassen(a22, b21 - b11);
matrix m5 = mult_strassen(a11 + a12, b22);
matrix m6 = mult_strassen(a21 - a11, b11 + b12);
matrix m7 = mult_strassen(a12 - a22, b21 + b22);
matrix c(a.dim(), false, true);
set_part(0, 0, &c, m1 + m4 - m5 + m7);
set_part(0, 1, &c, m3 + m5);
set_part(1, 0, &c, m2 + m4);
set_part(1, 1, &c, m1 - m2 + m3 + m6);
return c;
}
pair<matrix, long> run(matrix(*f)(matrix, matrix), matrix a, matrix b)
{
struct timeval start, end;
gettimeofday(&start, NULL);
matrix c = f(a, b);
gettimeofday(&end, NULL);
long e = (end.tv_sec * 1000 + end.tv_usec / 1000);
long s = (start.tv_sec * 1000 + start.tv_usec / 1000);
return pair<matrix, long>(c, e - s);
}
int parseLine(char* line){ /* overflow*/
// This assumes that a digit will be found and the line ends in " Kb".
int i = strlen(line);
const char* p = line;
while (*p <'0' || *p > '9') p++;
line[i-3] = '\0';
i = atoi(p);
return i;
}
int getValue(){ //Note: this value is in KB!
FILE* file = fopen("/proc/self/status", "r");
int result = -1;
char line[128];
while (fgets(line, 128, file) != NULL){
if (strncmp(line, "VmSize:", 7) == 0){
result = parseLine(line);
break;
}
}
fclose(file);
return result;
}
int main()
{
/* test cut of for strassen
/*
for (cut = 2; cut <= 512; cut++) {
matrix a(512, true, true);
matrix b(512, true, true);
result r = run(mult_strassen, a, b);
cout << cut << " " << r.second << "\n";
}
*/
/* performance test: standard and strassen */
/*1024 going up by 64*/
for (int dim = 1500; dim <= 2300; dim += 200)
{
double space = getValue() * .01;
cout << "Space before: " << space << "Mb" << "\n";
matrix a(dim, true, false);
matrix b(dim, true, false);
result std = run(mult_std, a, b);
matrix c(dim, true, true);
matrix d(dim, true, true);
result strassen = run(mult_strassen, c, d);
cout << "Dim " << " Std " << " Stranssen" << endl;
cout << dim << " " << std.second << "ms " << strassen.second << "ms " << "\n";
double spaceA = getValue() * .01;
cout << "Space: " << spaceA << "Mb" << "\n";
cout << " " << endl;
}
}
I set it to go from 1500 to 2300 by 200 and the program is "killed" before finishing
1500 2406 4250
1700 3463 4252
1900 4819 4247
2100 6487 30023
Killed
Also, it shouldn't make a big jump on time like that when the dimension goes from 1900 to 2100.

Related

How to output the line of the function call that created logic error

I'm creating a Matrix math library with CUDA to improve my CNNs performance (and to understand C++ better).
I would like to be able to add error handling and tell the user (me) what has gone wrong when using the matrix class.
This can be seen in my main file as, in this case, I'm trying to add a 10 * 10 matrix to a 15 * 15 matrix. This is an impossible action and would like some output to tell the user. for example
Error in file "Main.cu" on line: 9 (Dimensions inconsistent)
If you check inside the function the line number is line number of the check and I've looked at using macros to check but I'm wondering if there is another way without having to call the macro every time I add two matrices together.
Main.cu
#include "Matrix.cuh"
int main() {
double* init;
cudaMallocManaged(&init, sizeof(double));
Matrix A(10, 10, 2);
Matrix B(15, 15, 3);
Matrix C = A + B;
A.printM("A");
B.printM("B");
C.printM("C");
//cudaFree(init);
return 0;
}
Matrix.cu
#include "Matrix.cuh"
__global__
void sumMatrix(Matrix* A, Matrix* B, Matrix* C)
{
int x = blockIdx.x * BLOCK_SIZE + threadIdx.x;
int y = blockIdx.y * BLOCK_SIZE + threadIdx.y;
if (x < A->ColumnCount && y < A->RowCount)
{
C->VALUES[y * A->ColumnCount + x] = A->VALUES[y * A->ColumnCount + x] + B->VALUES[y * A->ColumnCount + x];
}
}
__global__
void matrixInit(Row* rows, int R, int C, double* VALUES, double val) {
int x = blockIdx.x * BLOCK_SIZE + threadIdx.x;
int y = blockIdx.y * BLOCK_SIZE + threadIdx.y;
if (x < C && y < R)
{
if (x == 0)
{
rows[y].Count = C;
rows[y].values = VALUES + C * y;
}
VALUES[y * C + x] = val;
}
}
Matrix::Matrix(int R, int C, double val)
{
cudaMallocManaged(&VALUES, R * C * sizeof(double));
cudaMallocManaged(&rows, R * sizeof(Row));
RowCount = R;
ColumnCount = C;
dim3 gridDim(ceil(C / (double)BLOCK_SIZE), ceil(R / (double)BLOCK_SIZE), 1);
dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE, 1);
matrixInit << <gridDim, blockDim >> > (rows, R, C, VALUES, val);
cudaDeviceSynchronize();
cudaCheckErrors("MATRIX INIT VAL");
}
Matrix::Matrix(int R, int C)
{
cudaMallocManaged(&VALUES, R * C * sizeof(double));
cudaMallocManaged(&rows, R * sizeof(Row));
RowCount = R;
ColumnCount = C;
dim3 gridDim(ceil(C / (double)BLOCK_SIZE), ceil(R / (double)BLOCK_SIZE), 1);
dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE, 1);
matrixInit << <gridDim, blockDim >> > (rows, R, C, VALUES, 0);
cudaDeviceSynchronize();
cudaCheckErrors("MATRIX INIT VAL");
}
void Matrix::updatePointers()
{
for (size_t i = 0; i < RowCount; i++)
{
rows[i].values = VALUES + (i * ColumnCount);
}
}
void Matrix::removePointers()
{
VALUES = nullptr;
rows = nullptr;
}
void Matrix::printM(const char* msg)
{
std::cout << "Matrix " << msg << ": " << RowCount << "*" << ColumnCount << std::endl;
for (size_t i = 0; i < RowCount; i++)
{
for (size_t j = 0; j < ColumnCount; j++)
{
std::cout << rows[i][j] << " ";
}
std::cout << std::endl;
}
}
Matrix Matrix::sum(Matrix B)
{
Matrix* A_p, * B_p, * C_p;
Matrix C(RowCount, ColumnCount);
cudaMallocManaged(&A_p, sizeof(Matrix));
cudaMallocManaged(&B_p, sizeof(Matrix));
cudaMallocManaged(&C_p, sizeof(Matrix));
memcpy(A_p, this, sizeof(Matrix));
memcpy(B_p, &B, sizeof(Matrix));
memcpy(C_p, &C, sizeof(Matrix));
dim3 gridDim(ceil(ColumnCount / (double)BLOCK_SIZE), ceil(RowCount / (double)BLOCK_SIZE), 1);
dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE, 1);
sumMatrix << < gridDim, blockDim >> > (A_p, B_p, C_p);
cudaDeviceSynchronize();
cudaCheckErrors("SUM");
B.removePointers();
C.removePointers();
return *C_p;
}
Row& Matrix::operator[](size_t i)
{
if (i >= RowCount)
{
std::cout << "OUT OF BOUNDS";
std::exit(1);
}
return rows[i];
}
Matrix& Matrix::operator+(Matrix B)
{
Matrix C = sum(B);
Matrix* C_p;
cudaMallocManaged(&C_p, sizeof(Matrix));
memcpy(C_p, &C, sizeof(Matrix));
B.removePointers();
C.removePointers();
return *C_p;
}
Matrix::~Matrix()
{
if (VALUES != nullptr && rows != nullptr)
{
cudaFree(VALUES);
cudaFree(rows);
}
}

C++ neural network implemented from scratch cannot get above 50% on MNIST

So I have implemented a fully connected one hidden layer neural network in C++ using Eigen for matrix multiplication. It uses minibatch gradient descent.
However, my model cannot get above 50% accuracy on mnist. I have tried learning rates from between 0.0001 and 10. The model does overfit on training sizes < 100 (with ~90% accuracy which is still pretty bad), albeit extremely slowly.
What might be causing this low accuracy and extremely slow learning? My main concern is that the backpropagation is incorrect. Furthermore, I would prefer not to add any other optimization techniques (learning rate schedule, regularization, etc.).
Feed forward and backprop code:
z1 = (w1 * mbX).colwise() + b1;
a1 = sigmoid(z1);
z2 = (w2 * a1).colwise() + b2;
a2 = sigmoid(z2);
MatrixXd err = ((double) epsilon)/((double) minibatch_size) * ((a2 - mbY).array() * sigmoid_derivative(z2).array()).matrix();
b2 = b2 - err * ones;
w2 = w2 - (err * a1.transpose());
err = ((w2.transpose() * err).array() * sigmoid_derivative(z1).array()).matrix();
b1 = b1 - err * ones;
w1 = w1 - (err * mbX.transpose());
Full program code:
#include <iostream>
#include <fstream>
#include <math.h>
#include <cstdlib>
#include <Eigen/Dense>
#include <vector>
#include <string>
using namespace Eigen;
#define N 30
#define epsilon 0.7
#define epoch 1000
//sizes
const int minibatch_size = 10;
const int training_size = 10000;
const int val_size = 10;
unsigned int num, magic, rows, cols;
//images
unsigned int image[training_size][28][28];
unsigned int val_image[val_size][28][28];
//labels
unsigned int label[training_size];
unsigned int val_label[val_size];
//inputs
MatrixXd X(784, training_size);
MatrixXd Y = MatrixXd::Zero(10, training_size);
//minibatch
MatrixXd mbX(784, minibatch_size);
MatrixXd mbY = MatrixXd::Zero(10, minibatch_size);
//validation
MatrixXd Xv(784, val_size);
MatrixXd Yv = MatrixXd::Zero(10, val_size);
//Image processing courtesy of https://stackoverflow.com/users/11146076/%e5%bc%a0%e4%ba%91%e9%93%ad
unsigned int in(std::ifstream& icin, unsigned int size) {
unsigned int ans = 0;
for (int i = 0; i < size; i++) {
unsigned char x;
icin.read((char*)&x, 1);
unsigned int temp = x;
ans <<= 8;
ans += temp;
}
return ans;
}
void input(std::string ipath, std::string lpath, std::string ipath2, std::string lpath2) {
std::ifstream icin;
//training data
icin.open(ipath, std::ios::binary);
magic = in(icin, 4), num = in(icin, 4), rows = in(icin, 4), cols = in(icin, 4);
for (int i = 0; i < training_size; i++) {
int val = 0;
for (int x = 0; x < rows; x++) {
for (int y = 0; y < cols; y++) {
image[i][x][y] = in(icin, 1);
X(val, i) = image[i][x][y]/255;
val++;
}
}
}
icin.close();
//training labels
icin.open(lpath, std::ios::binary);
magic = in(icin, 4), num = in(icin, 4);
for (int i = 0; i < training_size; i++) {
label[i] = in(icin, 1);
Y(label[i], i) = 1;
}
icin.close();
//validation data
icin.open(ipath2, std::ios::binary);
magic = in(icin, 4), num = in(icin, 4), rows = in(icin, 4), cols = in(icin, 4);
for (int i = 0; i < val_size; i++) {
int val = 0;
for (int x = 0; x < rows; x++) {
for (int y = 0; y < cols; y++) {
val_image[i][x][y] = in(icin, 1);
Xv(val, i) = val_image[i][x][y]/255;
val++;
}
}
}
icin.close();
//validation labels
icin.open(lpath2, std::ios::binary);
magic = in(icin, 4), num = in(icin, 4);
for (int i = 0; i < val_size; i++) {
val_label[i] = in(icin, 1);
Yv(val_label[i], i) = 1;
}
icin.close();
}
//Neural Network calculations
MatrixXd sigmoid(MatrixXd m) {
m *= -1;
return (1/(1 + m.array().exp())).matrix();
}
MatrixXd sigmoid_derivative(MatrixXd m) {
return (sigmoid(m).array() * (1 - sigmoid(m).array())).matrix();
}
//Initialize weights and biases
//hidden layer
VectorXd b1 = MatrixXd::Zero(N, 1);
MatrixXd w1 = MatrixXd::Random(N, 784);
//output
VectorXd b2 = MatrixXd::Zero(10, 1);
MatrixXd w2 = MatrixXd::Random(10, N);
//Initialize intermediate values
MatrixXd z1, z2, a1, a2, z1v, z2v, a1v, a2v;
MatrixXd ones = MatrixXd::Constant(minibatch_size, 1, 1);
int main() {
input("C:\\Users\\Aaron\\Documents\\Test\\train-images-idx3-ubyte\\train-images.idx3-ubyte", "C:\\Users\\Aaron\\Documents\\Test\\train-labels-idx1-ubyte\\train-labels.idx1-ubyte", "C:\\Users\\Aaron\\Documents\\Test\\t10k-images-idx3-ubyte\\t10k-images.idx3-ubyte", "C:\\Users\\Aaron\\Documents\\Test\\t10k-labels-idx1-ubyte\\t10k-labels.idx1-ubyte");
std::cout << "Finished Image Processing" << std::endl;
//std::cout << w1 << std::endl;
std::vector<double> val_ac;
std::vector<double> c;
std::vector<int> order;
for (int i = 0; i < training_size; i++) {
order.push_back(i);
}
for (int i = 0; i < epoch; i++) {
//feed forward
std::random_shuffle(order.begin(), order.end());
for (int j = 0; j < training_size/minibatch_size; j++) {
for (int k = 0; k < minibatch_size; k++) {
int index = order[j * minibatch_size + k];
mbX.col(k) = X.col(index);
mbY.col(k) = Y.col(index);
}
z1 = (w1 * mbX).colwise() + b1;
a1 = sigmoid(z1);
z2 = (w2 * a1).colwise() + b2;
a2 = sigmoid(z2);
MatrixXd err = ((double) epsilon)/((double) minibatch_size) * ((a2 - mbY).array() * sigmoid_derivative(z2).array()).matrix();
//std::cout << err << std::endl;
b2 = b2 - err * ones;
w2 = w2 - (err * a1.transpose());
err = ((w2.transpose() * err).array() * sigmoid_derivative(z1).array()).matrix();
//std::cout << err << std::endl;
b1 = b1 - err * ones;
w1 = w1 - (err * mbX.transpose());
}
//validation
z1 = (w1 * X).colwise() + b1;
a1 = sigmoid(z1);
z2 = (w2 * a1).colwise() + b2;
a2 = sigmoid(z2);
double cost = 1/((double) training_size) * ((a2 - Y).array() * (a2 - Y).array()).matrix().sum();
c.push_back(cost);
int correct = 0;
for (int i = 0; i < training_size; i++) {
double maxP = -1;
int na;
for (int j = 0; j < 10; j++) {
if (a2(j, i) > maxP) {
maxP = a2(j, i);
na = j;
}
}
if (na == label[i]) correct++;
}
val_ac.push_back(((double) correct) / ((double) training_size));
std::cout << "Finished Epoch " << i + 1 << std::endl;
std::cout << "Cost: " << cost << std::endl;
std::cout << "Accuracy: " << ((double) correct) / ((double) training_size) << std::endl;
}
//plot accuracy
FILE * gp = _popen("gnuplot", "w");
fprintf(gp, "set terminal wxt size 600,400 \n");
fprintf(gp, "set grid \n");
fprintf(gp, "set title '%s' \n", "NN");
fprintf(gp, "plot '-' w line, '-' w lines \n");
for (int i = 0; i < epoch; i++) {
fprintf(gp, "%f %f \n", i + 1.0, c[i]);
}
fprintf(gp, "e\n");
//validation accuracy
for (int i = 0; i < epoch; i++) {
fprintf(gp, "%f %f \n", i + 1.0, val_ac[i]);
}
fprintf(gp, "e\n");
fflush(gp);
system("pause");
_pclose(gp);
return 0;
}
UPD
Here is a graph of the accuracy on the training dataset (green) and the loss (purple)
https://i.stack.imgur.com/Ya2yR.png
Here is a graph of the loss for the training data and validation data:
https://imgur.com/a/4gmFCrk
The loss of the validation data is increasing past a certain point, which shows signs of overfitting. However, the accuracy still remains abysmal even on the training data.
unsigned int val_image[val_size][28][28];
Xv(val, i) = val_image[i][x][y]/255;
Can you try again with Xv(val, i) = val_image[i][x][y] / 255.0;
There too:
X(val, i) = image[i][x][y]/255;
With the code as written, Xv is 0 very often, and 1, when the image as value 255. With a floating point division, you'll get value between 0.0 and 1.0.
You'll need to check your code for other places where you may be dividing integers.
N.b.: In C++, 240/255 is 0.

Gradient descent converging towards the wrong value

I'm trying to implement a gradient descent algorithm in C++. Here's the code I have so far :
#include <iostream>
double X[] {163,169,158,158,161,172,156,161,154,145};
double Y[] {52, 68, 49, 73, 71, 99, 50, 82, 56, 46 };
double m, p;
int n = sizeof(X)/sizeof(X[0]);
int main(void) {
double alpha = 0.00004; // 0.00007;
m = (Y[1] - Y[0]) / (X[1] - X[0]);
p = Y[0] - m * X[0];
for (int i = 1; i <= 8; i++) {
gradientStep(alpha);
}
return 0;
}
double Loss_function(void) {
double res = 0;
double tmp;
for (int i = 0; i < n; i++) {
tmp = Y[i] - m * X[i] - p;
res += tmp * tmp;
}
return res / 2.0 / (double)n;
}
void gradientStep(double alpha) {
double pg = 0, mg = 0;
for (int i = 0; i < n; i++) {
pg += Y[i] - m * X[i] - p;
mg += X[i] * (Y[i] - m * X[i] - p);
}
p += alpha * pg / n;
m += alpha * mg / n;
}
This code converges towards m = 2.79822, p = -382.666, and an error of 102.88. But if I use my calculator to find out the correct linear regression model, I find that the correct values of m and p should respectively be 1.601 and -191.1.
I also noticed that the algorithm won't converge for alpha > 0.00007, which seems quite low, and the value of p barely changes during the 8 iterations (or even after 2000 iterations).
What's wrong with my code?
Here's a good overview of the algorithm I'm trying to implement. The values of theta0 and theta1 are called p and m in my program.
Other implementation in python
More about the algorithm
This link gives a comprehensive view of the algorithm; it turns out I was following a completely wrong approach.
The following code does not work properly (and I have no plans to work on it further), but should put on track anyone who's confronted to the same problem as me :
#include <vector>
#include <iostream>
typedef std::vector<double> vect;
std::vector<double> y, omega(2, 0), omega2(2, 0);;
std::vector<std::vector<double>> X;
int n = 10;
int main(void) {
/* Initialize x so that each members contains (1, x_i) */
/* Initialize x so that each members contains y_i */
double alpha = 0.00001;
display();
for (int i = 1; i <= 8; i++) {
gradientStep(alpha);
display();
}
return 0;
}
double f_function(const std::vector<double> &x) {
double c;
for (unsigned int i = 0; i < omega.size(); i++) {
c += omega[i] * x[i];
}
return c;
}
void gradientStep(double alpha) {
for (int i = 0; i < n; i++) {
for (unsigned int j = 0; j < X[0].size(); j++) {
omega2[j] -= alpha/(double)n * (f_function(X[i]) - y[i]) * X[i][j];
}
}
omega = omega2;
}
void display(void) {
double res = 0, tmp = 0;
for (int i = 0; i < n; i++) {
tmp = y[i] - f_function(X[i]);
res += tmp * tmp; // Loss functionn
}
std::cout << "omega = ";
for (unsigned int i = 0; i < omega.size(); i++) {
std::cout << "[" << omega[i] << "] ";
}
std::cout << "\tError : " << res * .5/(double)n << std::endl;
}

keep the signed value that has minimal absolute value in two matrix in OpenCV

In OpenCV, I have two matrix One and Two which are the same size. I want to find the signed value that has minimal absolute value in both matrix and keep it in matrix One. For this, I use following code:
for (int i = 0; i < One.rows; ++i)
{
p=One.ptr<float>(i);
p_two = Two.ptr<float>(i);
for (int j = 0; j < One.cols; ++j)
{
if(fabsf(p_two[j])<fabsf(p[j]))
p[j] = p_two[j];
}
}
This code seems to be the bottleneck in my program. Does anyone know how to improve the performance? Thanks a lot!
Your code is not the bottleneck of your program. It's indeed very fast. You need to profile your code to see where the actual bottleneck is.
You can optimize it a little in case your matrices are continuous (which is very often in practice), like:
int rows = one.rows;
int cols = one.cols;
if (one.isContinuous() && two.isContinuous())
{
cols = rows * cols;
rows = 1;
}
for (int r = 0; r < rows; ++r)
{
float* pone = one.ptr<float>(r);
float* ptwo = two.ptr<float>(r);
for (int c = 0; c < cols; ++c)
{
if (fabs(ptwo[c]) < fabs(pone[c]))
{
pone[c] = ptwo[c];
}
}
}
Here a small evaluation also against the good alternative method proposed by #s1h in the comments:
two.copyTo(one, abs(two) < abs(one));
Time (in ms)
Size: Yuanhao s1h Miki
[3 x 3] 0.000366543 0.117294 0.000366543
[10 x 10] 0.00109963 0.0157614 0.00109963
[100 x 100] 0.0964009 0.139653 0.112529
[1280 x 720] 8.70577 11.0267 8.65372
[1000 x 1000] 9.66538 13.5068 9.02026
[1920 x 1080] 16.5681 26.9706 15.7412
[4096 x 3112] 104.423 135.629 102.595
[5000 x 5000] 196.124 277.457 187.203
You see that your method is very fast. Mine is a little bit faster. #s1h is slower, but more concise and easy to read.
Code
You can evaulate the results on your PC with this:
#include <opencv2/opencv.hpp>
#include <iostream>
using namespace std;
using namespace cv;
int main()
{
vector<Size> sizes{ Size(3, 3), Size(10, 10), Size(100, 100), Size(1280, 720), Size(1000, 1000), Size(1920, 1080), Size(4096, 3112), Size(5000, 5000) };
cout << "Size: \t\tYuanhao \ts1h \t\tMiki" << endl;
for (int is = 0; is < sizes.size(); ++is)
{
Size sz = sizes[is];
cout << sz << "\t";
Mat1f img1(sz);
randu(img1, Scalar(-100), Scalar(100));
Mat1f img2(sz);
randu(img2, Scalar(-100), Scalar(100));
{
Mat1f one = img1.clone();
Mat1f two = img2.clone();
double tic = double(getTickCount());
for (int r = 0; r < one.rows; ++r)
{
float* pone = one.ptr<float>(r);
float* ptwo = two.ptr<float>(r);
for (int c = 0; c < one.cols; ++c)
{
if (fabs(ptwo[c]) < fabs(pone[c]))
{
pone[c] = ptwo[c];
}
}
}
double toc = (double(getTickCount()) - tic) * 1000. / getTickFrequency();
cout << toc << " \t";
}
{
Mat1f one = img1.clone();
Mat1f two = img2.clone();
double tic = double(getTickCount());
two.copyTo(one, abs(two) < abs(one));
double toc = (double(getTickCount()) - tic) * 1000. / getTickFrequency();
cout << toc << " \t";
}
{
Mat1f one = img1.clone();
Mat1f two = img2.clone();
double tic = double(getTickCount());
int rows = one.rows;
int cols = one.cols;
if (one.isContinuous() && two.isContinuous())
{
cols = rows * cols;
rows = 1;
}
for (int r = 0; r < rows; ++r)
{
float* pone = one.ptr<float>(r);
float* ptwo = two.ptr<float>(r);
for (int c = 0; c < cols; ++c)
{
if (fabs(ptwo[c]) < fabs(pone[c]))
{
pone[c] = ptwo[c];
}
}
}
double toc = (double(getTickCount()) - tic) * 1000. / getTickFrequency();
cout << toc << " \t";
}
cout << endl;
}
getchar();
return 0;
}

tbb matrix mulitiplication stack overflow error c++

I'm trying to a matrix multiplication using task in intel tbb, the algorithm that I'm using is Strassen's algorithm...
Here is my code for main():
#include "Matrix.h"
#include "tbb/tick_count.h"
using namespace tbb;
using namespace std;
//Here is how I call MatTask class
Matrica callParallel(Matrx& A, Matrix& B, Matrix& C, int n){
MatTask& t = *new (task::allocate_root ()) MatTask (A, B, &C, n);
task::spawn_root_and_wait (t);
return C;
}
int main(){
int rows, columns;
Matrix serialC;
cout << "*******************\n" << "If rows and columns are < 6 you will enter the matric manualy\n" << "********************\n" <<endl;
cout << "Enter rows for matrix A: ";
cin >> rows;
cout << "Enter columns for matrix A: ";
cin >> columns;
Matrix A(rows, columns);
if(rows > 5 && columns > 5){
A.createMatrixAutomatic();
}else {
A.createMatricManualy();
}
cout << "Enter rows for matrix B: ";
cin >> rows;
cout << "Enter columns for matrix B: ";
cin >> columns;
Matrix B(rows, columns);
if(rows > 5 && columns > 5){
B.createMatrixAutomatic();
}else {
B.createMatricManualy();
}
cout << "Matrix A: " << endl;
A.printMatrix();
cout << "Matrix B: " << endl;
B.printMatrix();
cout << "Matrix C: " << endl;
tick_count start_time = tick_count::now();
serialC.MultSerial(A, B);
tick_count end_time = tick_count::now();
cout << "\nTime for serial: " << (end_time - start_time).seconds() * 1000 << " ms" << endl;
serialC.printMatrix();
//Creating matrix for result and calling the parallel algorithm
Matrix parallelC(rows, columns);
parallelC = callParallel(A, B, parallelC, rows);
//This here prints the result matrix
parallelC.printMatrix();
system("PAUSE");
}
Here is my Matrix.cpp code:
#include "Matrix.h"
Matrix::Matrix(){}
Matrix::Matrix(int rows, int columns){
vr = rows;
kol = columns;
}
void Matrix::createMatrixAutomatic(){
for(int i = 0; i < vr; i++){
for (int j = 0; j < kol; j++){
int number = rand() % 5 + 1;
matr[i][j] = number;
}
}
}
void Matrix::createMatricManualy(){
cout << "Enter the elements: " << endl;
for(int i = 0; i < vr; i++){
for (int j = 0; j < kol; j++){
cout << "Enter [" << i << "]" << "[" << j << "] element: ";
cin >> matr[i][j];
}
}
}
void Matrix::printMatrix(){
for (int i = 0; i < vr; i++){
for (int j = 0; j < kol; j++){
cout << matr[i][j] << " ";
}
cout << endl << endl;
}
}
void Matrix::MultSerial(Matrix& A, Matrix& B){
for(int i = 0; i < A.vr; i++){
for(int j = 0; j < B.kol; j++){
matr[i][j] = 0;
for(int k = 0; k < B.vr; k++){
matr[i][j] += (A.matr[i][k] * B.matr[k][j]);
vr = A.vr;
kol = B.kol;
}
}
}
}
void Matrix::substract(Matrix& A, Matrix& B, int dim){
for (int i = 0; i < dim; i++) {
for (int j = 0; j < dim; j++) {
matr[i][j] = A.matr[i][j] - B.matr[i][j];
}
}
}
void Matrix::Add(Matrix& A, Matrix& B, int dim){
for (int i = 0; i < dim; i++) {
for (int j = 0; j < dim; j++) {
matr[i][j] = A.matr[i][j] + B.matr[i][j];
}
}
}
And here is my MatTask class and Matrica.h class
#pragma once
#include <iostream>
#include <tbb/task.h>
using namespace tbb;
using namespace std;
class Matrix{
public:
int vr, kol;
int matr[100][100];
Matrix();
Matrix(int rows, int columns);
void createMatrixAutomatic();
void createMatricManualy();
void printMatrix();
void MultSerial(Matrix&, Matrix&);
void Add(Matrix& A, Matrix& B, int dim);
void substract(Matrix& A, Matrix& B, int dim);
};
class MatTask: public task{
public:
Matrix A, B;
Matrix* C;
int dimension;
MatTask(Matrix& _A, Matrix& _B, Matrix* _C, int dim):
A(_A), B(_B), C(_C), dimension(dim){}
task* execute(){
if(dimension == 1){
C->MultSerial(A, B);
} else {
int newDimension = dimension/2;
task_list list;
int count = 1;
Matrica a11(newDimension, newDimension), a12(newDimension, newDimension), a21(newDimension, newDimension), a22(newDimension, newDimension),
b11(newDimension, newDimension), b12(newDimension, newDimension), b21(newDimension, newDimension), b22(newDimension, newDimension),
*c11, *c12, *c21, *c22,
p1(newDimension, newDimension), *p2, *p3, *p4, *p5, *p6, *p7,
aResult(newDimension, newDimension), bResult(newDimension, newDimension);
//Delimo matrice u 4 podmatrice
for(int i = 0; i < newDimension; i++){
for(int j = 0; j < newDimension; j++){
(a11).matr[i][j] = A.matr[i][j];
(a12).matr[i][j] = A.matr[i][j + newDimension];
(a21).matr[i][j] = A.matr[i + newDimension][j];
(a22).matr[i][j] = A.matr[i + newDimension][j + newDimension];
(b11).matr[i][j] = B.matr[i][j];
(b12).matr[i][j] = B.matr[i][j + newDimension];
(b21).matr[i][j] = B.matr[i + newDimension][j];
(b22).matr[i][j] = B.matr[i + newDimension][j + newDimension];
}
}
//RACUNAMO p1...p7
//p1 = (a11 + a22) * (b11 + b22)
aResult.Add(a11, a22, newDimension); //a11 + a22
bResult.Add(b11, b22, newDimension); //b11 + b22
count++;
//MatTask& a = *new( allocate_child() ) MatTask(aResult, bResult, &p1, newDimension);
//lista.push_back(a);
lista.push_back(*new (allocate_child()) MatTask(aResult, bResult, &p1, newDimension));
//p2 = (a21 + a22) * b11
//aResult.Add(a21, a22, newDimension); //a21 + a22
//count++;
////lista.push_back(*new (allocate_child()) MatTask(aResult, b11, p2, newDimension));
////p3 = a11 * (b12 - b22)
//bResult.substract(b12, b22, newDimension); // b12 - b22
//count++;
////lista.push_back(*new (allocate_child()) MatTask(a11, bResult, p3, newDimension));
////p4 = a22 * (b21 - b11)
//bResult.substract(b21, b11, newDimension); // b21 - b11
//count++;
////lista.push_back(*new (allocate_child()) MatTask(a22, bResult, p4, newDimension));
////p5 = (a11 + a12) * b22
//aResult.Add(a11, a12, newDimension); // a11 + a12
//count++;
////lista.push_back(*new (allocate_child()) MatTask(aResult, b22, p5, newDimension));
////p6 = (a21 - a11) * (b11 + b12)
//bResult.Add(b11, b12, newDimension); //b11 + b12
//aResult.substract(a21, a11, newDimension); //a21 - a11
//count++;
////lista.push_back(*new (allocate_child()) MatTask(aResult, bResult, p6, newDimension));
////p7 = (a12 - a22) * (b21 + b22)
//bResult.Add(b21, b22, newDimension); //b21 + b22
//aResult.substract(a12, a22, newDimension); //a12 - a22
//count++;
////lista.push_back(*new (allocate_child()) MatTask(aResult, bResult, p7, newDimension));
set_ref_count(count);
//spawn(a);
spawn_and_wait_for_all(list);
//spawn_and_wait_for_all(a);
//RACUNAMO d11, d12, d21, d22
//c11 = p1 + p4 - p5 + p7
//aResult.Add(p1, p4, newDimension); // p1 + p4
//bResult.Add(aResult, p7, newDimension); // p1 + p4 + p7
//c11.oduzmi(bResult, p5, newDimension); // c11 = p1 + p4 + p7 - p5
//// c12 = p3 + p5
//c12.Add(p3, p5, newDimension);
//
//// c21 = p2 + p4
//c21.Add(p2, p4, newDimension);
//// c22 = p1 + p3 - p2 + p6
//aResult.Add(p1, p3, newDimension); //p1 + p3
//bResult.Add(aResult, p6, newDimension); //p1 + p3 + p6
//c22.substract(bResult, p2, newDimension); // c22 = p1 + p3 + p6 - p2
//Grouping the results obtained in a single matrix:
//for (int i = 0; i < novaDimenzija ; i++) {
// for (int j = 0 ; j < novaDimenzija ; j++) {
// C.matr[i][j] = c11.matr[i][j];
// C.matr[i][j + newDimension] = c12.matr[i][j];
// C.matr[i + newDimension][j] = c21.matr[i][j];
// C.matr[i + newDimension][j + newDimension] = c22.matr[i][j];
// }
//}
}
return NULL;
}
};
As you see the names of functions and classes are not in English, but i don't think the will be a problem, because the code is really straightforward.
I get the error:
Unhandled exception at 0x01193787 in MnozenjeMatrica.exe: 0xC00000FD:
Stack overflow.
I think that the error occurres in the line spawn_and_wait_for_all(lista), nut I'm not sure.
Can you please take a look at my code and help me to solve the problem. Maybe I'm not calling correctly the functions, I do not know realy, please help. Thank you
It is blocking-style parallelism plus heavy use of stack for matrices which result in the stack overflow. So, each your task reserve some stack for its data and then calls spawn_root_and_wait_for_all which in turn executes another instance of the same task which recursively keeps growing the stack.
Use continuation-style programming and avoid allocating huge data on stack (and inside the task if possible - it reduces task allocator efficiency).