Pass an Armadillo C++ matrix over MPI

Pass an Armadillo C++ matrix over MPI - c++

I need to pass a matrix or complex matrix type defined by Armadillo C++ Matrix Library over MPI. What is a good way to go about this? I thought of trying to:
Write the matrix to some sort of array and then sending
rows/columns of that, with methods to de/re-construct the arrays either side of a MPI_send/recv
Using something like MPI_BYTE type?
Thanks
Update
So I was trying to implement the other scheme, by sending and receiving, for a simple example, on one node.
translate.cpp
#include <mpi.h>
#include <armadillo>
#include <vector>
#include <cstdlib>
using namespace std;
using namespace arma;
using std::vector;
class ArmadilloMPI
{
public:
ArmadilloMPI(int nRows, int nCols)
{
this->nRows = nRows;
this->nCols = nCols;
realArray = (double **)malloc(nCols * nRows * sizeof(double*));
imArray = (double **)malloc(nCols * nRows * sizeof(double*));
}
~ArmadilloMPI()
{
free(realArray[0]);
free(realArray);
free(imArray[0]);
free(imArray);
}
double **realArray;
double **imArray;
int nCols;
int nRows;
cx_mat matConstructRecv(int src, int tag)
{
cx_mat A(nRows, nCols);
MPI_Recv(&(imArray[0][0]), nRows * nCols, MPI_DOUBLE, src, tag, MPI_COMM_WORLD,0);
MPI_Recv(&(realArray[0][0]),nRows * nCols, MPI_DOUBLE, src, tag, MPI_COMM_WORLD,0);
for(int i = 0; i < nRows; ++i )
{
for(int j = 0; i < nCols; ++j)
{
real(A(i,j)) = *realArray[i * nRows + j];
imag(A(i,j)) = *imArray[i * nRows + j];
}
}
return A;
}
void matDestroySend(cx_mat &A, int dest, int tag)
{
for(int i = 0; i < nRows; ++i )
{
for(int j = 0; i < nCols; ++j)
{
realArray[i * nRows + j] = &real(A(i,j));
imArray[i * nRows + j] = &imag(A(i,j));
}
}
MPI_Send(&(realArray[0][0]), nRows * nCols, MPI_DOUBLE, dest, tag, MPI_COMM_WORLD);
MPI_Send(&(imArray[0][0]), nRows * nCols, MPI_DOUBLE, dest, tag, MPI_COMM_WORLD);
}
};
int main(int argc, char** argv)
{
MPI::Init(argc, argv);
int size = MPI::COMM_WORLD.Get_size();
int rank = MPI::COMM_WORLD.Get_rank();
cout << "test"<<endl;
vector<cx_mat> world;
for(int i = 0; i < size; ++i )
{
world.push_back(randu<cx_mat>(4,4));
}
cx_mat A;
A = randu<cx_mat>(4,4);
ArmadilloMPI* armaMPI = new ArmadilloMPI(4,4);
if(rank==0)
{
for(int i = 1; i < size; i++)
{
cout << "A is now " << A << endl;
A += armaMPI->matConstructRecv(i, 0);
}
}
else
{
armaMPI->matDestroySend(world[rank], 1, 0);
}
cout << A << endl;
delete armaMPI;
MPI::Finalize();
}
But we have a seg fault.
*** Process received signal ***
Signal: Segmentation fault: 11 (11)
Signal code: (0)
Failing at address: 0x0 translate(1032,0x7fff747ad310) malloc: *** error for object 0x41434d5f49504d4f: pointer being freed was not allocated
Thoughts?

There are a couple of issues :
In c and c++, array and vector start at 0, not 1. So the following code will fail :
vector<cx_mat> world;
world.resize(1);
world[1] = randu<cx_mat>(4,4); //problem to come !
You may change for :
vector<cx_mat> world;
world.push_back(randu<cx_mat>(4,4));
Dynamic allocation of 2D array with contiguous memory. You need one new for an array of double, and another new for array of pointers to double. Then set each pointer to point to the first item of the row.
double *data=new double[nCols * nRows ];
realArray = new double*[( nRows )];
for(int i=0;i<nRows;i++){
realArray[i]=&data[i*nCols];
}
You could guess this one...Why don't compilers warn about this kind of stuff ? Because it could make sense, but not here.
for(int j = 0; i < nCols; ++j)
You may add a different tag to each message to avoid switching the real part and the imaginary part
MPI_Send(&(realArray[0][0]), nRows * nCols, MPI_DOUBLE, dest, tag, MPI_COMM_WORLD);
MPI_Send(&(imArray[0][0]), nRows * nCols, MPI_DOUBLE, dest, tag+1, MPI_COMM_WORLD);
The code becomes :
#include <mpi.h>
#include <armadillo>
#include <vector>
#include <iostream>
#include <cstdlib>
using namespace std;
using namespace arma;
using std::vector;
class ArmadilloMPI
{
public:
ArmadilloMPI(int nRows, int nCols)
{
this->nRows = nRows;
this->nCols = nCols;
double *data=new double[nCols * nRows ];
realArray = new double*[( nRows )];
for(int i=0;i<nRows;i++){
realArray[i]=&data[i*nCols];
}
double *datai=new double[(nCols * nRows )];
imArray =new double*[( nRows )];
for(int i=0;i<nRows;i++){
imArray[i]=&datai[i*nCols];
}
}
~ArmadilloMPI()
{
delete[] realArray[0];
delete[] realArray;
delete[] imArray[0];
delete[] imArray;
}
double **realArray;
double **imArray;
int nCols;
int nRows;
cx_mat matConstructRecv(int tag, int src)
{
cx_mat A(nRows, nCols);
MPI_Recv(&(imArray[0][0]), nRows * nCols, MPI_DOUBLE, src, tag+1, MPI_COMM_WORLD,0);
MPI_Recv(&(realArray[0][0]),nRows * nCols, MPI_DOUBLE, src, tag, MPI_COMM_WORLD,0);
for(int i = 0; i < nRows; ++i )
{
for(int j = 0; j < nCols; ++j)
{
real(A(i,j)) = realArray[i][j];
imag(A(i,j)) = imArray[i][j];
}
}
return A;
}
void matDestroySend(cx_mat &A, int dest, int tag)
{
for(int i = 0; i < nRows; ++i )
{
for(int j = 0; j < nCols; ++j)
{
realArray[i][j] = real((A(i,j)));
imArray[i][j] = imag((A(i,j)));
}
}
MPI_Send(&(realArray[0][0]), nRows * nCols, MPI_DOUBLE, dest, tag, MPI_COMM_WORLD);
MPI_Send(&(imArray[0][0]), nRows * nCols, MPI_DOUBLE, dest, tag+1, MPI_COMM_WORLD);
}
};
int main(int argc, char **argv)
{
int rank;
int size;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
srand (time(NULL)+rank);
vector<cx_mat> world;
world.push_back(randu<cx_mat>(4,4));
cx_mat A;
ArmadilloMPI* armaMPI = new ArmadilloMPI(4,4);
if(rank==0)
{
world[0].print("world[0] on 0:");
armaMPI->matDestroySend(world[0], 1, 0);
}
if(rank==1){
A = armaMPI->matConstructRecv(0, 0);
A.print("A on 1:");
}
delete armaMPI;
MPI_Finalize();
}
To compile :
mpiCC -O2 -o main main.cpp -larmadillo -llapack -lblas -Wall
To run :
mpiexec -np 2 main

Related

MPI_Reduce on different communicators not working as expected

#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
using namespace std;
int ceil(int x, int y) {
return x / y + (x % y > 0);
}
void create_group_and_comm(MPI_Group *world_group, MPI_Group *group, MPI_Comm *comm, int size, bool is_even) {
int *ranks;
int count = is_even ? ceil(size, 2) : size / 2;
ranks = (int *)malloc(count * sizeof(int));
int i = is_even ? 0 : 1, j=0;
while(i < size) {
ranks[j] = i;
j++;
i+=2;
}
MPI_Group_incl(*world_group, j, ranks, group);
MPI_Comm_create(MPI_COMM_WORLD, *group, comm);
free(ranks);
}
int main(int argc, char *argv[])
{
int size, rank, *result_odd, *result_even;
int rank_gr;
char processor_name[MPI_MAX_PROCESSOR_NAME];
MPI_Status status;
MPI_Comm even_comm, odd_comm;
MPI_Group even_group, odd_group, world_group;
int *A, *Rows;
int namelen;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Get_processor_name(processor_name, &namelen);
if (rank == 0)
{
A = (int *)malloc(size * size * sizeof(int));
for (int i = 0; i < size * size; i++) {
A[i] = rand() / 1000000;
}
printf("Initial data:\n");
for (int i = 0; i < size; i++)
{
putchar('|');
for (int j = 0; j < size; j++)
printf("%.4d ", A[i*size+j]);
printf("|\n");
}
MPI_Barrier(MPI_COMM_WORLD);
}
else
MPI_Barrier(MPI_COMM_WORLD);
MPI_Comm_group(MPI_COMM_WORLD, &world_group);
create_group_and_comm(&world_group, &even_group, &even_comm, size, true);
create_group_and_comm(&world_group, &odd_group, &odd_comm, size, false);
Rows = new int[size];
MPI_Scatter(A, size, MPI_INT, Rows, size, MPI_INT, 0, MPI_COMM_WORLD);
result_odd = new int[size];
result_even = new int[size];
if(rank % 2 == 0) {
MPI_Reduce(Rows,result_even,size,MPI_INT,MPI_MAX,0,even_comm);
} else {
MPI_Reduce(Rows,result_odd,size,MPI_INT,MPI_MIN,0,odd_comm);
}
if(rank == 0) {
printf("Max values for columns on even:\n");
for(int idx = 0; idx < size;idx++) {
printf("Column %d: %d\n", idx+1, result_even[idx]);
}
printf("Max values for columns on odd:\n");
for(int idx = 0; idx < size;idx++) {
printf("Column %d: %d\n", idx+1, result_odd[idx]);
}
}
//MPI_Comm_free(&even_comm);
//MPI_Comm_free(&odd_comm);
MPI_Group_free(&even_group);
MPI_Group_free(&odd_group);
MPI_Finalize();
return 0;
}
Hello i'm writing an application using MPI library, i'm trying to create 2 groups with each of them with their own communicator. Basically one group which holds processors with rank even calculates the maximum value per column using MPI_Reduce between them(processors in group), and the second one calculates the minimum for each column in matrice. For even rank MPI_Reduce works as expected but for processors with odd rank is not working as it should, can someone help me what i'm doing wrong? Below is a picture with the problem i described:
image here

CTPL C++ threadpooling datarace on accessing array's elements

Does accessing the same array's different elements create a data race?
I have a "Matrix" wrapper class for an array with matrix interface, and i wrote a parallel multiplication by a scalar function for it.
I use CTPL library for thread pools.
I know that writing from a thread into an array cell passed by reference is not a data race (please correct me if i'm wrong) so i decided to pass a cell from the array to the function so i can write multiplication result into the cell itself, not by passing the reference to an array and the index, so i can avoid a data race.
I ran the function 10k times and the results did not differ even once, but a sanitizer i use ("-fsanitize=thread -fPIE -pie -g" in Cmake flags) still alerts me of a data race on the line where i create the thread pool.
Is the sanitizer mistaken or am i really experiencing a data race somewhere?
Here are the pieces of code, relevant to the prolem:
Wrapper:
class Matrix {
protected:
int width;
int height;
double* matrix;
public:
Matrix(int m, int n);
Matrix(int m, int n, const std::vector<double>& values);
int get_width() {
return width;
}
int get_height() {
return height;
}
double get_element(int row_num, int col_num);
void set_element(int row_num, int col_num, double el);
double* get_cell_ref(int row_num, int col_num);
};
Method implementations:
Matrix::Matrix(int m, int n) {
assert(m > 0 && n > 0);
matrix = new double[m * n]{0};
width = n;
height = m;
}
Matrix::Matrix(int m, int n, const std::vector<double>& values) {
assert(m > 0 && n > 0 && values.size() == m * n);
matrix = new double[m * n];
width = n;
height = m;
for (int i = 0; i < m * n; ++i) {
matrix[i] = values[i];
}
}
double Matrix::get_element(int row_num, int col_num) {
assert(check_valid(row_num, col_num, get_width(), get_height()));
return matrix[col_num + get_width() * row_num];
}
void Matrix::set_element(int row_num, int col_num, double el) {
assert(check_valid(row_num, col_num, get_width(), get_height()));
matrix[col_num + row_num * get_width()] = el;
}
double* Matrix::get_cell_ref(int row_num, int col_num) {
int idx = col_num + get_width() * row_num;
return &matrix[idx];
}
The function that supposedly has a data race:
Matrix* scalar_multiply_parallel(Matrix* a, double mul, int threadN) {
auto* b = new Matrix(a->get_height(), a->get_width());
ctpl::thread_pool thr_pool(threadN);
std::vector<std::future<void>> futures(a->get_height() * a->get_width());
for (int i =0; i < a->get_height(); i++) {
for (int j =0; j < a->get_width(); j++) {
int idx = j + a->get_width() * i;
auto util = [&a, &b, i, j, mul](int) {
//b->set_element(i, j, a->get_element(i, j) * mul);
double *cell;
cell = b->get_cell_ref(i, j);
*cell = a->get_element(i, j) * mul;
};
futures[idx] = thr_pool.push(util);
}
}
for (auto& f: futures) {
f.get();
}
return b;
}

Difference in speed between GSL and MKL

I have two codes that are both working, yet I cannot figure out why one is so much faster than the other. To my knowledge, BLAS with MKL (Intel) should be much faster than GSL (GNU), although my code is showing quite the opposite. Here are the codes themselves where I am simply creating 2 matrices at the master node and then sending different rows to different "slave" processors (with OpenMPI) which compute the final matrices elements and then return them back to the master node.
GSL example (the fast code):
#include <iostream>
#include <stdio.h>
#include <iostream>
#include <cmath>
#include <mpi.h>
#include <gsl/gsl_blas.h>
using namespace std;
int main(int argc, char** argv){
int noprocs, nid;
MPI_Status status;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &nid);
MPI_Comm_size(MPI_COMM_WORLD, &noprocs);
int master = 0;
const int nsame = 1000; //must be same if matrices multiplied together = acols = brows
const int arows = 1000;
const int bcols = 1000;
int rowsent;
double * buff;
buff = new double [nsame];
double * b;
b = new double [nsame*bcols];
double** c = new double*[arows];
for(int i = 0; i < arows; ++i)
c[i] = new double[bcols];
double * CC;
CC = new double [1*bcols]; //here ncols corresponds to numbers of rows for matrix b
for (int i = 0; i < bcols; i++){
CC[i] = 0.;
}; //this is imply a 1-d array of zeros which will be updated and passed by processors
// Master part
if (nid == master ) {
double** a = new double*[arows];
for(int i = 0; i < arows; ++i){
a[i] = new double[nsame];}
for (int i = 0; i < arows; i++){
for (int j = 0; j < nsame; j++){
if (i == j)
a[i][j] = 1.;
else
a[i][j] = 0.;
}
}
for (int i = 0; i < (nsame*bcols); i++){
b[i] = (10.*i + 3.)/(3.*i - 2.) ;
}
MPI_Bcast(b,nsame*bcols, MPI_DOUBLE_PRECISION, master, MPI_COMM_WORLD); //assumes stored as contguous block of code
// send one row to each slave tagged with row number, assume nprocs<nrows
rowsent=0;
for (int i=1; i < (noprocs); i++) { //must be equal to noprocs otherwise it will not send to 3
MPI_Send(a[rowsent], nsame, MPI_DOUBLE_PRECISION,i,rowsent+1,MPI_COMM_WORLD);
rowsent++;
}
for (int i=0; i<arows; i++) {
MPI_Recv(CC, bcols, MPI_DOUBLE_PRECISION, MPI_ANY_SOURCE, MPI_ANY_TAG,
MPI_COMM_WORLD, &status);
int sender = status.MPI_SOURCE;
int anstype = status.MPI_TAG; //row number+1
int IND_I = 0;
while (IND_I < bcols){
c[anstype - 1][IND_I] = CC[IND_I];
IND_I++;
}
if (rowsent < arows) {
MPI_Send(a[rowsent], nsame,MPI_DOUBLE_PRECISION,sender,rowsent+1,MPI_COMM_WORLD);
rowsent++;
}
else { // tell sender no more work to do via a 0 TAG
MPI_Send(MPI_BOTTOM,0,MPI_DOUBLE_PRECISION,sender,0,MPI_COMM_WORLD);
}
}
}
// Slave part
else {
MPI_Bcast(b,nsame*bcols, MPI_DOUBLE_PRECISION, master, MPI_COMM_WORLD);
MPI_Recv(buff,nsame,MPI_DOUBLE_PRECISION,master,MPI_ANY_TAG,MPI_COMM_WORLD,&status);
while(status.MPI_TAG != 0) {
int crow = status.MPI_TAG;
gsl_matrix_view AAAA = gsl_matrix_view_array(buff, 1, nsame);
gsl_matrix_view BBBB = gsl_matrix_view_array(b, nsame, bcols);
gsl_matrix_view CCCC = gsl_matrix_view_array(CC, 1, bcols);
/* Compute C = A B */
gsl_blas_dgemm (CblasNoTrans, CblasNoTrans, 1.0, &AAAA.matrix, &BBBB.matrix,
0.0, &CCCC.matrix);
MPI_Send(CC,bcols,MPI_DOUBLE_PRECISION, master, crow, MPI_COMM_WORLD);
MPI_Recv(buff,nsame,MPI_DOUBLE_PRECISION,master,MPI_ANY_TAG,MPI_COMM_WORLD,&status);
// cout << ans << " OUTPUT \n";
}
}
MPI_Finalize();
return 0;
};
MKL example (the slow code):
#include <iostream>
#include <stdio.h>
#include <iostream>
#include <cmath>
#include <mpi.h>
#include </opt/intel/compilers_and_libraries_2017.1.126/mac/mkl/include/mkl.h>
using namespace std;
int main(int argc, char** argv){ //THE IDENTITY MATRIX ONLY WORKS IF arows = nsame!
int noprocs, nid;
MPI_Status status;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &nid);
MPI_Comm_size(MPI_COMM_WORLD, &noprocs);
int master = 0;
const int nsame = 1000;
const int arows = 1000;
const int bcols = 1000;
int rowsent;
double * buff;
buff = new double [nsame];
double * b;
b = new double [nsame*bcols];
double** c = new double*[arows];
for(int i = 0; i < arows; ++i)
c[i] = new double[bcols];
double * CC;
CC = new double [1*bcols];
for (int i = 0; i < bcols; i++){
CC[i] = 0.;
};
// Master part
if (nid == master ) {
double** a = new double*[arows];
for(int i = 0; i < arows; ++i){
a[i] = new double[nsame];}
for (int i = 0; i < arows; i++){
for (int j = 0; j < nsame; j++){
if (i == j)
a[i][j] = 1.;
else
a[i][j] = 0.;
}
}
for (int i = 0; i < (nsame*bcols); i++){
b[i] = (10.*i + 3.)/(3.*i - 2.) ; // = 1.*i as test value
}
MPI_Bcast(b,nsame*bcols, MPI_DOUBLE_PRECISION, master, MPI_COMM_WORLD); //assumes stored as contguous block of code nprocs<nrows
delete[] b;
rowsent=0;
for (int i=1; i < (noprocs); i++) { //must be equal to noprocs otherwise it will not send to 3
MPI_Send(a[rowsent], nsame, MPI_DOUBLE_PRECISION,i,rowsent+1,MPI_COMM_WORLD);
delete[] a[rowsent];
rowsent++;
}
for (int i=0; i<arows; i++) {
MPI_Recv(CC, bcols, MPI_DOUBLE_PRECISION, MPI_ANY_SOURCE, MPI_ANY_TAG,
MPI_COMM_WORLD, &status);
int sender = status.MPI_SOURCE;
int anstype = status.MPI_TAG; //row number+1
int IND_I = 0;
while (IND_I < bcols){
c[anstype - 1][IND_I] = CC[IND_I];
IND_I++;
}
if (rowsent < arows) {
MPI_Send(a[rowsent], nsame,MPI_DOUBLE_PRECISION,sender,rowsent+1,MPI_COMM_WORLD);
delete[] a[rowsent];
rowsent++;
}
else { // tell sender no more work to do via a 0 TAG
MPI_Send(MPI_BOTTOM,0,MPI_DOUBLE_PRECISION,sender,0,MPI_COMM_WORLD);
}
}
}
// Slave part
else {
MPI_Bcast(b,nsame*bcols, MPI_DOUBLE_PRECISION, master, MPI_COMM_WORLD);
MPI_Recv(buff,nsame,MPI_DOUBLE_PRECISION,master,MPI_ANY_TAG,MPI_COMM_WORLD,&status);
while(status.MPI_TAG != 0) {
int crow = status.MPI_TAG;
/* Compute C = A B */
cblas_dgemm (CblasRowMajor, CblasNoTrans, CblasNoTrans, 1, bcols, nsame, 1.0, buff, nsame, b, bcols,
0.0, CC, bcols);
MPI_Send(CC,bcols,MPI_DOUBLE_PRECISION, master, crow, MPI_COMM_WORLD);
MPI_Recv(buff,nsame,MPI_DOUBLE_PRECISION,master,MPI_ANY_TAG,MPI_COMM_WORLD,&status);
}
}
MPI_Finalize();
return 0;
};
I was thinking it might be due to me not deleting any of the new elements created, although I use essentially the same approach to initialize the arrays in both codes. I even tried deleting values in the MKL code (as shown) yet this appears to not have much of an effect. When I increase the size of the arrays from nsame = arows = bcols = 1000 to nsame = arows = bcols = 10000, the time differences in the two codes can readily be observed (the GSL code takes approximately 45 seconds while the MKL code takes quite a few minutes). Thus I am wondering if this is simply inherent to the way GSL and MKL are designed and incorporated in my code or if there is perhaps something else more subtle going on.

measured runtime from c++ "time.h" is double than real

I am running this pthread-c++ program (gauss elimination) on my laptop to measure its runtime.
The program runs about 10 seconds in real but my output shows about 20 seconds. What is wrong with this program?
I used
g++ -pthread main.c
./a.out 32 2048
to run
#include <stdio.h>
#include <stdlib.h>
#include <ctime>
#include <cstdlib>
#include <pthread.h>
#include <iostream>
typedef float Type;
void mat_rand (Type**, int, int);
Type** mat_aloc (int, int);
void mat_free (Type**);
void mat_print (Type**, int, int);
void* eliminate(void*);
unsigned int n, max_threads, active_threads, thread_length;
Type** A;
int current_row;
struct args
{
int start;
int end;
};
typedef struct args argument;
void *print_message_function( void *ptr );
int main(int argc, char *argv[])
{
if (argc < 3)
{
printf ("Error!. Please Enter The Matrix Dimension and No. of Threads!\n");
return 0;
} else
{
n = atoi(argv[2]);
max_threads = atoi(argv[1]);
if (n > 4096)
{
printf ("The maximum allowed size is 4096!\n");
return 0;
}
if (max_threads > 32)
{
printf ("The maximum allowed Threads Count is 32!\n");
return 0;
}
}
A = mat_aloc(n , n+1);
mat_rand (A, n, n+1);
//mat_print (A, n, n+1);
std::clock_t start;
double exe_time;
start = std::clock();
pthread_attr_t attr;
pthread_attr_init(&attr);
argument* thread_args = new argument[max_threads];
pthread_t* thread = new pthread_t[max_threads];
for (int i=0; i<n-1; i++)
{
current_row = i;
if (max_threads >= n-i)
active_threads = n-i-1;
else
active_threads = max_threads;
thread_length = (n-i-1)/active_threads;
for (int j=0; j<active_threads-1; j++)
{
thread_args[j].start = i+1+j*thread_length;
thread_args[j].end = i+1+(j+1)*thread_length;
pthread_create( &thread[j], &attr, eliminate, (void*) &thread_args[j]);
}
thread_args[active_threads-1].start = i+1+(active_threads-1)*thread_length;
thread_args[active_threads-1].end = n-1;
pthread_create(&thread[active_threads-1], &attr, eliminate, (void*) &thread_args[active_threads-1]);
for (int j=0; j<active_threads; j++)
{
pthread_join(thread[j], NULL);
}
}
exe_time = (clock() - start) / (double) CLOCKS_PER_SEC;
printf("Execution time for Matrix of size %i: %f\n", n, exe_time);
//mat_print (A, n, n+1);
return 0;
}
void* eliminate(void* arg)
{
Type k, row_constant;
argument* info = (argument*) arg;
row_constant = A[current_row][current_row];
for (int i=info->start; i<=info->end; i++)
{
k = A[i][current_row] / row_constant;
A[i][current_row] = 0;
for (int j=current_row+1; j<n+1; j++)
{
A[i][j] -= k*A[current_row][j];
}
}
}
// matrix random values
void mat_rand (Type** matrix, int row, int column)
{
for (int i=0; i<row; i++)
for (int j=0; j<column; j++)
{
matrix[i][j] = (float)(1) + ((float)rand()/(float)RAND_MAX)*256;
}
}
// allocates a 2d matrix
Type** mat_aloc (int row, int column)
{
Type* temp = new Type [row*column];
if (temp == NULL)
{
delete [] temp;
return 0;
}
Type** mat = new Type* [row];
if (temp == NULL)
{
delete [] mat;
return 0;
}
for (int i=0; i<row; i++)
{
mat[i] = temp + i*column;
}
return mat;
}
// free memory of matrix
void mat_free (Type** matrix)
{
delete[] (*matrix);
delete[] matrix;
}
// print matrix
void mat_print (Type** matrix, int row, int column)
{
for (int i=0; i<row; i++)
{
for (int j=0; j<column; j++)
{
std::cout<< matrix[i][j] << "\t\t";
}
printf("\n");
}
printf(".................\n");
}

clock reports CPU time used. If you have 2 CPUs and run a thread on each one for 10 seconds, clock will report 20 seconds.

Pass By Reference Multidimensional Array With Unknown Size

How to pass by reference multidimensional array with unknown size in C or C++?
EDIT:
For example, in main function I have:
int main(){
int x, y;
int arr[x][y];
// pass_by_ref(/* passing just arr[][] by reference */);
}
and the function:
void pass_by_ref(/* proper parameter for arr[][] */){
// int size_x_Arr = ???
// int size_y_arr = ???
}
How to implement the commented line?

Simply put, you can't. In C, you can't pass by reference, since C has no references. In C++, you can't pass arrays with unknown size, since C++ doesn't support variable-lenght arrays.
Alternative solutions: in C99, pass a pointer to the variable-length array; in C++, pass a reference to std::vector<std::vector<T>>.
Demonstration for C99:
#include <stdio.h>
void foo(int n, int k, int (*arr)[n][k])
{
int i, j;
for (i = 0; i < n; i++) {
for (j = 0; j < k; j++) {
printf("%3d ", (*arr)[i][j]);
}
printf("\n");
}
}
int main(int argc, char *argv[])
{
int a = strtol(argv[1], NULL, 10);
int b = strtol(argv[2], NULL, 10);
int arr[a][b];
int i, j;
for (i = 0; i < a; i++) {
for (j = 0; j < b; j++) {
arr[i][j] = i * j;
}
}
foo(a, b, &arr);
return 0;
}
Demonstration for C++03:
#include <iostream>
#include <vector>
#include <cstdlib>
#include <ctime>
void foo(std::vector < std::vector < int > > &vec)
{
for (std::vector < std::vector < int > >::iterator i = vec.begin(); i != vec.end(); i++) {
for (std::vector<int>::iterator j = i->begin(); j != i->end(); j++) {
std::cout << *j << " ";
}
std::cout << std::endl;
}
}
int main(int argc, char *argv[])
{
int i = strtol(argv[1], NULL, 10);
int j = strtol(argv[2], NULL, 10);
srand(time(NULL));
std::vector < std::vector < int > > vec;
vec.resize(i);
for (std::vector < std::vector < int > >::iterator it = vec.begin(); it != vec.end(); it++) {
it->resize(j);
for (std::vector<int>::iterator jt = it->begin(); jt != it->end(); jt++) {
*jt = random() % 10;
}
}
foo(vec);
return 0;
}

H2CO3's solution will work for C99 or a C2011 compiler that supports VLAs. For C89 or a C2011 compiler that doesn't support VLAs, or (God forbid) a K&R C compiler, you'd have to do something else.
Assuming you're passing a contiguously allocated array, you can pass a pointer to the first element (&a[0][0]) along with the dimension sizes, and then treat it as a 1-D array, mapping indices like so:
void foo( int *a, size_t rows, size_t cols )
{
size_t i, j;
for (i = 0; i < rows; i++)
{
for (j = 0; j < cols; j++)
{
a[i * rows + j] = some_value();
}
}
}
int main( void )
{
int arr[10][20];
foo( &arr[0][0], 10, 20 );
...
return 0;
}
This will work for arrays allocated on the stack:
T a[M][N];
and for dynamically allocated arrays of the form:
T (*ap)[N] = malloc( M * sizeof *ap );
since both will have contiguously allocated rows. This will not work (or at least, not be guaranteed to work) for dynamically allocated arrays of the form:
T **ap = malloc( M * sizeof *ap );
if (ap)
{
size_t i;
for (i = 0; i < M; i++)
{
ap[i] = malloc( N * sizeof *ap[i] );
}
}
since it's not guaranteed that all the rows will be allocated contiguously to each other.

This is a sort of comment to the good answer of #John Bode
This will not work (or at least, not be guaranteed to work) for
dynamically allocated arrays of the form:
But this variant will:
T **ap = malloc( M * sizeof *ap );
if (ap) return NULL; ---> some error atention
if (ap)
{
ap[0] = malloc( M * N * sizeof *ap[i] );
if (ap[0]) { free(ap); return NULL;} ---> some error atention
size_t i;
for (i = 1; i < M; i++)
{
ap[i] = ap[0] + i * N;
}
}
After use :
free(ap[0]);
free(ap);
for T being int you call foo exactly als for the array int ap[M][N];
foo( &ap[0][0], M, N);
since you guaranteed that all the rows are allocated contiguously to each other.
This allocation is a litter more efficient.

John Bode's explanation is very good, but there is a little mistake:
it should be
i * cols + j
instead of
i * rows + j

If you really want references, then it's only in C++.
En example of a two-dimensional int array passed by reference
void function_taking_an_array(int**& multi_dim_array);
But the reference doesn't have any advantage, so simply use :
void function_taking_an_array(int** multi_dim_array);
I would advice you to use a container to hold your array.

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

Pass an Armadillo C++ matrix over MPI - c++

Related

MPI_Reduce on different communicators not working as expected

CTPL C++ threadpooling datarace on accessing array's elements

Difference in speed between GSL and MKL

measured runtime from c++ "time.h" is double than real

Pass By Reference Multidimensional Array With Unknown Size

Categories

Resources