Difference in speed between GSL and MKL - c++

I have two codes that are both working, yet I cannot figure out why one is so much faster than the other. To my knowledge, BLAS with MKL (Intel) should be much faster than GSL (GNU), although my code is showing quite the opposite. Here are the codes themselves where I am simply creating 2 matrices at the master node and then sending different rows to different "slave" processors (with OpenMPI) which compute the final matrices elements and then return them back to the master node.
GSL example (the fast code):
#include <iostream>
#include <stdio.h>
#include <iostream>
#include <cmath>
#include <mpi.h>
#include <gsl/gsl_blas.h>
using namespace std;
int main(int argc, char** argv){
int noprocs, nid;
MPI_Status status;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &nid);
MPI_Comm_size(MPI_COMM_WORLD, &noprocs);
int master = 0;
const int nsame = 1000; //must be same if matrices multiplied together = acols = brows
const int arows = 1000;
const int bcols = 1000;
int rowsent;
double * buff;
buff = new double [nsame];
double * b;
b = new double [nsame*bcols];
double** c = new double*[arows];
for(int i = 0; i < arows; ++i)
c[i] = new double[bcols];
double * CC;
CC = new double [1*bcols]; //here ncols corresponds to numbers of rows for matrix b
for (int i = 0; i < bcols; i++){
CC[i] = 0.;
}; //this is imply a 1-d array of zeros which will be updated and passed by processors
// Master part
if (nid == master ) {
double** a = new double*[arows];
for(int i = 0; i < arows; ++i){
a[i] = new double[nsame];}
for (int i = 0; i < arows; i++){
for (int j = 0; j < nsame; j++){
if (i == j)
a[i][j] = 1.;
else
a[i][j] = 0.;
}
}
for (int i = 0; i < (nsame*bcols); i++){
b[i] = (10.*i + 3.)/(3.*i - 2.) ;
}
MPI_Bcast(b,nsame*bcols, MPI_DOUBLE_PRECISION, master, MPI_COMM_WORLD); //assumes stored as contguous block of code
// send one row to each slave tagged with row number, assume nprocs<nrows
rowsent=0;
for (int i=1; i < (noprocs); i++) { //must be equal to noprocs otherwise it will not send to 3
MPI_Send(a[rowsent], nsame, MPI_DOUBLE_PRECISION,i,rowsent+1,MPI_COMM_WORLD);
rowsent++;
}
for (int i=0; i<arows; i++) {
MPI_Recv(CC, bcols, MPI_DOUBLE_PRECISION, MPI_ANY_SOURCE, MPI_ANY_TAG,
MPI_COMM_WORLD, &status);
int sender = status.MPI_SOURCE;
int anstype = status.MPI_TAG; //row number+1
int IND_I = 0;
while (IND_I < bcols){
c[anstype - 1][IND_I] = CC[IND_I];
IND_I++;
}
if (rowsent < arows) {
MPI_Send(a[rowsent], nsame,MPI_DOUBLE_PRECISION,sender,rowsent+1,MPI_COMM_WORLD);
rowsent++;
}
else { // tell sender no more work to do via a 0 TAG
MPI_Send(MPI_BOTTOM,0,MPI_DOUBLE_PRECISION,sender,0,MPI_COMM_WORLD);
}
}
}
// Slave part
else {
MPI_Bcast(b,nsame*bcols, MPI_DOUBLE_PRECISION, master, MPI_COMM_WORLD);
MPI_Recv(buff,nsame,MPI_DOUBLE_PRECISION,master,MPI_ANY_TAG,MPI_COMM_WORLD,&status);
while(status.MPI_TAG != 0) {
int crow = status.MPI_TAG;
gsl_matrix_view AAAA = gsl_matrix_view_array(buff, 1, nsame);
gsl_matrix_view BBBB = gsl_matrix_view_array(b, nsame, bcols);
gsl_matrix_view CCCC = gsl_matrix_view_array(CC, 1, bcols);
/* Compute C = A B */
gsl_blas_dgemm (CblasNoTrans, CblasNoTrans, 1.0, &AAAA.matrix, &BBBB.matrix,
0.0, &CCCC.matrix);
MPI_Send(CC,bcols,MPI_DOUBLE_PRECISION, master, crow, MPI_COMM_WORLD);
MPI_Recv(buff,nsame,MPI_DOUBLE_PRECISION,master,MPI_ANY_TAG,MPI_COMM_WORLD,&status);
// cout << ans << " OUTPUT \n";
}
}
MPI_Finalize();
return 0;
};
MKL example (the slow code):
#include <iostream>
#include <stdio.h>
#include <iostream>
#include <cmath>
#include <mpi.h>
#include </opt/intel/compilers_and_libraries_2017.1.126/mac/mkl/include/mkl.h>
using namespace std;
int main(int argc, char** argv){ //THE IDENTITY MATRIX ONLY WORKS IF arows = nsame!
int noprocs, nid;
MPI_Status status;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &nid);
MPI_Comm_size(MPI_COMM_WORLD, &noprocs);
int master = 0;
const int nsame = 1000;
const int arows = 1000;
const int bcols = 1000;
int rowsent;
double * buff;
buff = new double [nsame];
double * b;
b = new double [nsame*bcols];
double** c = new double*[arows];
for(int i = 0; i < arows; ++i)
c[i] = new double[bcols];
double * CC;
CC = new double [1*bcols];
for (int i = 0; i < bcols; i++){
CC[i] = 0.;
};
// Master part
if (nid == master ) {
double** a = new double*[arows];
for(int i = 0; i < arows; ++i){
a[i] = new double[nsame];}
for (int i = 0; i < arows; i++){
for (int j = 0; j < nsame; j++){
if (i == j)
a[i][j] = 1.;
else
a[i][j] = 0.;
}
}
for (int i = 0; i < (nsame*bcols); i++){
b[i] = (10.*i + 3.)/(3.*i - 2.) ; // = 1.*i as test value
}
MPI_Bcast(b,nsame*bcols, MPI_DOUBLE_PRECISION, master, MPI_COMM_WORLD); //assumes stored as contguous block of code nprocs<nrows
delete[] b;
rowsent=0;
for (int i=1; i < (noprocs); i++) { //must be equal to noprocs otherwise it will not send to 3
MPI_Send(a[rowsent], nsame, MPI_DOUBLE_PRECISION,i,rowsent+1,MPI_COMM_WORLD);
delete[] a[rowsent];
rowsent++;
}
for (int i=0; i<arows; i++) {
MPI_Recv(CC, bcols, MPI_DOUBLE_PRECISION, MPI_ANY_SOURCE, MPI_ANY_TAG,
MPI_COMM_WORLD, &status);
int sender = status.MPI_SOURCE;
int anstype = status.MPI_TAG; //row number+1
int IND_I = 0;
while (IND_I < bcols){
c[anstype - 1][IND_I] = CC[IND_I];
IND_I++;
}
if (rowsent < arows) {
MPI_Send(a[rowsent], nsame,MPI_DOUBLE_PRECISION,sender,rowsent+1,MPI_COMM_WORLD);
delete[] a[rowsent];
rowsent++;
}
else { // tell sender no more work to do via a 0 TAG
MPI_Send(MPI_BOTTOM,0,MPI_DOUBLE_PRECISION,sender,0,MPI_COMM_WORLD);
}
}
}
// Slave part
else {
MPI_Bcast(b,nsame*bcols, MPI_DOUBLE_PRECISION, master, MPI_COMM_WORLD);
MPI_Recv(buff,nsame,MPI_DOUBLE_PRECISION,master,MPI_ANY_TAG,MPI_COMM_WORLD,&status);
while(status.MPI_TAG != 0) {
int crow = status.MPI_TAG;
/* Compute C = A B */
cblas_dgemm (CblasRowMajor, CblasNoTrans, CblasNoTrans, 1, bcols, nsame, 1.0, buff, nsame, b, bcols,
0.0, CC, bcols);
MPI_Send(CC,bcols,MPI_DOUBLE_PRECISION, master, crow, MPI_COMM_WORLD);
MPI_Recv(buff,nsame,MPI_DOUBLE_PRECISION,master,MPI_ANY_TAG,MPI_COMM_WORLD,&status);
}
}
MPI_Finalize();
return 0;
};
I was thinking it might be due to me not deleting any of the new elements created, although I use essentially the same approach to initialize the arrays in both codes. I even tried deleting values in the MKL code (as shown) yet this appears to not have much of an effect. When I increase the size of the arrays from nsame = arows = bcols = 1000 to nsame = arows = bcols = 10000, the time differences in the two codes can readily be observed (the GSL code takes approximately 45 seconds while the MKL code takes quite a few minutes). Thus I am wondering if this is simply inherent to the way GSL and MKL are designed and incorporated in my code or if there is perhaps something else more subtle going on.

Related

Managing large vectors with MPI I/O in C

I am trying to compute the distance function matrix for a set of spatial coordinates (x and y ) with index array "a". The following is a simplified script wherein I scatter only the index array into local array local_a for each process. Instead of evaluating the entire distance matrix , I only compute the upper triangular matrix without the trivial diagonal entries which are essentially zero. I am trying to use MPI I/O to compute the Euclidean distance "d" and store it in an array and then write that array to an output file. I define the offset as a function of local_a[i] which defines both the local dimension as (SIZE-local_a), since the # of elements decrease as one moves down in row for an upper triangular matrix. The code below works well for up to 8000 elements but fails with " mca_fbtl_posix_pwritev: error in writev:Invalid argument " when I use 80,000 as vector size. I am not sure what is causing this error and any help will be greatly appreciated.
#include <stdio.h>
#include <mpi.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include <math.h>
int main(int argc, char *argv[])
{
int procid, numprocs, ierr, *a, SIZE, num_rows;
SIZE = 80000;
MPI_File fh;
double *x, *y, tstart, tend;
a = (int*) malloc(SIZE*sizeof(int));
x =(double*) malloc(SIZE*sizeof(double));
y =(double*) malloc(SIZE*sizeof(double));
ierr = MPI_Init(&argc, &argv);
ierr = MPI_Comm_rank(MPI_COMM_WORLD, &procid);
ierr = MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
MPI_Status status;
MPI_Offset offset;
num_rows = SIZE/numprocs;
for ( int i = 0; i < SIZE; i ++)
{
a[i] = i+1;
x[i] = i+1;
y[i] = i+2;
}
// Define local array for each process
int *local_a;
local_a = (int*) malloc(num_rows*sizeof(int));
// Scatter only the index array (a) to each process in a cyclical manner
for (int i = 0; i < num_rows; i ++)
{
ierr = MPI_Scatter(&a[i*numprocs], 1, MPI_INT, &local_a[i], 1, MPI_INT, 0, MPI_COMM_WORLD);
}
MPI_File_open(MPI_COMM_WORLD, "data.bin", MPI_MODE_CREATE|MPI_MODE_WRONLY, MPI_INFO_NULL, &fh);
for ( int i = 0; i < num_rows ; i++)
{
int p1 = local_a[i];
int local_dim = (SIZE-local_a[i]);
double *Dist;
Dist = (double*) malloc(local_dim*sizeof(double));
if ( local_a[i]==1)
{
offset = 2*0*sizeof(int);
}
else
{
int sum =0;
for ( int i = (SIZE-1); i >local_dim; i--)
{
sum +=i;
}
offset = 2*sum*sizeof(int);
if ( local_dim == 0)
{
local_dim = 1;
}
}
for ( int j = local_a[i]; j < SIZE; j++)
{
double d;
d = pow((pow(x[p1-1]-x[j],2) + pow(y[p1 -1]-y[j],2)),0.5);
Dist[j-local_a[i]] = d;
}
ierr = MPI_File_write_at(fh,offset,Dist,local_dim,MPI_DOUBLE,&status);
ierr = MPI_Barrier(MPI_COMM_WORLD);
free(Dist);
}
MPI_File_close(&fh);
ierr = MPI_Finalize();
free(a);
free(x);
free(y);
free(local_a);
return ierr;
}

MPI_Reduce on different communicators not working as expected

#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
using namespace std;
int ceil(int x, int y) {
return x / y + (x % y > 0);
}
void create_group_and_comm(MPI_Group *world_group, MPI_Group *group, MPI_Comm *comm, int size, bool is_even) {
int *ranks;
int count = is_even ? ceil(size, 2) : size / 2;
ranks = (int *)malloc(count * sizeof(int));
int i = is_even ? 0 : 1, j=0;
while(i < size) {
ranks[j] = i;
j++;
i+=2;
}
MPI_Group_incl(*world_group, j, ranks, group);
MPI_Comm_create(MPI_COMM_WORLD, *group, comm);
free(ranks);
}
int main(int argc, char *argv[])
{
int size, rank, *result_odd, *result_even;
int rank_gr;
char processor_name[MPI_MAX_PROCESSOR_NAME];
MPI_Status status;
MPI_Comm even_comm, odd_comm;
MPI_Group even_group, odd_group, world_group;
int *A, *Rows;
int namelen;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Get_processor_name(processor_name, &namelen);
if (rank == 0)
{
A = (int *)malloc(size * size * sizeof(int));
for (int i = 0; i < size * size; i++) {
A[i] = rand() / 1000000;
}
printf("Initial data:\n");
for (int i = 0; i < size; i++)
{
putchar('|');
for (int j = 0; j < size; j++)
printf("%.4d ", A[i*size+j]);
printf("|\n");
}
MPI_Barrier(MPI_COMM_WORLD);
}
else
MPI_Barrier(MPI_COMM_WORLD);
MPI_Comm_group(MPI_COMM_WORLD, &world_group);
create_group_and_comm(&world_group, &even_group, &even_comm, size, true);
create_group_and_comm(&world_group, &odd_group, &odd_comm, size, false);
Rows = new int[size];
MPI_Scatter(A, size, MPI_INT, Rows, size, MPI_INT, 0, MPI_COMM_WORLD);
result_odd = new int[size];
result_even = new int[size];
if(rank % 2 == 0) {
MPI_Reduce(Rows,result_even,size,MPI_INT,MPI_MAX,0,even_comm);
} else {
MPI_Reduce(Rows,result_odd,size,MPI_INT,MPI_MIN,0,odd_comm);
}
if(rank == 0) {
printf("Max values for columns on even:\n");
for(int idx = 0; idx < size;idx++) {
printf("Column %d: %d\n", idx+1, result_even[idx]);
}
printf("Max values for columns on odd:\n");
for(int idx = 0; idx < size;idx++) {
printf("Column %d: %d\n", idx+1, result_odd[idx]);
}
}
//MPI_Comm_free(&even_comm);
//MPI_Comm_free(&odd_comm);
MPI_Group_free(&even_group);
MPI_Group_free(&odd_group);
MPI_Finalize();
return 0;
}
Hello i'm writing an application using MPI library, i'm trying to create 2 groups with each of them with their own communicator. Basically one group which holds processors with rank even calculates the maximum value per column using MPI_Reduce between them(processors in group), and the second one calculates the minimum for each column in matrice. For even rank MPI_Reduce works as expected but for processors with odd rank is not working as it should, can someone help me what i'm doing wrong? Below is a picture with the problem i described:
image here

Pass an Armadillo C++ matrix over MPI

I need to pass a matrix or complex matrix type defined by Armadillo C++ Matrix Library over MPI. What is a good way to go about this? I thought of trying to:
Write the matrix to some sort of array and then sending
rows/columns of that, with methods to de/re-construct the arrays either side of a MPI_send/recv
Using something like MPI_BYTE type?
Thanks
Update
So I was trying to implement the other scheme, by sending and receiving, for a simple example, on one node.
translate.cpp
#include <mpi.h>
#include <armadillo>
#include <vector>
#include <cstdlib>
using namespace std;
using namespace arma;
using std::vector;
class ArmadilloMPI
{
public:
ArmadilloMPI(int nRows, int nCols)
{
this->nRows = nRows;
this->nCols = nCols;
realArray = (double **)malloc(nCols * nRows * sizeof(double*));
imArray = (double **)malloc(nCols * nRows * sizeof(double*));
}
~ArmadilloMPI()
{
free(realArray[0]);
free(realArray);
free(imArray[0]);
free(imArray);
}
double **realArray;
double **imArray;
int nCols;
int nRows;
cx_mat matConstructRecv(int src, int tag)
{
cx_mat A(nRows, nCols);
MPI_Recv(&(imArray[0][0]), nRows * nCols, MPI_DOUBLE, src, tag, MPI_COMM_WORLD,0);
MPI_Recv(&(realArray[0][0]),nRows * nCols, MPI_DOUBLE, src, tag, MPI_COMM_WORLD,0);
for(int i = 0; i < nRows; ++i )
{
for(int j = 0; i < nCols; ++j)
{
real(A(i,j)) = *realArray[i * nRows + j];
imag(A(i,j)) = *imArray[i * nRows + j];
}
}
return A;
}
void matDestroySend(cx_mat &A, int dest, int tag)
{
for(int i = 0; i < nRows; ++i )
{
for(int j = 0; i < nCols; ++j)
{
realArray[i * nRows + j] = &real(A(i,j));
imArray[i * nRows + j] = &imag(A(i,j));
}
}
MPI_Send(&(realArray[0][0]), nRows * nCols, MPI_DOUBLE, dest, tag, MPI_COMM_WORLD);
MPI_Send(&(imArray[0][0]), nRows * nCols, MPI_DOUBLE, dest, tag, MPI_COMM_WORLD);
}
};
int main(int argc, char** argv)
{
MPI::Init(argc, argv);
int size = MPI::COMM_WORLD.Get_size();
int rank = MPI::COMM_WORLD.Get_rank();
cout << "test"<<endl;
vector<cx_mat> world;
for(int i = 0; i < size; ++i )
{
world.push_back(randu<cx_mat>(4,4));
}
cx_mat A;
A = randu<cx_mat>(4,4);
ArmadilloMPI* armaMPI = new ArmadilloMPI(4,4);
if(rank==0)
{
for(int i = 1; i < size; i++)
{
cout << "A is now " << A << endl;
A += armaMPI->matConstructRecv(i, 0);
}
}
else
{
armaMPI->matDestroySend(world[rank], 1, 0);
}
cout << A << endl;
delete armaMPI;
MPI::Finalize();
}
But we have a seg fault.
*** Process received signal ***
Signal: Segmentation fault: 11 (11)
Signal code: (0)
Failing at address: 0x0 translate(1032,0x7fff747ad310) malloc: *** error for object 0x41434d5f49504d4f: pointer being freed was not allocated
Thoughts?
There are a couple of issues :
In c and c++, array and vector start at 0, not 1. So the following code will fail :
vector<cx_mat> world;
world.resize(1);
world[1] = randu<cx_mat>(4,4); //problem to come !
You may change for :
vector<cx_mat> world;
world.push_back(randu<cx_mat>(4,4));
Dynamic allocation of 2D array with contiguous memory. You need one new for an array of double, and another new for array of pointers to double. Then set each pointer to point to the first item of the row.
double *data=new double[nCols * nRows ];
realArray = new double*[( nRows )];
for(int i=0;i<nRows;i++){
realArray[i]=&data[i*nCols];
}
You could guess this one...Why don't compilers warn about this kind of stuff ? Because it could make sense, but not here.
for(int j = 0; i < nCols; ++j)
You may add a different tag to each message to avoid switching the real part and the imaginary part
MPI_Send(&(realArray[0][0]), nRows * nCols, MPI_DOUBLE, dest, tag, MPI_COMM_WORLD);
MPI_Send(&(imArray[0][0]), nRows * nCols, MPI_DOUBLE, dest, tag+1, MPI_COMM_WORLD);
The code becomes :
#include <mpi.h>
#include <armadillo>
#include <vector>
#include <iostream>
#include <cstdlib>
using namespace std;
using namespace arma;
using std::vector;
class ArmadilloMPI
{
public:
ArmadilloMPI(int nRows, int nCols)
{
this->nRows = nRows;
this->nCols = nCols;
double *data=new double[nCols * nRows ];
realArray = new double*[( nRows )];
for(int i=0;i<nRows;i++){
realArray[i]=&data[i*nCols];
}
double *datai=new double[(nCols * nRows )];
imArray =new double*[( nRows )];
for(int i=0;i<nRows;i++){
imArray[i]=&datai[i*nCols];
}
}
~ArmadilloMPI()
{
delete[] realArray[0];
delete[] realArray;
delete[] imArray[0];
delete[] imArray;
}
double **realArray;
double **imArray;
int nCols;
int nRows;
cx_mat matConstructRecv(int tag, int src)
{
cx_mat A(nRows, nCols);
MPI_Recv(&(imArray[0][0]), nRows * nCols, MPI_DOUBLE, src, tag+1, MPI_COMM_WORLD,0);
MPI_Recv(&(realArray[0][0]),nRows * nCols, MPI_DOUBLE, src, tag, MPI_COMM_WORLD,0);
for(int i = 0; i < nRows; ++i )
{
for(int j = 0; j < nCols; ++j)
{
real(A(i,j)) = realArray[i][j];
imag(A(i,j)) = imArray[i][j];
}
}
return A;
}
void matDestroySend(cx_mat &A, int dest, int tag)
{
for(int i = 0; i < nRows; ++i )
{
for(int j = 0; j < nCols; ++j)
{
realArray[i][j] = real((A(i,j)));
imArray[i][j] = imag((A(i,j)));
}
}
MPI_Send(&(realArray[0][0]), nRows * nCols, MPI_DOUBLE, dest, tag, MPI_COMM_WORLD);
MPI_Send(&(imArray[0][0]), nRows * nCols, MPI_DOUBLE, dest, tag+1, MPI_COMM_WORLD);
}
};
int main(int argc, char **argv)
{
int rank;
int size;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
srand (time(NULL)+rank);
vector<cx_mat> world;
world.push_back(randu<cx_mat>(4,4));
cx_mat A;
ArmadilloMPI* armaMPI = new ArmadilloMPI(4,4);
if(rank==0)
{
world[0].print("world[0] on 0:");
armaMPI->matDestroySend(world[0], 1, 0);
}
if(rank==1){
A = armaMPI->matConstructRecv(0, 0);
A.print("A on 1:");
}
delete armaMPI;
MPI_Finalize();
}
To compile :
mpiCC -O2 -o main main.cpp -larmadillo -llapack -lblas -Wall
To run :
mpiexec -np 2 main

measured runtime from c++ "time.h" is double than real

I am running this pthread-c++ program (gauss elimination) on my laptop to measure its runtime.
The program runs about 10 seconds in real but my output shows about 20 seconds. What is wrong with this program?
I used
g++ -pthread main.c
./a.out 32 2048
to run
#include <stdio.h>
#include <stdlib.h>
#include <ctime>
#include <cstdlib>
#include <pthread.h>
#include <iostream>
typedef float Type;
void mat_rand (Type**, int, int);
Type** mat_aloc (int, int);
void mat_free (Type**);
void mat_print (Type**, int, int);
void* eliminate(void*);
unsigned int n, max_threads, active_threads, thread_length;
Type** A;
int current_row;
struct args
{
int start;
int end;
};
typedef struct args argument;
void *print_message_function( void *ptr );
int main(int argc, char *argv[])
{
if (argc < 3)
{
printf ("Error!. Please Enter The Matrix Dimension and No. of Threads!\n");
return 0;
} else
{
n = atoi(argv[2]);
max_threads = atoi(argv[1]);
if (n > 4096)
{
printf ("The maximum allowed size is 4096!\n");
return 0;
}
if (max_threads > 32)
{
printf ("The maximum allowed Threads Count is 32!\n");
return 0;
}
}
A = mat_aloc(n , n+1);
mat_rand (A, n, n+1);
//mat_print (A, n, n+1);
std::clock_t start;
double exe_time;
start = std::clock();
pthread_attr_t attr;
pthread_attr_init(&attr);
argument* thread_args = new argument[max_threads];
pthread_t* thread = new pthread_t[max_threads];
for (int i=0; i<n-1; i++)
{
current_row = i;
if (max_threads >= n-i)
active_threads = n-i-1;
else
active_threads = max_threads;
thread_length = (n-i-1)/active_threads;
for (int j=0; j<active_threads-1; j++)
{
thread_args[j].start = i+1+j*thread_length;
thread_args[j].end = i+1+(j+1)*thread_length;
pthread_create( &thread[j], &attr, eliminate, (void*) &thread_args[j]);
}
thread_args[active_threads-1].start = i+1+(active_threads-1)*thread_length;
thread_args[active_threads-1].end = n-1;
pthread_create(&thread[active_threads-1], &attr, eliminate, (void*) &thread_args[active_threads-1]);
for (int j=0; j<active_threads; j++)
{
pthread_join(thread[j], NULL);
}
}
exe_time = (clock() - start) / (double) CLOCKS_PER_SEC;
printf("Execution time for Matrix of size %i: %f\n", n, exe_time);
//mat_print (A, n, n+1);
return 0;
}
void* eliminate(void* arg)
{
Type k, row_constant;
argument* info = (argument*) arg;
row_constant = A[current_row][current_row];
for (int i=info->start; i<=info->end; i++)
{
k = A[i][current_row] / row_constant;
A[i][current_row] = 0;
for (int j=current_row+1; j<n+1; j++)
{
A[i][j] -= k*A[current_row][j];
}
}
}
// matrix random values
void mat_rand (Type** matrix, int row, int column)
{
for (int i=0; i<row; i++)
for (int j=0; j<column; j++)
{
matrix[i][j] = (float)(1) + ((float)rand()/(float)RAND_MAX)*256;
}
}
// allocates a 2d matrix
Type** mat_aloc (int row, int column)
{
Type* temp = new Type [row*column];
if (temp == NULL)
{
delete [] temp;
return 0;
}
Type** mat = new Type* [row];
if (temp == NULL)
{
delete [] mat;
return 0;
}
for (int i=0; i<row; i++)
{
mat[i] = temp + i*column;
}
return mat;
}
// free memory of matrix
void mat_free (Type** matrix)
{
delete[] (*matrix);
delete[] matrix;
}
// print matrix
void mat_print (Type** matrix, int row, int column)
{
for (int i=0; i<row; i++)
{
for (int j=0; j<column; j++)
{
std::cout<< matrix[i][j] << "\t\t";
}
printf("\n");
}
printf(".................\n");
}
clock reports CPU time used. If you have 2 CPUs and run a thread on each one for 10 seconds, clock will report 20 seconds.

Unhandled exception with C++ class function

I am writing a program which will preform texture synthesis. I have been away from C++ for a while and am having trouble figuring out what I am doing wrong in my class. When I run the program, I get an unhandled exception in the copyToSample function when it tries to access the arrays. It is being called from the bestSampleSearch function when the unhandled exception occurs. The function has been called before and works just fine, but later on in the program it is called a second time and fails. Any ideas? Let me know if anyone needs to see more code. Thanks!
Edit1: Added the bestSampleSearch function and the compareMetaPic function
Edit2: Added a copy constructor
Edit3: Added main()
Edit4: I have gotten the program to work. However there is now a memory leak of some kind or I am running out of memory when I run the program. It seems in the double for loop in main which starts "// while output picture is unfilled" is the problem. If I comment this portion out the program finishes in a timely manner but only one small square is output. Something must be wrong with my bestSampleSearch function.
MetaPic.h
#pragma once
#include <pic.h>
#include <stdlib.h>
#include <cmath>
class MetaPic
{
public:
Pic* source;
Pixel1*** meta;
int x;
int y;
int z;
MetaPic();
MetaPic(Pic*);
MetaPic(const MetaPic&);
MetaPic& operator=(const MetaPic&);
~MetaPic();
void allocateMetaPic();
void copyPixelData();
void copyToOutput(Pic*&);
void copyToMetaOutput(MetaPic&, int, int);
void copyToSample(MetaPic&, int, int);
void freeMetaPic();
};
MetaPic.cpp
#include "MetaPic.h"
MetaPic::MetaPic()
{
source = NULL;
meta = NULL;
x = 0;
y = 0;
z = 0;
}
MetaPic::MetaPic(Pic* pic)
{
source = pic;
x = pic->nx;
y = pic->ny;
z = pic->bpp;
allocateMetaPic();
copyPixelData();
}
MetaPic::MetaPic(const MetaPic& mp)
{
source = mp.source;
x = mp.x;
y = mp.y;
z = mp.z;
allocateMetaPic();
copyPixelData();
}
MetaPic::~MetaPic()
{
freeMetaPic();
}
// create a 3 dimensional array from the original one dimensional array
void MetaPic::allocateMetaPic()
{
meta = (Pixel1***)calloc(x, sizeof(Pixel1**));
for(int i = 0; i < x; i++)
{
meta[i] = (Pixel1**)calloc(y, sizeof(Pixel1*));
for(int j = 0; j < y; j++)
{
meta[i][j] = (Pixel1*)calloc(z, sizeof(Pixel1));
}
}
}
void MetaPic::copyPixelData()
{
for(int j = 0; j < y; j++)
{
for(int i = 0; i < x; i++)
{
for(int k = 0; k < z; k++)
meta[i][j][k] = source->pix[(j*z*x)+(i*z)+k];
}
}
}
void MetaPic::copyToOutput(Pic* &output)
{
for(int j = 0; j < y; j++)
{
for(int i = 0; i < x; i++)
{
for(int k = 0; k < z; k++)
output->pix[(j*z*x)+(i*z)+k] = meta[i][j][k];
}
}
}
// copy the meta data to the final pic output starting at the top left of the picture and mapped to 'a' and 'b' coordinates in the output
void MetaPic::copyToMetaOutput(MetaPic &output, int a, int b)
{
for(int j = 0; (j < y) && ((j+b) < output.y); j++)
{
for(int i = 0; (i < x) && ((i+a) < output.x); i++)
{
for(int k = 0; k < z; k++)
output.meta[i+a][j+b][k] = meta[i][j][k];
}
}
}
// copies from a source image to a smaller sample image
// *** Must make sure that the x and y coordinates have enough buffer space ***
void MetaPic::copyToSample(MetaPic &sample, int a, int b)
{
for(int j = 0; (j < sample.y) && ((b+j) < y); j++)
{
for(int i = 0; i < (sample.x) && ((a+i) < x); i++)
{
for(int k = 0; k < sample.z; k++)
{
**sample.meta[i][j][k] = meta[i+a][j+b][k];**
}
}
}
}
// free the meta pic data (MetaPic.meta)
// *** Not to be used outside of class declaration ***
void MetaPic::freeMetaPic()
{
for(int j = 0; j < y; j++)
{
for(int i = 0; i < z; i++)
free(meta[i][j]);
}
for(int i = 0; i < x; i++)
free(meta[i]);
free(meta);
}
MetaPic MetaPic::operator=(MetaPic mp)
{
MetaPic newMP(mp.source);
return newMP;
}
main.cpp
#ifdef WIN32
// For VC++ you need to include this file as glut.h and gl.h refer to it
#include <windows.h>
// disable the warning for the use of strdup and friends
#pragma warning(disable:4996)
#endif
#include <stdio.h> // Standard Header For Most Programs
#include <stdlib.h> // Additional standard Functions (exit() for example)
#include <iostream>
// Interface to libpicio, provides functions to load/save jpeg files
#include <pic.h>
#include <string.h>
#include <time.h>
#include <cmath>
#include "MetaPic.h"
using namespace std;
MetaPic bestSampleSearch(MetaPic, MetaPic);
double compareMetaPics(MetaPic, MetaPic);
#define SAMPLE_SIZE 23
#define OVERLAP 9
// Texture source image (pic.h uses the Pic* data structure)
Pic *sourceImage;
Pic *outputImage;
int main(int argc, char* argv[])
{
char* pictureName = "reg1.jpg";
int outputWidth = 0;
int outputHeight = 0;
// attempt to read in the file name
sourceImage = pic_read(pictureName, NULL);
if(sourceImage == NULL)
{
cout << "Couldn't read the file" << endl;
system("pause");
exit(EXIT_FAILURE);
}
// *** For now set the output image to 3 times the original height and width ***
outputWidth = sourceImage->nx*3;
outputHeight = sourceImage->ny*3;
// allocate the output image
outputImage = pic_alloc(outputWidth, outputHeight, sourceImage->bpp, NULL);
Pic* currentImage = pic_alloc(SAMPLE_SIZE, SAMPLE_SIZE, sourceImage->bpp, NULL);
MetaPic metaSource(sourceImage);
MetaPic metaOutput(outputImage);
MetaPic metaCurrent(currentImage);
// seed the output image
int x = 0;
int y = 0;
int xupperbound = metaSource.x - SAMPLE_SIZE;
int yupperbound = metaSource.y - SAMPLE_SIZE;
int xlowerbound = 0;
int ylowerbound = 0;
// find random coordinates
srand(time(NULL));
while((x >= xupperbound) || (x <= xlowerbound))
x = rand() % metaSource.x;
while((y >= yupperbound) || (y <= ylowerbound))
y = rand() % metaSource.y;
// copy a random sample from the source to the metasample
metaSource.copyToSample(metaCurrent, x, y);
// copy the seed to the metaoutput
metaCurrent.copyToMetaOutput(metaOutput, 0, 0);
int currentOutputX = 0;
int currentOutputY = 0;
// while the output picture is unfilled...
for(int j = 0; j < yupperbound; j+=(SAMPLE_SIZE-OVERLAP))
{
for(int i = 0; i < xupperbound; i+=(SAMPLE_SIZE-OVERLAP))
{
// move the sample to correct overlap
metaSource.copyToSample(metaCurrent, i, j);
// find the best match for the sample
metaCurrent = bestSampleSearch(metaSource, metaCurrent);
// write the best match to the metaoutput
metaCurrent.copyToMetaOutput(metaOutput, i, j);
// update the values
}
}
// copy the metaOutput to the output
metaOutput.copyToOutput(outputImage);
// output the image
pic_write("reg1_output.jpg", outputImage, PIC_JPEG_FILE);
// clean up
pic_free(sourceImage);
pic_free(outputImage);
pic_free(currentImage);
// return success
cout << "Done!" << endl;
system("pause");
// return success
return 0;
}
// finds the best sample to insert into the image
// *** best must be the sample which consists of the overlap ***
MetaPic bestSampleSearch(MetaPic source, MetaPic best)
{
MetaPic metaSample(best);
double bestScore = 999999.0;
double currentScore = 0.0;
for(int j = 0; j < source.y; j++)
{
for(int i = 0; i < source.x; i++)
{
// copy the image starting at the top left of the source image
source.copyToSample(metaSample, i, j);
// compare the sample with the overlap
currentScore = compareMetaPics(best, metaSample);
// if best score is greater than current score then copy the better sample to best and continue searching
if( bestScore > currentScore)
{
metaSample.copyToSample(best, 0, 0);
bestScore = currentScore;
}
// otherwise, the score is less than current score then do nothing (a better sample has not been found)
}
}
return best;
}
// find the comparison score for the two MetaPics based on their rgb values
// *** Both of the meta pics should be the same size ***
double compareMetaPics(MetaPic pic1, MetaPic pic2)
{
float r1 = 0.0;
float g1 = 0.0;
float b1 = 0.0;
float r2 = 0.0;
float g2 = 0.0;
float b2 = 0.0;
float r = 0.0;
float g = 0.0;
float b = 0.0;
float sum = 0.0;
// take the sum of the (sqrt((r1-r2)^2 + ((g1-g2)^2 + ((b1-b2)^2))
for(int j = 0; (j < pic1.y) && (j < pic2.y); j++)
{
for(int i = 0; (i < pic1.x) && (i < pic2.x); i++)
{
r1 = PIC_PIXEL(pic1.source, i, j, 0);
r2 = PIC_PIXEL(pic2.source, i, j, 0);
g1 = PIC_PIXEL(pic1.source, i, j, 1);
g2 = PIC_PIXEL(pic2.source, i, j, 1);
b1 = PIC_PIXEL(pic1.source, i, j, 2);
b2 = PIC_PIXEL(pic2.source, i, j, 2);
r = r1 - r2;
g = g1 - g2;
b = b1 - b2;
sum += sqrt((r*r) + (g*g) + (b*b));
}
}
return sum;
}
I'm not sure if this is the root cause of the problem, but your assignment operator does not actually assign anything:
MetaPic MetaPic::operator=(MetaPic mp)
{
MetaPic newMP(mp.source);
return newMP;
}
This should probably look something like the following (based off of the code in your copy constructor):
edit: with credit to Alf P. Steinbach
MetaPic& MetaPic::operator=(MetaPic mp)
{
mp.swap(*this);
return *this;
}
It turns out that the deallocate function is incorrect. It should be freeing in the same manner that it was allocating.
void MetaPic::freeMetaPic()
{
for(int j = 0; j < y; j++)
{
for(int i = 0; i < z; i++)
free(meta[i][j]);
}
for(int i = 0; i < x; i++)
free(meta[i]);
free(meta);
}