Good day. I have some issues with running MPI program that multiply matrices.
This is code (it is not my code) I get it from http://dkl.cs.arizona.edu/teaching/csc522-fall16/examples/hybrid-openmp-mm.c
I will be very grateful if you help me
Also I was looking for similar problems and solutions, but it didn't solve my problem
#include <omp.h>
#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#define TAG 13
int main(int argc, char* argv[]) {
double** A, ** B, ** C, * tmp;
double startTime, endTime;
int numElements, offset, stripSize, myrank, numnodes, N, i, j, k;
int numThreads, chunkSize = 10;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
MPI_Comm_size(MPI_COMM_WORLD, &numnodes);
N = atoi(argv[1]);
numThreads = atoi(argv[2]); // difference from MPI: how many threads/rank?
omp_set_num_threads(numThreads); // OpenMP call to set threads per rank
// allocate A, B, and C --- note that you want these to be
// contiguously allocated. Workers need less memory allocated.
if (myrank == 0) {
tmp = (double*)malloc(sizeof(double) * N * N);
A = (double**)malloc(sizeof(double*) * N);
for (i = 0; i < N; i++)
A[i] = &tmp[i * N];
}
else {
tmp = (double*)malloc(sizeof(double) * N * N / numnodes);
A = (double**)malloc(sizeof(double*) * N / numnodes);
for (i = 0; i < N / numnodes; i++)
A[i] = &tmp[i * N];
}
tmp = (double*)malloc(sizeof(double) * N * N);
B = (double**)malloc(sizeof(double*) * N);
for (i = 0; i < N; i++)
B[i] = &tmp[i * N];
if (myrank == 0) {
tmp = (double*)malloc(sizeof(double) * N * N);
C = (double**)malloc(sizeof(double*) * N);
for (i = 0; i < N; i++)
C[i] = &tmp[i * N];
}
else {
tmp = (double*)malloc(sizeof(double) * N * N / numnodes);
C = (double**)malloc(sizeof(double*) * N / numnodes);
for (i = 0; i < N / numnodes; i++)
C[i] = &tmp[i * N];
}
if (myrank == 0) {
// initialize A and B
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
A[i][j] = 1.0;
B[i][j] = 1.0;
}
}
}
// start timer
if (myrank == 0) {
startTime = MPI_Wtime();
}
stripSize = N / numnodes;
// send each node its piece of A -- note could be done via MPI_Scatter
if (myrank == 0) {
offset = stripSize;
numElements = stripSize * N;
for (i = 1; i < numnodes; i++) {
MPI_Send(A[offset], numElements, MPI_DOUBLE, i, TAG, MPI_COMM_WORLD);
offset += stripSize;
}
}
else { // receive my part of A
MPI_Recv(A[0], stripSize * N, MPI_DOUBLE, 0, TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
}
// everyone gets B
MPI_Bcast(B[0], N * N, MPI_DOUBLE, 0, MPI_COMM_WORLD);
// Let each process initialize C to zero
for (i = 0; i < stripSize; i++) {
for (j = 0; j < N; j++) {
C[i][j] = 0.0;
}
}
// do the work---this is the primary difference from the pure MPI program
#pragma omp parallel for shared(A,B,C,numThreads) private(i,j,k) schedule (static, chunkSize)
for (i = 0; i < stripSize; i++) {
for (j = 0; j < N; j++) {
for (k = 0; k < N; k++) {
C[i][j] += A[i][k] * B[k][j];
}
}
}
// master receives from workers -- note could be done via MPI_Gather
if (myrank == 0) {
offset = stripSize;
numElements = stripSize * N;
for (i = 1; i < numnodes; i++) {
MPI_Recv(C[offset], numElements, MPI_DOUBLE, i, TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
offset += stripSize;
}
}
else { // send my contribution to C
MPI_Send(C[0], stripSize * N, MPI_DOUBLE, 0, TAG, MPI_COMM_WORLD);
}
// stop timer
if (myrank == 0) {
endTime = MPI_Wtime();
printf("Time is %f\n", endTime - startTime);
}
// print out matrix here, if I'm the master
if (myrank == 0 && N < 10) {
for (i = 0; i < N; i++) {
for (j = 0; j < N; j++) {
printf("%f ", C[i][j]);
}
printf("\n");
}
}
MPI_Finalize();
return 0;
}
And this is my issue
You are doing a MPI_Bcast on B as if it's a contiguous block of N*N elements. However, it's not: it's an array of pointers to N separate arrays for length N. So either you need to allocate B contiguously, or you need to do N broadcasts.
I am a beginner in MPI programing and I am trying to do a matrix-vector multiplication (Ax=b).
let say A matrix is as follows,
|3 2 5|
matrix A= |4 3 1|
|2 4 2|
I divided A matrix into two matrix A1 and A2 as follows
|1 2 3|
matrix A1= |3 2 1|
|1 2 0|
|2 0 2|
matrix A2= |1 1 0|
|1 2 2|
The x vector is,
| 2 |
vector x= | 1 |
| 3 |
I need to calculate Ax=b such a way that the process number 1 does the A1 * x multiplication and gives C1 and process number 2 does the A2 * x multiplication and gives C2 and at the end the sum of C1 and C2 will be wrapped up in C. when I run the code through cmd it stops working and I don't know what is the problem. I would be really grateful if you could help me to find out what is the problem in code,
here is my code,
#define _CRT_SECURE_NO_WARNINGS
#include<iostream>
#include<fstream>
#include<vector>
#include<iterator>
#include<sstream>
#include<string>
#include<cstdlib>
#include<cmath>
#include<stdio.h>
#include<conio.h>
#include<algorithm>
#include<ctime>
#include<iomanip>
#include<mpi.h>
#include<time.h>
#include<assert.h>
using namespace std;
void Initialise(int **res, int rows, int cols);
void Multiply(int **res, int **A, int **B, int aRows, int aCols, int bRows, int bCols);
void timestamp();
//**********************************************************
/* |3 2 5|
matrix A= |4 3 1|
|2 4 2|
matrix A is divided into two matrix A1,A2
|1 2 3|
matrix A1= |3 2 1|
|1 2 0|
|2 0 2|
matrix A2= |1 1 0|
|1 2 2|
| 2 |
vector x= | 1 |
| 3 |
| 23 |
C = | 14 |
| 14 |
//*********************************************************
*/
int main(int argc,char **argv)
{
int id, p;
MPI_Status status;
// p -> no. of processes
// id -> process id
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &id);
MPI_Comm_size(MPI_COMM_WORLD, &p);
cout << p << endl;
char processor_name[MPI_MAX_PROCESSOR_NAME];
int name_len;
MPI_Get_processor_name(processor_name, &name_len);
if (id == 0)// master
{
wtime = MPI_Wtime();
int aRows = 3;
int aCols = 3;
int bRows = 3;
int bCols = 1;
int** A = new int*[aRows];
for (int i = 0; i < aRows; i++)
{
A[i] = new int[aCols];
}
int** A1 = new int*[aRows];
for (int i = 0; i < aRows; i++)
{
A1[i] = new int[aCols];
}
int** A2 = new int*[aRows];
for (int i = 0; i < aRows; i++)
{
A2[i] = new int[aCols];
}
int** B = new int*[bRows];
for (int i = 0; i < bRows; i++)
{
B[i] = new int[bCols];
}
//***************************************
A[0][0] = 3;
A[0][1] = 2;
A[0][2] = 5;
A[1][0] = 4;
A[1][1] = 3;
A[1][2] = 1;
A[2][0] = 2;
A[2][1] = 4;
A[2][2] = 2;
B[0][0] = 2;
B[1][0] = 1;
B[2][0] = 3;
//**************************************
A1[0][0] = 1;
A1[0][1] = 2;
A1[0][2] = 3;
A1[1][0] = 3;
A1[1][1] = 2;
A1[1][2] = 1;
A1[2][0] = 1;
A1[2][1] = 2;
A1[2][2] = 0;
//**************************************
A2[0][0] = 2;
A2[0][1] = 0;
A2[0][2] = 2;
A2[1][0] = 1;
A2[1][1] = 1;
A2[1][2] = 0;
A2[2][0] = 1;
A2[2][1] = 2;
A2[2][2] = 2;
//*************************************
B[0][0] = 2;
B[1][0] = 1;
B[2][0] = 3;
//*************************************
int** C;
C = new int*[aRows];
for (int i = 0; i < aRows; i++)
{
C[i] = new int[bCols];
}
//************************************
int** C1;
C1 = new int*[aRows];
for (int i = 0; i < aRows; i++)
{
C1[i] = new int[bCols];
}
//************************************
int** C2;
C2 = new int*[aRows];
for (int i = 0; i < aRows; i++)
{
C2[i] = new int[bCols];
}
//***********************************
Multiply(C, A, B, aRows, aCols, bRows, bCols);
for (int i = 0; i < aRows; i++)
{
for (int j = 0; j < bCols; j++)
{
std::cout << C[i][j] << ' ';
}
std::cout << '\n';
}
MPI_Send(&aRows, 1, MPI_INT, 1, 1, MPI_COMM_WORLD);
MPI_Send(&aCols, 1, MPI_INT, 1, 2, MPI_COMM_WORLD);
MPI_Send(&bRows, 1, MPI_INT, 1, 3, MPI_COMM_WORLD);
MPI_Send(&bCols, 1, MPI_INT, 1, 4, MPI_COMM_WORLD);
MPI_Send(&aRows, 1, MPI_INT, 2, 5, MPI_COMM_WORLD);
MPI_Send(&aCols, 1, MPI_INT, 2, 6, MPI_COMM_WORLD);
MPI_Send(&bRows, 1, MPI_INT, 2, 7, MPI_COMM_WORLD);
MPI_Send(&bCols, 1, MPI_INT, 2, 8, MPI_COMM_WORLD);
MPI_Send(&A1, aRows*aCols, MPI_INT, 1, 9, MPI_COMM_WORLD);
MPI_Send(&B , bRows*bCols, MPI_INT, 1, 10, MPI_COMM_WORLD);
MPI_Send(&A2, aRows*aCols, MPI_INT, 2, 11, MPI_COMM_WORLD);
MPI_Send(&B, bRows*bCols, MPI_INT, 2, 12, MPI_COMM_WORLD);
}
for (id=1;id<3;id++)
{
if (id == 1)
{
int aRows, aCols, bRows, bCols;
MPI_Recv(&aRows, 1, MPI_INT, 0, 1, MPI_COMM_WORLD, &status);
printf("receive data:%d", aRows);
MPI_Recv(&aCols, 1, MPI_INT, 0, 2, MPI_COMM_WORLD, &status);
printf("receive data:%d", aCols);
MPI_Recv(&bRows, 1, MPI_INT, 0, 3, MPI_COMM_WORLD, &status);
printf("receive data:%d", bRows);
MPI_Recv(&bCols, 1, MPI_INT, 0, 4, MPI_COMM_WORLD, &status);
printf("receive data:%d", bCols);
//int s = status.MPI_SOURCE;
//int t = status.MPI_TAG;
int** A1 = new int*[aRows];
for (int i = 0; i < aRows; i++)
{
A1[i] = new int[aCols];
}
int** B = new int*[bRows];
for (int i = 0; i < bRows; i++)
{
B[i] = new int[bCols];
}
int** C1 = new int*[bRows];
for (int i = 0; i < bRows; i++)
{
C1[i] = new int[bCols];
}
//***********************************************************
MPI_Recv(&A1, aRows*aCols, MPI_INT, 0, 9, MPI_COMM_WORLD, &status);
printf("receive data:%d", A1);
MPI_Recv(&B , aRows*aCols, MPI_INT, 0, 10, MPI_COMM_WORLD, &status);
printf("receive data:%d", B);
Multiply(C1, A1, B, aRows, aCols, bRows, bCols);
for (int i = 0; i < aRows; i++)
{
for (int j = 0; j < bCols; j++)
{
cout << C1[i][j] << endl;
}
}
}
else
{
int aRows, aCols, bRows, bCols;
MPI_Recv(&aRows, 1, MPI_INT, 0, 5, MPI_COMM_WORLD, &status);
MPI_Recv(&aCols, 1, MPI_INT, 0, 6, MPI_COMM_WORLD, &status);
MPI_Recv(&bRows, 1, MPI_INT, 0, 7, MPI_COMM_WORLD, &status);
MPI_Recv(&bCols, 1, MPI_INT, 0, 8, MPI_COMM_WORLD, &status);
int** A2 = new int*[aRows];
for (int i = 0; i < aRows; i++)
{
A2[i] = new int[aCols];
}
int** B = new int*[bRows];
for (int i = 0; i < bRows; i++)
{
B[i] = new int[bCols];
}
int** C2 = new int*[bRows];
for (int i = 0; i < bRows; i++)
{
C2[i] = new int[bCols];
}
MPI_Recv(&A2, aRows*aCols, MPI_INT, 0, 11, MPI_COMM_WORLD, &status);
printf("receive data:%d", A2);
MPI_Recv(&B , aRows*aCols, MPI_INT, 0, 12, MPI_COMM_WORLD, &status);
printf("receive data:%d", B);
//**************************************************************
MPI_Status status;
Multiply(C2, A2, B, aRows, aCols, bRows, bCols);
for (int i = 0; i < aRows; i++)
{
for (int j = 0; j < bCols; j++)
{
cout << C2[i][j] << endl;
}
}
}
//MPI_Recv(&(C1[0][0]), aRows*bCols, MPI_INT, 0, tag, MPI_COMM_WORLD,&status);
}
MPI_Finalize();
return 0;
}
void Multiply(int **res, int **A, int **B, int aRows, int aCols, int bRows, int bCols)
{
if (aCols != bRows)
return;
for (int i = 0; i < aRows; i++)
{
for (int j = 0; j < bCols; j++)
{
res[i][j] = 0;
for (int k = 0; k < aCols; k++)
{
res[i][j] += A[i][k] * B[k][j];
}
}
}
}
void Initialise(int **res, int rows, int cols)
{
for (int i = 0; i < rows; i++)
{
for (int j = 0; j < cols; j++)
{
res[i][j] = 0;
}
}
}
My program is meant to take an array size and the elements of that particular array from the user.
However, I want the program to be able to distribute the array elements evenly for any number of processors used.
I think the problem is on the displs array, but even after countless try-outs, I don't seem to be reaching any logical conclusion.
Let's say I enter a sequence of 7 numbers -> 1,2,3,4,5,6,7
I will have an output as such:
processor 0
arr[0] = 1
arr[1] = 2
arr[2] = 3
processor 1
arr[0] = 4
arr[1] = 5
processor 2
arr[0] = 7
arr[1] = 32767
The code is the following:
#include "mpi.h"
#include <stdio.h>
#include <stdlib.h>
#define ARRAY_SIZE 100
int main(int argc, char **argv)
{
int myrank, wsize;
int i,N;
int *arr,*displs, *arr_r, *sendcount;
int sum1=0;
int portion,remainder,x,y;
int root;
MPI_Status status;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
MPI_Comm_size(MPI_COMM_WORLD, &wsize);
if(myrank == 0)
{
printf("Enter number N of integers\n");
scanf("%d", &N);
arr = (int*)malloc(N*sizeof(int));
for(i = 0; i < N; i++)
{
printf("Enter number %d\n", i+1);
scanf("%d",&arr[i]);
}
}
MPI_Bcast(&N, 1, MPI_INT, 0, MPI_COMM_WORLD);
portion = N / wsize;
remainder = N % wsize;
x = portion;
y = portion +1;
displs = (int*)malloc(N*sizeof(int));
sendcount = (int*)malloc(N*sizeof(int));
for(i=0; i < N; i++)
{
if(myrank < remainder)
{
sendcount[i] = portion + (remainder);
displs[i] = (portion + (remainder)) * i;
}
else if(remainder == 0)
{
sendcount[i] = portion;
displs[i] = portion *i;
}
else
{
sendcount[i] = portion;
displs[i] = portion * i;
}
}
arr_r = (int*)malloc(N *sizeof(int));
MPI_Scatterv(arr, sendcount, displs, MPI_INT, arr_r, N, MPI_INT, 0, MPI_COMM_WORLD);
if(myrank < remainder)
{
printf("process %d \n",myrank);
for(i = 0; i < portion + 1; i++)
{
printf("Arr[%d] = %d\n",i,arr_r[i]);
}
}
else if(remainder == 0)
{
printf("process %d \n",myrank);
for(i = 0; i < portion; i++)
{
printf("Arr[%d] = %d\n",i,arr_r[i]);
}
}
else
{
printf("process %d \n",myrank);
for(i = 0; i < portion; i++)
{
printf("Arr[%d] = %d\n",i,arr_r[i]);
}
}
MPI_Finalize();
return 0;
}
I got syntax error in MPI send command. I wanted to send some rows and respective columns with it. I have got error in this line MPI_Send(&(array[ch_row][ch_col]), ch_size*col, MPI_INT, p, 1, MPI_COMM_WORLD) at ch_col. I can't understand why I and getting this error.
int tot_processes;
int process_id;
MPI_Comm_size(MPI_COMM_WORLD, &tot_processes);
MPI_Comm_rank(MPI_COMM_WORLD, &process_id);
if (process_id == 0) {
int row, col;
cout << "Enter rows and columns: ";
cin >> row >> col;
int *array = new int[row*col];
for (int i = 0; i < row; i++) {
for (int j = 0; j < col; j++) {
array[i][j] = 1;
}
}
int ch_size = row / tot_processes;
for (int p = 1; p < tot_processes; p++) {
int ch_row = ch_size * (p - 1);
int ch_col = ch_size * col;
MPI_Send(&ch_size, 1, MPI_INT, p, 0, MPI_COMM_WORLD);
MPI_Send(&(array[ch_row][ch_col]), ch_size*col, MPI_INT, p, 1, MPI_COMM_WORLD);
}
}
In my parallel programming book, I came across this code that says the slaves generate the data set, however, I think the master acutally generates the data set.
This line in particular is why I believe that master generates the data set.
for (i=0; i < ARRAY_SIZE; i++)
numbers[i] = i;
Can someone confirm if master or slaves generate the data set?
#include "mpi.h"
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#define TRIALS 20
#define ARRAY_SIZE 1000000
int main(int argc, char *argv[])
{
int myid, numprocs;
double startwtime, endwtime;
int namelen;
int* numbers = new int[ARRAY_SIZE];
int i, j, sum, part_sum;
int s, s0, startIndex, endIndex;
double totalTime;
char processor_name[MPI_MAX_PROCESSOR_NAME];
MPI_Init(&argc,&argv);
MPI_Comm_size(MPI_COMM_WORLD,&numprocs);
MPI_Comm_rank(MPI_COMM_WORLD,&myid);
MPI_Get_processor_name(processor_name,&namelen);
fprintf(stderr,"Process %d on %s\n", myid, processor_name);
fflush(stderr);
for (i=0; i < ARRAY_SIZE; i++)
numbers[i] = i;
if (myid == 0)
{
s = (int) floor(ARRAY_SIZE/numprocs);
s0 = s + ARRAY_SIZE%numprocs;
//printf("s=%d , s0= %d\n", s, s0);
}
MPI_Bcast(&s, 1, MPI_INT, 0, MPI_COMM_WORLD);
MPI_Bcast(&s0, 1, MPI_INT, 0, MPI_COMM_WORLD);
startIndex = s0 + (myid - 1)*s;
endIndex = startIndex + s;
totalTime = 0;
for (j = 1; j <= TRIALS; j++)
{
if (myid == 0)
{
startwtime = MPI_Wtime();
}
sum = 0;
part_sum = 0;
if (myid == 0) // master
{
// compute sum of master's numbers
for (i = 0; i < s0; i++)
{
part_sum += numbers[i];
}
}
else
{
for (i = startIndex; i < endIndex; i++)
{
part_sum += numbers[i];
}
}
MPI_Reduce(&part_sum, &sum, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
if (myid == 0)
{
double runTime;
endwtime = MPI_Wtime();
runTime = endwtime - startwtime;
printf("Trial %d : Execution time (sec) = %f\n", j, runTime);
printf("Sum = %d \n", sum);
totalTime += runTime;
}
} // end for
if (myid == 0)
printf("Average time for %d trials = %f", TRIALS, totalTime/TRIALS);
MPI_Finalize();
}
Both the master and the slaves generate the entire array. You have to remember that your program runs on all nodes and the part of the code in question doesn't distinguish between master/slave. So the wording of your book isn't wrong, but it could be clarified. :)