Managing large vectors with MPI I/O in C - c++

I am trying to compute the distance function matrix for a set of spatial coordinates (x and y ) with index array "a". The following is a simplified script wherein I scatter only the index array into local array local_a for each process. Instead of evaluating the entire distance matrix , I only compute the upper triangular matrix without the trivial diagonal entries which are essentially zero. I am trying to use MPI I/O to compute the Euclidean distance "d" and store it in an array and then write that array to an output file. I define the offset as a function of local_a[i] which defines both the local dimension as (SIZE-local_a), since the # of elements decrease as one moves down in row for an upper triangular matrix. The code below works well for up to 8000 elements but fails with " mca_fbtl_posix_pwritev: error in writev:Invalid argument " when I use 80,000 as vector size. I am not sure what is causing this error and any help will be greatly appreciated.
#include <stdio.h>
#include <mpi.h>
#include <stdlib.h>
#include <string.h>
#include <stdbool.h>
#include <math.h>
int main(int argc, char *argv[])
{
int procid, numprocs, ierr, *a, SIZE, num_rows;
SIZE = 80000;
MPI_File fh;
double *x, *y, tstart, tend;
a = (int*) malloc(SIZE*sizeof(int));
x =(double*) malloc(SIZE*sizeof(double));
y =(double*) malloc(SIZE*sizeof(double));
ierr = MPI_Init(&argc, &argv);
ierr = MPI_Comm_rank(MPI_COMM_WORLD, &procid);
ierr = MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
MPI_Status status;
MPI_Offset offset;
num_rows = SIZE/numprocs;
for ( int i = 0; i < SIZE; i ++)
{
a[i] = i+1;
x[i] = i+1;
y[i] = i+2;
}
// Define local array for each process
int *local_a;
local_a = (int*) malloc(num_rows*sizeof(int));
// Scatter only the index array (a) to each process in a cyclical manner
for (int i = 0; i < num_rows; i ++)
{
ierr = MPI_Scatter(&a[i*numprocs], 1, MPI_INT, &local_a[i], 1, MPI_INT, 0, MPI_COMM_WORLD);
}
MPI_File_open(MPI_COMM_WORLD, "data.bin", MPI_MODE_CREATE|MPI_MODE_WRONLY, MPI_INFO_NULL, &fh);
for ( int i = 0; i < num_rows ; i++)
{
int p1 = local_a[i];
int local_dim = (SIZE-local_a[i]);
double *Dist;
Dist = (double*) malloc(local_dim*sizeof(double));
if ( local_a[i]==1)
{
offset = 2*0*sizeof(int);
}
else
{
int sum =0;
for ( int i = (SIZE-1); i >local_dim; i--)
{
sum +=i;
}
offset = 2*sum*sizeof(int);
if ( local_dim == 0)
{
local_dim = 1;
}
}
for ( int j = local_a[i]; j < SIZE; j++)
{
double d;
d = pow((pow(x[p1-1]-x[j],2) + pow(y[p1 -1]-y[j],2)),0.5);
Dist[j-local_a[i]] = d;
}
ierr = MPI_File_write_at(fh,offset,Dist,local_dim,MPI_DOUBLE,&status);
ierr = MPI_Barrier(MPI_COMM_WORLD);
free(Dist);
}
MPI_File_close(&fh);
ierr = MPI_Finalize();
free(a);
free(x);
free(y);
free(local_a);
return ierr;
}

Related

MPI_Reduce on different communicators not working as expected

#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
using namespace std;
int ceil(int x, int y) {
return x / y + (x % y > 0);
}
void create_group_and_comm(MPI_Group *world_group, MPI_Group *group, MPI_Comm *comm, int size, bool is_even) {
int *ranks;
int count = is_even ? ceil(size, 2) : size / 2;
ranks = (int *)malloc(count * sizeof(int));
int i = is_even ? 0 : 1, j=0;
while(i < size) {
ranks[j] = i;
j++;
i+=2;
}
MPI_Group_incl(*world_group, j, ranks, group);
MPI_Comm_create(MPI_COMM_WORLD, *group, comm);
free(ranks);
}
int main(int argc, char *argv[])
{
int size, rank, *result_odd, *result_even;
int rank_gr;
char processor_name[MPI_MAX_PROCESSOR_NAME];
MPI_Status status;
MPI_Comm even_comm, odd_comm;
MPI_Group even_group, odd_group, world_group;
int *A, *Rows;
int namelen;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Get_processor_name(processor_name, &namelen);
if (rank == 0)
{
A = (int *)malloc(size * size * sizeof(int));
for (int i = 0; i < size * size; i++) {
A[i] = rand() / 1000000;
}
printf("Initial data:\n");
for (int i = 0; i < size; i++)
{
putchar('|');
for (int j = 0; j < size; j++)
printf("%.4d ", A[i*size+j]);
printf("|\n");
}
MPI_Barrier(MPI_COMM_WORLD);
}
else
MPI_Barrier(MPI_COMM_WORLD);
MPI_Comm_group(MPI_COMM_WORLD, &world_group);
create_group_and_comm(&world_group, &even_group, &even_comm, size, true);
create_group_and_comm(&world_group, &odd_group, &odd_comm, size, false);
Rows = new int[size];
MPI_Scatter(A, size, MPI_INT, Rows, size, MPI_INT, 0, MPI_COMM_WORLD);
result_odd = new int[size];
result_even = new int[size];
if(rank % 2 == 0) {
MPI_Reduce(Rows,result_even,size,MPI_INT,MPI_MAX,0,even_comm);
} else {
MPI_Reduce(Rows,result_odd,size,MPI_INT,MPI_MIN,0,odd_comm);
}
if(rank == 0) {
printf("Max values for columns on even:\n");
for(int idx = 0; idx < size;idx++) {
printf("Column %d: %d\n", idx+1, result_even[idx]);
}
printf("Max values for columns on odd:\n");
for(int idx = 0; idx < size;idx++) {
printf("Column %d: %d\n", idx+1, result_odd[idx]);
}
}
//MPI_Comm_free(&even_comm);
//MPI_Comm_free(&odd_comm);
MPI_Group_free(&even_group);
MPI_Group_free(&odd_group);
MPI_Finalize();
return 0;
}
Hello i'm writing an application using MPI library, i'm trying to create 2 groups with each of them with their own communicator. Basically one group which holds processors with rank even calculates the maximum value per column using MPI_Reduce between them(processors in group), and the second one calculates the minimum for each column in matrice. For even rank MPI_Reduce works as expected but for processors with odd rank is not working as it should, can someone help me what i'm doing wrong? Below is a picture with the problem i described:
image here

Harmonic Sum using mpi : having trouble getting started with mpi, not sure how to modify this to a harmonic series

I am new to using MPI, I am attempting to modify this code to do a harmonic series of n = 20 and I am not too sure how to start. How is the MPI exactly working and what do we do to modify it.
Prompt: Modify the sum.cpp program to compute sum = 1 + 1/2 + 1/3 + 1/4 + … 1/n!
let n = 20
#include "mpi.h"
#include <cstdio>
#include <cmath>
#include <cstdlib>
#define ARRAY_SIZE 1000000
int main (int argc, char *argv[]) {
int myid, numprocs;
int namelen;
int* numbers = new int[ARRAY_SIZE];
char processor_name[MPI_MAX_PROCESSOR_NAME];
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
MPI_Comm_rank(MPI_COMM_WORLD, &myid);
MPI_Get_processor_name(processor_name, &namelen);
printf("Process %d on %s\n", myid, processor_name);
for (int i=0; i<ARRAY_SIZE; i++)
numbers[i] = i; //could be randomly generated
int s = (int)floor(ARRAY_SIZE/numprocs);
int s0 = s + ARRAY_SIZE%numprocs;
int startIndex = s0 + (myid-1)*s;
int endIndex = startIndex + s;
double startwtime;
if (myid == 0) {
startwtime = MPI_Wtime();
}
int i;
int part_sum = 0;
if (myid == 0) {
// master worker - comput the master's numbers
for (i=0; i<s0; i++) {
part_sum += numbers[i];
}
printf("Process %d - startIndex 0 endIndex %d; part_sum %ld\n",
myid, s0-1, part_sum);
} else {
//slave's work
for (i= startIndex; i<endIndex; i++) {
part_sum += numbers[i];
}
printf ("Process %d - startIndex %d endIndex %d; part_sum %ld\n",
myid, startIndex, endIndex-1, part_sum);
}
int sum = 0;
MPI_Reduce(&part_sum, &sum, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
if (myid == 0) {
double runTime = MPI_Wtime() - startwtime;
printf("Execution time (sec) = %f sum = %ld \n",
runTime, sum);
}
delete[] numbers;
MPI_Finalize();
}

MPI - How to partition and communicate my array portions between master and worker processes

I am having a problem executing my master/worker MPI program.
The goal is to have the master pass portions of the integer array to the workers, have the workers sort their portions, and then return array portion to the master process which then combines the portions into finalArray[].
I think it has something to do with how I'm passing the portions of the array between processes, but I can't seem to think of anything new to try.
My code:
int compare(const void * a, const void * b) // used for quick sort method
{
if (*(int*)a < *(int*)b) return -1;
if (*(int*)a > *(int*)b) return 1;
return 0;
}
const int arraySize = 10000;
int main(int argc, char ** argv)
{
int rank;
int numProcesses;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &numProcesses);
const int PART = floor(arraySize / (numProcesses - 1));
auto start = std::chrono::high_resolution_clock::now(); //start timer
//================================= MASTER PROCESS =================================
if (rank == 0)
{
int bigArray[arraySize];
int finalArray[arraySize];
for (int i = 0; i < arraySize; i++) //random number generator
{
bigArray[i] = rand();
}
for (int i = 0; i < numProcesses - 1; i++)
{
MPI_Send(&bigArray, PART, MPI_INT, i + 1, 0, MPI_COMM_WORLD); // send elements of the array
}
for (int i = 0; i < numProcesses - 1; i++)
{
std::unique_ptr<int[]> tmpArray(new int[PART]);
MPI_Recv(&tmpArray, PART, MPI_INT, i + 1, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); //recieve sorted array from workers
for (int k = 0; k < PART; k++)
{
finalArray[PART * i + k] = tmpArray[k];
}
}
for (int m = 0; m < arraySize; m++)
{
printf(" Sorted Array: %d \n", finalArray[m]); //print my sorted array
}
}
//================================ WORKER PROCESSES ===============================
if (rank != 0)
{
std::unique_ptr<int[]> tmpArray(new int[PART]);
MPI_Recv(&tmpArray, PART, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE); //recieve data into local initalized array
qsort(&tmpArray, PART, sizeof(int), compare); // quick sort
MPI_Send(&tmpArray, PART, MPI_INT, 0, 0, MPI_COMM_WORLD); //send sorted array back to rank 0
}
MPI_Barrier(MPI_COMM_WORLD);
auto end = std::chrono::high_resolution_clock::now(); //end timer
std::cout << "process took: "
<< std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count() //prints timer
<< " nanoseconds\n ";
MPI_Finalize();
return 0;
}
I am fairly new to MPI and C++ so any advice on either subject related to this problem is extremely helpful. I realize there may be many problems with this code so thank you for all help in advance.

Difference in speed between GSL and MKL

I have two codes that are both working, yet I cannot figure out why one is so much faster than the other. To my knowledge, BLAS with MKL (Intel) should be much faster than GSL (GNU), although my code is showing quite the opposite. Here are the codes themselves where I am simply creating 2 matrices at the master node and then sending different rows to different "slave" processors (with OpenMPI) which compute the final matrices elements and then return them back to the master node.
GSL example (the fast code):
#include <iostream>
#include <stdio.h>
#include <iostream>
#include <cmath>
#include <mpi.h>
#include <gsl/gsl_blas.h>
using namespace std;
int main(int argc, char** argv){
int noprocs, nid;
MPI_Status status;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &nid);
MPI_Comm_size(MPI_COMM_WORLD, &noprocs);
int master = 0;
const int nsame = 1000; //must be same if matrices multiplied together = acols = brows
const int arows = 1000;
const int bcols = 1000;
int rowsent;
double * buff;
buff = new double [nsame];
double * b;
b = new double [nsame*bcols];
double** c = new double*[arows];
for(int i = 0; i < arows; ++i)
c[i] = new double[bcols];
double * CC;
CC = new double [1*bcols]; //here ncols corresponds to numbers of rows for matrix b
for (int i = 0; i < bcols; i++){
CC[i] = 0.;
}; //this is imply a 1-d array of zeros which will be updated and passed by processors
// Master part
if (nid == master ) {
double** a = new double*[arows];
for(int i = 0; i < arows; ++i){
a[i] = new double[nsame];}
for (int i = 0; i < arows; i++){
for (int j = 0; j < nsame; j++){
if (i == j)
a[i][j] = 1.;
else
a[i][j] = 0.;
}
}
for (int i = 0; i < (nsame*bcols); i++){
b[i] = (10.*i + 3.)/(3.*i - 2.) ;
}
MPI_Bcast(b,nsame*bcols, MPI_DOUBLE_PRECISION, master, MPI_COMM_WORLD); //assumes stored as contguous block of code
// send one row to each slave tagged with row number, assume nprocs<nrows
rowsent=0;
for (int i=1; i < (noprocs); i++) { //must be equal to noprocs otherwise it will not send to 3
MPI_Send(a[rowsent], nsame, MPI_DOUBLE_PRECISION,i,rowsent+1,MPI_COMM_WORLD);
rowsent++;
}
for (int i=0; i<arows; i++) {
MPI_Recv(CC, bcols, MPI_DOUBLE_PRECISION, MPI_ANY_SOURCE, MPI_ANY_TAG,
MPI_COMM_WORLD, &status);
int sender = status.MPI_SOURCE;
int anstype = status.MPI_TAG; //row number+1
int IND_I = 0;
while (IND_I < bcols){
c[anstype - 1][IND_I] = CC[IND_I];
IND_I++;
}
if (rowsent < arows) {
MPI_Send(a[rowsent], nsame,MPI_DOUBLE_PRECISION,sender,rowsent+1,MPI_COMM_WORLD);
rowsent++;
}
else { // tell sender no more work to do via a 0 TAG
MPI_Send(MPI_BOTTOM,0,MPI_DOUBLE_PRECISION,sender,0,MPI_COMM_WORLD);
}
}
}
// Slave part
else {
MPI_Bcast(b,nsame*bcols, MPI_DOUBLE_PRECISION, master, MPI_COMM_WORLD);
MPI_Recv(buff,nsame,MPI_DOUBLE_PRECISION,master,MPI_ANY_TAG,MPI_COMM_WORLD,&status);
while(status.MPI_TAG != 0) {
int crow = status.MPI_TAG;
gsl_matrix_view AAAA = gsl_matrix_view_array(buff, 1, nsame);
gsl_matrix_view BBBB = gsl_matrix_view_array(b, nsame, bcols);
gsl_matrix_view CCCC = gsl_matrix_view_array(CC, 1, bcols);
/* Compute C = A B */
gsl_blas_dgemm (CblasNoTrans, CblasNoTrans, 1.0, &AAAA.matrix, &BBBB.matrix,
0.0, &CCCC.matrix);
MPI_Send(CC,bcols,MPI_DOUBLE_PRECISION, master, crow, MPI_COMM_WORLD);
MPI_Recv(buff,nsame,MPI_DOUBLE_PRECISION,master,MPI_ANY_TAG,MPI_COMM_WORLD,&status);
// cout << ans << " OUTPUT \n";
}
}
MPI_Finalize();
return 0;
};
MKL example (the slow code):
#include <iostream>
#include <stdio.h>
#include <iostream>
#include <cmath>
#include <mpi.h>
#include </opt/intel/compilers_and_libraries_2017.1.126/mac/mkl/include/mkl.h>
using namespace std;
int main(int argc, char** argv){ //THE IDENTITY MATRIX ONLY WORKS IF arows = nsame!
int noprocs, nid;
MPI_Status status;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &nid);
MPI_Comm_size(MPI_COMM_WORLD, &noprocs);
int master = 0;
const int nsame = 1000;
const int arows = 1000;
const int bcols = 1000;
int rowsent;
double * buff;
buff = new double [nsame];
double * b;
b = new double [nsame*bcols];
double** c = new double*[arows];
for(int i = 0; i < arows; ++i)
c[i] = new double[bcols];
double * CC;
CC = new double [1*bcols];
for (int i = 0; i < bcols; i++){
CC[i] = 0.;
};
// Master part
if (nid == master ) {
double** a = new double*[arows];
for(int i = 0; i < arows; ++i){
a[i] = new double[nsame];}
for (int i = 0; i < arows; i++){
for (int j = 0; j < nsame; j++){
if (i == j)
a[i][j] = 1.;
else
a[i][j] = 0.;
}
}
for (int i = 0; i < (nsame*bcols); i++){
b[i] = (10.*i + 3.)/(3.*i - 2.) ; // = 1.*i as test value
}
MPI_Bcast(b,nsame*bcols, MPI_DOUBLE_PRECISION, master, MPI_COMM_WORLD); //assumes stored as contguous block of code nprocs<nrows
delete[] b;
rowsent=0;
for (int i=1; i < (noprocs); i++) { //must be equal to noprocs otherwise it will not send to 3
MPI_Send(a[rowsent], nsame, MPI_DOUBLE_PRECISION,i,rowsent+1,MPI_COMM_WORLD);
delete[] a[rowsent];
rowsent++;
}
for (int i=0; i<arows; i++) {
MPI_Recv(CC, bcols, MPI_DOUBLE_PRECISION, MPI_ANY_SOURCE, MPI_ANY_TAG,
MPI_COMM_WORLD, &status);
int sender = status.MPI_SOURCE;
int anstype = status.MPI_TAG; //row number+1
int IND_I = 0;
while (IND_I < bcols){
c[anstype - 1][IND_I] = CC[IND_I];
IND_I++;
}
if (rowsent < arows) {
MPI_Send(a[rowsent], nsame,MPI_DOUBLE_PRECISION,sender,rowsent+1,MPI_COMM_WORLD);
delete[] a[rowsent];
rowsent++;
}
else { // tell sender no more work to do via a 0 TAG
MPI_Send(MPI_BOTTOM,0,MPI_DOUBLE_PRECISION,sender,0,MPI_COMM_WORLD);
}
}
}
// Slave part
else {
MPI_Bcast(b,nsame*bcols, MPI_DOUBLE_PRECISION, master, MPI_COMM_WORLD);
MPI_Recv(buff,nsame,MPI_DOUBLE_PRECISION,master,MPI_ANY_TAG,MPI_COMM_WORLD,&status);
while(status.MPI_TAG != 0) {
int crow = status.MPI_TAG;
/* Compute C = A B */
cblas_dgemm (CblasRowMajor, CblasNoTrans, CblasNoTrans, 1, bcols, nsame, 1.0, buff, nsame, b, bcols,
0.0, CC, bcols);
MPI_Send(CC,bcols,MPI_DOUBLE_PRECISION, master, crow, MPI_COMM_WORLD);
MPI_Recv(buff,nsame,MPI_DOUBLE_PRECISION,master,MPI_ANY_TAG,MPI_COMM_WORLD,&status);
}
}
MPI_Finalize();
return 0;
};
I was thinking it might be due to me not deleting any of the new elements created, although I use essentially the same approach to initialize the arrays in both codes. I even tried deleting values in the MKL code (as shown) yet this appears to not have much of an effect. When I increase the size of the arrays from nsame = arows = bcols = 1000 to nsame = arows = bcols = 10000, the time differences in the two codes can readily be observed (the GSL code takes approximately 45 seconds while the MKL code takes quite a few minutes). Thus I am wondering if this is simply inherent to the way GSL and MKL are designed and incorporated in my code or if there is perhaps something else more subtle going on.

What is the easier way to split an array

I have one problem at that point when I tried to split an array into some subarrays.
To be more exactly I have an array, let's say int a[10]={1,3,2,7,8,12,5,7,68,10} and I'm running my program on X process (in this moment I'm using 8 but could be more or less).
And I want to sent to each process on part of this array, for example for my array in this moment each process will receive something like process0 = {1, 3}, process2 = {2, 7} and so on.. until process7 = 68, 10.
After I've send each subarray I will do some operations on each subarray and after I want to merge all my subarrays into one back.
I've search on google a lot and I saw some example using MPI_Send and MPI_Recv or MPI_Scatter and MPI_Gather and I've tried some methods but everything I've tried... was without success and I receive errors or
null pointer...
My Code:
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#define N 32
int A[N];
int main(int argc, char *argv[]) {
int size;
int rank;
const int ROOT = 0;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
int count = N / (size - 1);
int *localArray = (int *) malloc(count * sizeof(int));
if (rank == ROOT) {
for (int i = 0; i < N; i++) {
A[i] = rand() % 10;
}
for (int dest = 1; dest < size; ++dest) {
MPI_Send(&A[(dest - 1) * count], count, MPI_INT, dest, tag, MPI_COMM_WORLD);
printf("P0 sent a %d elements to P%d.\n", count, dest);
}
for (int source = 1; source < size; source++) {
MPI_Recv(localArray, count, MPI_INT, source, 2, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
//--------------------------------MERGE THE ALL RESULTS INTO A SORTED ARRAY-------------------------------------
printf("Received results from task %d\n", source);
}
}
else {
MPI_Recv(localArray, count, MPI_INT, ROOT, tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
//---------------SORT THE localArray-------------------
MPI_Send(localArray, count, MPI_INT, ROOT, tag, MPI_COMM_WORLD);
}
MPI_Finalize();
return 0;
}
What ever I've tried I can't get results where I've put the comment, what I'm doing wrong
While dreamcrash already suggested you could clean up your code using scatter & gather, I would put more emphasis on this. Use the built-in collective operations wherever possible. Do not try to rebuild them on your own. Not only is the code cleaner and easier to understand, it will also be significantly faster and allow all sorts of optimizations by the MPI implementation. Your example (assuming N is divisible by size) becomes:
if (rank == ROOT) {
for (int i = 0; i < N; i++) {
A[i] = rand() % 10;
}
}
MPI_Scatter(A, count, MPI_INT, localArray, count, MPI_INT, ROOT, MPI_COMM_WORLD);
//---------------SORT THE localArray-------------------
MPI_Gather(localArray, count, MPI_INT, A, count, MPI_INT, ROOT, MPI_COMM_WORLD);
MPI_Finalize();
Note that the ROOT rank correctly participates in the computation and does send data to itself using scatter / gather without any additional code path.
Now since your example explicitly uses N=10, which is not divisible by size=8, here is a version that works correctly. The idea is to distribute the remainder of the integer division evenly across the first remainder ranks (each gets one additional element to work on). You have to do that irregardless of using send/recv or scatter/gather. With scatter/gather you use the MPI_Scatterv / MPI_Gatherv variants, which take an array of sendcounts (how much elements does each rank get) and displacements (offset of each local part within the global one):
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#define N 32
int A[N];
int main(int argc, char *argv[]) {
int size;
int rank;
const int ROOT = 0;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
// compute the work distribution
int remainder = N % size;
int local_counts[size], offsets[size];
int sum = 0;
for (int i = 0; i < size; i++) {
local_counts[i] = N / size;
if (remainder > 0) {
local_counts[i] += 1;
remainder--;
}
offsets[i] = sum;
sum += local_counts[i];
}
int localArray[local_counts[rank]];
if (rank == ROOT) {
for (int i = 0; i < N; i++) {
A[i] = rand() % 10;
}
}
MPI_Scatterv(A, local_counts, offsets, MPI_INT, localArray, local_counts[rank], MPI_INT, ROOT, MPI_COMM_WORLD);
//---------------SORT THE localArray-------------------
MPI_Gatherv(localArray, local_counts[rank], MPI_INT, A, local_counts, offsets, MPI_INT, ROOT, MPI_COMM_WORLD);
MPI_Finalize();
return 0;
}
Change your code for something like this:
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#define N 32
int A[N]; // this should be global
int main(int argc, char *argv[]) {
int size;
int rank;
const int VERY_LARGE_INT = 999999;
const int ROOT = 0;
int tag = 1234;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
int count = N / size ;
int *localArray = (int *) malloc(count * sizeof(int));
int localMin; // minimum computed on rank i
int globalMin; // will only be valid on rank == ROOT
if (rank == ROOT) {
for (int i = 0; i < N; i++) {
A[i] = rand() % 10;
}
// master local copy
for (int i = 0; i < count; i++)
localArray[i] = A[i];
for (int dest = 1; dest < size; ++dest) {
MPI_Send(&A[dest* count], count, MPI_INT, dest, tag, MPI_COMM_WORLD);
printf("P0 sent a %d elements to P%d.\n", count, dest);
}
localMin = VERY_LARGE_INT;
for (int source = 1; source < size; source++)
{
MPI_Recv(localArray, count, MPI_INT, source, 2, MPI_COMM_WORLD,
MPI_STATUS_IGNORE);
//--------------------------------I CANT GET RESULT HERE-------------------------------------
printf("Received results from task %d\n", source);
}
}
else
{
MPI_Recv(localArray, count, MPI_INT, ROOT, tag, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
//.. do something
MPI_Send(localArray, count, MPI_INT, ROOT, 2, MPI_COMM_WORLD);
}
MPI_Finalize();
return 0;
}
Some mistakes:
Array A is global, therefore all processes will have it, you most
likely want to only allocate it for the master process;
I changed N / (size - 1) to N / size, however be aware that this
only works when N %% size == 0, thus you might want to deal with opposed
scenario.
Since the master will have a sub-copy of the global array, I am performing this local copy from A to local array before sending the data to the slaves :
// master local copy
for (int i = 0; i < count; i++)
localArray[i] = A[i];
You have a small mistake on the merging part, the master and the slaves are using different tags, that was causing a deadlock. That is why I also changed this:
MPI_Send(localArray, count, MPI_INT, ROOT, tag, MPI_COMM_WORLD);
to
MPI_Send(localArray, count, MPI_INT, ROOT, 2, MPI_COMM_WORLD);
Both have now the same tag (2);
You could implement this code with scatter and gather and it would be a lot cleaner see here some examples.
Another mirror issue is if you are using C language instead of int *localArray = (int *) malloc(count * sizeof(int)); you should do int *localArray = malloc(count * sizeof(int)); see here why.