Matrix-Vector Multiplication on MPI - ERROR Compiled code - c++

I need assistance to resolve an error in the following code:
#include <iostream>
#include <mpi.h>
using namespace std;
//matrix in two dimension in memory!!
int main(int argc, char** argv)
{
const int WIDTH = 100;
const int HEIGHT = 100;
int id, P;
double tempValue = 0;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &P);
MPI_Comm_rank(MPI_COMM_WORLD, &id);
double A[WIDTH][HEIGHT];
double x[HEIGHT], b[WIDTH];
int upperBound, lowerBound = 0;
// Master controls worksharing..
if (id == 0)
{
// Init A & x
for (int i = 0; i < WIDTH; i++)
for (int j = 0; j < HEIGHT; j++)
A[i][j] = 1;
for (int j = 0; j < HEIGHT; j++)
x[j] = 2;
// Send to each node its portion of A to be processed
int portionSize = WIDTH / P;
for (int i = 0; i < P; i++)
{
lowerBound = i * portionSize;
upperBound = (i + 1) * portionSize;
// let the last node process the remainder
if (i == (P - 1))
upperBound += (HEIGHT - portionSize * P);
if (i > 0)// Do not send to master node!!
{
// Send to node i the lower & upper bounds the A portion
//and complete vector x
MPI_Send(&lowerBound, 1, MPI_INT, i, 0, MPI_COMM_WORLD);
MPI_Send(&upperBound, 1, MPI_INT, i, 0, MPI_COMM_WORLD);
MPI_Send(&A[lowerBound][0], (upperBound - lowerBound) * HEIGHT,
MPI_DOUBLE, i, 0, MPI_COMM_WORLD);
MPI_Send(&x[0], HEIGHT, MPI_DOUBLE, i, 0, MPI_COMM_WORLD);
}
}
// master perform part of the job...
for (int i = 0; i < portionSize; i++)
{
tempValue = 0;
for (int j = 0; j < HEIGHT; j++)
tempValue += A[i][j] * x[j];
b[i] = tempValue;
}
//Get the results in order, each node would send their boundaries and data part
for (int i = 1; i < P; i++)
{
MPI_Recv(&lowerBound, 1, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUSES_IGNORE);
MPI_Recv(&upperBound, 1, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUSES_IGNORE);
MPI_Recv(&P[lowerBound], (upperBound - lowerBound), MPI_DOUBLE, i, 0,
MPI_COMM_WORLD, MPI_STATUSES_IGNORE);
}
// Print the first 2 values to check..
cout << "b[0]=" << b[0] << " b[Width-1]=" << b[WIDTH - 1] << endl;
}
else // the rest of the workers do their parts
{
//Receive the inputs
MPI_Recv(&lowerBound, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUSES_IGNORE);
MPI_Recv(&upperBound, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, MPI_STATUSES_IGNORE);
MPI_Recv(&A[lowerBound][0], (upperBound - lowerBound) * WIDTH, MPI_DOUBLE, 0, 0,
MPI_COMM_WORLD, MPI_STATUSES_IGNORE);
MPI_Recv(&x, HEIGHT, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, MPI_STATUSES_IGNORE);
cout << "Node:" << id << " Received from:" << lowerBound << " to " << upperBound - 1
<< endl;
double* result = new double[upperBound - lowerBound];
//Do the job
for (int i = lowerBound, resultCounter = 0; i < upperBound; i++, resultCounter++)
{
tempValue = 0;
for (int j = 0; j < HEIGHT; j++)
tempValue += A[i][j] * x[j];
result[resultCounter] = tempValue;
}
//send the results
MPI_Send(&lowerBound, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);
MPI_Send(&upperBound, 1, MPI_INT, 0, 0, MPI_COMM_WORLD);
MPI_Send(&result[0], upperBound - lowerBound, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
delete[] result;
}
MPI_Finalize();
return 0;
}
When I compile the code in Microsoft Visual Studio 2019, I get this error message:
Error (active) E0142 expression must have pointer-to-object type ConsoleApplication9 C:\Users\m_swe\Desktop\Assignments\Assignments\PrjP2P\MatMPI\MatMPI\Source.cpp 59
Error C2109 subscript requires array or pointer type ConsoleApplication9 C:\Users\m_swe\Desktop\Assignments\Assignments\PrjP2P\MatMPI\MatMPI\Source.cpp 59

I think the problem is on line: 59
MPI_Recv(&P[lowerBound], (upperBound - lowerBound), MPI_DOUBLE, i, 0,
MPI_Recv takes in a pointer to a buffer (the first argument) where you are going to receive and store the incoming data. In this case it could be in some variable which you can define inside the for loop, as:
int receivedValues[ WIDTH * HEIGHT ];
for (int i = 1; i < P; i++)
{
MPI_Recv(&lowerBound, 1, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUSES_IGNORE);
MPI_Recv(&upperBound, 1, MPI_INT, i, 0, MPI_COMM_WORLD, MPI_STATUSES_IGNORE);
MPI_Recv(&receivedValues[0], (upperBound - lowerBound), MPI_DOUBLE, i, 0,
MPI_COMM_WORLD, MPI_STATUSES_IGNORE);
// do your computation here with receivedValues
}
}

Related

Problem with MPI_Gather Couldnt gather 8 arrays of 32 element into a big array of 256 elements

i am new to MPI , i have an array of 256 integer , i want to divide each number by 16 , I suggested to Scatter 32 element on each Processor but i couldn't gather them as each Return value contains array of 32
int globalhistogram[256];
float globalProb[256];
float* localprob = new float[32];
int localpixel[32];
MPI_Scatter(&globalhistogram, 32, MPI_INT, localpixel, 32, MPI_INT, 0, MPI_COMM_WORLD);
for (int i = 0; i < 32; i++)
{
localprob[i] = (float)localpixel[i] / 16;
}
MPI_Gather(localprob, 32, MPI_FLOAT, &globalprob, 32, MPI_FLOAT, 0, MPI_COMM_WORLD);
I don't understand the issue - the code appears to run correctly after I correct what I assume is a typo float globalProb[256] -> float globalprob[256].
I agree with #victor-eijkhout about the &globalprob issue but it doesn't appear to make a difference.
If I compile and run the appended code I get the expected answer:
dsh#laptop$ mpicxx -o play play.cpp
dsh#laptop$ mpirun -n 8 ./play
rank 0: globalprob[0] = 0.000000
...
rank 0: globalprob[31] = 31.000000
rank 0: globalprob[32] = 64.000000
...
rank 0: globalprob[255] = 2040.000000
Here's the full code:
#include <stdio.h>
#include <mpi.h>
int main(void)
{
int rank, size, i;
MPI_Init(NULL, NULL);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
int globalhistogram[256];
float globalprob[256];
float* localprob = new float[32];
int localpixel[32];
for (i=0; i < 256; i++)
{
globalhistogram[i] = i;
}
MPI_Scatter(&globalhistogram, 32, MPI_INT, localpixel, 32, MPI_INT, 0, MPI_COMM_WORLD);
for (int i = 0; i < 32; i++)
{
localprob[i] = (float)localpixel[i] *(rank+1);
}
MPI_Gather(localprob, 32, MPI_FLOAT, &globalprob, 32, MPI_FLOAT, 0, MPI_COMM_WORLD);
if (rank == 0)
{
for (i=0; i < 256; i++)
{
printf("rank %d: globalprob[%d] = %f\n", rank, i, globalprob[i]);
}
}
MPI_Finalize();
}

MPI Point to Point Communication to Collective Communication

I am learning MPI and I am trying to convert my MPI program from Point to Point Communication to MPI Collectives ..
Below is a fragment of my code for Matrix Multiplication using MPI Point to Point communication ...
int i;
if(rank == 0) {
for(i = 1; i < size; i++){
MPI_Send(&rows, 1, MPI_INT, i, 0, MPI_COMM_WORLD);
MPI_Send(&columns, 1, MPI_INT, i, 0, MPI_COMM_WORLD);
}
} else {
MPI_Recv(&rows, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &status);
MPI_Recv(&columns, 1, MPI_INT, 0, 0, MPI_COMM_WORLD, &status);
}
int local_block_size = rows / size;
int process, column_pivot;
if(rank == 0) {
for(i = 1; i < size; i++){
MPI_Send((matrix_1D_mapped + (i * (local_block_size * rows))), (local_block_size * rows), MPI_DOUBLE, i, 0, MPI_COMM_WORLD);
MPI_Send((rhs + (i * local_block_size)), local_block_size, MPI_DOUBLE, i, 0, MPI_COMM_WORLD);
}
for(i = 0; i < local_block_size * rows; i++){
matrix_local_block[i] = matrix_1D_mapped[i];
}
for(i = 0; i < local_block_size; i++){
rhs_local_block[i] = rhs[i];
}
} else {
MPI_Recv(matrix_local_block, local_block_size * rows, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &status);
MPI_Recv(rhs_local_block, local_block_size, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &status);
}
I am thinking about replacing MPI_Send with MPI_Bcast ... will that be the correct approach ?
For the first communication that data sent to all receivers is in fact identical, thus MPI_Bcast is the correct approach. The second communication distributes different chunks of a larger array to the recipients, this is done as a collective with MPI_Scatter. Note that scatter includes the root rank in the communication, so you can omit the manual local copy.

MPI_Gatherv hangs when recomposing a matrix

I have to decompose and recompose a matrix in MPI (I'm using MPICH), and I'm using Scatterv and Gatherv as in the example from this question. Everything works well for small matrices, but when the matrix size increases (starting from 800x800), the program hangs when it reaches MPI_Gatherv. By printing debug messages, I can see that every process passes the call to Gatherv, except the one with rank 0 (the root process in the Gatherv call).
Any suggestion? Here's the code:
#include <iostream>
#include <cstring>
#include <fstream>
#include <cstdlib>
#include "mpi.h"
using namespace std;
#define TOP_ROW_TAG 1
#define BOTTOM_ROW_TAG 2
#define LEFT_COL_TAG 3
#define RIGHT_COL_TAG 4
int main(int argc, char ** argv) {
int me, nproc, width, height, wloc, hloc;
double k, d,c, wdouble, hdouble, discr, delta_t, t;
char* initial, end;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &me);
MPI_Comm_size(MPI_COMM_WORLD, &nproc);
MPI_Comm cart_top;
wdouble = atof(argv[1]);
hdouble = atof(argv[2]);
discr = atof(argv[3]);
k = atof(argv[4]);
d = atof(argv[5]);
c = atof(argv[6]);
delta_t = atof(argv[7]);
t = atof(argv[8]);
initial = argv[9];
end = argv[10];
double p = k/(d*c);
double dsc = delta_t/(discr*discr);
width = wdouble / discr;
height = hdouble / discr;
const int NPROWS=4; /* number of rows in _decomposition_ */
const int NPCOLS=4; /* number of cols in _decomposition_ */
const int BLOCKROWS = width/NPROWS; /* number of rows in _block_ */
const int BLOCKCOLS = height/NPCOLS;
const int dims[2] = {NPROWS, NPCOLS};
const int periods[2] = {0,0};
int* mycoords = new int[2];
int locsz = (width*height)/nproc;
double* T, *Tnew, *local, *locnew;
local = new double[BLOCKROWS*BLOCKCOLS];
locnew = new double[BLOCKROWS*BLOCKCOLS];
T = new double[width * height];
Tnew = new double[width * height];
ifstream infile;
infile.open(initial);
if(me==0) {
cout<<"BLOCKROWS: "<<BLOCKROWS;
cout<<"BLOCKCOLS: "<<BLOCKCOLS<<endl;
cout<<"width: "<<width;
cout<<"height: "<<height<<endl;
int idx, jdx, temp;
for (int i=0; i<width*height; i++) {
string currline;
getline(infile, currline);
idx = atoi(strtok(currline.c_str(), " "));
jdx = atoi(strtok(NULL, " "));
temp = atof(strtok(NULL, " "));
T[idx*height+jdx] = temp;
infile.close();
}
MPI_Datatype blocktype;
MPI_Datatype blocktype2;
MPI_Datatype coltype, coltype2;
MPI_Type_vector(BLOCKROWS, 1, BLOCKCOLS, MPI_DOUBLE, &coltype);
MPI_Type_create_resized( coltype, 0, sizeof(double), &coltype2);
MPI_Type_commit(&coltype2);
MPI_Type_vector(BLOCKROWS, BLOCKCOLS, height, MPI_DOUBLE, &blocktype2);
MPI_Type_create_resized( blocktype2, 0, sizeof(double), &blocktype);
MPI_Type_commit(&blocktype);
int disps[NPROWS*NPCOLS];
int counts[NPROWS*NPCOLS];
for (int ii=0; ii<NPROWS; ii++) {
for (int jj=0; jj<NPCOLS; jj++) {
disps[ii*NPCOLS+jj] = ii*height*BLOCKROWS+jj*BLOCKCOLS;
counts [ii*NPCOLS+jj] = 1;
}
}
int myrank, lb_i, lb_j, ub_i, ub_j;
lb_i=0;
lb_j=0;
ub_i=BLOCKROWS;
ub_j=BLOCKCOLS;
/*
0= left neighbor;
1= right neighbor;
2=top neighbor;
3=bottom neighbor;
*/
int neighs[4] = {};
double* leftcol, *rightcol, *myleftcol, *myrightcol, *toprow, *bottomrow;
leftcol = new double[BLOCKROWS];
rightcol= new double[BLOCKROWS];
myleftcol = new double[BLOCKROWS];
myrightcol= new double[BLOCKROWS];
toprow = new double[BLOCKCOLS];
bottomrow = new double[BLOCKCOLS];
//Create topology and get neighbor's rank
MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periods, 0, &cart_top);
MPI_Barrier(MPI_COMM_WORLD);
MPI_Comm_rank(cart_top, &myrank);
MPI_Cart_shift(cart_top, 0, -1, &myrank, &neighs[0]);
MPI_Cart_shift(cart_top, 0, 1, &myrank, &neighs[1]);
MPI_Cart_shift(cart_top, 1, 1, &myrank, &neighs[2]);
MPI_Cart_shift(cart_top, 1, -1, &myrank, &neighs[3]);
MPI_Scatterv(T, counts, disps, blocktype, local, BLOCKROWS*BLOCKCOLS,
MPI_DOUBLE, 0, cart_top);
double curr_t=0;
for(double curr_t = 0; curr_t < t; curr_t+=delta_t) {
MPI_Barrier(cart_top);
//Send border columns to neighbors
if(neighs[2] != MPI_PROC_NULL) {
MPI_Send(&local[BLOCKCOLS-1], 1, coltype2, neighs[2], LEFT_COL_TAG+(int)(curr_t*1000), cart_top);
}
if(neighs[3] != MPI_PROC_NULL) {
MPI_Send(local, 1, coltype2, neighs[3], RIGHT_COL_TAG+(int)(curr_t*1000), cart_top);
}
if(neighs[0] != MPI_PROC_NULL) {
MPI_Send(local, BLOCKCOLS, MPI_DOUBLE, neighs[0], TOP_ROW_TAG+(int)(curr_t*1000), cart_top);
}
if(neighs[1] != MPI_PROC_NULL) {
MPI_Send(&local[(BLOCKROWS-1)*BLOCKCOLS], BLOCKCOLS, MPI_DOUBLE, neighs[1], BOTTOM_ROW_TAG+(int)(curr_t*1000), cart_top);
}
if(neighs[3] != MPI_PROC_NULL) {
MPI_Recv(leftcol, BLOCKROWS, MPI_DOUBLE, neighs[3], LEFT_COL_TAG+(int)(curr_t*1000), cart_top, MPI_STATUS_IGNORE);
}
if(neighs[2] != MPI_PROC_NULL) {
MPI_Recv(rightcol, BLOCKROWS, MPI_DOUBLE, neighs[2], RIGHT_COL_TAG+(int)(curr_t*1000), cart_top, MPI_STATUS_IGNORE);
}
if(neighs[1] != MPI_PROC_NULL) {
MPI_Recv(bottomrow, BLOCKCOLS, MPI_DOUBLE, neighs[1], TOP_ROW_TAG+(int)(curr_t*1000), cart_top, MPI_STATUS_IGNORE);
}
if(neighs[0] != MPI_PROC_NULL) {
MPI_Recv(toprow, BLOCKCOLS, MPI_DOUBLE, neighs[0], BOTTOM_ROW_TAG+(int)(curr_t*1000), cart_top, MPI_STATUS_IGNORE);
}
MPI_Barrier(cart_top);
double* aux;
//cout<<" t in process "<<me<<" is " <<t<<endl;
int i, j;
MPI_Comm_rank(cart_top, &myrank);
MPI_Barrier(cart_top);
for(i=lb_i; i<ub_i; i++) {
for(j=lb_j; j<ub_j; j++) {
double curr,c1,c2,c3,c4;
curr = local[i*BLOCKCOLS+j];
c1 = i==0 ? toprow[j] : local[(i-1)*BLOCKCOLS+j];
c2 = i==BLOCKROWS-1 ? bottomrow[j] : local[(i+1)*BLOCKCOLS+j];
c3 = j==0 ? leftcol[i] : local[i*BLOCKCOLS+(j-1)];
c4 = j==BLOCKCOLS-1 ? rightcol[i] : local[i*BLOCKCOLS+(j+1)];
locnew[i*BLOCKCOLS+j] = curr*(1-4*dsc*p) + dsc*p*(c1+c2+c3+c4);
/*if(i==0) locnew[i*BLOCKCOLS+j] = toprow[j];
else if(i==BLOCKROWS-1) locnew[i*BLOCKCOLS+j] = bottomrow[j];
if(j==0) locnew[i*BLOCKCOLS+j] = leftcol[i];
else if(j==BLOCKCOLS-1) locnew[i*BLOCKCOLS+j] = rightcol[i];
if(i!=0 && i!=BLOCKROWS-1 && j!=0 && j!=BLOCKCOLS-1) locnew[i*BLOCKCOLS+j] = local[i*BLOCKCOLS+j];*/
/*if(i==0) locnew[i*BLOCKCOLS+j] = (double)5000;
else if(i==BLOCKROWS-1) locnew[i*BLOCKCOLS+j] = (double)5000;
if(j==0) locnew[i*BLOCKCOLS+j] = (double)5000;
else if(j==BLOCKCOLS-1) locnew[i*BLOCKCOLS+j] = (double)5000;
if(i!=0 && i!=BLOCKROWS-1 && j!=0 && j!=BLOCKCOLS-1) locnew[i*BLOCKCOLS+j] = local[i*BLOCKCOLS+j];*/
}
}
aux = local;
local = locnew;
locnew = aux;
MPI_Barrier(cart_top);
/* aux = T;
T=Tnew;
Tnew = aux;*/
}
MPI_Gatherv(local, BLOCKROWS*BLOCKCOLS, MPI_DOUBLE, Tnew, counts, disps, blocktype, 0,cart_top);
if(me == 0) {
ofstream outfile;
outfile.open(argv[10]);
for(int i=0; i<width; i++) {
for(int j=0; j<height; j++) {
outfile<< i<<" " <<j<<" "<<Tnew[i*height+j]<<endl;
}
}
outfile.close();
}
MPI_Finalize();
}

MPI_Reduce with MPI_SUM is not working

I am trying to simply sum up all variables called "train_hr" and "test_hr" from all 10 processors and store and print the sum on processor 0. I checked to make sure the individual sums are NOT 0 (they are not, they are all in the 1000s). The sum it keeps reporting is 0. I have no idea why. I have looked at many examples of this, and I have done it exactly as instructed. Any help would be appreciated.
double train_hr = 0, test_hr = 0;
double train_hr_global = 0, test_hr_global = 0;
//Master processor
if (my_rank == 0) {
// sends a task to each processor
int curr_task = 0;
for(i = 0; i < num_procs; i++) {
if (curr_task < nsamples_all) {
MPI_Send(&curr_task, 1, MPI_INT, i, 1, MPI_COMM_WORLD);
curr_task++;
}
}
int r;
MPI_Status status;
//keeps sending tasks to processors until there are no more tasks
while (curr_task < nsamples_all) {
MPI_Recv(&r, 1, MPI_INT, MPI_ANY_SOURCE, 1, MPI_COMM_WORLD, &status);
MPI_Send(&curr_task, 1, MPI_INT, status.MPI_SOURCE, 1, MPI_COMM_WORLD);
curr_task++;
}
//tell all processors to stop receiving
int a = -1;
for (i = 0; i < num_procs; i++) {
MPI_Send(&a, 1, MPI_INT, i, 1, MPI_COMM_WORLD);
}
}
//Helper processors
else {
int stop = 1;
while(stop != 0){
int i;
//Receives task OR stop alert from master
MPI_Status status;
MPI_Recv(&i, 1, MPI_INT, 0, 1, MPI_COMM_WORLD, &status);
if (i == -1) {
stop = 0;
}
//computations
else{
float r;
//unimportant computations here
train_hr += r;
test_hr += r;
//Tells master processor it is done
MPI_Send(&i, 1, MPI_INT, 0, 1, MPI_COMM_WORLD);
}
}
}
//At this point I checked the current values of train_hr and test_hr on each helper processor. They are all non-zero.
MPI_Reduce(&train_hr, &train_hr_global, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
MPI_Reduce(&test_hr, &test_hr_global, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD);
//at this point, the vales of train_hr_global and test_hr_global on the master processor (processor 0) are 0 when they should be the sum of all the processors values.
}

MPI - no speedup with increasing amounts of processes

I'm writing program for testing whether numbers are prime. At the beginning I calculate how much numbers assign to each process, then send this amount to the processes. Next, calculations are performed and data send back to process 0 that save the results. Below code works but when I increase number of process my program doesn't speedup. It seems to me that my program doesn't work in parallel. What's wrong? This is my first program in MPI so any advices are welcome.
I use mpich2 an I test my program on Intel Core i7-950.
main.cpp:
if (rank == 0) {
int workers = (size-1);
readFromFile(path);
int elements_per_proc = (N + (workers-1)) / workers;
int rest = N % elements_per_proc;
for (int i=1; i <= workers; i++) {
if((i == workers) && (rest != 0))
MPI_Send(&rest, 1, MPI_INT, i, 0, MPI_COMM_WORLD);
else
MPI_Send(&elements_per_proc, 1, MPI_INT, i, 0, MPI_COMM_WORLD);
}
int it = 1;
for (int i=0; i < N; i++) {
if((i != 0) && ((i % elements_per_proc) == 0))
it++;
MPI_Isend(&input[i], 1, MPI_INT, it, 0, MPI_COMM_WORLD, &send_request);
}
}
if (rank != 0) {
int count;
MPI_Recv(&count, 1, MPI_INT, 0, MPI_ANY_TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
for (int j=0; j < count; j++) {
MPI_Recv(&number, 1, MPI_INT, 0, MPI_ANY_TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
result = test(number, k);
send_array[0] = number;
send_array[1] = result;
MPI_Send(send_array, 2, MPI_INT, 0, 0, MPI_COMM_WORLD);
}
}
if (rank == 0) {
for (int i=0; i < N; i++) {
MPI_Recv(rec_array, 2, MPI_INT, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
// save results
}
}
Your implementation probably doesn't scale well to many processes, since you communicate in every step. You currently communicate the numbers and results for each single input, which incurs a large latency overhead. Instead you should think about communicating the input in-bulk (ie, using a single message).
Furthermore, using MPI collective operations (MPI_Scatter/MPI_Gather) instead of loops of MPI_Send/MPI_Recv might increase your performance further.
Additionally, you can utilize the master process to work on a chunk of the input as well.
A much more scalable implementation might then look as follows:
// tell everybody how many elements there are in total
MPI_Bcast(&N, 1, MPI_INT, 0, MPI_COMM_WORLD);
// everybody determines how many elements it will work on
// (include the master process)
int num_local_elements = N / size + (N % size < rank ? 1 : 0);
// allocate local size
int* local_input = (int*) malloc(sizeof(int)*num_local_elements);
// distribute the input from master to everybody using MPI_Scatterv
int* counts; int* displs;
if (rank == 0) {
counts = (int*)malloc(sizeof(int) * size);
displs = (int*)malloc(sizeof(int) * size);
for (int i = 0; i < size; i++) {
counts[i] = N / size + (N % size < i ? 1 : 0);
if (i > 0)
displs[i] = displs[i-1] + counts[i-1];
}
// scatter from master
MPI_Scatterv(input, counts, displs, MPI_INT, local_input, num_local_elements, MPI_INT, 0, MPI_COMM_WORLD);
} else {
// receive scattered numbers
MPI_Scatterv(NULL, NULL, NULL, MPI_DATATYPE_NULL, local_input, num_local_elements, MPI_INT, 0, MPI_COMM_WORLD);
}
// perform prime testing
int* local_results = (int*) malloc(sizeof(int)*num_local_elements);
for (int i = 0; i < num_local_elements; ++i) {
local_results[i] = test(local_input[i], k);
}
// gather results back to master process
int* results;
if (rank == 0) {
results = (int*)malloc(sizeof(int)*N);
MPI_Gatherv(local_results, num_local_elements, MPI_INT, results, counts, displs, MPI_INT, 0, MPI_COMM_WORLD);
// TODO: save results on master process
} else {
MPI_Gatherv(local_results, num_local_elements, MPI_INT, NULL, NULL, NULL, MPI_INT, 0, MPI_COMM_WORLD);
}