Here is my entire code first (the code is correct):
#include <iostream>
#include <mpi.h>
using namespace std;
int main(int argc, char**argv)
{
int mynode, totalnodes;
int sum,starval,endval,accum;
MPI_Status status;
MPI_Init (&argc,&argv);
MPI_Comm_size (MPI_COMM_WORLD, &totalnodes); //get totalnodes
MPI_Comm_rank (MPI_COMM_WORLD, &mynode); //get mynode
sum=0; //zero sum for accumulation
starval = 1000*mynode / totalnodes+1;
endval = 1000*(mynode+1) / totalnodes;
for (int i=starval; i <= endval; i=i+1)
sum = sum + i;
if(mynode!=0)
MPI_Send(&sum,1,MPI_INT,0,7,MPI_COMM_WORLD);
else
for(int j=1; j < totalnodes; j=j+1){
MPI_Recv(&accum,1,MPI_INT,j,7,MPI_COMM_WORLD, &status);
sum=sum+accum;
}
if(mynode ==0)
cout << "the sum from 1 to 1000 is: " << sum << endl;
MPI_Finalize();
}
We are trying to seperate the accumulating assignment to all different processors, and then accumulating result made by all processor (not the processor 0) to send the outcome to the processor 0. And the processor zero will accumulating all the outcomes receieved from other processors, and output a result of the summation from 1 to 1000 (As you can see in the code).
Now, my question is: what is the "accum" in the command of
sum = sum + accum;
There is no declearation of "accum". How does the operation system know that?
Please brifly explain the logic of
else
for(int j=1; j < totalnodes; j=j+1){
MPI_Recv(&accum,1,MPI_INT,j,7,MPI_COMM_WORLD, &status);
sum=sum+accum;}
And what did the accum did in this code.
Thank you!
Related
I am trying to write a C++ program by using MPI, in which each rank will send a matrix to rank 0. When the matrix size is relatively small, the code works perfectly. However, when the matrix size becomes big. The code starts to give strange error which will only happen when I use specific amount of CPUs.
If you feel the full code is too long, please directly go down to the minimum example below.
To avoid overlooking some part, I give the full source code here:
#include <iostream>
#include <mpi.h>
#include <cmath>
int world_size;
int world_rank;
MPI_Comm comm;
int m, m_small, m_small2;
int index(int row, int column)
{
return m * row + column;
}
int index3(int row, int column)
{
return m_small2 * row + column;
}
int main(int argc, char **argv) {
MPI_Init(&argc, &argv);
MPI_Status status;
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
m = atoi(argv[1]); //Size
int ndims = 2;
int *dims = new int[ndims];
int *period = new int[ndims];
int *coords = new int[ndims];
for (int i=0; i<ndims; i++) dims[i] = 0;
for (int i=0; i<ndims; i++) period[i] = 0;
for (int i=0; i<ndims; i++) coords[i] = 0;
MPI_Dims_create(world_size, ndims, dims);
MPI_Cart_create(MPI_COMM_WORLD, ndims, dims, period, 0, &comm);
MPI_Cart_coords(comm, world_rank, ndims, coords);
double *a, *a_2;
if (0 == world_rank) {
a = new double [m*m];
for (int i=0; i<m; i++) {
for (int j=0; j<m; j++) {
a[index(i,j)] = 0;
}
}
}
/*m_small is along the vertical direction, m_small2 is along the horizental direction*/
//The upper cells will take the reminder of total lattice point along vertical direction divided by the cell number along that direction
if (0 == coords[0]){
m_small = int(m / dims[0]) + m % dims[0];
}
else m_small = int(m / dims[0]);
//The left cells will take the reminder of total lattice point along horizental direction divided by the cell number along that direction
if (0 == coords[1]) {
m_small2 = int(m / dims[1]) + m % dims[1];
}
else m_small2 = int(m / dims[1]);
double *a_small = new double [m_small * m_small2];
/*Initialization of matrix*/
for (int i=0; i<m_small; i++) {
for (int j=0; j<m_small2; j++) {
a_small[index3(i,j)] = 2.5 ;
}
}
if (0 == world_rank) {
a_2 = new double[m_small*m_small2];
for (int i=0; i<m_small; i++) {
for (int j=0; j<m_small2; j++) {
a_2[index3(i,j)] = 0;
}
}
}
int loc[2];
int m1_rec, m2_rec;
MPI_Request send_req;
MPI_Isend(coords, 2, MPI_INT, 0, 1, MPI_COMM_WORLD, &send_req);
//This Isend may have problem!
MPI_Isend(a_small, m_small*m_small2, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &send_req);
if (0 == world_rank) {
for (int i = 0; i < world_size; i++) {
MPI_Recv(loc, 2, MPI_INT, i, 1, MPI_COMM_WORLD, MPI_STATUSES_IGNORE);
/*Determine the size of matrix for receiving the information*/
if (0 == loc[0]) {
m1_rec = int(m / dims[0]) + m % dims[0];
} else {
m1_rec = int(m / dims[0]);
}
if (0 == loc[1]) {
m2_rec = int(m / dims[1]) + m % dims[1];
} else {
m2_rec = int(m / dims[1]);
}
//This receive may have problem!
MPI_Recv(a_2, m1_rec * m2_rec, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUSES_IGNORE);
}
}
delete[] a_small;
if (0 == world_rank) {
delete[] a;
delete[] a_2;
}
delete[] dims;
delete[] period;
delete[] coords;
MPI_Finalize();
return 0;
}
Basically, the code reads an input value m, and then construct a big matrix of m x m. MPI creates a 2D topology according to the number of CPUs, which divide the big matrix to sub-matrix. The size of the sub-matrix is m_small x m_small2. There should be no problem in these steps.
The problem happens when I send the sub-matrix in each rank to rank-0 using MPI_Isend(a_small, m_small*m_small2, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &send_req); and MPI_Recv(a_2, m1_rec * m2_rec, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUSES_IGNORE);.
For example, when I run the code by this command: mpirun -np 2 ./a.out 183, I will get the error of
Read -1, expected 133224, errno = 14
*** Process received signal ***
Signal: Segmentation fault (11)
Signal code: Address not mapped (1)
Failing at address: 0x7fb23b485010
--------------------------------------------------------------------------
Primary job terminated normally, but 1 process returned
a non-zero exit code. Per user-direction, the job has been aborted.
--------------------------------------------------------------------------
--------------------------------------------------------------------------
mpirun noticed that process rank 1 with PID 0 on node dx1-500-24164 exited on signal 11 (Segmentation fault).
Strangely, If I modify the CPU number or decrease the value of input argument, the problem is not there anymore. Also, If I just comment out the MPI_Isend/Recv, there is no problem either.
So I am really wondering how to solve this problem?
Edit.1
The minimum example to reproduce the problem.
When the size of matrix is small, there is no problem. But problem comes when you increase the size of matrix (at least for me):
#include <iostream>
#include <mpi.h>
#include <cmath>
int world_size;
int world_rank;
MPI_Comm comm;
int m, m_small, m_small2;
int main(int argc, char **argv) {
MPI_Init(&argc, &argv);
MPI_Status status;
MPI_Comm_size(MPI_COMM_WORLD, &world_size);
MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
m = atoi(argv[1]); //Size
double *a_2;
//Please increase the size of m_small and m_small2 and wait for the problem to happen
m_small = 100;
m_small2 = 200;
double *a_small = new double [m_small * m_small2];
if (0 == world_rank) {
a_2 = new double[m_small*m_small2];
}
MPI_Request send_req;
MPI_Isend(a_small, m_small*m_small2, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &send_req);
if (0 == world_rank) {
for (int i = 0; i < world_size; i++) {
MPI_Recv(a_2, m_small*m_small2, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, MPI_STATUSES_IGNORE);
}
}
delete[] a_small;
if (0 == world_rank) {
delete[] a_2;
}
MPI_Finalize();
return 0;
}
Command to run mpirun -np 2 ./a.out 183 (The input argument is actually not used the by code this time)
The problem is in the line
MPI_Isend(a_small, m_small*m_small2, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD, &send_req);
MPI_Isend is non-blocking send (which you pair with blocking MPI_Recv), thus when it returns it still may use a_small until you wait for the send to complete (when you are free to use a_small again) using, e.g., MPI_Wait(&send_req, MPI_STATUS_IGNORE);. So, you then delete a_small while it may still be in use by non-blocking message sending code, which likely causes access of deleted memory, which can lead to segfault and crash. Try using blocked send like this:
MPI_Send(a_small, m_small*m_small2, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
This will return when a_small can be reused (including by deletion), though data may still not be recieved by recievers by that time, but rather held in an internal temporary buffer.
Hello I am trying to write a C++ multithreaded program using POSIX thread library to find the number of prime numbers between 1 and 10,000,000 (10 million) and find out how many microseconds it takes...
Creating my threads and running them works completely fine, however I feel as if there is an error found in my Prime function when determining if a number is prime or not...
I keep receiving 78496 as my output, however I desire 664579. Below is my code. Any hints or pointers would be greatly appreciated.
#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <unistd.h>
#include <iostream>
#include <sys/time.h> //measure the execution time of the computations
using namespace std;
//The number of thread to be generated
#define NUMBER_OF_THREADS 4
void * Prime(void* index);
long numbers[4] = {250000, 500000, 750000, 1000000};
long start_numbers[4] = {1, 250001, 500001, 750001};
int thread_numbers[4] = {0, 1, 2, 3};
int main(){
pthread_t tid[NUMBER_OF_THREADS];
int tn;
long sum = 0;
timeval start_time, end_time;
double start_time_microseconds, end_time_microseconds;
gettimeofday(&start_time, NULL);
start_time_microseconds = start_time.tv_sec * 1000000 + start_time.tv_usec;
for(tn = 0; tn < NUMBER_OF_THREADS; tn++){
if (pthread_create(&tid[tn], NULL, Prime, (void *) &thread_numbers[tn]) == -1 ) {
perror("thread fail");
exit(-1);
}
}
long value[4];
for(int i = 0; i < NUMBER_OF_THREADS; i++){
if(pthread_join(tid[i],(void **) &value[i]) == 0){
sum = sum + value[i]; //add four sums together
}else{
perror("Thread join failed");
exit(-1);
}
}
//get the end time in microseconds
gettimeofday(&end_time, NULL);
end_time_microseconds = end_time.tv_sec * 1000000 + end_time.tv_usec;
//calculate the time passed
double time_passed = end_time_microseconds - start_time_microseconds;
cout << "Sum is: " << sum << endl;
cout << "Running time is: " << time_passed << " microseconds" << endl;
exit(0);
}
//Prime function
void* Prime(void* index){
int temp_index;
temp_index = *((int*)index);
long sum_t = 0;
for(long i = start_numbers[temp_index]; i <= numbers[temp_index]; i++){
for (int j=2; j*j <= i; j++)
{
if (i % j == 0)
{
break;
}
else if (j+1 > sqrt(i)) {
sum_t++;
}
}
}
cout << "Thread " << temp_index << " terminates" << endl;
pthread_exit( (void*) sum_t);
}```
This is because, you used 10^6 instead of 10^7.
Also, added some corner cases for numbers 1, 2 and 3:
//Prime function
void* Prime(void* index){
int temp_index;
temp_index = *((int*)index);
long sum_t = 0;
for(long i = start_numbers[temp_index]; i <= numbers[temp_index]; i++){
// Corner cases
if(i<=1)continue;
if (i <= 3){
sum_t++;
continue;
}
for (int j=2; j*j <= i; j++)
{
if ((i % j == 0) || (i %( j+2))==0 )
{
break;
}
else if (j+1 > sqrt(i)) {
sum_t++;
}
}
}
cout << "Thread " << temp_index << " terminates" << endl;
pthread_exit( (void*) sum_t);
}
I tested your code with correct number and got the correct number of primes as output:
Thread 0 terminates
Thread 1 terminates
Thread 2 terminates
Thread 3 terminates
Sum is: 664579
Running time is: 4.69242e+07 microseconds
Thanks to #chux - Reinstate Monica for pointing this out
Along with taking 10^7 as the numbers divided in thread instead of setting the limit as 10^6 ,a number of other small scale errors are there and a number of optimizations could be made -
First of all start numbers could be from 2 itself
long start_numbers[4] = {2, 2500001, 5000001, 7500001};
sum_t++ in your code may not work on edge cases. It is better to follow the following algorithm for calculating Prime function
bool flag = false;
for(long i = start_numbers[temp_index]; i <= numbers[temp_index]; i++){
flag = false;
for (long j=2; j*j <= i; j++){
if (i % j == 0 )
{
flag = true;
break;
}
}
if(!flag)
sum_t++;
}
After these 2 operations i am getting the result as
Thread 0 terminates
Thread 1 terminates
Thread 2 terminates
Thread 3 terminates
Sum is: 664579
Running time is: 6.62618e+06 microseconds
edit:
( Note : in this case j is taken as long datatype but it could work as well with int in this 'example' since the tested compiler takes int as 32 bits long)
I am new to functions and i am really trying to understand how they work, my teacher gave us a problem where by we were to pass a number to a function between the range of 1-12 and the function was then meant to do the times tales of that number so I asked the user to enter a number and if the number is less then 1 and greater then 12 exit, else pass the number to the function and then I used a for loop to do the multiplication for me (as far as I am aware) but nothing seems to happen? Νo doubt I am doing something really stupid, any help is much appreciated.
#include <iostream>
using namespace std;
int TimesTables (int num);
int main(int argc, const char * argv[]) {
int number;
cout << "enter a number to multiply by, with a range of 1-12: ";
cin >> number;
if (number < 1 && number > 12)
return EXIT_FAILURE;
else {
int tables = TimesTables(number);
cout << tables;
}
return 0;
}
int TimesTables (int num) {
for ( int i = 0; num <=12; i ++)
num = num * i;
return num;
}
Running i from 0 is going to set num to 0, and therefore any multiplication after that.
Your loop is also rather dubious. Why are you checking num <= 12 rather than i <= 12?
Shouldn't your loop take the form
for ( int i = 1; i <=12; i ++){
// Print num * i
cout << num * i;
}
// There's no need to return anything back to the caller
for ( int i = 0; num <=12; i ++)
num = num * i;
Here i starts from 0, so any multiplication you do afterwards doesn't affect the result (num). Moreover, you want to go from 1 to 12, so you should start from 0 and finish at 12 - 1, or start from 1 and finish at 12.
So change this:
for ( int i = 0; num <=12; i ++)
to this:
for ( int i = 1; i <=12; i ++)
since you want to stop when i reaches 12, not num, i is the counter of the for-loop!
I'm trying to scatter values among processes belonging to an hypercube group (quicksort project).
Depending on the amount of processes I either create a new communicator excluding excessive processes, or I duplicate MPI_COMM_WORLD if it fits exactly any hypercube (power of 2).
In both cases, processes other than 0 receive their data, but:
- On first scenario, process 0 throws a segmentation fault 11
- On second scenario, nothing faults, but process 0 received values are gibberish.
NOTE: If I try a regular MPI_Scatter everything works well.
//Input
vector<int> LoadFromFile();
int d; //dimension of hypercube
int p; //active processes
int idle; //idle processes
vector<int> values; //values loaded
int arraySize; //number of total values to distribute
int main(int argc, char* argv[])
{
int mpiWorldRank;
int mpiWorldSize;
int mpiRank;
int mpiSize;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &mpiWorldRank);
MPI_Comm_size(MPI_COMM_WORLD, &mpiWorldSize);
MPI_Comm MPI_COMM_HYPERCUBE;
d = log2(mpiWorldSize);
p = pow(2, d); //Number of processes belonging to the hypercube
idle = mpiWorldSize - p; //number of processes in excess
int toExclude[idle]; //array of idle processes to exclude from communicator
int sendCounts[p]; //array of values sizes to be sent to processes
//
int i = 0;
while (i < idle)
{
toExclude[i] = mpiWorldSize - 1 - i;
++i;
}
//CREATING HYPERCUBE GROUP: Group of size of power of 2 -----------------
MPI_Group world_group;
MPI_Comm_group(MPI_COMM_WORLD, &world_group);
// Remove excessive processors if any from communicator
if (idle > 0)
{
MPI_Group newGroup;
MPI_Group_excl(world_group, 1, toExclude, &newGroup);
MPI_Comm_create(MPI_COMM_WORLD, newGroup, &MPI_COMM_HYPERCUBE);
//Abort any processor not part of the hypercube.
if (mpiWorldRank > p)
{
cout << "aborting: " << mpiWorldRank <<endl;
MPI_Finalize();
return 0;
}
}
else
{
MPI_Comm_dup(MPI_COMM_WORLD, &MPI_COMM_HYPERCUBE);
}
MPI_Comm_rank(MPI_COMM_HYPERCUBE, &mpiRank);
MPI_Comm_size(MPI_COMM_HYPERCUBE, &mpiSize);
//END OF: CREATING HYPERCUBE GROUP --------------------------
if (mpiRank == 0)
{
//STEP1: Read input
values = LoadFromFile();
arraySize = values.size();
}
//Transforming input vector into an array
int valuesArray[values.size()];
if(mpiRank == 0)
{
copy(values.begin(), values.end(), valuesArray);
}
//Broadcast input size to all processes
MPI_Bcast(&arraySize, 1, MPI_INT, 0, MPI_COMM_HYPERCUBE);
//MPI_Scatterv: determining size of arrays to be received and displacement
int nmin = arraySize / p;
int remainingData = arraySize % p;
int displs[p];
int recvCount;
int k = 0;
for (i=0; i<p; i++)
{
sendCounts[i] = i < remainingData
? nmin+1
: nmin;
displs[i] = k;
k += sendCounts[i];
}
recvCount = sendCounts[mpiRank];
int recvValues[recvCount];
//Following MPI_Scatter works well:
// MPI_Scatter(&valuesArray, 13, MPI_INT, recvValues , 13, MPI_INT, 0, MPI_COMM_HYPERCUBE);
MPI_Scatterv(&valuesArray, sendCounts, displs, MPI_INT, recvValues , recvCount, MPI_INT, 0, MPI_COMM_HYPERCUBE);
int j = 0;
while (j < recvCount)
{
cout << "rank " << mpiRank << " received: " << recvValues[j] << endl;
++j;
}
MPI_Finalize();
return 0;
}
First of all, you are supplying wrong arguments to MPI_Group_excl:
MPI_Group_excl(world_group, 1, toExclude, &newGroup);
// ^
The second argument specifies the number of entries in the exclusion list and should therefore be equal to idle. Since you are excluding a single rank only, the resulting group has mpiWorldSize-1 ranks and hence MPI_Scatterv expects that both sendCounts[] and displs[] have that many elements. Of those only p elements are properly initialised and and the rest are random, therefore MPI_Scatterv crashes in the root.
Another error is the code that aborts the idle processes: it should read if (mpiWorldRank >= p).
I would recommend that the entire exclusion code is replaced by a single call to MPI_Comm_split instead:
MPI_Comm comm_hypercube;
int colour = mpiWorldRank >= p ? MPI_UNDEFINED : 0;
MPI_Comm_split(MPI_COMM_WORLD, colour, mpiWorldRank, &comm_hypercube);
if (comm_hypercube == MPI_COMM_NULL)
{
MPI_Finalize();
return 0;
}
When no process supplies MPI_UNDEFINED as its colour, the call is equivalent to MPI_Comm_dup.
Note that you should avoid using in your code names starting with MPI_ as those could clash with symbols from the MPI implementation.
Additional note: std::vector<T> uses contiguous storage, therefore you could do without copying the elements into a regular array and simply provide the address of the first element in the call to MPI_Scatter(v):
MPI_Scatterv(&values[0], ...);
//review3
#include <iostream>
#include <cstdlib>
#include <string>
#include <fstream>
using namespace std;
double average;
int main(int argc, char* argv[])
{
int num1 = atoi (argv[1]);
int num2 = atoi (argv[2]);
int num3 = atoi (argv[3]);
average = ((num1 + num2 + num3) / 3);
cout << average << endl;
}
I am not sure how to tackle this problem if I need to calculate the average of all of the command line arguments? This is how I would do it with 3 CLA's but I am unsure how I would do it without knowing a set amount of CLA's for this problem. Does anyone also know how to do this if you had to find the median?
Below is the simple program.
int sum = 0 ;
for ( int i = 1; i < argc; i++ )
{
sum = sum + atoi(argv[i]) ; //Exclusion of argv[0] is no incidence...
}
Then you can do whatever you want to do with sum
In the declaration of main int argc is the number of arguments given. You can use a for loop to iterate through your arguments and calculate the average:
// Your includes here
// ...
int main(int argc, char* argv[])
{
int average = 0;
for(int i = 1; i < argc; i++) // argv[0] is the name of your program, so we are skipping it
{
average += atoi(argv[i]);
}
average = average / (argc - 1);
}
argc will give you the number of arguments given to main().
Note that argc will be ONE, not zero if you provide no argument as argv[0] is the file name of the executable.
To get the median, the following should do the trick:
float median;
// Get a sorted list of the integers
std:list<int> args;
for(int i = 1; i < argc; i++)
{
args.push_back(atoi(argv[i]));
}
args.sort();
// Extract median from the sorted array of integers
int middle_index = args.size() / 2;
if (args.size() % 2 == 1)
{
// For odd number of values, median is middle value
median = args[middle_index];
}
else
{
// For even number of values, median is average of the two middle values
median = (args[middle_index-1] + args[middle_index]) / 2.0;
}