Error on file_set_view in a collective mpi_write

Error on file_set_view in a collective mpi_write - fortran

I am just trying to write in a collective way in MPI Fortran from a CFD code. In each process, data are divided in blocks, with a general number of cells, and a structure var(b) is created which hosts the two variables r and p of the block b. Then a double MPI structure derived type is created to collect all data in a process, the first type collecting all variables in a block, and the second one all the structures in a block. So, each process has to write one of this double derived datatype, where the offset is evaluated all the data amount in the previous processes (0 for rank 0, all data in the rank 0 for rank 1, and so one). The code is the following
module var_mod
type vt
sequence
double precision,dimension(:,:,:),allocatable :: r,p
end type vt
type(vt),target,dimension(:),allocatable :: var
end module var_mod
PROGRAM main
USE MPI_F08
USE var_mod
IMPLICIT NONE
! FILES
INTEGER,PARAMETER :: NB = 4
!----------------------------------------------------------------
INTEGER :: b,i,j,k,me,np
TYPE(MPI_File) :: mpifh
INTEGER(KIND=MPI_OFFSET_KIND) :: mpidisp,sum_dim
integer,dimension(:),allocatable :: ni,nj,nk,mpiblock,mpistride
integer :: cont,mpierr
INTEGER,dimension(nb) :: Blocks
INTEGER(KIND=MPI_ADDRESS_KIND),dimension(:),allocatable :: Offsets,Pos
INTEGER(KIND=MPI_COUNT_KIND) :: lb, ext8
TYPE(MPI_Datatype),dimension(:),allocatable :: Elem_Type,Types
TYPE(MPI_Datatype) :: All_Type,mpiparflowtype
TYPE(MPI_Status) :: status
CHARACTER(LEN=MPI_MAX_ERROR_STRING) :: string
INTEGER :: resultlen
!----------------------------------------------------------------
call mpi_init
call mpi_comm_size(mpi_comm_world,np)
call mpi_comm_rank(mpi_comm_world,me)
allocate (ni(nb))
allocate (nj(nb))
allocate (nk(nb))
allocate (var(nb))
do b = 1,NB
ni(b) = b/3+1
nj(b) = b
nk(b) = b/5+1
allocate (var(b)%r(ni(b),nj(b),nk(b)))
allocate (var(b)%p(ni(b),nj(b),nk(b)))
END DO
!
! Initialize the data
!
do b = 1,nb
DO k = 1,nk(b)
DO j = 1,nj(b)
DO i = 1,ni(b)
var(b)%r(i,j,k) = 10000*me+1000*b+i*100+j*10+k
var(b)%p(i,j,k) = -var(b)%r(i,j,k)
END DO
END DO
END DO
end do
! (1) Create a separate structure datatype for each record
allocate (Offsets(2),Pos(2),Types(2),Elem_Type(nb))
DO b = 1,nb
CALL MPI_GET_ADDRESS(var(b)%r,POS(1))
CALL MPI_GET_ADDRESS(var(b)%p,POS(2))
Offsets = POS-POS(1)
Types = MPI_REAL8
Blocks = ni(b)*nj(b)*nk(b)
CALL MPI_TYPE_CREATE_STRUCT(2,Blocks,Offsets,Types,Elem_Type(b),mpierr)
END DO
deallocate (Offsets,Pos,Types)
! Create a structure of structures that describes the whole array
allocate (Offsets(nb),Pos(nb))
Blocks = 1
DO b = 1,nb
CALL MPI_GET_ADDRESS(var(b)%r,POS(b))
END DO
Offsets = POS-POS(1)
CALL MPI_TYPE_CREATE_STRUCT(nb,Blocks,Offsets,Elem_Type,All_Type)
CALL MPI_TYPE_COMMIT(All_Type,mpierr)
! Free the intermediate datatypes
DO b = 1,nb
CALL MPI_TYPE_FREE(Elem_Type(b))
END DO
deallocate(Offsets,Pos,Elem_Type)
! Set index
cont = 1
allocate(mpiblock(cont))
allocate(mpistride(cont))
mpiblock=1
mpistride=0
call MPI_TYPE_INDEXED(cont,mpiblock,mpistride,All_Type,mpiparflowtype)
call MPI_TYPE_COMMIT(mpiparflowtype)
deallocate(mpiblock,mpistride)
! Position where to write
CALL MPI_Type_get_extent(MPI_REAL8, lb, ext8)
mpidisp = 0
do b = 1,nb
mpidisp = mpidisp + (ni(b)*nj(b)*nk(b)) ! number of cell in the block b
end do
mpidisp = mpidisp*2*ext8*me !multiply for number of variables and byte of each variable and shif to the process rank
! Open file
call MPI_FILE_OPEN(MPI_COMM_WORLD,'MPIDATA',IOR(MPI_MODE_CREATE,MPI_MODE_WRONLY),MPI_INFO_NULL,mpifh)
! setting file view
call MPI_FILE_SET_VIEW(mpifh,mpidisp,All_Type,mpiparflowtype,'native',MPI_INFO_NULL,mpierr)
write(*,*) me,'error on file set view:',mpierr
call MPI_Error_string(mpierr, string, resultlen)
write(*,*) 'string:',trim(string),resultlen
! MPI Write file
call MPI_FILE_WRITE_ALL(mpifh,var(1)%r,1,All_Type,status)
! Close file
call MPI_FILE_CLOSE(mpifh)
! deallocations and free
CALL MPI_TYPE_FREE(All_Type)
CALL MPI_TYPE_FREE(mpiparflowtype)
do b = 1,nb
deallocate (var(b)%r,var(b)%p)
END DO
deallocate (var)
deallocate (ni,nj,nk)
! end
call mpi_finalize
END PROGRAM main
When the code is launched, for instance, on two processes (both Intel and gnu compilers no problem in compilation phase), the run concludes but an error MPI_TYPE_ERR in MPI_FILE_SET_VIEW is issued and the data file contains only rank 0 data.
I would expect a file with data from all ranks, but I can not understand what the problem is.

Related

unknown size matrix in Fortran

I want to add elements to a 1d matrix mat, subject to a condition as in the test program below. In Fortran 2003 you can add an element
mat=[mat,i]
as mentioned in the related question Fortran array automatically growing when adding a value. Unfortunately, this is very slow for large matrices. So I tried to overcome this, by writing the matrix elements in an unformatted file and reading them afterwards. This turned out to be way faster than using mat=[mat,i]. For example for n=2000000_ilong the run time is 5.1078133666666661 minutes, whereas if you store the matrix elements in the file the run time drops to 3.5234166666666665E-003 minutes.
The problem is that for large matrix sizes the file storage.dat can be hundreds of GB...
Any ideas?
program test
implicit none
integer, parameter :: ndig=8
integer, parameter :: ilong=selected_int_kind(ndig)
integer (ilong), allocatable :: mat(:)
integer (ilong), parameter :: n=2000000_ilong
integer (ilong) :: i, cn
logical, parameter :: store=.false.
real(8) :: z, START_CLOCK, STOP_CLOCK
open(1, file='storage.dat',form='unformatted')
call cpu_time(START_CLOCK)
if(store) then
cn=0
do i=1,n
call random_number(z)
if (z<0.5d0) then
write(1) i
cn=cn+1
end if
end do
rewind(1); allocate(mat(cn)); mat=0
do i=1,cn
read(1) mat(i)
end do
else
allocate(mat(1)); mat=0
do i=1,n
call random_number(z)
if (z<0.5d0) then
mat=[mat,i]
end if
end do
end if
call cpu_time(STOP_CLOCK)
print *, 'run took:', (STOP_CLOCK - START_CLOCK)/60.0d0, 'minutes.'
end program test

If the data file has hundreds of gigabytes, than there can may be no solution available at all, because you need so much RAM memory anyway for your array. Maybe you made the mistake of storing the data as text and then the memory size will be somewhat lower, but still tens of GB.
What is often done, when you need to add elements one-by-one and you do not know the final size, is growing the array geometrically in steps. That means pre-allocate an array to size N. When the array is full, you allocate a new array of size 2*N. When the array is full again, you allocate it to 4*N. And so on. Either you are finished or you exhausted all your memory.
Of course, it is often best to know the size of the array beforehand, but in some algorithms you simply do not have the information.

Maybe you need a dynamic container such as C++'s std::vector, with a push_back() function.
The following is a simplified version. You probably ought to check the allocation to make sure that you don't run out of addressable memory.
Note the need for random_seed.
module container
use iso_fortran_env
implicit none
type array
integer(int64), allocatable :: A(:)
integer(int64) num
contains
procedure push_back
procedure print
end type array
interface array ! additional constructors
procedure array_constructor
end interface array
contains
!----------------------------------------------
function array_constructor() result( this ) ! performs initial allocation
type(array) this
allocate( this%A(1) )
this%num = 0
end function array_constructor
!----------------------------------------------
subroutine push_back( this, i )
class(array), intent(inout) :: this
integer(int64) i
integer(int64), allocatable :: temp(:)
if ( size(this%A) == this%num ) then ! Need to resize
allocate( temp( 2 * this%num ) ) ! <==== for example
temp(1:this%num ) = this%A
call move_alloc( temp, this%A )
! print *, "Resized to ", size( this%A ) ! debugging only!!!
end if
this%num = this%num + 1
this%A(this%num) = i
end subroutine push_back
!----------------------------------------------
subroutine print( this )
class(array), intent(in) :: this
write( *, "( *( i0, 1x ) )" ) ( this%A(1:this%num) )
end subroutine print
end module container
!=======================================================================
program test
use iso_fortran_env
use container
implicit none
type(array) mat
integer(int64) :: n = 2000000_int64
integer(int64) i
real(real64) z, START_CLOCK, STOP_CLOCK
mat = array() ! initial trivial allocation
call random_seed ! you probably need this
call cpu_time(START_CLOCK)
do i = 1, n
call random_number( z )
if ( z < 0.5_real64 ) call mat%push_back( i )
end do
call cpu_time(STOP_CLOCK)
print *, 'Run took ', ( STOP_CLOCK - START_CLOCK ) / 60.0_real64, ' minutes.'
! call mat%print ! debugging only!!!
end program test

Heap memory allocated but not free after calling MPI_TYPE_CREATE_STRUCT to create struct with allocatables

I'm trying to send/recv a derived datatype with allocatable arrays. Currently, I managed to follow the suggestion in MPI derived datatype for dynamically allocated structs with dynamically allocated member. With that, my information is passed correctly. However, when I was profiling with tau, the memory allocated on heap wasn't freed and resulted in memory leak.
I have tested many times by commenting on/off different lines of code. The memory leak disappear as long as I comment off the MPI_TYPE_CREATE_STRUCT function.
I also pasted the code in the post into my code but problem still persists.
The compilers I tried are openmpi-4.0.0, 3.1.0 and impi 18.0.2, 18.0.0
Here is the a simple code I tested on
Here is the memory leak version
Program memory_leak
implicit none
include "mpif.h"
TYPE Struct
INTEGER :: N
DOUBLE PRECISION :: A
DOUBLE PRECISION ,ALLOCATABLE :: B(:)
END TYPE Struct
TYPE(Struct) :: Structs(2)
integer :: i
integer :: Types(3)
integer :: Blocks(3)
integer :: Elem_Type(2), TwoElem_Type,IError
integer(kind=MPI_ADDRESS_KIND) :: POS_(3)
integer(kind=MPI_ADDRESS_KIND) :: Offsets(3)
ALLOCATE(Structs(1)%B(10))
ALLOCATE(Structs(2)%B(20))
CALL MPI_INIT(IError)
! (1) Create a separate structure datatype for each record
DO i=1,2
CALL MPI_GET_ADDRESS(Structs(i)%N, POS_(1), IError)
CALL MPI_GET_ADDRESS(Structs(i)%A, POS_(2), IError)
CALL MPI_GET_ADDRESS(Structs(i)%B(1), POS_(3), IError)
Offsets = POS_ - POS_(1)
Types(1) = MPI_INTEGER
Types(2) = MPI_DOUBLE_PRECISION
Types(3) = MPI_DOUBLE_PRECISION
Blocks(1) = 1
Blocks(2) = 1
Blocks(3) = i * 10
CALL MPI_TYPE_CREATE_STRUCT(3, Blocks, Offsets, Types, Elem_Type(i), IError)
END DO
! (2) Create a structure of structures that describes the whole array
CALL MPI_GET_ADDRESS(Structs(1)%N, POS_(1), IError)
CALL MPI_GET_ADDRESS(Structs(2)%N, POS_(2), IError)
Offsets = POS_ - POS_(1)
Types(1) = Elem_Type(1)
Types(2) = Elem_Type(2)
Blocks(1) = 1
Blocks(2) = 1
CALL MPI_TYPE_CREATE_STRUCT(2, Blocks, Offsets, Types, TwoElem_Type, IError)
CALL MPI_TYPE_COMMIT(TwoElem_Type, IError)
! (2.1) Free the intermediate datatypes
DO i=1,2
CALL MPI_TYPE_FREE(Elem_Type(i), IError)
END DO
CALL MPI_TYPE_FREE(TwoElem_Type, IError)
print *, "end"
CALL MPI_FINALIZE(IError)
end program memory_leak
Memory leak using tau
\
Here is the Leak free version
Program memory_leak
implicit none
include "mpif.h"
TYPE Struct
INTEGER :: N
DOUBLE PRECISION :: A
DOUBLE PRECISION ,ALLOCATABLE :: B(:)
END TYPE Struct
TYPE(Struct) :: Structs(2)
integer :: i
integer :: Types(3)
integer :: Blocks(3)
integer :: Elem_Type(2), TwoElem_Type,IError
integer(kind=MPI_ADDRESS_KIND) :: POS_(3)
integer(kind=MPI_ADDRESS_KIND) :: Offsets(3)
ALLOCATE(Structs(1)%B(10))
ALLOCATE(Structs(2)%B(20))
CALL MPI_INIT(IError)
! (1) Create a separate structure datatype for each record
DO i=1,2
CALL MPI_GET_ADDRESS(Structs(i)%N, POS_(1), IError)
CALL MPI_GET_ADDRESS(Structs(i)%A, POS_(2), IError)
CALL MPI_GET_ADDRESS(Structs(i)%B(1), POS_(3), IError)
Offsets = POS_ - POS_(1)
Types(1) = MPI_INTEGER
Types(2) = MPI_DOUBLE_PRECISION
Types(3) = MPI_DOUBLE_PRECISION
Blocks(1) = 1
Blocks(2) = 1
Blocks(3) = i * 10
! CALL MPI_TYPE_CREATE_STRUCT(3, Blocks, Offsets, Types, Elem_Type(i), IError)
END DO
! (2) Create a structure of structures that describes the whole array
CALL MPI_GET_ADDRESS(Structs(1)%N, POS_(1), IError)
CALL MPI_GET_ADDRESS(Structs(2)%N, POS_(2), IError)
Offsets = POS_ - POS_(1)
Types(1) = Elem_Type(1)
Types(2) = Elem_Type(2)
Blocks(1) = 1
Blocks(2) = 1
! CALL MPI_TYPE_CREATE_STRUCT(2, Blocks, Offsets, Types, TwoElem_Type, IError)
! CALL MPI_TYPE_COMMIT(TwoElem_Type, IError)
! ! (2.1) Free the intermediate datatypes
! DO i=1,2
! CALL MPI_TYPE_FREE(Elem_Type(i), IError)
! END DO
!CALL MPI_TYPE_FREE(TwoElem_Type, IError)
print *, "end"
CALL MPI_FINALIZE(IError)
end program memory_leak

Solved. The problem lies in other part of code where I used the pack function on allocatable array. When you use pack, the array will result in memory lost since pointer is gone but the array is not deallocated

Using Persistent Communication in Fortran MPI

I am using persistent communication in my CFD code. I have the communications setup in another subroutine and in the main subroutine, where I have the do loop, I use the MPI_STARTALL(), MPI_WAITALL().
In order to make it shorter, I am showing hte first part of the setup. The rest of the arrays are exactly the same.
My setup subrotuine looks like:
Subroutine MPI_Subroutine
use Variables
use mpi
implicit none
!Starting up MPI
call MPI_INIT(ierr)
call MPI_COMM_SIZE(MPI_COMM_WORLD,npes,ierr)
call MPI_COMM_RANK(MPI_COMM_WORLD,MyRank,ierr)
!Compute the size of local block (1D Decomposition)
Jmax = JmaxGlobal
Imax = ImaxGlobal/npes
if (MyRank.lt.(ImaxGlobal - npes*Imax)) then
Imax = Imax + 1
end if
if (MyRank.ne.0.and.MyRank.ne.(npes-1)) then
Imax = Imax + 2
Else
Imax = Imax + 1
endif
! Computing neighboars
if (MyRank.eq.0) then
Left = MPI_PROC_NULL
else
Left = MyRank - 1
end if
if (MyRank.eq.(npes -1)) then
Right = MPI_PROC_NULL
else
Right = MyRank + 1
end if
! Initializing the Arrays in each processor, according to the number of local nodes
Call InitializeArrays
!Creating the channel of communication for this computation,
!Sending and receiving the u_old (Ghost cells)
Call MPI_SEND_INIT(u_old(2,:),Jmax,MPI_DOUBLE_PRECISION,Left,tag,MPI_COMM_WORLD,req(1),ierr)
Call MPI_RECV_INIT(u_old(Imax,:),jmax,MPI_DOUBLE_PRECISION,Right,tag,MPI_COMM_WORLD,req(2),ierr)
Call MPI_SEND_INIT(u_old(Imax-1,:),Jmax,MPI_DOUBLE_PRECISION,Right,tag,MPI_COMM_WORLD,req(3),ierr)
Call MPI_RECV_INIT(u_old(1,:),jmax,MPI_DOUBLE_PRECISION,Left,tag,MPI_COMM_WORLD,req(4),ierr)
Since I am debugging my code I am just checking these arrays. When I check my ghost cells are full of zeroes. Then I guess that I messing with the instruction.
The main code, where I call the MPI_STARTALL, MPI_WAITALL looks like:
Program
use Variables
use mpi
implicit none
open(32, file = 'error.dat')
Call MPI_Subroutine
!kk=kk+1
DO kk=1, 2001
! A lot of calculation
! communicating the maximum error among the processes and delta t
call MPI_REDUCE(eps,epsGlobal,1,MPI_DOUBLE_PRECISION,MPI_MAX,0,MPI_COMM_WORLD,ierr)
call MPI_BCAST(epsGlobal,1,MPI_DOUBLE_PRECISION,0,MPI_COMM_WORLD,ierr)
call MPI_REDUCE(delta_t,delta_tGlobal,1,MPI_DOUBLE_PRECISION,MPI_MIN,0,MPI_COMM_WORLD,ierr)
if(MyRank.eq.0) delta_t = delta_tGlobal
call MPI_BCAST(delta_t,1,MPI_DOUBLE_PRECISION,0,MPI_COMM_WORLD,ierr)
if(MyRank.eq.0) then
write(*,*) kk,epsGlobal,(kk*delta_t)
write(32,*) kk,epsGlobal
endif
Call Swap
Call MPI_STARTALL(4,req,ierr) !
Call MPI_WAITALL(4,req,status,ierr)
enddo
The variables are set in another module. the MPI related variables looks like:
! MPI variables
INTEGER :: npes, MyRank, ierr, Left, Right, tag
INTEGER :: status(MPI_STATUS_SIZE,4)
INTEGER,dimension(4) :: req
I appreciate your time and suggestion in this problem.

MPI in Fortran gives garbage values

PROGRAM ShareNeighbors
IMPLICIT REAL (a-h,o-z)
INCLUDE "mpif.h"
PARAMETER (m = 500, n = 500)
DIMENSION a(m,n), b(m,n)
DIMENSION h(m,n)
INTEGER istatus(MPI_STATUS_SIZE)
INTEGER iprocs, jprocs
PARAMETER (ROOT = 0)
integer dims(2),coords(2)
logical periods(2)
data periods/2*.false./
integer status(MPI_STATUS_SIZE)
integer comm2d,req,source
CALL MPI_INIT(ierr)
CALL MPI_COMM_SIZE(MPI_COMM_WORLD, nprocs, ierr)
CALL MPI_COMM_RANK(MPI_COMM_WORLD, myrank, ierr)
! Get a new communicator for a decomposition of the domain.
! Let MPI find a "good" decomposition
dims(1) = 0
dims(2) = 0
CALL MPI_DIMS_CREATE(nprocs,2,dims,ierr)
if (myrank.EQ.Root) then
print *,nprocs,'processors have been arranged into',dims(1),'X',dims(2),'grid'
endif
CALL MPI_CART_CREATE(MPI_COMM_WORLD,2,dims,periods,.true., &
comm2d,ierr)
! Get my position in this communicator
CALL MPI_COMM_RANK(comm2d,myrank,ierr)
! Get the decomposition
CALL fnd2ddecomp(comm2d,m,n,ista,iend,jsta,jend)
! print *,ista,jsta,iend,jend
ilen = iend - ista + 1
jlen = jend - jsta + 1
CALL MPI_Cart_get(comm2d,2,dims,periods,coords,ierr)
iprocs = dims(1)
jprocs = dims(2)
myranki = coords(1)
myrankj = coords(2)
DO j = jsta, jend
DO i = ista, iend
a(i,j) = myrank+1
ENDDO
ENDDO
! Send data from each processor to Root
call MPI_ISEND(ista,1,MPI_INTEGER,Root,1, &
MPI_COMM_WORLD,req,ierr)
call MPI_ISEND(iend,1,MPI_INTEGER,Root,1, &
MPI_COMM_WORLD,req,ierr)
call MPI_ISEND(jsta,1,MPI_INTEGER,Root,1, &
MPI_COMM_WORLD,req,ierr)
call MPI_ISEND(jend,1,MPI_INTEGER,Root,1, &
MPI_COMM_WORLD,req,ierr)
call MPI_ISEND(a(ista:iend,jsta:jend),(ilen)*(jlen),MPI_REAL, &
Root,1,MPI_COMM_WORLD,req,ierr )
! Recieved the results from othe precessors
if (myrank.EQ.Root) then
do source = 0,nprocs-1
call MPI_RECV(ista,1,MPI_INTEGER,source, &
1,MPI_COMM_WORLD,status,ierr )
call MPI_RECV(iend,1,MPI_INTEGER,source, &
1,MPI_COMM_WORLD,status,ierr )
call MPI_RECV(jsta,1,MPI_INTEGER,source, &
1,MPI_COMM_WORLD,status,ierr )
call MPI_RECV(jend,1,MPI_INTEGER,source, &
1,MPI_COMM_WORLD,status,ierr )
ilen = iend - ista + 1
jlen = jend - jsta + 1
call MPI_RECV(a(ista:iend,jsta:jend),(ilen)*(jlen),MPI_REAL, &
source,1,MPI_COMM_WORLD,status,ierr)
! print the results
call ZMINMAX(m,n,ista,iend,jsta,jend,a(:,:),amin,amax)
print *, 'myid=',source,amin,amax
call MPI_Wait(req, status, ierr)
enddo
endif
CALL MPI_FINALIZE(ierr)
END
subroutine fnd2ddecomp(comm2d,m,n,ista,iend,jsta,jend)
integer comm2d
integer m,n,ista,jsta,iend,jend
integer dims(2),coords(2),ierr
logical periods(2)
! Get (i,j) position of a processor from Cartesian topology.
CALL MPI_Cart_get(comm2d,2,dims,periods,coords,ierr)
! Decomposition in first (ie. X) direction
CALL MPE_DECOMP1D(m,dims(1),coords(1),ista,iend)
! Decomposition in second (ie. Y) direction
CALL MPE_DECOMP1D(n,dims(2),coords(2),jsta,jend)
return
end
SUBROUTINE MPE_DECOMP1D(n,numprocs,myid,s,e)
integer n,numprocs,myid,s,e,nlocal,deficit
nlocal = n / numprocs
s = myid * nlocal + 1
deficit = mod(n,numprocs)
s = s + min(myid,deficit)
! Give one more slice to processors
if (myid .lt. deficit) then
nlocal = nlocal + 1
endif
e = s + nlocal - 1
if (e .gt. n .or. myid .eq. numprocs-1) e = n
return
end
SUBROUTINE ZMINMAX(IX,JX,SX,EX,SY,EY,ZX,ZXMIN,ZXMAX)
INTEGER :: IX,JX,SX,EX,SY,EY
REAL :: ZX(IX,JX)
REAL :: ZXMIN,ZXMAX
ZXMIN=1000.
ZXMAX=-1000.
DO II=SX,EX
DO JJ=SY,EY
IF(ZX(II,JJ).LT.ZXMIN)ZXMIN=ZX(II,JJ)
IF(ZX(II,JJ).GT.ZXMAX)ZXMAX=ZX(II,JJ)
ENDDO
ENDDO
RETURN
END
When I am running the above code with 4 processors Root receives garbage values. Where as for 15 processors, the data transfer is proper. How I can tackle this?
I guess it is related buffer, a point which is not clear to me. How I have to tackle the buffer wisely?

1. problem
You are doing multiple sends
call MPI_ISEND(ista,1,MPI_INTEGER,Root,1, &
MPI_COMM_WORLD,req,ierr)
call MPI_ISEND(iend,1,MPI_INTEGER,Root,1, &
MPI_COMM_WORLD,req,ierr)
call MPI_ISEND(jsta,1,MPI_INTEGER,Root,1, &
MPI_COMM_WORLD,req,ierr)
call MPI_ISEND(jend,1,MPI_INTEGER,Root,1, &
MPI_COMM_WORLD,req,ierr)
call MPI_ISEND(a(ista:iend,jsta:jend),(ilen)*(jlen),MPI_REAL, &
Root,1,MPI_COMM_WORLD,req,ierr )
and all of them with the same request variable req. That can't work.
2. problem
You are using a subarray a(ista:iend,jsta:jend) in non-blocking MPI. That is not allowed*. You need to copy the array into some temporary array buffer or use MPI derived subarray datatype (too hard for you at this stage).
The reason for the problem is that the compiler will create a temporary copy just for the call to ISend. The ISend will remember the address, but will not send anything. Then temporary is deleted and the address becomes invalid. And then the MPI_Wait will try to use that address and will fail.
3. problem
Your MPI_Wait is in the wrong place. It must be after the sends out of any if conditions so that they are always executed (provided you are always sending).
You must collect all request separately and than wait for all of them. Best to have them in a an array and wait for all of them at once using MPI_Waitall.
Remeber, the ISend typically does not actually send anything if the buffer is large. The exchange often happens during the Wait operation. At least for larger arrays.
Recommendation:
Take a simple problem example and try to exchange just two small arrays with MPI_IRecv and MPI_ISend between two processes. As simple test problem as you can do. Learn from it, do simple steps. Take no offence, but your current understanding of non-blocking MPI is too weak to write full scale programs. MPI is hard, non-blocking MPI is even harder.
* not allowed when using the interface available in MPI-2. MPI-3 brings a new interface available by using use mpi_f08 where it is possible. But learn the basics first.

Dynamic array allocation in fortran90

I am writing a generic subroutine in fortran90 that will read in a column of data (real values). The subroutine should first check to see that the file exists and can be opened, then it determines the number of elements (Array_Size) in the column by reading the number of lines until end of file. Next the subroutine rewinds the file back to the beginning and reads in the data points and assigns each to an array (Column1(n)) and also determines the largest element in the array (Max_Value). The hope is that this subroutine can be written to be completely generic and not require any prior knowledge of the number of data points in the file, which is why the number of elements is first determined so the array, "Column1", can be dynamically allocated to contain "Array_Size" number of data points. Once the array is passed to the main program, it is transferred to another array and the initial dynamically allocated array is deallocated so that the routine can be repeated for multiple other input files, although this example only reads in one data file.
As written below, the program compiles just fine on the Intel fortran compiler; however, when it runs it gives me a severe (174): SIGSEV fault. I place the write(,) statements before and after the allocate statement in the subroutine and it prints the first statement "Program works here", but not the second, which indicates that the problem is occurring at the ALLOCATE (Column1(Array_Size)) statement, between the two write(,) statements. I re-compiled it with -C flag and ran the executable, which fails again and states severe (408): "Attempt to fetch from allocatable variable MISC_ARRAY when it is not allocated". The variable MISC_ARRAY is the dummy variable in the main program, which seems to indicate that the compiler wants the array allocated in the main program and not in the subprogram. If I statically allocate the array, the program works just fine. In order to make the program generic and not require any knowledge of the size of each file, it needs to be dynamically allocated and this should happen in the subprogram, not the main program. Is there a way to accomplish this that I am not seeing?
PROGRAM MAIN
IMPLICIT NONE
! - variable Definitions for MAIN program
INTEGER :: n
! - Variable Definitions for EXPENSE READER Subprograms
REAL, DIMENSION(:), ALLOCATABLE :: Misc_Array,MISC_DATA
INTEGER :: Size_Misc
REAL :: Peak_Misc_Value
! REAL :: Misc_Array(365)
CHARACTER(LEN=13) :: File_Name
File_Name = "Misc.txt"
CALL One_Column(File_Name,Size_Misc,Peak_Misc_Value,Misc_Array)
ALLOCATE (MISC_DATA(Size_Misc))
DO n = 1,Size_Misc ! Transfers array data
MISC_DATA(n) = Misc_Array(n)
END DO
DEALLOCATE (Misc_Array)
END PROGRAM MAIN
SUBROUTINE One_Column(File_Name,Array_Size,Max_Value,Column1)
IMPLICIT NONE
REAL, DIMENSION(:), ALLOCATABLE,INTENT(OUT) :: Column1
! REAL :: Column1(365)
REAL, INTENT(OUT) :: Max_Value
CHARACTER,INTENT(IN) :: File_Name*13
INTEGER, INTENT(OUT) :: Array_Size
INTEGER :: Open_Status,Input_Status,n
! Open the file and check to ensure it is properly opened
OPEN(UNIT=100,FILE = File_Name,STATUS = 'old',ACTION = 'READ', &
IOSTAT = Open_Status)
IF(Open_Status > 0) THEN
WRITE(*,'(A,A)') "**** Cannot Open ",File_Name
STOP
RETURN
END IF
! Determine the size of the file
Array_Size = 0
DO 300
READ(100,*,IOSTAT = Input_Status)
IF(Input_Status < 0) EXIT
Array_Size = Array_Size + 1
300 CONTINUE
REWIND(100)
WRITE(*,*) "Program works here"
ALLOCATE (Column1(Array_Size))
WRITE(*,*) "Program stops working here"
Max_Value = 0.0
DO n = 1,Array_Size
READ(100,*) Column1(n)
IF(Column1(n) .GT. Max_Value) Max_Value = Column1(n)
END DO
END SUBROUTINE One_Column

This is an educated guess: I think that the subroutine One_Column ought to have an explicit interface. As written the source code has 2 compilation units, a program (called main) and an external subroutine (called One_Column).
At compile-time the compiler can't figure out the correct way to call the subroutine from the program. In good-old (emphasis on old) Fortran style it takes a leap of faith and leaves it to the linker to find a subroutine with the right name and crosses its fingers (as it were) and hopes that the actual arguments match the dummy arguments at run-time. This approach won't work on subroutines returning allocated data structures.
For a simple fix move end program to the end of the source file, in the line vacated enter the keyword contains. The compiler will then take care of creating the necessary interface.
For a more scalable fix, put the subroutine into a module and use-associate it.

I think it is important to show the corrected code so that future users can read the question and also see the solution. I broke the subroutine into a series of smaller functions and one subroutine to keep the data as local as possible and implemented it into a module. The main program and module are attached. The main program includes a call to the functions twice, just to show that it can be used modularly to open multiple files.
PROGRAM MAIN
!
! - Author: Jonathan A. Webb
! - Date: December 11, 2014
! - Purpose: This code calls subprograms in module READ_COLUMNAR_FILE
! to determine the number of elements in an input file, the
! largest element in the input file and reads in the column of
! data as an allocatable array
!***************************************************************************
!***************************************************************************
!********************* **********************
!********************* VARIABLE DEFINITIONS **********************
!********************* **********************
!***************************************************************************
!***************************************************************************
USE READ_COLUMNAR_FILE
IMPLICIT NONE
CHARACTER(LEN=13) :: File_Name
INTEGER :: Size_Misc,Size_Bar,Unit_Number
REAL :: Peak_Misc_Value,Peak_Bar_Value
REAL, DIMENSION(:), ALLOCATABLE :: Misc_Array,Bar_Array
!***************************************************************************
!***************************************************************************
!********************* **********************
!********************* FILE READER BLOCK **********************
!********************* **********************
!***************************************************************************
!***************************************************************************
! - This section reads in data from all of the columnar input decks.
! User defines the input file name and number
File_Name = "Misc.txt"; Unit_Number = 100
! Determines the number of rows in the file
Size_Misc = File_Length(File_Name,Unit_Number)
! Yields the allocatable array and the largest element in the array
CALL Read_File(File_Name,Unit_Number,Misc_Array,Peak_Misc_Value)
File_Name = "Bar.txt"; Unit_Number = 100
Size_Bar = File_Length(File_Name,Unit_Number)
CALL Read_File(File_Name,Unit_Number,Bar_Array,Peak_Bar_Value)
END PROGRAM MAIN
MODULE READ_COLUMNAR_FILE
!***********************************************************************************
!***********************************************************************************
! ***
! Author: Jonathan A. Webb ***
! Purpose: Compilation of subprograms required to read in multi-column ***
! data files ***
! Drafted: December 11, 2014 ***
! ***
!***********************************************************************************
!***********************************************************************************
!
!-----------------------------------
! Public functions and subroutines for this module
!-----------------------------------
PUBLIC :: Read_File
PUBLIC :: File_Length
!-----------------------------------
! Private functions and subroutines for this module
!-----------------------------------
PRIVATE :: Check_File
!===============================================================================
CONTAINS
!===============================================================================
SUBROUTINE Check_File(Unit_Number,Open_Status,File_Name)
INTEGER,INTENT(IN) :: Unit_Number
CHARACTER(LEN=13), INTENT(IN) :: File_Name
INTEGER,INTENT(OUT) :: Open_Status
! Check to see if the file exists
OPEN(UNIT=Unit_Number,FILE = File_Name,STATUS='old',ACTION='read', &
IOSTAT = Open_Status)
IF(Open_Status .GT. 0) THEN
WRITE(*,*) "**** Cannot Open ", File_Name," ****"
STOP
RETURN
END IF
END SUBROUTINE Check_File
!===============================================================================
FUNCTION File_Length(File_Name,Unit_Number)
INTEGER :: File_Length
INTEGER, INTENT(IN) :: Unit_Number
CHARACTER(LEN=13),INTENT(IN) :: File_Name
INTEGER :: Open_Status,Input_Status
! Calls subroutine to check on status of file
CALL Check_File(Unit_Number,Open_Status,File_Name)
IF(Open_Status .GT. 0)THEN
WRITE(*,*) "**** Cannot Read", File_Name," ****"
STOP
RETURN
END IF
! Determine File Size
File_Length = 0
DO 300
READ(Unit_Number,*,IOSTAT = Input_Status)
IF(Input_Status .LT. 0) EXIT
File_Length = File_Length + 1
300 CONTINUE
CLOSE(Unit_Number)
END FUNCTION File_Length
!===============================================================================
SUBROUTINE Read_File(File_Name,Unit_Number,Column1,Max_Value)
INTEGER, INTENT(IN) :: Unit_Number
REAL, DIMENSION(:), ALLOCATABLE,INTENT(OUT) :: Column1
CHARACTER(LEN=13),INTENT(IN) :: File_Name
REAL, INTENT(OUT) :: Max_Value
INTEGER :: Array_Size,n
! Determines the array size and allocates the array
Array_Size = File_Length(File_Name,Unit_Number)
ALLOCATE (Column1(Array_Size))
! - Reads in columnar array and determines the element with
! the largest value
Max_Value = 0.0
OPEN(UNIT= Unit_Number,File = File_Name)
DO n = 1,Array_Size
READ(Unit_Number,*) Column1(n)
IF(Column1(n) .GT. Max_Value) Max_Value = Column1(n)
END DO
CLOSE(Unit_Number)
END SUBROUTINE Read_File
!===============================================================================
END MODULE READ_COLUMNAR_FILE

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js