MPI Write taking too much memory at the master node - fortran

I have a DNS (Direct Numerical Simulations) code that has to handle large amounts of data. While running the code, the code is able to read from a file using the MPI_File_read_all, but after a few iterations, when I want to write out the data using MPI_File_write_all I get a memory error (signal 9). When I used Valgrind to memory profile, I see that the master node uses more memory than the rest of the nodes, even though there are no data operations that are exclusive to the master node.
This is the read subroutine
subroutine read3DField(filename,vars)
! Read binary data file using parallel MPI-IO
use commons
include 'sphere.cmn'
character(len=8), intent(in) :: filename ! File name
integer :: ierr
real vars(nxm:nxp,nym:nyp,nzm:nzp,ns+6)
type(MPI_File) :: fh
type(MPI_Status) :: status
integer(kind=MPI_OFFSET_KIND) :: disp
write (*, *) 'Entered read', cartcoord(1)
call MPI_File_open(MPI_COMM_WORLD, filename, &
MPI_MODE_RDONLY, MPI_INFO_NULL, fh, ierr)
write (*, *) 'File opened by ', id
write(*,*) ierr
disp = 0
call MPI_File_set_view(fh, disp, MPI_dp, type_global, &
'native', MPI_INFO_NULL, ierr)
write (*, *) 'view set', id
write(*,*) ierr
call MPI_File_read_all(fh, vars, 1, type_local, status, ierr)
write (*, *) 'read data', id
write(*,*) ierr
call MPI_File_close(fh, ierr)
write(*,*) ierr
end subroutine read3DField
this is the write subroutine
subroutine write3DField(filename,vars)
! Write binary data file using parallel MPI-IO
use commons
character(len=8), intent(in) :: filename ! File name
integer :: ierr
! Array to write
real vars(nxm:nxp,nym:nyp,nzm:nzp,ns+6)
type(MPI_File) :: fh
type(MPI_Status) :: status
integer(kind=MPI_OFFSET_KIND) :: disp
call MPI_File_open(MPI_COMM_WORLD, filename, &
IOR(MPI_MODE_WRONLY, MPI_MODE_CREATE), MPI_INFO_NULL, fh, ierr)
disp = 0
call MPI_File_set_view(fh, disp, MPI_dp, type_global, &
'native', MPI_INFO_NULL, ierr)
call MPI_File_write_all(fh, vars, 1, type_local, status, ierr)
call MPI_File_close(fh, ierr)
end subroutine write3DField
type_global and type_local are MPI data types that are initialized previously with appropriate offsets and committed using MPI_commit.
My question is where is this "excess" memory coming from and how to remove it.

Related

Use MPI_Send and MPI_Recv between Parent and Child process created with MPI_Comm_spawn

I am trying to use MPI_Send() and MPI_Recv() to communicate between a child and its parent process, created by using MPI_Comm_spawn as can be seen below:
Parent.f90
program master
use mpi
implicit none
integer :: ierr, num_procs, my_id, intercomm, i, array(10), tag
CALL MPI_INIT(ierr)
CALL MPI_COMM_RANK(MPI_COMM_WORLD, my_id, ierr)
CALL MPI_COMM_SIZE(MPI_COMM_WORLD, num_procs, ierr)
if (.not. (ierr .eq. 0)) then
print*, "S.Unable to initilaize!"
stop
endif
if (my_id .eq. 0) then
call MPI_Comm_spawn("./child.out", MPI_ARGV_NULL, 1, MPI_INFO_NULL, my_id, &
& MPI_COMM_WORLD, intercomm, MPI_ERRCODES_IGNORE, ierr)
call MPI_Send(array, 255, MPI_INTEGER, my_id, tag, intercomm, ierr)
endif
call MPI_Finalize(ierr)
end program master
Child.f90
program name
use mpi
implicit none
! type declaration statements
integer :: ierr, parent, my_id, n_procs, i, array(10), tag, intercomm
logical :: flag, high
! executable statements
call MPI_Init(ierr)
call MPI_Initialized(flag, ierr)
call MPI_Comm_get_parent(parent, ierr)
call MPI_Comm_rank(MPI_COMM_WORLD, my_id, ierr)
call MPI_Comm_size(MPI_COMM_WORLD, n_procs, ierr)
print *, "Initilaized? ", flag
print *, "My mommy is: ", parent
print *, "My rank is:", my_id
tag = 1
call MPI_Recv(array, 255, MPI_INTEGER, MPI_ANY_SOURCE, tag, parent, MPI_STATUS_IGNORE, ierr)
print *, "Client received array."
call MPI_Finalize(ierr)
end program name
When the above program is run, the Parent seem to run through fine, but the Child never prints: "Client received array.", leading me to believe that I have messed something up with the send/recv.
If it is not clear what I am trying to achieve, I want the parent to spawn a child, send an array to that child, the child to process the array and the child to send the array back to the parent. (italics is yet to be written, I want to get this basic communication working first)
At the moment, when I run: mpiexec -np 1 parent.out, the child prints:
Initilaized? T
My mommy is: 3
My rank is: 0
but not "Client received array."
I was able to solve my issue. The following code starts a parent, sends an array of size 1000000 to its child, the child squares the array and sends it back to its parent.
Parent.f90
program master
use mpi
implicit none
integer :: ierr, num_procs, my_id, intercomm, i, array(1000000), s_tag, s_dest, siffra
CALL MPI_INIT(ierr)
CALL MPI_COMM_RANK(MPI_COMM_WORLD, my_id, ierr)
CALL MPI_COMM_SIZE(MPI_COMM_WORLD, num_procs, ierr)
!print *, "S.Rank =", my_id
!print *, "S.Size =", num_procs
if (.not. (ierr .eq. 0)) then
print*, "S.Unable to initilaize bös!"
stop
endif
do i=1,size(array)
array(i) = 2
enddo
if (my_id .eq. 0) then
call MPI_Comm_spawn("./client2.out", MPI_ARGV_NULL, 1, MPI_INFO_NULL, my_id, &
& MPI_COMM_WORLD, intercomm, MPI_ERRCODES_IGNORE, ierr)
s_dest = 0 !rank of destination (integer)
s_tag = 1 !message tag (integer)
call MPI_Send(array(1), 1000000, MPI_INTEGER, s_dest, s_tag, intercomm, ierr)
call MPI_Recv(array(1), 1000000, MPI_INTEGER, s_dest, s_tag, intercomm, MPI_STATUS_IGNORE, ierr)
!do i=1,10
! print *, "S.Array(",i,"): ", array(i)
!enddo
endif
call MPI_Finalize(ierr)
end program master
Child.f90
program name
use mpi
implicit none
! type declaration statements
integer :: ierr, parent, my_id, n_procs, i, array(1000000), ctag, csource, intercomm, siffra
logical :: flag
! executable statements
call MPI_Init(ierr)
call MPI_Initialized(flag, ierr)
call MPI_Comm_get_parent(parent, ierr)
call MPI_Comm_rank(MPI_COMM_WORLD, my_id, ierr)
call MPI_Comm_size(MPI_COMM_WORLD, n_procs, ierr)
csource = 0 !rank of source
ctag = 1 !message tag
call MPI_Recv(array(1), 1000000, MPI_INTEGER, csource, ctag, parent, MPI_STATUS_IGNORE, ierr)
!do i=1,10
! print *, "C.Array(",i,"): ", array(i)
!enddo
do i=1,size(array)
array(i) = array(i)**2
enddo
!do i=1,10
! print *, "C.Array(",i,"): ", array(i)
!enddo
call MPI_Send(array(1), 1000000, MPI_INTEGER, csource, ctag, parent, ierr)
call MPI_Finalize(ierr)
end program name

Creating HDF5 File and datasets with OpenMPI

I need to write my HDF5 datasets to one HDF5 file in parallel and I want to create my file just with one thread and to do this I can use if statements like:
if( currentThread == 0)
{
createHDF5File( );
}
But I don't know which thread will come first. For example, when thread 1 comes first then it tries to write a dataset to a non-existing file. Is there any way to select the first thread? Or is there any better way to do this?
It sounds like you really should be using parallel IO with HDF5. HDF5 is capable of using MPI-IO (under the hood), if you built it with parallel support.
Here's a sample program (in Fortran).
! Program to use MPI_Cart and Parallel HDF5
!
program hdf_pwrite
use, intrinsic :: iso_c_binding, only: c_double
use mpi
use hdf5
use kinds, only : r_dp
implicit none
! external interface
interface
subroutine get_walltime(t) &
bind(c, name="get_walltime")
import :: c_double
real(kind=c_double), intent(out) :: t
end subroutine get_walltime
end interface
! Local 4000x4000 with a 1x1 halo
integer, parameter :: ndims = 2
integer, parameter :: N = 4000
integer, parameter :: halo = 1
integer :: argc ! Command line args
integer :: ierr ! Error status
integer :: id ! My rank/ID
integer :: np ! Number of processors
integer :: iunit ! File descriptor
integer :: i,j ! Loop indexers
integer :: total ! Total dimension size
integer :: lcount ! Luster count size
integer :: lsize ! Lustre stripe size
character(len=1024) :: clcount, clsize ! Strings of LFS
integer :: info ! MPI IO Info
integer :: m_dims(ndims) ! MPI cart dims
integer :: coords(ndims) ! Co-ords of procs in the grid
logical :: is_periodic(ndims) ! Periodic boundary conditions
logical :: reorder ! Reorder the MPI structure
integer :: MPI_COMM_2D ! New communicator
integer(KIND=MPI_OFFSET_KIND) :: offset
character(len=1024) :: filename
integer(kind=hid_t) :: p_id, f_id, x_id, d_id
integer(kind=hid_t) :: memspace, filespace
! Local hyper slab info
integer(kind=hsize_t) :: d_size(ndims), s_size(ndims), h_size(ndims),&
stride(ndims), block(ndims)
! Global hyper slab info
integer(kind=hsize_t) :: g_size(ndims), g_start(ndims)
real(kind=r_dp), allocatable :: ld(:,:)
! Timing vars
real(kind=r_dp) :: s, e, dt, mdt
argc = 0
ierr = 0
offset = 0
m_dims = (/ 0, 0/)
is_periodic = .false. ! Non-periodic
reorder = .false. ! Not allowed to reorder
call mpi_init(ierr)
! Set up the MPI cartesian topology
call mpi_comm_size(MPI_COMM_WORLD, np, ierr)
call mpi_dims_create(np, ndims, m_dims, ierr)
call mpi_cart_create(MPI_COMM_WORLD, ndims, m_dims, is_periodic, &
reorder, MPI_COMM_2D, ierr)
call mpi_comm_rank(MPI_COMM_2D, id, ierr)
call mpi_cart_coords(MPI_COMM_2D, id, ndims, coords, ierr)
if (id .eq. 0) then
if (mod(N,np) .ne. 0) then
write(0,*) 'Must use divisiable number of procs.'
call mpi_abort(MPI_COMM_WORLD, 1, ierr)
endif
! get the filename
argc = iargc()
if (argc .lt. 1 ) then
write(0, *) 'Must supply a filename'
call exit(1)
endif
call get_command_argument(1, filename)
endif
! Broadcast the filename
call mpi_bcast(filename, len(filename), MPI_CHAR, 0, &
MPI_COMM_WORLD, ierr)
! Init the HDF5 library
call h5open_f(ierr)
! Set a stripe count of 4 and a stripe size of 4MB
lcount = 4
lsize = 4 * 1024 * 1024
write(clcount, '(I4)') lcount
write(clsize, '(I8)') lsize
call mpi_info_create(info, ierr)
call mpi_info_set(info, "striping_factor", trim(clcount), ierr)
call mpi_info_set(info, "striping_unit", trim(clsize), ierr)
! Set up the access properties
call h5pcreate_f(H5P_FILE_ACCESS_F, p_id, ierr)
call h5pset_fapl_mpio_f(p_id, MPI_COMM_2D, info, ierr)
! Open the file
call h5fcreate_f(filename, H5F_ACC_TRUNC_F, f_id, ierr, &
access_prp = p_id)
if (ierr .ne. 0) then
write(0,*) 'Unable to open: ', trim(filename), ': ', ierr
call mpi_abort(MPI_COMM_WORLD, 1, ierr)
endif
! Generate our 4000x4000 matrix with a 1x1 halo
total = N + 2 * halo
allocate(ld(0:total-1, 0:total-1))
ld = -99.99
! init the local data
do j = 1, N
do i = 1, N
ld(i,j) = (i - 1 + (j-1)*N)
enddo
enddo
! Create the local memory space and hyperslab
do i = 1, ndims
d_size(i) = total
s_size(i) = N
h_size(i) = halo
stride(i) = 1
block(i) = 1
enddo
call h5screate_simple_f(ndims, d_size, memspace, ierr)
call h5sselect_hyperslab_f(memspace, H5S_SELECT_SET_F, &
h_size, s_size, ierr, &
stride, block)
! Create the global file space and hyperslab
do i = 1, ndims
g_size(i) = N * m_dims(i)
g_start(i) = N * coords(i)
enddo
call h5screate_simple_f(ndims, g_size, filespace, ierr)
call h5sselect_hyperslab_f(filespace, H5S_SELECT_SET_F, &
g_start, s_size, ierr, &
stride, block)
! Create a data transfer property
call h5pcreate_f(H5P_DATASET_XFER_F, x_id, ierr)
call h5pset_dxpl_mpio_f(x_id, H5FD_MPIO_COLLECTIVE_F, ierr)
! Create the dataset id
call h5dcreate_f(f_id, "/data", H5T_IEEE_F64LE, filespace, d_id, &
ierr)
! Write the data
call get_walltime(s)
call h5dwrite_f(d_id, H5T_NATIVE_DOUBLE, ld, s_size, ierr, &
file_space_id=filespace, mem_space_id=memspace, &
xfer_prp=x_id)
call get_walltime(e)
dt = e - s
call mpi_reduce(dt, mdt, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_2D, ierr)
if (id .eq. 0) then
write(6,*) mdt / np
endif
if (allocated(ld)) then
deallocate(ld)
endif
! Close everything and exit
call h5dclose_f(d_id, ierr)
call h5sclose_f(filespace, ierr)
call h5sclose_f(memspace, ierr)
call h5pclose_f(x_id, ierr)
call h5pclose_f(p_id, ierr)
call h5fclose_f(f_id, ierr)
call h5close_f(ierr)
call mpi_finalize(ierr)
end program hdf_pwrite
Please note this is my teaching example that I interactively get the class to play with. So there are a few different things in it.
I introduce the iso_c_binding as we have a timing routine in C (gettimeofday) wrapper.
I use MPI topologies.
The root rank is the only one that processes the filename to write, and then we broadcast this to all ranks.
We set a stripe count and size for the lustre file system.
Use hyper slabs for the data placement.
Use MPI IO collective call.
Hope this helps.
Do you calculate the data you want to write in parallel? If so, you want to make sure that all workers have finished their processing before you write, so that your data is in fact complete.
In other words,
// Collect all the data using some form of MPI_Collect, MPI_Reduce
// or whatevs. I'll just put this here for proof-of-concept
MPI_Barrier();
// Now, all the threads have "joined", so you can write from 0 without worrying
// that some other thread got here way before
if (currentThread == 0) { createdHDF5File(); }
If not, I assume you want to write data from each thread. Why not just write it to different files?
// Calculate stuff on each thread
// Then write to different files depending on thread num
createHDF5File(currentThread); // Chooses file name that includes the thread num

Block execution until children called via MPI_Comm_spawn have finished, no access to children

I would like to spawn the child.exe from the main.f90 code, and I don't have access to modify the child.f90. A simplified code looks like:
main.f90:
program main
implicit none
include 'mpif.h'
integer, parameter:: root = 0
integer, dimension(:), allocatable:: errCodes, status
integer:: numprocs, request, ierr, taskid, INTERCOMM
character*(*), PARAMETER:: cmd = "./child.exe"
!
call mpi_init(ierr)
call mpi_comm_rank(mpi_comm_world, taskid, ierr)
call mpi_comm_size(mpi_comm_world, numprocs, ierr )
call mpi_barrier(mpi_comm_world, ierr)
!
allocate( errCodes(numprocs), status(numprocs))
print*, "from parent, CPU: ", taskid
call MPI_COMM_SPAWN(cmd, MPI_ARGV_NULL, 2, MPI_INFO_NULL, root, MPI_COMM_WORLD,INTERCOMM, errCodes, ierr)
!
call mpi_barrier(mpi_comm_world, ierr)
!
print*, "After children, from parent CPU", taskid
call mpi_finalize(ierr)
end program main
child.f90:
program child
implicit none
include 'mpif.h'
integer:: numprocs, temp, ierr, arg_count, taskid, maxproces, errCodes(2)
!
call mpi_init(ierr)
call mpi_comm_rank(mpi_comm_world, taskid, ierr)
call mpi_comm_size(mpi_comm_world, numprocs, ierr )
!
call sleep(1)
!
print*, "from child, CPU: ", taskid
call mpi_finalize(ierr)
end program child
My question is similar to this one in C++, but I would appreciate it if someone could clarify how to do the process if I don't have access to the children. I have tried to use mpi_barrier, but it is not helpful.
For me the Hristo's advice from the linked answer works quite well, just follow it closely. Pay attention to the communicators (INTERCOMM, parent). There is still some problem at the end, I have never worked with child processes before, but it should point you to the right direction.
program main
use mpi
implicit none
integer, parameter:: root = 0
integer, dimension(:), allocatable:: errCodes, status
integer:: numprocs, request, ierr, taskid, INTERCOMM
character*(*), PARAMETER:: cmd = "./child.exe"
!
call mpi_init(ierr)
call mpi_comm_rank(mpi_comm_world, taskid, ierr)
call mpi_comm_size(mpi_comm_world, numprocs, ierr )
call mpi_barrier(mpi_comm_world, ierr)
!
allocate( errCodes(numprocs), status(numprocs))
print*, "from parent, CPU: ", taskid
call MPI_COMM_SPAWN(cmd, MPI_ARGV_NULL, 1, MPI_INFO_NULL, root, MPI_COMM_WORLD, INTERCOMM, errCodes, ierr)
!
call mpi_barrier(INTERCOMM, ierr)
!
call mpi_barrier(INTERCOMM, ierr)
print*, "After children, from parent CPU", taskid
call mpi_finalize(ierr)
end program main
program child
use mpi
implicit none
integer:: numprocs, temp, ierr, arg_count, taskid, maxproces, errCodes(2), parent
!
call mpi_init(ierr)
call mpi_Comm_get_parent(parent, ierr)
call mpi_comm_rank(parent, taskid, ierr)
call mpi_comm_size(parent, numprocs, ierr )
!
call sleep(5)
call mpi_barrier(parent, ierr)
!
print*, "from child, CPU: ", taskid
call sleep(1)
call mpi_barrier(parent, ierr)
! call sleep(2)
call mpi_finalize(ierr)
end program child

Sending linked list through MPI

I have seen this question being asked many times, but didn't find an answer that could resolve my issue. I want to be able to send a linked list, in Fortran, to another process through MPI. I have done something similar where the derived data type in the linked list was as follows
type a
{
integer :: varA
type(a), pointer :: next=>null()
real :: varB
}
The way I did this was to create an MPI datatype which contained all the varA values together, and receive it as an array of integers. Then do the same for varB.
What I am trying to do now is to create the linked list, and then pack all the varA and varB values together to form the MPI datatype. I give below the code that does this.
PROGRAM TEST
USE MPI
IMPLICIT NONE
TYPE a
INTEGER:: b
REAL :: e
TYPE(a), POINTER :: nextPacketInList => NULL()
END TYPE
TYPE PacketComm
INTEGER :: numPacketsToComm
TYPE(a), POINTER :: PacketListHeadPtr => NULL()
TYPE(a), POINTER :: PacketListTailPtr => NULL()
END TYPE PacketComm
TYPE(PacketComm), DIMENSION(:), ALLOCATABLE :: PacketCommArray
INTEGER :: packPacketDataType !New data type
INTEGER :: ierr, size, rank, dest, ind
integer :: b
real :: e
CALL MPI_INIT(ierr)
CALL MPI_COMM_SIZE(MPI_COMM_WORLD, size, ierr)
CALL MPI_COMM_RANK(MPI_COMM_WORLD, rank, ierr)
IF(.NOT. ALLOCATED(PacketCommArray)) THEN
ALLOCATE(PacketCommArray(0:size-1), STAT=ierr)
DO ind=0, size-1
PacketCommArray(ind)%numPacketsToComm = 0
END DO
ENDIF
b = 2
e = 4
dest = 1
CALL addPacketToList(b, e, dest)
b = 3
e = 5
dest = 1
CALL addPacketToList(b, e, dest)
dest = 1
CALL packPacketList(dest)
IF(rank == 0) THEN
dest = 1
CALL sendPacketList(dest)
ELSE
CALL recvPacketList()
ENDIF
CALL MPI_FINALIZE(ierr)
CONTAINS
SUBROUTINE addPacketToList(b, e, rank)
IMPLICIT NONE
INTEGER :: b, rank, ierr
REAL :: e
TYPE(a), POINTER :: head
IF(.NOT. ASSOCIATED(PacketCommArray(rank)%PacketListHeadPtr)) THEN
ALLOCATE(PacketCommArray(rank)%PacketListHeadPtr, STAT=ierr)
PacketCommArray(rank)%PacketListHeadPtr%b = b
PacketCommArray(rank)%PacketListHeadPtr%e = e
PacketCommArray(rank)%PacketListHeadPtr%nextPacketInList => NULL()
PacketCommArray(rank)%PacketListTailPtr => PacketCommArray(rank)%PacketListHeadPtr
PacketCommArray(rank)%numPacketsToComm = PacketCommArray(rank)%numPacketsToComm+1
ELSE
ALLOCATE(PacketCommArray(rank)%PacketListTailPtr%nextPacketInList, STAT=ierr)
PacketCommArray(rank)%PacketListTailPtr => PacketCommArray(rank)%PacketListTailPtr%nextPacketInList
PacketCommArray(rank)%PacketListTailPtr%b = b
PacketCommArray(rank)%PacketListTailPtr%e = e
PacketCommArray(rank)%PacketListTailPtr%nextPacketInList => NULL()
PacketCommArray(rank)%numPacketsToComm = PacketCommArray(rank)%numPacketsToComm+1
ENDIF
END SUBROUTINE addPacketToList
SUBROUTINE packPacketList(rank)
IMPLICIT NONE
INTEGER :: rank
INTEGER :: numListNodes
INTEGER(kind=MPI_ADDRESS_KIND), DIMENSION(:), ALLOCATABLE :: listNodeAddr
INTEGER(kind=MPI_ADDRESS_KIND), DIMENSION(:), ALLOCATABLE :: listNodeDispl
INTEGER, DIMENSION(:), ALLOCATABLE :: listNodeTypes
INTEGER, DIMENSION(:), ALLOCATABLE :: listNodeCount
TYPE(a), POINTER :: head
INTEGER :: numNode
head => PacketCommArray(rank)%PacketListHeadPtr
numListNodes = PacketCommArray(rank)%numPacketsToComm
PRINT *, ' Number of nodes to allocate for rank ', rank , ' is ', numListNodes
ALLOCATE(listNodeTypes(2*numListNodes), stat=ierr)
ALLOCATE(listNodeCount(2*numListNodes), stat=ierr)
DO numNode=1, 2*numListNodes, 2
listNodeTypes(numNode) = MPI_INTEGER
listNodeTypes(numNode+1) = MPI_REAL
END DO
DO numNode=1, 2*numListNodes, 2
listNodeCount(numNode) = 1
listNodeCount(numNode+1) = 1
END DO
ALLOCATE(listNodeAddr(2*numListNodes), stat=ierr)
ALLOCATE(listNodeDispl(2*numListNodes), stat=ierr)
numNode = 1
DO WHILE(ASSOCIATED(head))
CALL MPI_GET_ADDRESS(head%b, listNodeAddr(numNode), ierr)
CALL MPI_GET_ADDRESS(head%e, listNodeAddr(numNode+1), ierr)
numNode = numNode + 2
head => head%nextPacketInList
END DO
DO numNode=1, UBOUND(listNodeAddr,1)
listNodeDispl(numNode) = listNodeAddr(numNode) - listNodeAddr(1)
END DO
CALL MPI_TYPE_CREATE_STRUCT(UBOUND(listNodeAddr,1), listNodeCount, listNodeDispl, listNodeTypes, packPacketDataType, ierr)
CALL MPI_TYPE_COMMIT(packPacketDataType, ierr)
END SUBROUTINE packPacketList
SUBROUTINE sendPacketList(rank)
IMPLICIT NONE
INTEGER :: rank, ierr, numNodes
TYPE(a), POINTER :: head
head => PacketCommArray(rank)%PacketListHeadPtr
numNodes = PacketCommArray(rank)%numPacketsToComm
CALL MPI_SSEND(head%b, 1, packPacketDataType, rank, 0, MPI_COMM_WORLD, ierr)
END SUBROUTINE sendPacketList
SUBROUTINE recvPacketList
IMPLICIT NONE
TYPE(a), POINTER :: head
TYPE(a), DIMENSION(:), ALLOCATABLE :: RecvPacketCommArray
INTEGER, DIMENSION(:), ALLOCATABLE :: recvB
INTEGER :: numNodes, ierr, numNode
INTEGER, DIMENSION(MPI_STATUS_SIZE):: status
head => PacketCommArray(rank)%PacketListHeadPtr
numNodes = PacketCommArray(rank)%numPacketsToComm
ALLOCATE(RecvPacketCommArray(numNodes), stat=ierr)
ALLOCATE(recvB(numNodes), stat=ierr)
CALL MPI_RECV(RecvPacketCommArray, 1, packPacketDataType, 0, 0, MPI_COMM_WORLD, status, ierr)
DO numNode=1, numNodes
PRINT *, ' value in b', RecvPacketCommArray(numNode)%b
PRINT *, ' value in e', RecvPacketCommArray(numNode)%e
END DO
END SUBROUTINE recvPacketList
END PROGRAM TEST
So basically I create a linked list with two nodes containing the following data
Node 1
b = 2, e = 4
Node 2
b = 3, e = 5
When I run this code on two cores, the results I get on core 1 are
value in b 2
value in e 4.000000
value in b 0
value in e 0.0000000E+00
So my code seems to send the data in the first node of the linked list correctly, but not the second one. Please could someone let me know if what I am trying to do is feasible, and what is wrong with the code. I know I can send the values of b in all nodes together and then the values of e together. But my derived data type will probably contain more variables (including arrays) and I want to be able to send all the data in one go instead of using multiple sends.
Thanks
It's not easy for me to read that code, but it seems like you are expecting the receiving buffer to get contiguous data, which is not the case. The strange type you construct by computing address offsets is not going to match the receiving buffer. To illustrate this, I though I might present this simple example (it's quickly written, don't take it as a good code example):
program example
use mpi
integer :: nprocs, myrank
integer :: buf(4)
integer :: n_elements
integer :: len_element(2)
integer(MPI_ADDRESS_KIND) :: disp_element(2)
integer :: type_element(2)
integer :: newtype
integer :: istat(MPI_STATUS_SIZE)
integer :: ierr
call mpi_init(ierr)
call mpi_comm_size(mpi_comm_world, nprocs, ierr)
call mpi_comm_rank(mpi_comm_world, myrank, ierr)
! simple example illustrating mpi_type_create_struct
! take an integer array buf(4):
! on rank 0: [ 7, 2, 6, 4 ]
! on rank 1: [ 1, 1, 1, 1 ]
! and we create a struct to send only elements 1 and 3
! so that on rank 1 we'll get [7, 1, 6, 1]
if (myrank == 0) then
buf = [7, 2, 6, 4]
else
buf = 1
end if
n_elements = 2
len_element = 1
disp_element(1) = 0
disp_element(2) = 8
type_element = MPI_INTEGER
call mpi_type_create_struct(n_elements, len_element, disp_element, type_element, newtype, ierr)
call mpi_type_commit(newtype, ierr)
write(6,'(1x,a,i2,1x,a,4i2)') 'SEND| rank ', myrank, 'buf = ', buf
if (myrank == 0) then
call mpi_send (buf, 1, newtype, 1, 13, MPI_COMM_WORLD, ierr)
else
call mpi_recv (buf, 1, newtype, 0, 13, MPI_COMM_WORLD, istat, ierr)
!the below call does not scatter the received integers, try to see the difference
!call mpi_recv (buf, 2, MPI_INTEGER, 0, 13, MPI_COMM_WORLD, istat, ierr)
end if
write(6,'(1x,a,i2,1x,a,4i2)') 'RECV| rank ', myrank, 'buf = ', buf
end program
I hope this clearly shows that the receiving buffer will have to accommodate any offsets in the constructed type, and does not receive any contiguous data.
EDIT: updated the code to illustrate a different receive type that does not scatter the data.

MPI_ALLGATHER error parallelizing code

I'm trying to parallelize the following code.
subroutine log_likelihood(y, theta, lli, ll)
doubleprecision, allocatable, intent(in) :: y(:)
doubleprecision, intent(in) :: theta(2)
doubleprecision, allocatable, intent(out) :: lli(:)
doubleprecision, intent(out) :: ll
integer :: i
ALLOCATE (lli(size(y)))
lli = 0.0d0
ll = 0.0d0
do i = 1, size(y)
lli(i) = -log(sqrt(theta(2))) - 0.5*log(2.0d0*pi) &
- (1.0d0/(2.0d0*theta(2)))*((y(i)-theta(1))**2)
end do
ll = sum(lli)
end subroutine log_likelihood
To do this, I'm trying to use MPI_ALLGATHER. This is the code I wrote
subroutine log_likelihood(y, theta, lli, ll)
doubleprecision, allocatable, intent(in) :: y(:)
doubleprecision, intent(in) :: theta(2)
doubleprecision, allocatable, intent(out) :: lli(:)
doubleprecision, intent(out) :: ll
integer :: i, size_y, diff
size_y=size(y)
ALLOCATE (lli(size_y))
!Broadcasting
call MPI_BCAST(theta, 1, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, ierr)
call MPI_BCAST(y, 1, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, ierr)
! Determine how many points to handle with each proc
points_per_proc = (size_y + numprocs - 1)/numprocs
! Determine start and end index for this proc's points
istart = proc_num * points_per_proc + 1
iend = min((proc_num + 1)*points_per_proc, size_y)
diff = iend-istart+1
allocate(proc_contrib(istart:iend))
do i = istart, iend
proc_contrib(i) = -log(sqrt(theta(2))) - 0.5*log(2.0d0*pi) &
- (1.0d0/(2.0d0*theta(2)))*((y(i)-theta(1))**2)
end do
call MPI_ALLGATHER(proc_contrib, diff, MPI_DOUBLE_PRECISION, &
lli, diff, MPI_DOUBLE_PRECISION, &
MPI_COMM_WORLD, ierr)
ll = sum(lli)
end subroutine log_likelihood
When I try to run my program, I get the following error.
$ mpiexec -n 2 ./mle.X
Fatal error in PMPI_Allgather: Internal MPI error!, error stack:
PMPI_Allgather(961)......: MPI_Allgather(sbuf=0x7ff2f251b860, scount=1500000, MPI_DOUBLE_PRECISION, rbuf=0x7ff2f2ad5650, rcount=3000000, MPI_DOUBLE_PRECISION, MPI_COMM_WORLD) failed
MPIR_Allgather_impl(807).:
MPIR_Allgather(766)......:
MPIR_Allgather_intra(560):
MPIR_Localcopy(357)......: memcpy arguments alias each other, dst=0x7ff2f2ad5650 src=0x7ff2f251b860 len=12000000
===================================================================================
= BAD TERMINATION OF ONE OF YOUR APPLICATION PROCESSES
= EXIT CODE: 1
= CLEANING UP REMAINING PROCESSES
= YOU CAN IGNORE THE BELOW CLEANUP MESSAGES
===================================================================================
Can somebody please explain to me what I'm doing wrong?
Thanks!
I finally was able to solve my problem. The serial and parallel versions of my code are available at https://bitbucket.org/ignacio82/bhhh
subroutine log_likelihood(y, n, theta, lli, ll)
integer, intent(in) :: n
doubleprecision, intent(in) :: y(n)
doubleprecision, intent(in) :: theta(2)
doubleprecision, intent(out) :: lli(n)
doubleprecision, intent(out) :: ll
integer :: i
do i = istart, iend
proc_contrib(i-istart+1) = -log(sqrt(theta(2))) - 0.5*log(2.0d0*pi) &
- (1.0d0/(2.0d0*theta(2)))*((y(i)-theta(1))**2)
end do
if ( mod(n,numprocs)==0 ) then
call MPI_ALLGATHER(proc_contrib, points_per_proc, MPI_DOUBLE_PRECISION, &
lli, points_per_proc, MPI_DOUBLE_PRECISION, &
MPI_COMM_WORLD, ierr)
else if (numprocs-1 == proc_num ) then
recvcounts(numprocs) = iend-istart+1
call MPI_ALLGATHERV(proc_contrib, points_per_proc, MPI_DOUBLE_PRECISION, &
lli, recvcounts, displs, MPI_DOUBLE_PRECISION, &
MPI_COMM_WORLD, ierr)
end if
ll = sum(lli)
end subroutine log_likelihood