I need to write my HDF5 datasets to one HDF5 file in parallel and I want to create my file just with one thread and to do this I can use if statements like:
if( currentThread == 0)
{
createHDF5File( );
}
But I don't know which thread will come first. For example, when thread 1 comes first then it tries to write a dataset to a non-existing file. Is there any way to select the first thread? Or is there any better way to do this?
It sounds like you really should be using parallel IO with HDF5. HDF5 is capable of using MPI-IO (under the hood), if you built it with parallel support.
Here's a sample program (in Fortran).
! Program to use MPI_Cart and Parallel HDF5
!
program hdf_pwrite
use, intrinsic :: iso_c_binding, only: c_double
use mpi
use hdf5
use kinds, only : r_dp
implicit none
! external interface
interface
subroutine get_walltime(t) &
bind(c, name="get_walltime")
import :: c_double
real(kind=c_double), intent(out) :: t
end subroutine get_walltime
end interface
! Local 4000x4000 with a 1x1 halo
integer, parameter :: ndims = 2
integer, parameter :: N = 4000
integer, parameter :: halo = 1
integer :: argc ! Command line args
integer :: ierr ! Error status
integer :: id ! My rank/ID
integer :: np ! Number of processors
integer :: iunit ! File descriptor
integer :: i,j ! Loop indexers
integer :: total ! Total dimension size
integer :: lcount ! Luster count size
integer :: lsize ! Lustre stripe size
character(len=1024) :: clcount, clsize ! Strings of LFS
integer :: info ! MPI IO Info
integer :: m_dims(ndims) ! MPI cart dims
integer :: coords(ndims) ! Co-ords of procs in the grid
logical :: is_periodic(ndims) ! Periodic boundary conditions
logical :: reorder ! Reorder the MPI structure
integer :: MPI_COMM_2D ! New communicator
integer(KIND=MPI_OFFSET_KIND) :: offset
character(len=1024) :: filename
integer(kind=hid_t) :: p_id, f_id, x_id, d_id
integer(kind=hid_t) :: memspace, filespace
! Local hyper slab info
integer(kind=hsize_t) :: d_size(ndims), s_size(ndims), h_size(ndims),&
stride(ndims), block(ndims)
! Global hyper slab info
integer(kind=hsize_t) :: g_size(ndims), g_start(ndims)
real(kind=r_dp), allocatable :: ld(:,:)
! Timing vars
real(kind=r_dp) :: s, e, dt, mdt
argc = 0
ierr = 0
offset = 0
m_dims = (/ 0, 0/)
is_periodic = .false. ! Non-periodic
reorder = .false. ! Not allowed to reorder
call mpi_init(ierr)
! Set up the MPI cartesian topology
call mpi_comm_size(MPI_COMM_WORLD, np, ierr)
call mpi_dims_create(np, ndims, m_dims, ierr)
call mpi_cart_create(MPI_COMM_WORLD, ndims, m_dims, is_periodic, &
reorder, MPI_COMM_2D, ierr)
call mpi_comm_rank(MPI_COMM_2D, id, ierr)
call mpi_cart_coords(MPI_COMM_2D, id, ndims, coords, ierr)
if (id .eq. 0) then
if (mod(N,np) .ne. 0) then
write(0,*) 'Must use divisiable number of procs.'
call mpi_abort(MPI_COMM_WORLD, 1, ierr)
endif
! get the filename
argc = iargc()
if (argc .lt. 1 ) then
write(0, *) 'Must supply a filename'
call exit(1)
endif
call get_command_argument(1, filename)
endif
! Broadcast the filename
call mpi_bcast(filename, len(filename), MPI_CHAR, 0, &
MPI_COMM_WORLD, ierr)
! Init the HDF5 library
call h5open_f(ierr)
! Set a stripe count of 4 and a stripe size of 4MB
lcount = 4
lsize = 4 * 1024 * 1024
write(clcount, '(I4)') lcount
write(clsize, '(I8)') lsize
call mpi_info_create(info, ierr)
call mpi_info_set(info, "striping_factor", trim(clcount), ierr)
call mpi_info_set(info, "striping_unit", trim(clsize), ierr)
! Set up the access properties
call h5pcreate_f(H5P_FILE_ACCESS_F, p_id, ierr)
call h5pset_fapl_mpio_f(p_id, MPI_COMM_2D, info, ierr)
! Open the file
call h5fcreate_f(filename, H5F_ACC_TRUNC_F, f_id, ierr, &
access_prp = p_id)
if (ierr .ne. 0) then
write(0,*) 'Unable to open: ', trim(filename), ': ', ierr
call mpi_abort(MPI_COMM_WORLD, 1, ierr)
endif
! Generate our 4000x4000 matrix with a 1x1 halo
total = N + 2 * halo
allocate(ld(0:total-1, 0:total-1))
ld = -99.99
! init the local data
do j = 1, N
do i = 1, N
ld(i,j) = (i - 1 + (j-1)*N)
enddo
enddo
! Create the local memory space and hyperslab
do i = 1, ndims
d_size(i) = total
s_size(i) = N
h_size(i) = halo
stride(i) = 1
block(i) = 1
enddo
call h5screate_simple_f(ndims, d_size, memspace, ierr)
call h5sselect_hyperslab_f(memspace, H5S_SELECT_SET_F, &
h_size, s_size, ierr, &
stride, block)
! Create the global file space and hyperslab
do i = 1, ndims
g_size(i) = N * m_dims(i)
g_start(i) = N * coords(i)
enddo
call h5screate_simple_f(ndims, g_size, filespace, ierr)
call h5sselect_hyperslab_f(filespace, H5S_SELECT_SET_F, &
g_start, s_size, ierr, &
stride, block)
! Create a data transfer property
call h5pcreate_f(H5P_DATASET_XFER_F, x_id, ierr)
call h5pset_dxpl_mpio_f(x_id, H5FD_MPIO_COLLECTIVE_F, ierr)
! Create the dataset id
call h5dcreate_f(f_id, "/data", H5T_IEEE_F64LE, filespace, d_id, &
ierr)
! Write the data
call get_walltime(s)
call h5dwrite_f(d_id, H5T_NATIVE_DOUBLE, ld, s_size, ierr, &
file_space_id=filespace, mem_space_id=memspace, &
xfer_prp=x_id)
call get_walltime(e)
dt = e - s
call mpi_reduce(dt, mdt, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_2D, ierr)
if (id .eq. 0) then
write(6,*) mdt / np
endif
if (allocated(ld)) then
deallocate(ld)
endif
! Close everything and exit
call h5dclose_f(d_id, ierr)
call h5sclose_f(filespace, ierr)
call h5sclose_f(memspace, ierr)
call h5pclose_f(x_id, ierr)
call h5pclose_f(p_id, ierr)
call h5fclose_f(f_id, ierr)
call h5close_f(ierr)
call mpi_finalize(ierr)
end program hdf_pwrite
Please note this is my teaching example that I interactively get the class to play with. So there are a few different things in it.
I introduce the iso_c_binding as we have a timing routine in C (gettimeofday) wrapper.
I use MPI topologies.
The root rank is the only one that processes the filename to write, and then we broadcast this to all ranks.
We set a stripe count and size for the lustre file system.
Use hyper slabs for the data placement.
Use MPI IO collective call.
Hope this helps.
Do you calculate the data you want to write in parallel? If so, you want to make sure that all workers have finished their processing before you write, so that your data is in fact complete.
In other words,
// Collect all the data using some form of MPI_Collect, MPI_Reduce
// or whatevs. I'll just put this here for proof-of-concept
MPI_Barrier();
// Now, all the threads have "joined", so you can write from 0 without worrying
// that some other thread got here way before
if (currentThread == 0) { createdHDF5File(); }
If not, I assume you want to write data from each thread. Why not just write it to different files?
// Calculate stuff on each thread
// Then write to different files depending on thread num
createHDF5File(currentThread); // Chooses file name that includes the thread num
Related
I have a DNS (Direct Numerical Simulations) code that has to handle large amounts of data. While running the code, the code is able to read from a file using the MPI_File_read_all, but after a few iterations, when I want to write out the data using MPI_File_write_all I get a memory error (signal 9). When I used Valgrind to memory profile, I see that the master node uses more memory than the rest of the nodes, even though there are no data operations that are exclusive to the master node.
This is the read subroutine
subroutine read3DField(filename,vars)
! Read binary data file using parallel MPI-IO
use commons
include 'sphere.cmn'
character(len=8), intent(in) :: filename ! File name
integer :: ierr
real vars(nxm:nxp,nym:nyp,nzm:nzp,ns+6)
type(MPI_File) :: fh
type(MPI_Status) :: status
integer(kind=MPI_OFFSET_KIND) :: disp
write (*, *) 'Entered read', cartcoord(1)
call MPI_File_open(MPI_COMM_WORLD, filename, &
MPI_MODE_RDONLY, MPI_INFO_NULL, fh, ierr)
write (*, *) 'File opened by ', id
write(*,*) ierr
disp = 0
call MPI_File_set_view(fh, disp, MPI_dp, type_global, &
'native', MPI_INFO_NULL, ierr)
write (*, *) 'view set', id
write(*,*) ierr
call MPI_File_read_all(fh, vars, 1, type_local, status, ierr)
write (*, *) 'read data', id
write(*,*) ierr
call MPI_File_close(fh, ierr)
write(*,*) ierr
end subroutine read3DField
this is the write subroutine
subroutine write3DField(filename,vars)
! Write binary data file using parallel MPI-IO
use commons
character(len=8), intent(in) :: filename ! File name
integer :: ierr
! Array to write
real vars(nxm:nxp,nym:nyp,nzm:nzp,ns+6)
type(MPI_File) :: fh
type(MPI_Status) :: status
integer(kind=MPI_OFFSET_KIND) :: disp
call MPI_File_open(MPI_COMM_WORLD, filename, &
IOR(MPI_MODE_WRONLY, MPI_MODE_CREATE), MPI_INFO_NULL, fh, ierr)
disp = 0
call MPI_File_set_view(fh, disp, MPI_dp, type_global, &
'native', MPI_INFO_NULL, ierr)
call MPI_File_write_all(fh, vars, 1, type_local, status, ierr)
call MPI_File_close(fh, ierr)
end subroutine write3DField
type_global and type_local are MPI data types that are initialized previously with appropriate offsets and committed using MPI_commit.
My question is where is this "excess" memory coming from and how to remove it.
I'm new to fortran and MPI and currently processing a very large matrix row by row on different processors. I gather all the results on all the processors as it is needed. The following is the sample code which has the same structure as my real code.
I keep running into SIGBUS problems at MPI_Allgather, Line 49 of mod_test.f in the bounded function iter. How could I resolve this?
Compiler details:
$ mpifort --version
ifort (IFORT) 19.0.1.144 20181018
Copyright (C) 1985-2018 Intel Corporation. All rights reserved.
The code is compiled as follows:
mpifort mod_test.f main.f -o main -traceback -g -debug
mod_test.f
module TEST
include "mpif.h"
type A
real ,allocatable:: pf(:,:)
integer :: nx=100, ny=10
contains
procedure :: init
procedure :: iter
end type A
type(A) :: A_obj
contains
integer function init(this, x, y)
implicit none
class(A) , intent(inout):: this
integer , intent(in):: x, y
this% nx = x
this% ny = y
allocate( this% pf(x, y) )
this% pf = 0.0
init = 1
return
end function init
integer function iter(this, y_iter)
implicit none
class(A) , intent(inout):: this
integer , intent(in):: y_iter
integer :: i
real ,target :: a(this%nx+1), ar(this%nx+1)
real , dimension(:), pointer :: abuff, arbuff
a = 0.0
ar = 0.0
do i = 1, this% nx
this%pf(i, y_iter) = i * y_iter
enddo
a(1:this%nx) = this% pf(:, y_iter)
a(this%nx+1) = y_iter
call MPI_Allgather(a, this%nx+1, MPI_REAL, ar,
& this%nx+1, MPI_REAL,
& MPI_COMM_WORLD)
write(*,*) "Reached after MPI_Allgather"
do i = 1, this%nx + 1
write(*,*)ar(i)
enddo
this% pf(:, ar(this%nx+1)) = ar(1:this%nx)
write(*,*) "Got the solution from another processor"
iter = 1
end function iter
subroutine INIT_A
integer :: j, rank, ierr, size
! - Allocate
ierr= A_obj% init(100, 10)
! - Iterate
call MPI_COMM_RANK( MPI_COMM_WORLD, rank, ierr)
call MPI_COMM_SIZE( MPI_COMM_WORLD, size, ierr)
do j = 1, A_obj % ny
if ( rank == mod(j, size) ) then
ierr = A_obj % iter( j )
endif
enddo
end subroutine INIT_A
end module TEST
main.f
PROGRAM MAIN
use TEST
implicit none
integer :: ierr
call MPI_INIT(ierr)
call INIT_A
write(*,*) "Done with test"
call MPI_FINALIZE(ierr)
end PROGRAM MAIN
My simulation (written in Fortran 90) produces an array (either 1D, 2D, or 3D) at each time step. I would like to output these arrays into a single HDF5 file that contains the arrays produced for all the time steps. Note that since the output array at each time step has the same rank and dimensions, it is possible to combine these arrays together by adding an extra time dimension.
For now, I just create a buffer array to combine the output array at each time step, and then write the buffer array to a dataset in a HDF5 file at the end of simulation. But if the output array at each time step gets larger, the buffer can only hold the data for a few time steps. So I need to flush the data to HDF5 every a few time steps.
I looked at many posts and documentation and found that they mentioned some techniques like chunked dataset and hyperslab selection for efficient output to HDF5. But I am still not sure how I can apply these to my case. Could someone give me an example with Fortran 90?
After reading some of the documentation on hyperslab, I got the procedure to write to hdf5 file at each time step working. The following is the demo code. Hope you find it is useful.
program test_hyperslab
use HDF5
implicit none
integer :: error ! error flag
character(len=9), parameter :: filename = "subset.h5"
character(len=8), parameter :: dsetname = "IntArray"
integer(HID_T) :: file_id ! file identifier
integer(HID_T) :: dset_id ! dataset identifier
integer(HID_T) :: dataspace ! dataspace identifier
integer(HID_T) :: memspace ! memspace identifier
integer(HSIZE_T), dimension(2) :: dimsm = (/3, 1/)
integer, dimension(3) :: sdata ! subset buffer
integer :: dim0_sub = 3
integer :: dim1_sub = 1
integer(HSIZE_T), dimension(2) :: count = (/3, 1/)
integer(HSIZE_T), dimension(2) :: offset
integer(HSIZE_T), dimension(2) :: stride = (/1, 1/)
integer(HSIZE_T), dimension(2) :: block = (/1, 1/)
integer(HSIZE_T), dimension(2) :: dimsf = (/3, 10/)
integer, dimension(3, 10) :: rdata ! data to read
integer :: rank = 2
integer :: dim0 = 3
integer :: dim1 = 10
integer :: i
! initialize fortran interface
call h5open_f(error)
! create a new file with default properties
call h5fcreate_f(filename, H5F_ACC_TRUNC_F, file_id, error)
! create the data space for the dataset
call h5screate_simple_f(rank, dimsf, dataspace, error)
! create the dataset with default properties
call h5dcreate_f(file_id, dsetname, H5T_NATIVE_INTEGER, dataspace, &
dset_id, error)
! create memory dataspace
call h5screate_simple_f(rank, dimsm, memspace, error)
offset(1) = 0
do i = 1, 10
offset(2) = i - 1
sdata = i
! select subset
call h5sselect_hyperslab_f(dataspace, H5S_SELECT_SET_F, &
offset, count, error, stride, block)
! write subset to dataset
call h5dwrite_f(dset_id, H5T_NATIVE_INTEGER, sdata, dimsm, error, &
memspace, dataspace)
enddo
! read entire dataset back
call h5dread_f(dset_id, H5T_NATIVE_INTEGER, rdata, dimsf, error)
write(*, '(A)') "Data in file after subset written: "
do i = 1, dim0
write(*, '(100(1X,I0,1X))') rdata(i, 1:dim1)
enddo
! close everything opened
call h5sclose_f(dataspace, error)
call h5sclose_f(memspace, error)
call h5dclose_f(dset_id, error)
call h5fclose_f(file_id, error)
! close fortran interface
call h5close_f(error)
end program test_hyperslab
I think you need additional library for that.
Please look at the link to the Fortran library here: https://support.hdfgroup.org/HDF5/doc/fortran/index.html and some examples here: https://support.hdfgroup.org/HDF5/examples/f-src.html
I'm trying to modify a Fortran 90 code which writes a 2D array to the output in a NetCDF classic format. I would like the variable to have an extra dimension for time (i.e., it will be a 3D variable), printing it every corresponding time step during integration time of the model.
I'm not sure how it is being done; I appreciate any suggestion for doing it as efficiently as possible (also in a minimum file size).
subroutine writenetcdffile(array,argtitle)
use netcdf
implicit none
real, intent(IN), dimension(:,:) :: array
character*(*),intent(IN) :: argtitle
integer :: file_id, xdim_id, ydim_id
integer :: array_id
integer, dimension(2) :: arrdims
! character(len=*) :: argtitle = Flag_in
integer :: i, j
integer :: ierr
i = size(array,1)
j = size(array,2)
! create the file
ierr = nf90_create(path='test.nc', cmode=NF90_CLOBBER, ncid=file_id)
! define the dimensions
ierr = nf90_def_dim(file_id, 'X', i, xdim_id)
ierr = nf90_def_dim(file_id, 'Y', j, ydim_id)
! now that the dimensions are defined, we can define variables on them,...
arrdims = (/ xdim_id, ydim_id /)
ierr = nf90_def_var(file_id, 'Array', NF90_REAL, arrdims, array_id)
! ...and assign units to them as an attribute
ierr = nf90_put_att(file_id, array_id, "title", argtitle)
! done defining
ierr = nf90_enddef(file_id)
! Write out the values
ierr = nf90_put_var(file_id, array_id, array)
! close; done
ierr = nf90_close(file_id)
return
end subroutine writenetcdffile
MODULE Module_NetCDF
use netcdf
IMPLICIT NONE
integer :: file_id, xdim_id, ydim_id, tdim_id
integer :: array_id(5)
integer, dimension(3) :: arrdims
integer :: i, j
integer :: ierr
CONTAINS
SUBROUTINE NetCDF_Init(ICase)
IMPLICIT NONE
INTEGER :: ICase
SELECT CASE(ICase)
Case(1)
! create the file
ierr = nf90_create(path='test.nc', cmode = NF90_CLOBBER, ncid = file_id)
Case(2)
! Reopen the file for writing
ierr = nf90_open(path = "test.nc", mode = nf90_write, ncid = file_id)
if (ierr /= nf90_noerr) call check(ierr)
Case(3)
! close; done
ierr = nf90_close(file_id)
END SELECT
RETURN
END SUBROUTINE NetCDF_Init
SUBROUTINE NetCDF_Def(Array,ArrayTitle,ArrayUnits)
IMPLICIT NONE
real, intent(IN), dimension(:,:) :: Array
character(*),intent(IN) :: ArrayTitle(5)
character(*),intent(IN) :: ArrayUnits(5)
! Locals
integer :: k
i = size(Array,1)
j = size(Array,2)
! CALL NetCDF_Init(1)
! define the dimensions
ierr = nf90_def_dim(file_id, 'X', i, xdim_id)
ierr = nf90_def_dim(file_id, 'Y', j, ydim_id)
ierr = nf90_def_dim(file_id, 'Time', nf90_unlimited, tdim_id)
! now that the dimensions are defined, we can define variables on them,...
arrdims = (/ xdim_id, ydim_id, tdim_id /)
do k = 1,size(ArrayTitle)
ierr = nf90_def_var(file_id, ArrayTitle(k), NF90_REAL, arrdims, array_id(k))
! ...and assign units to them as an attribute
ierr = nf90_put_att(file_id, array_id(k), "Units", ArrayUnits(k))
enddo
! done defining
ierr = nf90_enddef(file_id)
RETURN
END SUBROUTINE NetCDF_Def
SUBROUTINE NetCDF_Write(Array,FlagTitle,NTime)
IMPLICIT NONE
real, intent(IN), dimension(:,:) :: Array
integer,intent(IN) :: NTime
character(*),intent(in) :: FlagTitle
! Locals
integer :: J_id
IF(FlagTitle.EQ.'ONECOND')THEN
J_id = 1
ELSEIF(FlagTitle.EQ.'MELTING')THEN
J_id = 2
ELSEIF(FlagTitle.EQ.'FREEZ_NEW')THEN
J_id = 3
ELSEIF(FlagTitle.EQ.'TFREEZ')THEN
J_id = 4
ELSEIF(FlagTitle.EQ.'DFREEZ')THEN
J_id = 5
ENDIF
CALL NetCDF_Init(2)
ierr = nf90_put_var(file_id, array_id(j_id), Array, start=[1,1,ntime], count=[i,j,1])
CALL NetCDF_Init(3)
RETURN
END SUBROUTINE
SUBROUTINE check(status)
IMPLICIT NONE
integer, intent ( in) :: status
IF(status /= nf90_noerr) THEN
PRINT *, trim(nf90_strerror(status))
STOP 2
ENDIF
END SUBROUTINE check
END MODULE Module_NetCDF
What you need to do is define the time dimension of nf90_unlimited length. This will allow you to write a 2-d array one slice at a time into a 3-d array, and makes the length of this array unspecified. Use start and count optional dummy arguments to a nf90_put_var call to specify where to write the 2-d slice.
! create the file
ierr = nf90_create(path='test.nc', cmode=NF90_CLOBBER, ncid=file_id)
! define the dimensions
ierr = nf90_def_dim(file_id, 'X', i, xdim_id)
ierr = nf90_def_dim(file_id, 'Y', j, ydim_id)
ierr = nf90_def_dim(file_id, 'Time', nf90_unlimited, tdim_id)
! now that the dimensions are defined, we can define variables on them,...
arrdims = (/ xdim_id, ydim_id, tdim_id /)
ierr = nf90_def_var(file_id, 'Array', NF90_REAL, arrdims, array_id)
! done defining
ierr = nf90_enddef(file_id)
! Time loop
do n = 1,nm
! Calculations go here
! Write out the values
ierr = nf90_put_var(file_id, array_id, array, start=[1,1,n], count=[i,j,1])
enddo
What I do in most of my programs is create the file and define dimensions and variables at the beginning, and write the fields in a loop afterward. If your simulations take a long time and you want to be able to look at the output during the simulation in progress, do the open/write/close steps inside of the model solver do-loop.
I have seen this question being asked many times, but didn't find an answer that could resolve my issue. I want to be able to send a linked list, in Fortran, to another process through MPI. I have done something similar where the derived data type in the linked list was as follows
type a
{
integer :: varA
type(a), pointer :: next=>null()
real :: varB
}
The way I did this was to create an MPI datatype which contained all the varA values together, and receive it as an array of integers. Then do the same for varB.
What I am trying to do now is to create the linked list, and then pack all the varA and varB values together to form the MPI datatype. I give below the code that does this.
PROGRAM TEST
USE MPI
IMPLICIT NONE
TYPE a
INTEGER:: b
REAL :: e
TYPE(a), POINTER :: nextPacketInList => NULL()
END TYPE
TYPE PacketComm
INTEGER :: numPacketsToComm
TYPE(a), POINTER :: PacketListHeadPtr => NULL()
TYPE(a), POINTER :: PacketListTailPtr => NULL()
END TYPE PacketComm
TYPE(PacketComm), DIMENSION(:), ALLOCATABLE :: PacketCommArray
INTEGER :: packPacketDataType !New data type
INTEGER :: ierr, size, rank, dest, ind
integer :: b
real :: e
CALL MPI_INIT(ierr)
CALL MPI_COMM_SIZE(MPI_COMM_WORLD, size, ierr)
CALL MPI_COMM_RANK(MPI_COMM_WORLD, rank, ierr)
IF(.NOT. ALLOCATED(PacketCommArray)) THEN
ALLOCATE(PacketCommArray(0:size-1), STAT=ierr)
DO ind=0, size-1
PacketCommArray(ind)%numPacketsToComm = 0
END DO
ENDIF
b = 2
e = 4
dest = 1
CALL addPacketToList(b, e, dest)
b = 3
e = 5
dest = 1
CALL addPacketToList(b, e, dest)
dest = 1
CALL packPacketList(dest)
IF(rank == 0) THEN
dest = 1
CALL sendPacketList(dest)
ELSE
CALL recvPacketList()
ENDIF
CALL MPI_FINALIZE(ierr)
CONTAINS
SUBROUTINE addPacketToList(b, e, rank)
IMPLICIT NONE
INTEGER :: b, rank, ierr
REAL :: e
TYPE(a), POINTER :: head
IF(.NOT. ASSOCIATED(PacketCommArray(rank)%PacketListHeadPtr)) THEN
ALLOCATE(PacketCommArray(rank)%PacketListHeadPtr, STAT=ierr)
PacketCommArray(rank)%PacketListHeadPtr%b = b
PacketCommArray(rank)%PacketListHeadPtr%e = e
PacketCommArray(rank)%PacketListHeadPtr%nextPacketInList => NULL()
PacketCommArray(rank)%PacketListTailPtr => PacketCommArray(rank)%PacketListHeadPtr
PacketCommArray(rank)%numPacketsToComm = PacketCommArray(rank)%numPacketsToComm+1
ELSE
ALLOCATE(PacketCommArray(rank)%PacketListTailPtr%nextPacketInList, STAT=ierr)
PacketCommArray(rank)%PacketListTailPtr => PacketCommArray(rank)%PacketListTailPtr%nextPacketInList
PacketCommArray(rank)%PacketListTailPtr%b = b
PacketCommArray(rank)%PacketListTailPtr%e = e
PacketCommArray(rank)%PacketListTailPtr%nextPacketInList => NULL()
PacketCommArray(rank)%numPacketsToComm = PacketCommArray(rank)%numPacketsToComm+1
ENDIF
END SUBROUTINE addPacketToList
SUBROUTINE packPacketList(rank)
IMPLICIT NONE
INTEGER :: rank
INTEGER :: numListNodes
INTEGER(kind=MPI_ADDRESS_KIND), DIMENSION(:), ALLOCATABLE :: listNodeAddr
INTEGER(kind=MPI_ADDRESS_KIND), DIMENSION(:), ALLOCATABLE :: listNodeDispl
INTEGER, DIMENSION(:), ALLOCATABLE :: listNodeTypes
INTEGER, DIMENSION(:), ALLOCATABLE :: listNodeCount
TYPE(a), POINTER :: head
INTEGER :: numNode
head => PacketCommArray(rank)%PacketListHeadPtr
numListNodes = PacketCommArray(rank)%numPacketsToComm
PRINT *, ' Number of nodes to allocate for rank ', rank , ' is ', numListNodes
ALLOCATE(listNodeTypes(2*numListNodes), stat=ierr)
ALLOCATE(listNodeCount(2*numListNodes), stat=ierr)
DO numNode=1, 2*numListNodes, 2
listNodeTypes(numNode) = MPI_INTEGER
listNodeTypes(numNode+1) = MPI_REAL
END DO
DO numNode=1, 2*numListNodes, 2
listNodeCount(numNode) = 1
listNodeCount(numNode+1) = 1
END DO
ALLOCATE(listNodeAddr(2*numListNodes), stat=ierr)
ALLOCATE(listNodeDispl(2*numListNodes), stat=ierr)
numNode = 1
DO WHILE(ASSOCIATED(head))
CALL MPI_GET_ADDRESS(head%b, listNodeAddr(numNode), ierr)
CALL MPI_GET_ADDRESS(head%e, listNodeAddr(numNode+1), ierr)
numNode = numNode + 2
head => head%nextPacketInList
END DO
DO numNode=1, UBOUND(listNodeAddr,1)
listNodeDispl(numNode) = listNodeAddr(numNode) - listNodeAddr(1)
END DO
CALL MPI_TYPE_CREATE_STRUCT(UBOUND(listNodeAddr,1), listNodeCount, listNodeDispl, listNodeTypes, packPacketDataType, ierr)
CALL MPI_TYPE_COMMIT(packPacketDataType, ierr)
END SUBROUTINE packPacketList
SUBROUTINE sendPacketList(rank)
IMPLICIT NONE
INTEGER :: rank, ierr, numNodes
TYPE(a), POINTER :: head
head => PacketCommArray(rank)%PacketListHeadPtr
numNodes = PacketCommArray(rank)%numPacketsToComm
CALL MPI_SSEND(head%b, 1, packPacketDataType, rank, 0, MPI_COMM_WORLD, ierr)
END SUBROUTINE sendPacketList
SUBROUTINE recvPacketList
IMPLICIT NONE
TYPE(a), POINTER :: head
TYPE(a), DIMENSION(:), ALLOCATABLE :: RecvPacketCommArray
INTEGER, DIMENSION(:), ALLOCATABLE :: recvB
INTEGER :: numNodes, ierr, numNode
INTEGER, DIMENSION(MPI_STATUS_SIZE):: status
head => PacketCommArray(rank)%PacketListHeadPtr
numNodes = PacketCommArray(rank)%numPacketsToComm
ALLOCATE(RecvPacketCommArray(numNodes), stat=ierr)
ALLOCATE(recvB(numNodes), stat=ierr)
CALL MPI_RECV(RecvPacketCommArray, 1, packPacketDataType, 0, 0, MPI_COMM_WORLD, status, ierr)
DO numNode=1, numNodes
PRINT *, ' value in b', RecvPacketCommArray(numNode)%b
PRINT *, ' value in e', RecvPacketCommArray(numNode)%e
END DO
END SUBROUTINE recvPacketList
END PROGRAM TEST
So basically I create a linked list with two nodes containing the following data
Node 1
b = 2, e = 4
Node 2
b = 3, e = 5
When I run this code on two cores, the results I get on core 1 are
value in b 2
value in e 4.000000
value in b 0
value in e 0.0000000E+00
So my code seems to send the data in the first node of the linked list correctly, but not the second one. Please could someone let me know if what I am trying to do is feasible, and what is wrong with the code. I know I can send the values of b in all nodes together and then the values of e together. But my derived data type will probably contain more variables (including arrays) and I want to be able to send all the data in one go instead of using multiple sends.
Thanks
It's not easy for me to read that code, but it seems like you are expecting the receiving buffer to get contiguous data, which is not the case. The strange type you construct by computing address offsets is not going to match the receiving buffer. To illustrate this, I though I might present this simple example (it's quickly written, don't take it as a good code example):
program example
use mpi
integer :: nprocs, myrank
integer :: buf(4)
integer :: n_elements
integer :: len_element(2)
integer(MPI_ADDRESS_KIND) :: disp_element(2)
integer :: type_element(2)
integer :: newtype
integer :: istat(MPI_STATUS_SIZE)
integer :: ierr
call mpi_init(ierr)
call mpi_comm_size(mpi_comm_world, nprocs, ierr)
call mpi_comm_rank(mpi_comm_world, myrank, ierr)
! simple example illustrating mpi_type_create_struct
! take an integer array buf(4):
! on rank 0: [ 7, 2, 6, 4 ]
! on rank 1: [ 1, 1, 1, 1 ]
! and we create a struct to send only elements 1 and 3
! so that on rank 1 we'll get [7, 1, 6, 1]
if (myrank == 0) then
buf = [7, 2, 6, 4]
else
buf = 1
end if
n_elements = 2
len_element = 1
disp_element(1) = 0
disp_element(2) = 8
type_element = MPI_INTEGER
call mpi_type_create_struct(n_elements, len_element, disp_element, type_element, newtype, ierr)
call mpi_type_commit(newtype, ierr)
write(6,'(1x,a,i2,1x,a,4i2)') 'SEND| rank ', myrank, 'buf = ', buf
if (myrank == 0) then
call mpi_send (buf, 1, newtype, 1, 13, MPI_COMM_WORLD, ierr)
else
call mpi_recv (buf, 1, newtype, 0, 13, MPI_COMM_WORLD, istat, ierr)
!the below call does not scatter the received integers, try to see the difference
!call mpi_recv (buf, 2, MPI_INTEGER, 0, 13, MPI_COMM_WORLD, istat, ierr)
end if
write(6,'(1x,a,i2,1x,a,4i2)') 'RECV| rank ', myrank, 'buf = ', buf
end program
I hope this clearly shows that the receiving buffer will have to accommodate any offsets in the constructed type, and does not receive any contiguous data.
EDIT: updated the code to illustrate a different receive type that does not scatter the data.