I have write a master-slave IO function with Fortran. First, I read the file with 0 process, put the data in the array read_buffer, and then I call subroutine"scatter_data".
I have created some communicators to scatter data from process 0 to process 0-8.
These communicators are like this:
2, 5, 8 :sub_io_communicator (In this sub_io_communicator , sub_iorank is /0, 1, 2/ )
1, 4, 7 :sub_io_communicator (In this sub_io_communicator , sub_iorank is /0, 1, 2/ )
0, 3, 6 :sub_io_communicator (In this sub_io_communicator , sub_iorank is /0, 1, 2/ )
2, 1, 0 :master_communicator
DATA
||
0 process read
||
0 call MPI_scatterv in communicator "master_communicator"
/ | \
0 1 2 call MPI_scatterv in communicator "sub_io_communicator"
/ | \ / | \ / | \
0 3 6 1 4 7 2 5 8
but when I call MPI_Scatterv, it crashes. I use "print" to debug it , and find the bug is in "call MPI_Scatterv". SO, I write a very simple MPI_Scatterv in this subroutine to see whether it will work, but it does not.
My code is like this :
SUBROUTINE scatter_data(read_buffer,ne_in)
use naqpms_nest, only : nest, nxlo, nylo, ratio, nx, ny
implicit none
include 'mpif.h'
integer, INTENT(IN) :: ne_in
integer :: ierr
integer :: location
integer :: ii, jj, kk, zz, dd, send_size,receive_size
real, INTENT(IN), dimension(nx(ne_in)*ny(ne_in)) :: read_buffer
integer, dimension(nx(ne_in)*ny(ne_in)) :: read_buffer_int
real, allocatable :: rerange_buffer(:)
integer, allocatable :: receive_buffer_int(:)
integer , allocatable :: counts_recv(:),displacements(:)
integer :: distance,left_bdy,left_bdy2,rigth_bdy,rigth_bdy2
integer :: tmp1(3), tmp2(3)
IF( sub_iorank.EQ.0 ) THEN
if(allocated(counts_recv))deallocate(counts_recv)
if(allocated(displacements))deallocate(displacements)
if(allocated(receive_buffer))deallocate(receive_buffer)
if(allocated(rerange_buffer))deallocate(rerange_buffer)
IF( master_iorank.EQ.0) THEN
allocate(counts_recv(dims(2,ne_in)))
allocate(displacements(dims(2,ne_in)))
ENDIF
print*,"location",my_rank,nx(ne_in), ey(ne_in), sy(ne_in)
receive_size = nx(ne_in) * (ey(ne_in)-sy(ne_in)+1)
allocate(receive_buffer(receive_size))
allocate(receive_buffer_int(receive_size))
allocate(rerange_buffer(receive_size))
CALL MPI_Gather(receive_size, 1, MPI_INTEGER, counts_recv, 1, MPI_INTEGER,&
0, master_communicator ,ierr )
tmp1(1:3)= (/1,1,1/)
tmp2(1:3)= (/0,1,2/)
IF(my_rank==0)print*,"counts_recv",counts_recv
CALL mpi_scatterv (counts_recv, tmp1, tmp2, MPI_INT,& ! this is just for test
receive_size, 1, mpi_int, 0, master_communicator, ierr )
print*,my_rank,receive_size
IF(my_rank.EQ.0) THEN
displacements(1)=0
do ii=2, dims(2,ne_in)
displacements(ii) = displacements(ii-1) + counts_recv(ii-1)
enddo
ENDIF
IF(my_rank==0)print*,displacements,counts_recv
CALL mpi_scatterv (read_buffer, counts_recv, displacements, mpi_real,&
receive_buffer, receive_size, mpi_real, 0, master_communicator, ierr )
IF(my_rank==0)print*,"mpi_scatterv one ok",my_rank
ENDIF!sub_iorank =0
IF(sub_iorank .EQ. 0) THEN
if(allocated(counts_recv))deallocate(counts_recv)
if(allocated(displacements))deallocate(displacements)
allocate(counts_recv(dims(1,ne_in)))
allocate(displacements(dims(1,ne_in)))
ENDIF
receive_size =(ex(ne_in)-sx(ne_in)+1)*(ey(ne_in)-sy(ne_in)+1)
CALL MPI_Gather(receive_size, 1, MPI_INTEGER, counts_recv, 1, MPI_INTEGER, 0, sub_io_communicator ,ierr )
IF(sub_iorank .EQ. 0) THEN
displacements(1)=0
do ii=2,dims(1,ne_in)
displacements(ii) = displacements(ii-1) + counts_recv(ii-1)
enddo
ENDIF
IF(sub_iorank .EQ. 0) THEN
DO dd = 1, dims(1,ne_in)
DO jj = 1, bdy_gather(4,dd,ne_in)-bdy_gather(3,dd,ne_in)+1
distance = bdy_gather(2,dd,ne_in)-bdy_gather(1,dd,ne_in)+1
left_bdy = (jj-1)*nx(ne_in) + bdy_gather(1,dd,ne_in)
rigth_bdy = (jj-1)*nx(ne_in) + bdy_gather(2,dd,ne_in)
left_bdy2 = displacements(dd) + (jj-1)*distance + 1
rigth_bdy2 = displacements(dd) + (jj-1)*distance + distance
rerange_buffer( left_bdy2 : rigth_bdy2) = receive_buffer(left_bdy : rigth_bdy )
ENDDO
ENDDO
ENDIF
if(allocated(receive_buffer))deallocate(receive_buffer)
allocate(receive_buffer(receive_size))
IF(sub_iorank .EQ. 0) print*, my_rank, counts_recv, displacements
CALL mpi_scatterv( rerange_buffer, counts_recv, displacements, mpi_real,&
receive_buffer, receive_size, mpi_real, 0, sub_io_communicator,ierr)
IF(sub_iorank .EQ. 0) print*,"mpi_scatterv ok"
END SUBROUTINE scatter_data
I run this code : mpirun -np 9 ./gnaqpms.v1.6.0_jx0307.exe
then, the error in the log file is like this :
location 0 88 26 1
location 1 88 52 27
location 2 88 77 53
counts_recv 2288 2288 2200
1 2288
2 2200
*** Error in forrtl: error (76): Abort trap signal
Image PC Routine Line Source
gnaqpms.v1.6.0_jx 00000000007A5F3A Unknown Unknown Unknown
libpthread-2.17.s 00002BA00DADA5D0 Unknown Unknown Unknown
libc-2.17.so 00002BA00E01F207 gsignal Unknown Unknown
libc-2.17.so 00002BA00E0208F8 abort Unknown Unknown
libc-2.17.so 00002BA00E061D27 Unknown Unknown Unknown
libc-2.17.so 00002BA00E06A489 Unknown Unknown Unknown
libmpi.so.12.0 00002BA00CAC2AED Unknown Unknown Unknown
libmpi.so.12.0 00002BA00CAC4A54 Unknown Unknown Unknown
libmpi.so.12 00002BA00CAC3188 MPI_Scatterv Unknown Unknown
libmpifort.so.12. 00002BA00D445A7A mpi_scatterv Unknown Unknown
gnaqpms.v1.6.0_jx 0000000000475EA4 naqpms_parallel_m 1269 naqpms_parallel.f90
gnaqpms.v1.6.0_jx 00000000005953BF rd_met_pyramid_ 151 rd_met_pyramid.f90
gnaqpms.v1.6.0_jx 0000000000617709 read_data_ 61 naqpms_readdata.f90
gnaqpms.v1.6.0_jx 0000000000647BA4 naqpms_calc_mp_ca 141 naqpms_calc.f90
gnaqpms.v1.6.0_jx 000000000065A835 MAIN__ 86 main.f90
gnaqpms.v1.6.0_jx 000000000040B45E Unknown Unknown Unknown
libc-2.17.so 00002BA00E00B3D5 __libc_start_main Unknown Unknown
gnaqpms.v1.6.0_jx 000000000040B369 Unknown Unknown Unknown
I am trying to learn how to perform inter-gpu data communication using the following toy code. The task of the program is to send array 'a' data in gpu-0 in to gpu-1's memory. I took the following root to do so, which involved four steps:
After initializing array 'a' on gpu0,
step1: send data from gpu0 to cpu0 (using !acc update self() clause)
step2: send data from cpu0 to cpu1 (using MPI_SEND())
step3: receive data into cpu1 from cpu0 (using MPI_RECV())
step4: update gpu1 device memory (using !$acc update device() clause)
This works perfectly fine, but this looks like a very long route and I think there is a better way of doing this. I tried to read up on !$acc host_data use_device clause suggested in the following post, but not able to implement it:
Getting started with OpenACC + MPI Fortran program
I would like to know how !$acc host_data use_device can be used, to perform the task shown below in an efficient manner.
PROGRAM TOY_MPI_OpenACC
implicit none
include 'mpif.h'
integer :: rank, nprocs, ierr, i, dest_rank, tag, from
integer :: status(MPI_STATUS_SIZE)
integer, parameter :: N = 10000
double precision, dimension(N) :: a
call MPI_INIT(ierr)
call MPI_COMM_RANK(MPI_COMM_WORLD,rank,ierr)
call MPI_COMM_SIZE(MPI_COMM_WORLD,nprocs,ierr)
print*, 'Process ', rank, ' of', nprocs, ' is alive'
!$acc data create(a)
! initialize 'a' on gpu0 (not cpu0)
IF (rank == 0) THEN
!$acc parallel loop default(present)
DO i = 1,N
a(i) = 1
ENDDO
ENDIF
! step1: send data from gpu0 to cpu0
!$acc update self(a)
print*, 'a in rank', rank, ' before communication is ', a(N/2)
IF (rank == 0) THEN
! step2: send from cpu0
dest_rank = 1; tag = 1999
call MPI_SEND(a, N, MPI_DOUBLE_PRECISION, dest_rank, tag, MPI_COMM_WORLD, ierr)
ELSEIF (rank == 1) THEN
! step3: recieve into cpu1
from = MPI_ANY_SOURCE; tag = MPI_ANY_TAG;
call MPI_RECV(a, N, MPI_DOUBLE_PRECISION, from, tag, MPI_COMM_WORLD, status, ierr)
! step4: send data in to gpu1 from cpu1
!$acc update device(a)
ENDIF
call MPI_BARRIER(MPI_COMM_WORLD, ierr)
print*, 'a in rank', rank, ' after communication is ', a(N/2)
!$acc end data
call MPI_BARRIER(MPI_COMM_WORLD, ierr)
END
compilation: mpif90 -acc -ta=tesla toycode.f90 (mpif90 from nvidia hpc-sdk 21.9)
execution : mpirun -np 2 ./a.out
Here's an example. Note that I also added some boiler-plate code to do the local node rank to device assignment. I also prefer to use unstructured data regions since they're better for more complex codes, but here they would be semantically equivalent to the structured data region that you used above. I have guarded the host_data constructs under a CUDA_AWARE_MPI macro since not all MPI have CUDA Aware support enabled. For these, you'd need to revert back to copying the data between the host and device before/after the MPI calls.
% cat mpi_acc.F90
PROGRAM TOY_MPI_OpenACC
use mpi
#ifdef _OPENACC
use openacc
#endif
implicit none
integer :: rank, nprocs, ierr, i, dest_rank, tag, from
integer :: status(MPI_STATUS_SIZE)
integer, parameter :: N = 10000
double precision, dimension(N) :: a
#ifdef _OPENACC
integer :: dev, devNum, local_rank, local_comm
integer(acc_device_kind) :: devtype
#endif
call MPI_INIT(ierr)
call MPI_COMM_RANK(MPI_COMM_WORLD,rank,ierr)
call MPI_COMM_SIZE(MPI_COMM_WORLD,nprocs,ierr)
print*, 'Process ', rank, ' of', nprocs, ' is alive'
#ifdef _OPENACC
! set the MPI rank to device mapping
! 1) Get the local node's rank number
! 2) Get the number of devices on the node
! 3) Round-Robin assignment of rank to device
call MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, &
MPI_INFO_NULL, local_comm,ierr)
call MPI_Comm_rank(local_comm, local_rank,ierr)
devtype = acc_get_device_type()
devNum = acc_get_num_devices(devtype)
dev = mod(local_rank,devNum)
call acc_set_device_num(dev, devtype)
print*, "Process ",rank," Using device ",dev
#endif
a = 0
!$acc enter data copyin(a)
! initialize 'a' on gpu0 (not cpu0)
IF (rank == 0) THEN
!$acc parallel loop default(present)
DO i = 1,N
a(i) = 1
ENDDO
!$acc update self(a)
ENDIF
! step1: send data from gpu0 to cpu0
print*, 'a in rank', rank, ' before communication is ', a(N/2)
IF (rank == 0) THEN
! step2: send from cpu0
dest_rank = 1; tag = 1999
#ifdef CUDA_AWARE_MPI
!$acc host_data use_device(a)
#endif
call MPI_SEND(a, N, MPI_DOUBLE_PRECISION, dest_rank, tag, MPI_COMM_WORLD, ierr)
#ifdef CUDA_AWARE_MPI
!$acc end host_data
#endif
ELSEIF (rank == 1) THEN
! step3: recieve into cpu1
from = MPI_ANY_SOURCE; tag = MPI_ANY_TAG;
#ifdef CUDA_AWARE_MPI
!$acc host_data use_device(a)
#endif
call MPI_RECV(a, N, MPI_DOUBLE_PRECISION, from, tag, MPI_COMM_WORLD, status, ierr)
#ifdef CUDA_AWARE_MPI
!$acc end host_data
#else
! step4: send data in to gpu1 from cpu1
!$acc update device(a)
#endif
ENDIF
call MPI_BARRIER(MPI_COMM_WORLD, ierr)
!$acc update self(a)
print*, 'a in rank', rank, ' after communication is ', a(N/2)
!$acc exit data delete(a)
call MPI_BARRIER(MPI_COMM_WORLD, ierr)
END
% which mpif90
/proj/nv/Linux_x86_64/21.9/comm_libs/mpi/bin//mpif90
% mpif90 -V
nvfortran 21.9-0 64-bit target on x86-64 Linux -tp skylake
NVIDIA Compilers and Tools
Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
% mpif90 -acc -Minfo=accel mpi_acc.F90
toy_mpi_openacc:
38, Generating enter data copyin(a(:))
42, Generating Tesla code
43, !$acc loop gang, vector(128) ! blockidx%x threadidx%x
42, Generating default present(a(:))
46, Generating update self(a(:))
76, Generating update device(a(:))
82, Generating update self(a(:))
85, Generating exit data delete(a(:))
% mpirun -np 2 ./a.out
Process 1 of 2 is alive
Process 0 of 2 is alive
Process 0 Using device 0
Process 1 Using device 1
a in rank 1 before communication is 0.000000000000000
a in rank 0 before communication is 1.000000000000000
a in rank 0 after communication is 1.000000000000000
a in rank 1 after communication is 1.000000000000000
% mpif90 -acc -Minfo=accel mpi_acc.F90 -DCUDA_AWARE_MPI=1
toy_mpi_openacc:
38, Generating enter data copyin(a(:))
42, Generating Tesla code
43, !$acc loop gang, vector(128) ! blockidx%x threadidx%x
42, Generating default present(a(:))
46, Generating update self(a(:))
82, Generating update self(a(:))
85, Generating exit data delete(a(:))
% mpirun -np 2 ./a.out
Process 0 of 2 is alive
Process 1 of 2 is alive
Process 1 Using device 1
Process 0 Using device 0
a in rank 1 before communication is 0.000000000000000
a in rank 0 before communication is 1.000000000000000
a in rank 1 after communication is 1.000000000000000
a in rank 0 after communication is 1.000000000000000
What s wrong with the following code? It works fine when I run it with 1,2 and 5 cpus,
while with 3 cpus, it breaks at (1), and with 4 cpus, it breaks at (2). In both the cases the error reads:
An error occurred in MPI_Scatterv
reported by process [3248685057,3]
on communicator MPI_COMM_WORLD
MPI_ERR_TRUNCATE: message truncated.
Probably the problem is correlated with memory allocation, but I cannot understand it deeply...(I am not totally confident about how I allocated the distributed variables (3)) Some suggestion is warmly accepted
program import_and_divide
implicit none
include 'mpif.h'
integer :: i, k, io, n ,nnz
integer,allocatable, dimension(:) :: Ai, Aj
real*8, allocatable, dimension(:) :: Aa, Ab, x
integer*4 :: rank, mpi_stat, size
integer, allocatable, dimension(:) :: sendcounts, displ, sendcounts1, displ1
integer :: n_distro
integer, allocatable, dimension(:) :: ia, ja
real*8, allocatable, dimension(:) :: a, b
n = 5
nnz = 13
!.. Initialize MPI.
call MPI_INIT(mpi_stat)
call MPI_COMM_SIZE(MPI_COMM_WORLD, size, mpi_stat)
call MPI_COMM_RANK(MPI_COMM_WORLD, rank, mpi_stat)
if (rank.eq.0) then
allocate(Ai(n+1), Aj(nnz), Aa(nnz), Ab(n), x(n))
Ai = (/0, 3, 5, 8, 11, 13/)
Aj = (/0, 1, 3, 0, 1, 2, 3, 4, 0, 2, 3, 1, 4/)
Aa = (/1, -1, -3, -2 , 5, 4, 6, 4, -4, 2, 7, 8, -5/)
Ab(:) = ([1, 2, 3, 4, 5])
x(:) = 0.d0
print*, 'Number of cpus: ', size
print*, 'n, nnz: ', n, nnz
endif
call MPI_BARRIER(MPI_COMM_WORLD, mpi_stat)
allocate(sendcounts(size), displ(size), sendcounts1(size), displ1(size))
n_distro=(n+1)/size
k = 0
do i=1,size
if (i<size) then
sendcounts(i) = n_distro
else
sendcounts(i) = (n+0)-(size-1)*n_distro
endif
displ(i) = k
k = k + sendcounts(i)
end do
if (rank.eq.0) then
displ1 = Ai(displ+1)
do i=1,size-1
sendcounts1(i) = displ1(i+1)-displ1(i)
end do
sendcounts1(size) = nnz-displ1(size)
endif
call MPI_BARRIER(MPI_COMM_WORLD,mpi_stat)
if (rank.eq.0) then
print*, 'Sendcounts: ', sendcounts
print*, 'Displ: ', displ
endif
call MPI_BCAST(displ1,shape(displ1), MPI_INT,0,MPI_COMM_WORLD,mpi_stat)
call MPI_BCAST(sendcounts1, shape(sendcounts1), MPI_INT,0,MPI_COMM_WORLD,mpi_stat)
do i=0,size-1
if (i.eq.rank) then
allocate(b(sendcounts(i+1)), ia(sendcounts(i+1)+1) ,&
a(sendcounts1(i+1)), ja(sendcounts1(i+1))) !(3)
ia(:) = rank
ja(:) = 0
b(:) = 0.d0
a(:) = 0.d0
end if
call MPI_BARRIER(MPI_COMM_WORLD,mpi_stat)
end do
call MPI_scatterv(Ab, sendcounts, displ, MPI_DOUBLE,&
b, sendcounts, MPI_DOUBLE,0, MPI_COMM_WORLD,mpi_stat) ! (2) breaks here with mpirun -np 4
call MPI_scatterv(Ai, sendcounts+1, displ, MPI_INT,&
ia, sendcounts+1, MPI_INT,0, MPI_COMM_WORLD,mpi_stat)
call MPI_scatterv(Aa, sendcounts1, displ1, MPI_DOUBLE,&
a, sendcounts1, MPI_DOUBLE,0, MPI_COMM_WORLD,mpi_stat)
call MPI_scatterv(Aj, sendcounts1, displ1, MPI_INT,&
ja, sendcounts1, MPI_INT,0, MPI_COMM_WORLD,mpi_stat) ! (1) breaks here with mpirun -np 3
call MPI_FINALIZE(mpi_stat)
end
When I replace include "mpif.h" with use mpi and I got the following errors:
There is no specific subroutine for the generic ‘mpi_bcast’
There is no specific subroutine for the generic ‘mpi_scatterv’.
I also add -fcheck=all and this option provides this additional info (when compiled with mpif.h):
Allocatable actual argument 'ab' is not allocated
but this do not clarify my idea.
If the arrays Ai, Aj, Ab, Aa are allocated on all the processors, I got the same behaviour (works with 1,2 and 5 cpus and breaks with 3 and 4 cpus) and disappear the message Allocatable actual argument 'ab' is not allocated
If I replace sendcounts and sendcounts1 in the second half of MPI_scatterv with sendcounts(rank+1) and sendcounts1(rank+1), the code seems to work fine only if I remove the compiler flag -fcheck=all/-fsanitize=address(if the arrays Ai,Aj,Aa,Ab are allocated on all the processes as well as if allocated only on one process (rank=0)). With one of these two options I got the following error:
==25041==ERROR: LeakSanitizer: detected memory leaks
I am solving a problem for unstructured meshes and the source and destination for communication are therefore saved in arrays. My question is how can I change this layout using MPI_BSend and/or using MPI_SendRecv?
integer:: status(MPI_STATUS_SIZE, (5*(total_send+total_recv))), &
req(5*(total_send+total_recv))
do i = 1, total_recv
j = (i-1) * 5 + 1 !Note that 5 comes from the 5 arrays: A, B, C, D, and E
k = index_element_recv(i) ; source = index_node_recv(i)
call MPI_IRECV (A(k), 1, MPI_DOUBLE_PRECISION, source, tag1, MPI_COMM_WORLD, req(j), ierr)
call MPI_IRECV (B(k), 1, MPI_DOUBLE_PRECISION, source, tag2, MPI_COMM_WORLD, req(j+1), ierr)
call MPI_IRECV (C(k), 1, MPI_DOUBLE_PRECISION, source, tag3, MPI_COMM_WORLD, req(j+2), ierr)
call MPI_IRECV (D(k), 1, MPI_DOUBLE_PRECISION, source, tag4, MPI_COMM_WORLD, req(j+3), ierr)
call MPI_IRECV (E(k), 1, MPI_DOUBLE_PRECISION, source, tag5, MPI_COMM_WORLD, req(j+4), ierr)
end do
l = 5 * total_recv !Note that 5 comes from the 5 arrays: A, B, C, D, and E
do i = 1, total_send
j = (i-1) * 5 + 1 !Note that 5 comes from the 5 arrays: A, B, C, D, and E
k = index_element_send(i) ; dest = index_node_send(i)
call MPI_ISEND (A(k), 1, MPI_DOUBLE_PRECISION, dest, tag1, MPI_COMM_WORLD, req(j+l), ierr)
call MPI_ISEND (B(k), 1, MPI_DOUBLE_PRECISION, dest, tag2, MPI_COMM_WORLD, req(j+l+1), ierr)
call MPI_ISEND (C(k), 1, MPI_DOUBLE_PRECISION, dest, tag3, MPI_COMM_WORLD, req(j+l+2), ierr)
call MPI_ISEND (D(k), 1, MPI_DOUBLE_PRECISION, dest, tag4, MPI_COMM_WORLD, req(j+l+3), ierr)
call MPI_ISEND (E(k), 1, MPI_DOUBLE_PRECISION, dest, tag5, MPI_COMM_WORLD, req(j+l+4), ierr)
end do
call MPI_WAITALL ( (5*(total_send+total_recv)), req, status, ierr)
PS:
Why I want to know this is because I want to test the performance of some communication styles in my code.
index_element_send is NOT same with index_element_recv, thus NOT overwriting each other.
total_send and total_recv are NOT ALWAYS same.
Note that tag1=1, tag2=2, ...
If you have any idea how to optimize it better, please let me know. Thanks.
Extra: I have applied the layout below for MPI_Bsend but it gave an error when executed. Do you have any suggestion?
integer, parameter :: dp = selected_real_kind(15,307)
real(kind = dp):: buffer(5*total_send*8)
do i = 1, total_send
j = index_element_send(i) ; dest = index_node_send(i)
call MPI_BUFFER_ATTACH (buffer, 5*total_send*8+MPI_BSEND_OVERHEAD, ierr)
call MPI_BSEND (A(j), 1, MPI_DOUBLE_PRECISION, dest, tag1, MPI_COMM_WORLD, ierr)
call MPI_BSEND (B(j), 1, MPI_DOUBLE_PRECISION, dest, tag2, MPI_COMM_WORLD, ierr)
call MPI_BSEND (C(j), 1, MPI_DOUBLE_PRECISION, dest, tag3, MPI_COMM_WORLD, ierr)
call MPI_BSEND (D(j), 1, MPI_DOUBLE_PRECISION, dest, tag4, MPI_COMM_WORLD, ierr)
call MPI_BSEND (E(j), 1, MPI_DOUBLE_PRECISION, dest, tag5, MPI_COMM_WORLD, ierr)
end do
do i = 1, total_recv
j = index_element_recv(i) ; source = index_node_recv(i)
call MPI_RECV (A(j), 1, MPI_DOUBLE_PRECISION, source, tag1, MPI_COMM_WORLD, ierr)
call MPI_RECV (B(j), 1, MPI_DOUBLE_PRECISION, source, tag2, MPI_COMM_WORLD, ierr)
call MPI_RECV (C(j), 1, MPI_DOUBLE_PRECISION, source, tag3, MPI_COMM_WORLD, ierr)
call MPI_RECV (D(j), 1, MPI_DOUBLE_PRECISION, source, tag4, MPI_COMM_WORLD, ierr)
call MPI_RECV (E(j), 1, MPI_DOUBLE_PRECISION, source, tag5, MPI_COMM_WORLD, ierr)
if ( i <= tot_comm_cells_edges_Ns ) &
call MPI_BUFFER_DETACH (buffer, 5*total_send*8, ierr)
end do
I want to send 2d chunks of data using MPI_GATHER. For example: I have 2x3 arrays on each node and I want 8x3 array on root, if I have 4 nodes. For 1d arrays, MPI_GATHER sorts data according to MPI ranks, but for 2d data it creates a mess!
What is the clean way to put chunks in order?
I expected the output of this code:
program testmpi
use mpi
implicit none
integer :: send (2,3)
integer :: rec (4,3)
integer :: ierror,my_rank,i,j
call MPI_Init(ierror)
MPI_DATA_TYPE type_col
! find out process rank
call MPI_Comm_rank(MPI_COMM_WORLD, my_rank, ierror)
if (my_rank==0) then
send=1
do i=1,2
print*,(send(i,j),j=1,3)
enddo
endif
if (my_rank==1) then
send=5
! do 1,2
! print*,(send(i,j),j=1,3)
! enddo
endif
call MPI_GATHER(send,6,MPI_INTEGER,rec,6,MPI_INTEGER,0,MPI_COMM_WORLD,ierror)
if (my_rank==0) then
print*,'<><><><><>rec'
do i=1,4
print*,(rec(i,j),j=1,3)
enddo
endif
call MPI_Finalize(ierror)
end program testmpi
to be something like this :
1 1 1
1 1 1
5 5 5
5 5 5
but it looks like this:
1 1 5
1 1 5
1 5 5
1 5 5
The following a literal Fortran translation of this answer. I had thought this was unnecessary, but the multiple differences in array indexing and memory layout might mean that it is worth doing a Fortran version.
Let me start by saying that you generally don't really want to do this - scatter and gather huge chunks of data from some "master" process. Normally you want each task to be chugging away at its own piece of the puzzle, and you should aim to never have one processor need a "global view" of the whole data; as soon as you require that, you limit scalability and the problem size. If you're doing this for I/O - one process reads the data, then scatters it, then gathers it back for writing, you'll want eventually to look into MPI-IO.
Getting to your question, though, MPI has very nice ways of pulling arbitrary data out of memory, and scatter/gathering it to and from a set of processors. Unfortunately that requires a fair number of MPI concepts - MPI Types, extents, and collective operations. A lot of the basic ideas are discussed in the answer to this question -- MPI_Type_create_subarray and MPI_Gather .
Consider a 1d integer global array that task 0 has that you want to distribute to a number of MPI tasks, so that they each get a piece in their local array. Say you have 4 tasks, and the global array is [0,1,2,3,4,5,6,7]. You could have task 0 send four messages (including one to itself) to distribute this, and when it's time to re-assemble, receive four messages to bundle it back together; but that obviously gets very time consuming at large numbers of processes. There are optimized routines for these sorts of operations - scatter/gather operations. So in this 1d case you'd do something like this:
integer, dimension(8) :: global ! only root has this
integer, dimension(2) :: local ! everyone has this
integer, parameter :: root = 0
integer :: rank, comsize
integer :: i, ierr
call MPI_Init(ierr)
call MPI_Comm_size(MPI_COMM_WORLD, comsize, ierr)
call MPI_Comm_rank(MPI_COMM_WORLD, rank, ierr)
if (rank == root) then
global = [ (i, i=1,8) ]
endif
call MPI_Scatter(global, 2, MPI_INTEGER, & ! send everyone 2 ints from global
local, 2, MPI_INTEGER, & ! each proc recieves 2 into
root, & ! sending process is root,
MPI_COMM_WORLD, ierr) ! all procs in COMM_WORLD participate
After this, the processors' data would look like
task 0: local:[1,2] global: [1,2,3,4,5,6,7,8]
task 1: local:[3,4] global: [garbage]
task 2: local:[5,6] global: [garbage]
task 3: local:[7,8] global: [garbage]
That is, the scatter operation takes the global array and sends contiguous 2-int chunks to all the processors.
To re-assemble the array, we use the MPI_Gather() operation, which works exactly the same but in reverse:
local = local + rank
call MPI_Gather (local, 2, MPI_INTEGER, & ! everyone sends 2 ints from local
global, 2, MPI_INTEGER, & ! root receives 2 ints each proc into global
root, & ! receiving process is root,
MPI_COMM_WORLD, ierr) ! all procs in COMM_WORLD participate
And now the arrays look like:
task 0: local:[1,2] global: [1,2,4,5,7,8,10,11]
task 1: local:[4,5] global: [garbage-]
task 2: local:[7,8] global: [garbage-]
task 3: local:[10,11] global: [garbage-]
Gather brings all the data back.
What happens if the number of data points doesn't evenly divide the number of processes, and we need to send different numbers of items to each process? Then you need a generalized version of scatter, MPI_Scatterv, which lets you specify the counts for each processor, and displacements -- where in the global array that piece of data starts. So let's say with the same 4 tasks you had an array of characters [a,b,c,d,e,f,g,h,i] with 9 characters, and you were going to assign every process two characters except the last, that got three. Then you'd need
character, dimension(9) :: global
character, dimension(3) :: local
integer, dimension(4) :: counts
integer, dimension(4) :: displs
if (rank == root) then
global = [ (achar(i+ichar('a')), i=0,8) ]
endif
local = ['-','-','-']
counts = [2,2,2,3]
displs = [0,2,4,6]
mycounts = counts(rank+1)
call MPI_Scatterv(global, counts, displs, & ! proc i gets counts(i) chars from displs(i)
MPI_CHARACTER, &
local, mycounts, MPI_CHARACTER, & ! I get mycounts chars into
root, & ! root rank does sending
MPI_COMM_WORLD, ierr) ! all procs in COMM_WORLD participate
Now the data looks like
task 0: local:"ab-" global: "abcdefghi"
task 1: local:"cd-" global: *garbage*
task 2: local:"ef-" global: *garbage*
task 3: local:"ghi" global: *garbage*
You've now used scatterv to distribute the irregular amounts of data. The displacement in each case is two*rank (measured in characters; the displacement is in unit of the types being sent for a scatter or received for a gather; it's not generally in bytes or something) from the start of the array, and the counts are [2,2,2,3]. If it had been the first processor we wanted to have 3 characters, we would have set counts=[3,2,2,2] and displacements would have been [0,3,5,7]. Gatherv again works exactly the same but reverse; the counts and displs arrays would remain the same.
Now, for 2D, this is a bit trickier. If we want to send 2d sublocks of a 2d array, the data we're sending now no longer is contiguous. If we're sending (say) 3x3 subblocks of a 6x6 array to 4 processors, the data we're sending has holes in it:
2D Array
---------
|000|222|
|000|222|
|000|222|
|---+---|
|111|333|
|111|333|
|111|333|
---------
Actual layout in memory
[000111000111000111222333222333222333]
(Note that all high-performance computing comes down to understanding the layout of data in memory.)
If we want to send the data that is marked "1" to task 1, we need to skip three values, send three values, skip three values, send three values, skip three values, send three values. A second complication is where the subregions stop and start; note that region "1" doesn't start where region "0" stops; after the last element of region "0", the next location in memory is partway-way through region "1".
Let's tackle the first layout problem first - how to pull out just the data we want to send. We could always just copy out all the "0" region data to another, contiguous array, and send that; if we planned it out carefully enough, we could even do that in such a way that we could call MPI_Scatter on the results. But we'd rather not have to transpose our entire main data structure that way.
So far, all the MPI data types we've used are simple ones - MPI_INTEGER specifies (say) 4 bytes in a row. However, MPI lets you create your own data types that describe arbitrarily complex data layouts in memory. And this case -- rectangular subregions of an array -- is common enough that there's a specific call for that. For the 2-dimensional case we're describing above,
integer :: newtype;
integer, dimension(2) :: sizes, subsizes, starts
sizes = [6,6] ! size of global array
subsizes = [3,3] ! size of sub-region
starts = [0,0] ! let's say we're looking at region "0"
! which begins at offset [0,0]
call MPI_Type_create_subarray(2, sizes, subsizes, starts, MPI_ORDER_FORTRAN, MPI_INTEGER, newtype, ierr)
call MPI_Type_commit(newtype, ierr)
This creates a type which picks out just the region "0" from the global array. Note that even in Fortran, the start parameter is given as an offset (eg, 0-based) from the start of the array, not an index (eg, 1-based).
We could send just that piece of data now to another processor
call MPI_Send(global, 1, newtype, dest, tag, MPI_COMM_WORLD, ierr) ! send region "0"
and the receiving process could receive it into a local array. Note that the receiving process, if it's only receiving it into a 3x3 array, can not describe what it's receiving as a type of newtype; that no longer describes the memory layout, because there aren't big skips between the end of one row and the start of the next. Instead, it's just receiving a block of 3*3 = 9 integers:
call MPI_Recv(local, 3*3, MPI_INTEGER, 0, tag, MPI_COMM_WORLD, ierr)
Note that we could do this for other sub-regions, too, either by creating a different type (with different start array) for the other blocks, or just by sending starting from the first location of the particular block:
if (rank == root) then
call MPI_Send(global(4,1), 1, newtype, 1, tag, MPI_COMM_WORLD, ierr)
call MPI_Send(global(1,4), 1, newtype, 2, tag, MPI_COMM_WORLD, ierr)
call MPI_Send(global(4,4), 1, newtype, 3, tag, MPI_COMM_WORLD, ierr)
local = global(1:3, 1:3)
else
call MPI_Recv(local, 3*3, MPI_INTEGER, 0, tag, MPI_COMM_WORLD, rstatus, ierr)
endif
Now that we understand how to specify subregions, there's only one more thing to discuss before using scatter/gather operations, and that's the "size" of these types. We couldn't just use MPI_Scatter() (or even scatterv) with these types yet, because these types have an extent of 15 integers; that is, where they end is 15 integers after they start -- and where they end doesn't line up nicely with where the next block begins, so we can't just use scatter - it would pick the wrong place to start sending data to the next processor.
Of course, we could use MPI_Scatterv() and specify the displacements ourselves, and that's what we'll do - except the displacements are in units of the send-type size, and that doesn't help us either; the blocks start at offsets of (0,3,18,21) integers from the start of the global array, and the fact that a block ends 15 integers from where it starts doesn't let us express those displacements in integer multiples at all.
To deal with this, MPI lets you set the extent of the type for the purposes of these calculations. It doesn't truncate the type; it's just used for figuring out where the next element starts given the last element. For types like these with holes in them, it's frequently handy to set the extent to be something smaller than the distance in memory to the actual end of the type.
We can set the extent to be anything that's convenient to us. We could just make the extent 1 integer, and then set the displacements in units of integers. In this case, though, I like to set the extent to be 3 integers - the size of a sub-column - that way, block "1" starts immediately after block "0", and block "3" starts immediately after block "2". Unfortunately, it doesn't quite work as nicely when jumping from block "2" to block "3", but that can't be helped.
So to scatter the subblocks in this case, we'd do the following:
integer(kind=MPI_ADDRESS_KIND) :: extent
starts = [0,0]
sizes = [6, 6]
subsizes = [3, 3]
call MPI_Type_create_subarray(2, sizes, subsizes, starts, &
MPI_ORDER_FORTRAN, MPI_INTEGER, &
newtype, ierr)
call MPI_Type_size(MPI_INTEGER, intsize, ierr)
extent = 3*intsize
call MPI_Type_create_resized(newtype, 0, extent, resizedtype, ierr)
call MPI_Type_commit(resizedtype, ierr)
Here we've created the same block type as before, but we've resized it; we haven't changed where the type "starts" (the 0) but we've changed where it "ends" (3 integers). We didn't mention this before, but the MPI_Type_commit is required to be able to use the type; but you only need to commit the final type you actually use, not any intermediate steps. You use MPI_Type_free to free the committed type when you're done.
So now, finally, we can scatterv the blocks: the data manipulations above are a little complicated, but once it's done, the scatterv looks just like before:
counts = 1 ! we will send one of these new types to everyone
displs = [0,1,6,7] ! the starting point of everyone's data
! in the global array, in block extents
call MPI_Scatterv(global, counts, displs, & ! proc i gets counts(i) types from displs(i)
resizedtype, &
local, 3*3, MPI_INTEGER, & ! I'm receiving 3*3 int
root, MPI_COMM_WORLD, ierr) !... from (root, MPI_COMM_WORLD)
And now we're done, after a little tour of scatter, gather, and MPI derived types.
An example code which shows both the gather and the scatter operation, with character arrays, follows. Running the program:
$ mpirun -np 4 ./scatter2d
global array is:
000222
000222
000222
111333
111333
111333
Rank 0 received:
000
000
000
Rank 1 received:
111
111
111
Rank 2 received:
222
222
222
Rank 3 received:
333
333
333
Rank 0 sending:
111
111
111
Rank 1 sending:
222
222
222
Rank 2 sending:
333
333
333
Rank 3 sending:
444
444
444
Root received:
111333
111333
111333
222444
222444
222444
and the code follows:
program scatter
use mpi
implicit none
integer, parameter :: gridsize = 6 ! size of array
integer, parameter :: procgridsize = 2 ! size of process grid
character, allocatable, dimension (:,:) :: global, local
integer, dimension(procgridsize**2) :: counts, displs
integer, parameter :: root = 0
integer :: rank, comsize
integer :: localsize
integer :: i, j, row, col, ierr, p, charsize
integer, dimension(2) :: sizes, subsizes, starts
integer :: newtype, resizedtype
integer, parameter :: tag = 1
integer, dimension(MPI_STATUS_SIZE) :: rstatus
integer(kind=MPI_ADDRESS_KIND) :: extent, begin
call MPI_Init(ierr)
call MPI_Comm_size(MPI_COMM_WORLD, comsize, ierr)
call MPI_Comm_rank(MPI_COMM_WORLD, rank, ierr)
if (comsize /= procgridsize**2) then
if (rank == root) then
print *, 'Only works with np = ', procgridsize**2, ' for now.'
endif
call MPI_Finalize(ierr)
stop
endif
localsize = gridsize/procgridsize
allocate( local(localsize, localsize) )
if (rank == root) then
allocate( global(gridsize, gridsize) )
forall( col=1:procgridsize, row=1:procgridsize )
global((row-1)*localsize+1:row*localsize, &
(col-1)*localsize+1:col*localsize) = &
achar(ichar('0')+(row-1)+(col-1)*procgridsize)
end forall
print *, 'global array is: '
do i=1,gridsize
print *, global(i,:)
enddo
endif
starts = [0,0]
sizes = [gridsize, gridsize]
subsizes = [localsize, localsize]
call MPI_Type_create_subarray(2, sizes, subsizes, starts, &
MPI_ORDER_FORTRAN, MPI_CHARACTER, &
newtype, ierr)
call MPI_Type_size(MPI_CHARACTER, charsize, ierr)
extent = localsize*charsize
begin = 0
call MPI_Type_create_resized(newtype, begin, extent, resizedtype, ierr)
call MPI_Type_commit(resizedtype, ierr)
counts = 1 ! we will send one of these new types to everyone
forall( col=1:procgridsize, row=1:procgridsize )
displs(1+(row-1)+procgridsize*(col-1)) = (row-1) + localsize*procgridsize*(col-1)
endforall
call MPI_Scatterv(global, counts, displs, & ! proc i gets counts(i) types from displs(i)
resizedtype, &
local, localsize**2, MPI_CHARACTER, & ! I'm receiving localsize**2 chars
root, MPI_COMM_WORLD, ierr) !... from (root, MPI_COMM_WORLD)
do p=1, comsize
if (rank == p-1) then
print *, 'Rank ', rank, ' received: '
do i=1, localsize
print *, local(i,:)
enddo
endif
call MPI_Barrier(MPI_COMM_WORLD, ierr)
enddo
local = achar( ichar(local) + 1 )
do p=1, comsize
if (rank == p-1) then
print *, 'Rank ', rank, ' sending: '
do i=1, localsize
print *, local(i,:)
enddo
endif
call MPI_Barrier(MPI_COMM_WORLD, ierr)
enddo
call MPI_Gatherv( local, localsize**2, MPI_CHARACTER, & ! I'm sending localsize**2 chars
global, counts, displs, resizedtype,&
root, MPI_COMM_WORLD, ierr)
if (rank == root) then
print *, ' Root received: '
do i=1,gridsize
print *, global(i,:)
enddo
endif
call MPI_Type_free(newtype,ierr)
if (rank == root) deallocate(global)
deallocate(local)
call MPI_Finalize(ierr)
end program scatter
So that's the general solution. For your particular case, where we are just appending by rows, we don't need a Gatherv, we can just use a gather, because in this case, all of the displacements are the same -- before, in the 2d block case we had one displacement going 'down', and then jumps in that displacement as you went 'across' to the next column of blocks. Here, the displacement is always one extent from the previous one, so we don't need to give displacements explicitly. So a final code looks like:
program testmpi
use mpi
implicit none
integer, dimension(:,:), allocatable :: send, recv
integer, parameter :: nsendrows = 2, nsendcols = 3
integer, parameter :: root = 0
integer :: ierror, my_rank, comsize, i, j, ierr
integer :: blocktype, resizedtype
integer, dimension(2) :: starts, sizes, subsizes
integer (kind=MPI_Address_kind) :: start, extent
integer :: intsize
call MPI_Init(ierror)
call MPI_Comm_rank(MPI_COMM_WORLD, my_rank, ierror)
call MPI_Comm_size(MPI_COMM_WORLD, comsize, ierror)
allocate( send(nsendrows, nsendcols) )
send = my_rank
if (my_rank==root) then
! we're going to append the local arrays
! as groups of send rows
allocate( recv(nsendrows*comsize, nsendcols) )
endif
! describe what these subblocks look like inside the full concatenated array
sizes = [ nsendrows*comsize, nsendcols ]
subsizes = [ nsendrows, nsendcols ]
starts = [ 0, 0 ]
call MPI_Type_create_subarray( 2, sizes, subsizes, starts, &
MPI_ORDER_FORTRAN, MPI_INTEGER, &
blocktype, ierr)
start = 0
call MPI_Type_size(MPI_INTEGER, intsize, ierr)
extent = intsize * nsendrows
call MPI_Type_create_resized(blocktype, start, extent, resizedtype, ierr)
call MPI_Type_commit(resizedtype, ierr)
call MPI_Gather( send, nsendrows*nsendcols, MPI_INTEGER, & ! everyone send 3*2 ints
recv, 1, resizedtype, & ! root gets 1 resized type from everyone
root, MPI_COMM_WORLD, ierr)
if (my_rank==0) then
print*,'<><><><><>recv'
do i=1,nsendrows*comsize
print*,(recv(i,j),j=1,nsendcols)
enddo
endif
call MPI_Finalize(ierror)
end program testmpi
Running this with 3 processes gives:
$ mpirun -np 3 ./testmpi
<><><><><>recv
0 0 0
0 0 0
1 1 1
1 1 1
2 2 2
2 2 2
Here's another code block for any other struggling Fortran beginners out there like myself. It shows two different ways to achieve a MPI_Gather on a nx * ny array divided into M * N blocks, with one block on each process.
One way uses MPI derived datatypes, the other simply sends 1D raw data and sorts if afterward on the main node.
Both produce an ordered M * N 2D array. For the example of 3 x 2 sub-arrays in x * y, each having 4 x 5 elements:
n mpi ranks: 6
rank 0 has topology coords 0,0
rank 1 has topology coords 0,1
rank 2 has topology coords 1,0
rank 3 has topology coords 1,1
rank 4 has topology coords 2,0
rank 5 has topology coords 2,1
1 1 1 1 3 3 3 3 5 5 5 5
1 1 1 1 3 3 3 3 5 5 5 5
1 1 1 1 3 3 3 3 5 5 5 5
1 1 1 1 3 3 3 3 5 5 5 5
1 1 1 1 3 3 3 3 5 5 5 5
0 0 0 0 2 2 2 2 4 4 4 4
0 0 0 0 2 2 2 2 4 4 4 4
0 0 0 0 2 2 2 2 4 4 4 4
0 0 0 0 2 2 2 2 4 4 4 4
0 0 0 0 2 2 2 2 4 4 4 4
Note that if the raw data is sent and not re-arranged it would be ordered by default like so:
5 5 5 5 5 5 5 5 5 5 5 5
4 4 4 4 5 5 5 5 5 5 5 5
4 4 4 4 4 4 4 4 4 4 4 4
3 3 3 3 3 3 3 3 4 4 4 4
3 3 3 3 3 3 3 3 3 3 3 3
2 2 2 2 2 2 2 2 2 2 2 2
1 1 1 1 2 2 2 2 2 2 2 2
1 1 1 1 1 1 1 1 1 1 1 1
0 0 0 0 0 0 0 0 1 1 1 1
0 0 0 0 0 0 0 0 0 0 0 0
The following code can be compiled with
mpif90 -Wall test.F90 -o test.out
and run with
mpirun -n 6 test.out
program main
use mpi
use, intrinsic :: iso_fortran_env
use iso_c_binding
implicit none
! ===
integer(int64) i, j
integer(int32) nx , ny
integer(int32) nxr , nyr
integer(int32) npx , npy
integer(int32) ri , rj
integer, dimension(2) :: mpi_coords, mpi_coords2
integer rank, n_ranks, ierror
integer rank_cart
integer comm, comm2d
real(real64), dimension(:,:), allocatable :: A, A_global
real(real64), dimension(:), allocatable :: B_global
integer(int32), dimension(:), allocatable :: lengths, displacements
integer(int32) subarraytype, resizedtype
integer(int32) dblsize
integer(kind=MPI_ADDRESS_KIND) :: start, extent
! === MPI interface initialization
call MPI_INIT(ierror)
comm = MPI_COMM_WORLD
call MPI_COMM_SIZE(comm, n_ranks, ierror)
call MPI_COMM_RANK(comm, rank, ierror)
if (rank .eq. 0) then
print '(a, i0)', 'n mpi ranks: ', n_ranks
end if
npx = 3 !! n processes in x
nxr = 4 !! n pts per rank in x
nx = npx*nxr !! n pts x total
npy = 2 !! n processes in y
nyr = 5 !! n pts per rank in y
ny = npy*nyr !! n pts y total
! === check that n_ranks are equal to the hardcoded number of processes in [x,y]
if (n_ranks .ne. npx*npy) then
if (rank .eq. 0) then
print '(a)','n_ranks != npx*npy'
end if
call MPI_Abort(MPI_COMM_WORLD, -1, ierror)
call MPI_Finalize(ierror)
end if
call MPI_BARRIER(comm, ierror)
! === create 2D Cartesian grid
call MPI_Cart_create(comm, 2, (/npx,npy/), (/.true.,.false./), .true., comm2d, ierror)
! === get rank in 2D communicator
call MPI_Comm_rank(comm2d, rank_cart, ierror)
! === get this rank ID and coordinates within the 2D topology
call MPI_Cart_coords(comm2d, rank_cart, 2, mpi_coords, ierror)
! === print topology
if (.true.) then
print '(a,i0,a,i0,a,i0)', 'rank ', rank_cart, ' has topology coords ', mpi_coords(1), ',', mpi_coords(2)
end if
call MPI_BARRIER(comm, ierror)
call flush(6)
! === populate data
allocate( A(nxr,nyr) )
A(:,:) = real(rank,real64)
! ===
if (.true.) then !! use MPI derived types
allocate (lengths(n_ranks))
allocate (displacements(n_ranks))
call MPI_Type_create_subarray(2, (/nx,ny/), (/nxr,nyr/), (/0,0/), &
MPI_ORDER_FORTRAN, MPI_DOUBLE_PRECISION, subarraytype, ierror)
call MPI_Type_size(MPI_DOUBLE_PRECISION, dblsize, ierror)
start = 0
extent = nxr*dblsize
call MPI_Type_create_resized(subarraytype, start, extent, resizedtype, ierror)
call MPI_Type_commit(resizedtype, ierror)
lengths = 1
!! displacements = [ npx*0*nyr+0, npx*1*nyr+0, &
!! npx*0*nyr+1, npx*1*nyr+1, &
!! npx*0*nyr+2, npx*1*nyr+2 ] !! for 3x2
!! displacements = [ npx*0*nyr+0, npx*1*nyr+0, npx*2*nyr+0, &
!! npx*0*nyr+1, npx*1*nyr+1, npx*2*nyr+1 ] !! for 2x3
do i=0,n_ranks-1
call MPI_Cart_coords(comm2d, int(i,int32), 2, mpi_coords2, ierror)
ri = mpi_coords2(1)
rj = mpi_coords2(2)
displacements(i+1) = npx*rj*nyr + ri
end do
if (rank .eq. 0) then
allocate( A_global(nx,ny) )
end if
call MPI_Gatherv( A , nxr*nyr, MPI_DOUBLE_PRECISION, &
A_global, lengths, displacements, resizedtype, &
0, comm2d, ierror )
else !! send raw data in 1D, then re-arrange
if (rank .eq. 0) then
allocate( A_global(nx,ny) )
allocate( B_global(nx*ny) )
end if
call MPI_Gather( A , nxr*nyr, MPI_DOUBLE_PRECISION, &
B_global, nxr*nyr, MPI_DOUBLE_PRECISION, &
0, comm2d, ierror )
if (rank .eq. 0) then
if (.true.) then !! re-arrange data
do i=0,n_ranks-1
call MPI_Cart_coords(comm2d, int(i,int32), 2, mpi_coords2, ierror)
ri = mpi_coords2(1)
rj = mpi_coords2(2)
A_global( (ri+0)*nxr+1:(ri+1)*nxr+1 , &
(rj+0)*nyr+1:(rj+1)*nyr+1 ) &
= &
reshape( B_global( (i+0)*nxr*nyr+1 : (i+1)*nxr*nyr+1 ) , (/nxr,nyr/) )
end do
else !! dont do re-arrange
A_global(:,:) = reshape( B_global , (/nx,ny/) )
end if
end if
if (rank .eq. 0) then
deallocate( B_global )
end if
end if
! ===
!! print the array to the terminal
if (rank .eq. 0) then
do j=ny, 1, -1
do i=1, nx
write(*,'(i0,a)',advance='no') int(A_global(i,j)) , " "
end do
write (*,*) ''
end do
end if
!! save data to binary file
if (rank==0) then
open(3, file=trim("A.dat"), access="stream")
write(3) reshape( A_global , (/nx,ny/) )
close(3)
end if
if (rank .eq. 0) then
deallocate( A_global )
end if
! ===
call MPI_BARRIER(comm, ierror)
call MPI_FINALIZE(ierror)
end program main
A final note: pulling all data to one process (for I/O purposes or otherwise) is typically very bad practice for parallel programming. The moment this is done, the code will lose potential scalability. The method shown here is really only fit for smaller codes or for debugging. For I/O purposes, one should seriously look into collective MPI-I/O or libraries such as HDF5 which allow for collective I/O.