MPI_WIN_ALLOCATE_SHARED and synchronization - fortran

I try to do a mpi shared memory example , but every time i get some weird value.
It's a 1D stencil, just doing the sum of elements at position i-1,i and i+1
I'm running this program on 2 node of 32 MPI process and with the domain size nx=64, the domain of each rank has only 1 element.
I do the exchange between node with MPI_SENDRECEIVE with ghost cells
program mpishared
USE MPI_F08
use ISO_C_BINDING
implicit none
integer :: rank, rankNode, rankW, rankE
integer :: nbp, nbNode
integer :: key
TYPE(MPI_Comm) :: commNode ! shared node
integer :: nx ! area global
integer :: sx,ex ! area local
integer :: rsx,rex ! real bound of local array with halo
integer(kind=MPI_ADDRESS_KIND) :: size
TYPE(C_PTR) :: baseptr
TYPE(MPI_Win) :: win
integer, parameter :: dp = kind(1.d0)
real(kind=dp), dimension(:), contiguous, pointer :: ushared
real(kind=dp), dimension(:), allocatable :: u
integer :: iterx,iter,iterp
!! Init MPI
CALL MPI_INIT()
!! Info WORLD
CALL MPI_COMM_RANK(MPI_COMM_WORLD,rank)
CALL MPI_COMM_SIZE(MPI_COMM_WORLD,nbp)
! Comm 4 Node
key = 0
CALL MPI_COMM_SPLIT_TYPE(MPI_COMM_WORLD,MPI_COMM_TYPE_SHARED,key,MPI_INFO_NULL,commNode)
CALL MPI_COMM_RANK(commNode, rankNode)
CALL MPI_COMM_SIZE(commNode, nbNode)
! Neighbours
rankW = rank-1
rankE = rank+1
if (rank == 0) rankW=MPI_PROC_NULL
if (rank == nbp-1) rankE=MPI_PROC_NULL
! Size of global domain
nx = 64
! Size of local domain
sx = 1+(rank*nx)/nbp
ex = ((rank+1)*nx)/nbp
rsx = sx ! real size only different for first
rex = ex ! and last rank in node
if (rankNode == 0) rsx = rsx-1
if (rankNode == nbNode-1) rex=rex+1
! Allocate Shared domain
size = (rex-rsx+1)
allocate(u(rex-rsx+1))
CALL MPI_WIN_ALLOCATE_SHARED(size,1,MPI_INFO_NULL,commNode,baseptr,win)
CALL C_F_POINTER(baseptr,ushared)
! Init local domain
do iterx=1,rex-rsx+1
u(iterx) = 0.0_dp
end do
if (rank == nbp-1) then
u(rex-rsx+1) = rex
end if
if (rank == 0) then
u(1) = -1.0_dp
end if
! Main Loop
CALL MPI_WIN_LOCK_ALL(0,win)
do iter=1,10
! Update sharedold
do iterx=1,rex-rsx+1
ushared(iterx)=u(iterx)
end do
! Update bound between node
if (rankNode == 0) then
CALL MPI_SENDRECV(ushared(2),nx,MPI_DOUBLE_PRECISION,rankW,100, &
ushared(1),nx,MPI_DOUBLE_PRECISION,rankW,100,&
MPI_COMM_WORLD,MPI_STATUS_IGNORE)
end if
if (rankNode == nbNode-1) then
CALL MPI_SENDRECV(ushared(ex-rsx+1),nx,MPI_DOUBLE_PRECISION,rankE,100, &
ushared(rex-rsx+1),nx,MPI_DOUBLE_PRECISION,rankE,100,&
MPI_COMM_WORLD,MPI_STATUS_IGNORE)
end if
call MPI_WIN_SYNC(win)
call MPI_BARRIER(MPI_COMM_WORLD)
! Compute
do iterx=sx-rsx+1,ex-rsx+1
u(iterx)=(ushared(iterx-1)+ushared(iterx)+ushared(iterx+1))/3.0_dp
!print *, rank, iterx, u(iterx), ushared(iterx-1), ushared(iterx), ushared(iterx+1)
end do
call MPI_BARRIER(MPI_COMM_WORLD)
end do
call MPI_WIN_UNLOCK_ALL(win)
do iterp=0, nbp-1
if (iterp == rank) then
do iterx=1,rex-rsx+1
print * , iter,"u", rank, iterx, u(iterx)
end do
end if
call MPI_BARRIER(MPI_COMM_WORLD)
end do
CALL MPI_FINALIZE()
end program
The value after a lot of iterations must be equal to the rank
But when i'm running it, wrong value start to appear (like -6.018996517484083E+196 )
Since i'm new to MPI RMA, i don't know if it's a bug of the MPI implementation i use or if i'm doing something wrong

Related

I got a message with this erro Function ‘areacircle’ requires an argument list ,Why?

this is the program. and I got an error why?
''''code'''''
I don't know why the whole doesn't appear, I tried to determine the area and volume for a random number.
----------------why-------------
'''Fortran
'program exercise2'
!
integer :: N,i
type :: Values
double precision :: radius,area,volume
end type
!
!
type(Values),allocatable, dimension(:) :: s
integer :: bi
!
!Read the data to create the random number
write(6,*) 'write your number '
read(5,*) N
allocate(s(N))
bi = 3.14
!create the random number
call random_seed()
do i=1,N
call random_number(s(i)%radius)
s(i)%area=areacircle(s(i)%radius)
s(i)%volume=volumesphere(s(i)%radius)
end do
!
open(15,file='radius.out',status='new')
write(15,*) s(i)%radius
open(16,file='output2.out',status='new')
r = real(s(i)%radius)
!Two function
contains
double precision function areacircle (s)
implicit none
double precision :: s
do i=1 , N
areacircle=bi*r**2
end do
return
end function areacircle
!
!
double precision function volumesphere (s)
implicit none
double precision :: s
do i=1,N
volumesphere=4/3*bi*r**3
end do
return
write(16,*) r , areacircle , volumesphere
end function volumesphere
'end program exercise2'
'''''''
so anyone know why?
This likely does what you want. As the computation of area and volume involve a single input that does not change, I've changed your functions to be elemental. This allows an array argument where the function is executed for each element of the array. I also changed double precision to use Fortran kind type parameter mechanism, because typing is real(dp) is much shorter.
Finally, never write a Fortran program without the implicit none statement.
program exercise2
implicit none ! Never write a program without this statement
integer, parameter :: dp = kind(1.d0) ! kind type parameter
integer n, i
type values
real(dp) radius, area, volume
end type
type(values), allocatable :: s(:)
real(dp) bi ! integer :: bi?
! Read the data to create the random number
write(6,*) 'write your number '
read(5,*) n
! Validate n is validate.
if (n < 1) stop 'Invalid number'
allocate(s(n))
bi = 4 * atan(1.d0) ! bi = 3.14? correctly determine pi
call random_seed() ! Use default seeding
call random_number(s%radius) ! Fill radii with random numbers
s%area = areacircle(s%radius) ! Compute area
s%volume = volumesphere(s%radius) ! Compute volume
write(*,'(A)') ' Radii Area Volume'
do i = 1, n
write(*, '(3F9.5)') s(i)
end do
contains
elemental function areacircle(s) result(area)
real(dp) area
real(dp), intent(in) :: s
area = bi * s**2
end function areacircle
elemental function volumesphere(s) result(volume)
real(dp) volume
real(dp), intent(in) :: s
volume = (4 * bi / 3) * s**3
end function volumesphere
end program exercise2

There is no specific subroutine for the generic ‘mpi_send

subroutine collect(rank, nprocs, n_local, n_global, u_initial_local)
use mpi
implicit none
integer*8 :: i_local_low, i_local_high
integer*8 :: i_global_low, i_global_high
integer*8 :: i_local, i_global
integer*8 :: n_local, n_global
real*8 :: u_initial_local(n_local)
real*8, dimension(:), allocatable :: u_global
integer :: procs
integer*8 :: n_local_procs
! Data declarations for MPI
integer :: ierr ! error signal variable, Standard value - 0
integer :: rank ! process ID (pid) / Number
integer :: nprocs ! number of processors
! MPI send/ receive arguments
integer :: buffer(2)
integer, parameter :: collect1 = 10
integer, parameter :: collect2 = 20
! status variable - tells the status of send/ received calls
! Needed for receive subroutine
integer, dimension(MPI_STATUS_SIZE) :: status1
i_global_low = (rank *(n_global-1))/nprocs
i_global_high = ((rank+1) *(n_global-1))/nprocs
if (rank > 0) then
i_global_low = i_global_low - 1
end if
i_local_low = 0
i_local_high = i_global_high - i_global_low
if (rank == 0) then
allocate(u_global(1:n_global))
do i_local = i_local_low, i_local_high
i_global = i_global_low + i_local - i_local_low
u_global(i_global) = u_initial_local(i_local)
end do
do procs = 1,nprocs-1
call MPI_RECV(buffer, 2, MPI_INTEGER, procs, collect1, MPI_COMM_WORLD, status1, ierr)
i_global_low = buffer(1)
n_local_procs = buffer(2)
call MPI_RECV(u_global(i_global_low+1), n_local_procs, MPI_DOUBLE_PRECISION, procs, collect2, MPI_COMM_WORLD, status1, ierr)
end do
print *, u_global
else
buffer(1) = i_global_low
buffer(2) = n_local
call MPI_SEND(buffer, 2, MPI_INTEGER, 0, collect1, MPI_COMM_WORLD, ierr)
call MPI_SEND(u_initial_local, n_local, MPI_DOUBLE_PRECISION, 0, collect2, MPI_COMM_WORLD, ierr)
end if
return
end subroutine collect
I am getting the error for MPI_SEND and MPI_RECV corresponding to collect2 tag. "There is no specific subroutine for the generic ‘mpi_recv’ at (1)" and 1 is at the end of .......ierr). MPI_SEND for collect2 tag is sending an array and MPI_RECV is receiving that array.
This does not happen for collect1 tag.
Your n_local is integer*8 but it must be integer (see How to debug Fortran 90 compile error "There is no specific subroutine for the generic 'foo' at (1)"?).
There are many articles (like https://blogs.cisco.com/performance/can-i-mpi_send-and-mpi_recv-with-a-count-larger-than-2-billion) about the problem with large arrays (more than maxint elements) and MPI. If you do have the problems with n_local being too large for integer, you can use derived types (like MPI_Type_contiguous) to lower the number of elements passed to MPI procedures so that it fits into a 4-byte integer.

Storing a Variable with a Multi-Dimensional Index in Fortran

Question
Consider the following code:
program example
implicit none
integer, parameter :: n_coeffs = 1000
integer, parameter :: n_indices = 5
integer :: i
real(8), dimension(n_coeffs) :: coeff
integer, dimension(n_coeffs,n_indices) :: index
do i = 1, n_coeffs
coeff(i) = real(i*3,8)
index(i,:) = [2,4,8,16,32]*i
end do
end
For any 5 dimensional index I need to obtain the associated coefficient, without knowing or calculating i. For instance, given [2,4,8,16,32] I need to obtain 3.0 without computing i.
Is there a reasonable solution, perhaps using sparse matrices, that would work for n_indices in the order of 100 (though n_coeffs still in the order of 1000)?
A Bad Solution
One solution would be to define a 5 dimensional array as in
real(8), dimension(2000,4000,8000,16000,32000) :: coeff2
do i = 1, ncoeffs
coeff2(index(i,1),index(i,2),index(i,3),index(i,4),index(i,5)) = coeff(i)
end do
then, to get the coefficient associated with [2,4,8,16,32], call
coeff2(2,4,8,16,32)
However, besides being very wasteful of memory, this solution would not allow n_indices to be set to a number higher than 7 given the limit of 7 dimensions to an array.
OBS: This question is a spin-off of this one. I have tried to ask the question more precisely having failed in the first attempt, an effort that greatly benefited from the answer of #Rodrigo_Rodrigues.
Actual Code
In case it helps here is the code for the actual problem I am trying to solve. It is an adaptive sparse grid method for approximating a function. The main goal is to make the interpolation at the and as fast as possible:
MODULE MOD_PARAMETERS
IMPLICIT NONE
SAVE
INTEGER, PARAMETER :: d = 2 ! number of dimensions
INTEGER, PARAMETER :: L_0 = 4 ! after this adaptive grid kicks in, for L <= L_0 usual sparse grid
INTEGER, PARAMETER :: L_max = 9 ! maximum level
INTEGER, PARAMETER :: bound = 0 ! 0 -> for f = 0 at boundary
! 1 -> adding grid points at boundary
! 2 -> extrapolating close to boundary
INTEGER, PARAMETER :: max_error = 1
INTEGER, PARAMETER :: L2_error = 1
INTEGER, PARAMETER :: testing_sample = 1000000
REAL(8), PARAMETER :: eps = 0.01D0 ! epsilon for adaptive grid
END MODULE MOD_PARAMETERS
PROGRAM MAIN
USE MOD_PARAMETERS
IMPLICIT NONE
INTEGER, DIMENSION(d,d) :: ident
REAL(8), DIMENSION(d) :: xd
INTEGER, DIMENSION(2*d) :: temp
INTEGER, DIMENSION(:,:), ALLOCATABLE :: grid_index, temp_grid_index, grid_index_new, J_index
REAL(8), DIMENSION(:), ALLOCATABLE :: coeff, temp_coeff, J_coeff
REAL(8) :: temp_min, temp_max, V, T, B, F, x1
INTEGER :: k, k_1, k_2, h, i, j, L, n, dd, L1, L2, dsize, count, first, repeated, add, ind
INTEGER :: time1, time2, clock_rate, clock_max
REAL(8), DIMENSION(L_max,L_max,2**(L_max),2**(L_max)) :: coeff_grid
INTEGER, DIMENSION(d) :: level, LL, ii
REAL(8), DIMENSION(testing_sample,d) :: x_rand
REAL(8), DIMENSION(testing_sample) :: interp1, interp2
! ============================================================================
! EXECUTABLE
! ============================================================================
ident = 0
DO i = 1,d
ident(i,i) = 1
ENDDO
! Initial grid point
dsize = 1
ALLOCATE(grid_index(dsize,2*d),grid_index_new(dsize,2*d))
grid_index(1,:) = 1
grid_index_new = grid_index
ALLOCATE(coeff(dsize))
xd = (/ 0.5D0, 0.5D0 /)
CALL FF(xd,coeff(1))
CALL FF(xd,coeff_grid(1,1,1,1))
L = 1
n = SIZE(grid_index_new,1)
ALLOCATE(J_index(n*2*d,2*d))
ALLOCATE(J_coeff(n*2*d))
CALL SYSTEM_CLOCK (time1,clock_rate,clock_max)
DO WHILE (L .LT. L_max)
L = L+1
n = SIZE(grid_index_new,1)
count = 0
first = 1
DEALLOCATE(J_index,J_coeff)
ALLOCATE(J_index(n*2*d,2*d))
ALLOCATE(J_coeff(n*2*d))
J_index = 0
J_coeff = 0.0D0
DO k = 1,n
DO i = 1,d
DO j = 1,2
IF ((bound .EQ. 0) .OR. (bound .EQ. 2)) THEN
temp = grid_index_new(k,:)+(/ident(i,:),ident(i,:)*(grid_index_new(k,d+i)-(-1)**j)/)
ELSEIF (bound .EQ. 1) THEN
IF (grid_index_new(k,i) .EQ. 1) THEN
temp = grid_index_new(k,:)+(/ident(i,:),ident(i,:)*(-(-1)**j)/)
ELSE
temp = grid_index_new(k,:)+(/ident(i,:),ident(i,:)*(grid_index_new(k,d+i)-(-1)**j)/)
ENDIF
ENDIF
CALL XX(d,temp(1:d),temp(d+1:2*d),xd)
temp_min = MINVAL(xd)
temp_max = MAXVAL(xd)
IF ((temp_min .GE. 0.0D0) .AND. (temp_max .LE. 1.0D0)) THEN
IF (first .EQ. 1) THEN
first = 0
count = count+1
J_index(count,:) = temp
V = 0.0D0
DO k_1 = 1,SIZE(grid_index,1)
T = 1.0D0
DO k_2 = 1,d
CALL XX(1,temp(k_2),temp(d+k_2),x1)
CALL BASE(x1,grid_index(k_1,k_2),grid_index(k_1,k_2+d),B)
T = T*B
ENDDO
V = V+coeff(k_1)*T
ENDDO
CALL FF(xd,F)
J_coeff(count) = F-V
ELSE
repeated = 0
DO h = 1,count
IF (SUM(ABS(J_index(h,:)-temp)) .EQ. 0) THEN
repeated = 1
ENDIF
ENDDO
IF (repeated .EQ. 0) THEN
count = count+1
J_index(count,:) = temp
V = 0.0D0
DO k_1 = 1,SIZE(grid_index,1)
T = 1.0D0
DO k_2 = 1,d
CALL XX(1,temp(k_2),temp(d+k_2),x1)
CALL BASE(x1,grid_index(k_1,k_2),grid_index(k_1,k_2+d),B)
T = T*B
ENDDO
V = V+coeff(k_1)*T
ENDDO
CALL FF(xd,F)
J_coeff(count) = F-V
ENDIF
ENDIF
ENDIF
ENDDO
ENDDO
ENDDO
ALLOCATE(temp_grid_index(dsize,2*d))
ALLOCATE(temp_coeff(dsize))
temp_grid_index = grid_index
temp_coeff = coeff
DEALLOCATE(grid_index,coeff)
ALLOCATE(grid_index(dsize+count,2*d))
ALLOCATE(coeff(dsize+count))
grid_index(1:dsize,:) = temp_grid_index
coeff(1:dsize) = temp_coeff
DEALLOCATE(temp_grid_index,temp_coeff)
grid_index(dsize+1:dsize+count,:) = J_index(1:count,:)
coeff(dsize+1:dsize+count) = J_coeff(1:count)
dsize = dsize + count
DO i = 1,count
coeff_grid(J_index(i,1),J_index(i,2),J_index(i,3),J_index(i,4)) = J_coeff(i)
ENDDO
IF (L .LE. L_0) THEN
DEALLOCATE(grid_index_new)
ALLOCATE(grid_index_new(count,2*d))
grid_index_new = J_index(1:count,:)
ELSE
add = 0
DO h = 1,count
IF (ABS(J_coeff(h)) .GT. eps) THEN
add = add + 1
J_index(add,:) = J_index(h,:)
ENDIF
ENDDO
DEALLOCATE(grid_index_new)
ALLOCATE(grid_index_new(add,2*d))
grid_index_new = J_index(1:add,:)
ENDIF
ENDDO
CALL SYSTEM_CLOCK (time2,clock_rate,clock_max)
PRINT *, 'Elapsed real time1 = ', DBLE(time2-time1)/DBLE(clock_rate)
PRINT *, 'Grid Points = ', SIZE(grid_index,1)
! ============================================================================
! Compute interpolated values:
! ============================================================================
CALL RANDOM_NUMBER(x_rand)
CALL SYSTEM_CLOCK (time1,clock_rate,clock_max)
DO i = 1,testing_sample
V = 0.0D0
DO L1=1,L_max
DO L2=1,L_max
IF (L1+L2 .LE. L_max+1) THEN
level = (/L1,L2/)
T = 1.0D0
DO dd = 1,d
T = T*(1.0D0-ABS(x_rand(i,dd)/2.0D0**(-DBLE(level(dd)))-DBLE(2*FLOOR(x_rand(i,dd)*2.0D0**DBLE(level(dd)-1))+1)))
ENDDO
V = V + coeff_grid(L1,L2,2*FLOOR(x_rand(i,1)*2.0D0**DBLE(L1-1))+1,2*FLOOR(x_rand(i,2)*2.0D0**DBLE(L2-1))+1)*T
ENDIF
ENDDO
ENDDO
interp2(i) = V
ENDDO
CALL SYSTEM_CLOCK (time2,clock_rate,clock_max)
PRINT *, 'Elapsed real time2 = ', DBLE(time2-time1)/DBLE(clock_rate)
END PROGRAM
For any 5 dimensional index I need to obtain the associated
coefficient, without knowing or calculating i. For instance, given
[2,4,8,16,32] I need to obtain 3.0 without computing i.
function findloc_vector(matrix, vector) result(out)
integer, intent(in) :: matrix(:, :)
integer, intent(in) :: vector(size(matrix, dim=2))
integer :: out, i
do i = 1, size(matrix, dim=1)
if (all(matrix(i, :) == vector)) then
out = i
return
end if
end do
stop "No match for this vector"
end
And that's how you use it:
print*, coeff(findloc_vector(index, [2,4,8,16,32])) ! outputs 3.0
I must confess I was reluctant to post this code because, even though this answers your question, I honestly think this is not what you really want/need, but you dind't provide enough information for me to know what you really do want/need.
Edit (After actual code from OP):
If I decrypted your code correctly (and considering what you said in your previous question), you are declaring:
REAL(8), DIMENSION(L_max,L_max,2**(L_max),2**(L_max)) :: coeff_grid
(where L_max = 9, so size(coeff_grid) = 21233664 =~160MB) and then populating it with:
DO i = 1,count
coeff_grid(J_index(i,1),J_index(i,2),J_index(i,3),J_index(i,4)) = J_coeff(i)
ENDDO
(where count is of the order of 1000, i.e. 0.005% of its elements), so this way you can fetch the values by its 4 indices with the array notation.
Please, don't do that. You don't need a sparse matrix in this case either. The new approach you proposed is much better: storing the indices in each row of an smaller array, and fetching on the array of coefficients by the corresponding location of those indices in its own array. This is way faster (avoiding the large allocation) and much more memory-efficient.
PS: Is it mandatory for you to stick to Fortran 90? Its a very old version of the standard and chances are that the compiler you're using implements a more recent version. You could improve the quality of your code a lot with the intrinsic move_alloc (for less array copies), the kind constants from the intrinsic module iso_fortran_env (for portability), the [], >, <, <=,... notation (for readability)...

Fortran character format string as subroutine argument

I am struggling with reading a text string in. Am using gfortran 4.9.2.
Below I have written a little subroutine in which I would like to submit the write format as argument.
Ideally I'd like to be able to call it with
call printarray(mat1, "F8.3")
to print out a matrix mat1 in that format for example. The numbers of columns should be determined automatically inside the subroutine.
subroutine printarray(x, udf_temp)
implicit none
real, dimension(:,:), intent(in) :: x ! array to be printed
integer, dimension(2) :: dims ! array for shape of x
integer :: i, j
character(len=10) :: udf_temp ! user defined format, eg "F8.3, ...
character(len = :), allocatable :: udf ! trimmed udf_temp
character(len = 10) :: udf2
character(len = 10) :: txt1, txt2
integer :: ncols ! no. of columns of array
integer :: udf_temp_length
udf_temp_length = len_trim(udf_temp)
allocate(character(len=udf_temp_length) :: udf)
dims = shape(x)
ncols = dims(2)
write (txt1, '(I5)') ncols
udf2 = trim(txt1)//adjustl(udf)
txt2 = "("//trim(udf2)//")"
do i = 1, dims(1)
write (*, txt2) (x(i, j), j = 1, dims(2)) ! this is line 38
end do
end suroutine printarray
when I set len = 10:
character(len=10) :: udf_temp
I get compile error:
call printarray(mat1, "F8.3")
1
Warning: Character length of actual argument shorter than of dummy argument 'udf_temp' (4/10) at (1)
When I set len = *
character(len=*) :: udf_temp
it compiles but at runtime:
At line 38 of file where2.f95 (unit = 6, file = 'stdout')
Fortran runtime error: Unexpected element '( 8
What am I doing wrong?
Is there a neater way to do this?
Here's a summary of your question that I will try to address: You want to have a subroutine that will print a specified two-dimensional array with a specified format, such that each row is printed on a single line. For example, assume we have the real array:
real, dimension(2,8) :: x
x = reshape([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16], shape=[2,8], order=[2,1])
! Then the array is:
! 1.000 2.000 3.000 4.000 5.000 6.000 7.000 8.000
! 9.000 10.000 11.000 12.000 13.000 14.000 15.000 16.000
We want to use the format "F8.3", which prints floating point values (reals) with a field width of 8 and 3 decimal places.
Now, you are making a couple of mistakes when creating the format within your subroutine. First, you try to use udf to create the udf2 string. This is a problem because although you have allocated the size of udf, nothing has been assigned to it (pointed out in a comment by #francescalus). Thus, you see the error message you reported: Fortran runtime error: Unexpected element '( 8.
In the following, I make a couple of simplifying changes and demonstrate a few (slightly) different techniques. As shown, I suggest the use of * to indicate that the format can be applied an unlimited number of times, until all elements of the output list have been visited. Of course, explicitly stating the number of times to apply the format (ie, "(8F8.3)" instead of "(*(F8.3))") is fine, but the latter is slightly less work.
program main
implicit none
real, dimension(2,8) :: x
character(len=:), allocatable :: udf_in
x = reshape([1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16], shape=[2,8], order=[2,1])
udf_in = "F8.3"
call printarray(x, udf_in)
contains
subroutine printarray(x, udf_in)
implicit none
real, dimension(:,:), intent(in) :: x
character(len=*), intent(in) :: udf_in
integer :: ncols ! size(x,dim=2)
character(len=10) :: ncols_str ! ncols, stringified
integer, dimension(2) :: dims ! shape of x
character(len=:), allocatable :: udf0, udf1 ! format codes
integer :: i, j ! index counters
dims = shape(x) ! or just use: ncols = size(x, dim=2)
ncols = dims(2)
write (ncols_str, '(i0)') ncols ! use 'i0' for min. size
udf0 = "(" // ncols_str // udf_in // ")" ! create string: "(8F8.3)"
udf1 = "(*(" // udf_in // "))" ! create string: "(*(F8.3))"
print *, "Version 1:"
do i = 1, dims(1)
write (*, udf0) (x(i, j), j = 1,ncols) ! implied do-loop over j.
end do
print *, "Version 2:"
do i = 1, dims(1)
! udf1: "(*(F8.3))"
write (*, udf1) (x(i, j), j = 1,ncols) ! implied do-loop over j
end do
print *, "Version 3:"
do i = 1, size(x,dim=1) ! no need to create nrows/ncols vars.
write(*, udf1) x(i,:) ! let the compiler handle the extents.
enddo
end subroutine printarray
end program main
Observe: the final do-loop ("Version 3") is very simple. It does not need an explicit count of ncols because the * takes care of it automatically. Due to its simplicity, there is really no need for a subroutine at all.
besides the actual error (not using the input argument), this whole thing can be done much more simply:
subroutine printarray(m,f)
implicit none
character(len=*)f
real m(:,:)
character*10 n
write(n,'(i0)')size(m(1,:))
write(*,'('//n//f//')')transpose(m)
end subroutine
end
note no need for the loop constructs as fortran will automatically write the whole array , line wrapping as you reach the length of data specified by your format.
alternately you can use a loop construct, then you can use a '*' repeat count in the format and obviate the need for the internal write to construct the format string.
subroutine printarray(m,f)
implicit none
character(len=*)f
real m(:,:)
integer :: i
do i=1,size(m(:,1))
write(*,'(*('//f//'))')m(i,:)
enddo
end subroutine
end

Using mpi_scatterv with 4D fortran array

I'm trying to break up a 4D array over the third dimension, and send to each node using MPI. Basically, I'm computing derivatives of a matrix, Cpq, with respect to atom positions in each of the three cartesian directions. Cpq is of size nat_sl x nat_sl, so dCpqdR is of size nat_sl x nat_sl x nat x 3. At the end of the day, for ever s,i pair, I have to compute the matrix product of dCpqdR between the transpose of the eigenvectors of Cpq and the eigenvectors of Cpq like so:
temp = MATMUL(TRANSPOSE(Cpq), MATMUL(dCpqdR(:, :, s, i), Cpq))
This is fine, but as it turns out, the loop over s and i is now by far the slow part of my code. Because each can be done independently, I was hoping that I could break up dCpqdR, and give each task it's own s, i to compute the derivative of. That is, I'd like task 1 to get dCpqdR(:,:,1,1), task 2 to get dCpqdR(:,:,1,2), etc.
I've got this working in some sense by using a buffered send/recv pair of calls. The root node allocates a temporary array, fills it, sends to the relevant nodes, and the relevant nodes do their computations as they wish. This is fine, but can be slow and memory inefficient. I'd ideally like to break it up in a more memory efficient way.
The logical thing to do, then, is to use mpi_scatterv, but here is where I start running into trouble, as I'm having trouble figuring out the memory layout for this. I've written this, so far:
call mpi_type_create_subarray(4, (/ nat_sl, nat_sl, nat, 3 /), (/nat_sl, nat_sl, n_pairs(me_image+1), 3/),&
(/0, 0, 0, 0/), mpi_order_fortran, mpi_double_precision, subarr_typ, ierr)
call mpi_type_commit(subarr_typ, ierr)
call mpi_scatterv(dCpqdR, n_pairs(me_image+1), f_displs, subarr_typ,&
my_dCpqdR, 3*nat_sl*3*nat_sl*3*n_pairs(me_image+1), subarr_typ,&
root_image, intra_image_comm, ierr)
I've computed n_pairs using this subroutine:
subroutine mbdvdw_para_init_int_forces()
implicit none
integer :: p, s, i, counter, k, cpu_ind
integer :: num_unique_rpq, n_pairs_per_proc, cpu
real(dp) :: Rpq(3), Rpq_norm, current_val
num_pairs = nat
if(.not.allocated(f_cpu_id)) allocate(f_cpu_id(nat, 3))
n_pairs_per_proc = floor(dble(num_pairs)/nproc_image)
cpu = 0
n_pairs = 0
counter = 1
p = 1
do counter = 0, num_pairs-1, 1
n_pairs(modulo(counter, nproc_image)+1) = n_pairs(modulo(counter, nproc_image)+1) + 1
end do
do s = 1, nat, 1
f_cpu_id(s) = cpu
if((counter.lt.num_pairs)) then
if(p.eq.n_pairs(cpu+1)) then
cpu = cpu + 1
p = 0
end if
end if
p = p + 1
end do
call mp_set_displs( n_pairs, f_displs, num_pairs, nproc_image)
f_displs = f_displs*nat_sl*nat_sl*3
end subroutine mbdvdw_para_init_int_forces
and the full method for the matrix multiplication is
subroutine mbdvdw_interacting_energy(energy, forcedR, forcedh, forcedV)
implicit none
real(dp), intent(out) :: energy
real(dp), dimension(nat, 3), intent(out) :: forcedR
real(dp), dimension(3,3), intent(out) :: forcedh
real(dp), dimension(nat), intent(out) :: forcedV
real(dp), dimension(3*nat_sl, 3*nat_sl) :: temp
real(dp), dimension(:,:,:,:), allocatable :: my_dCpqdR
integer :: num_negative, i_atom, s, i, j, counter
integer, parameter :: eigs_check = 200
integer :: subarr_typ, ierr
! lapack work variables
integer :: LWORK, errorflag
real(dp) :: WORK((3*nat_sl)*(3+(3*nat_sl)/2)), eigenvalues(3*nat_sl)
call start_clock('mbd_int_energy')
call mp_sum(Cpq, intra_image_comm)
eigenvalues = 0.0_DP
forcedR = 0.0_DP
energy = 0.0_DP
num_negative = 0
forcedV = 0.0_DP
errorflag=0
LWORK=3*nat_sl*(3+(3*nat_sl)/2)
call DSYEV('V', 'U', 3*nat_sl, Cpq, 3*nat_sl, eigenvalues, WORK, LWORK, errorflag)
if(errorflag.eq.0) then
do i_atom=1, 3*nat_sl, 1
!open (unit=eigs_check, file="eigs.tmp",action="write",status="unknown",position="append")
! write(eigs_check, *) eigenvalues(i_atom)
!close(eigs_check)
if(eigenvalues(i_atom).ge.0.0_DP) then
energy = energy + dsqrt(eigenvalues(i_atom))
else
num_negative = num_negative + 1
end if
end do
if(num_negative.ge.1) then
write(stdout, '(3X," WARNING: Found ", I3, " Negative Eigenvalues.")'), num_negative
end if
else
end if
energy = energy*nat/nat_sl
!!!!!!!!!!!!!!!!!!!!
! Forces below here. There's going to be some long parallelization business.
!!!!!!!!!!!!!!!!!!!!
call start_clock('mbd_int_forces')
if(.not.allocated(my_dCpqdR)) allocate(my_dCpqdR(nat_sl, nat_sl, n_pairs(me_image+1), 3)), my_dCpqdR = 0.0_DP
if(mbd_vdw_forces) then
do s=1,nat,1
if(me_image.eq.(f_cpu_id(s)+1)) then
do i=1,3,1
temp = MATMUL(TRANSPOSE(Cpq), MATMUL(my_dCpqdR(:, :, counter, i), Cpq))
do j=1,3*nat_sl,1
if(eigenvalues(j).ge.0.0_DP) then
forcedR(s, i) = forcedR(s, i) + 1.0_DP/(2.0_DP*dsqrt(eigenvalues(j)))*temp(j,j)
end if
end do
end do
counter = counter + 1
end if
end do
forcedR = forcedR*nat/nat_sl
do s=1,3,1
do i=1,3,1
temp = MATMUL(TRANSPOSE(Cpq), MATMUL(dCpqdh(:, :, s, i), Cpq))
do j=1,3*nat_sl,1
if(eigenvalues(j).ge.0.0_DP) then
forcedh(s, i) = forcedh(s, i) + 1.0_DP/(2.0_DP*dsqrt(eigenvalues(j)))*temp(j,j)
end if
end do
end do
end do
forcedh = forcedh*nat/nat_sl
call mp_sum(forcedR, intra_image_comm)
call mp_sum(forcedh, intra_image_comm)
end if
call stop_clock('mbd_int_forces')
call stop_clock('mbd_int_energy')
return
end subroutine mbdvdw_interacting_energy
But when run, it's complaining that
[MathBook Pro:58100] *** An error occurred in MPI_Type_create_subarray
[MathBook Pro:58100] *** reported by process [2560884737,2314885530279477248]
[MathBook Pro:58100] *** on communicator MPI_COMM_WORLD
[MathBook Pro:58100] *** MPI_ERR_ARG: invalid argument of some other kind
[MathBook Pro:58100] *** MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort,
[MathBook Pro:58100] *** and potentially your MPI job)
so something is going wrong, but I have no idea what. I know my description is somewhat sparse to start with, so please let me know what information would be necessary to help.