Invalid buffer pointer of MPI Fortran Code - fortran

I got my parallel code (conductivityMAINp.f90 and conductivityCALp.f90) work and update them below. Can I ask some more questions?
I find that the results from my serial and parallel codes give the almost same value but the decimal part of the value is different. I paste one of the test result below. Do you think that this difference between decimal part is normal or there may still be something wrong with code? Do you think that the results from serial and parallel code should be exactly the same or not?
serial version
(-50979.1014624820,-8.548064305026142E-013)
parallel version
(-50979.0937138954,-6.321723719222822E-013)
I also compared the files generated by serial and parallel ones. I find that some files have different size; like these files below.
serial version
par.dat 26600
con.dat 3730147
parallel version
par.dat 266
con.dat 37623
I understand that different processes enter these files and write down data into them separately so the data in these files were erased and overwritten by different processes. This is why the data in these files from serial and parallel ones are different from each other. Do you think that there is a way to keep data from all processes in the same file?
Would you please recommend some textbooks for the MPI skills to me? I want to have a better understanding of the parallelization.
The conductivityMAINp.f90 source code
PROGRAM MAIN
USE MPI
USE CAL
IMPLICIT NONE
!Variables for setting up the parameters in INPUT.dat file
CHARACTER (LEN=50) :: na(2) !Array to store the names of Hamiltonian files from wannier90
INTEGER :: km(2) !k point mesh
INTEGER :: vd !Velocity direction of the Hamiltonian matrix
DOUBLE PRECISION :: fermi !Fermi energy value
DOUBLE PRECISION :: bv !Broadening value
!
!Variables for parameters in '.wout' file
INTEGER :: sta !Status of files
DOUBLE PRECISION :: rea_c(3,3) !Lattice constant of unit cell in real space
DOUBLE PRECISION :: rec_c(3,3) !Vectors of unit cell in the reciprocal space
!
!Variables for parameters in Hamiltonian ('_hr.dat') file from wannier90
INTEGER :: nu_wa !Number of wannier function
INTEGER :: nu_nr !Number of Wigner-Seitz grid point
INTEGER, ALLOCATABLE :: nd(:) !Degeneracy of each Wigner-Seitz grid point
DOUBLE PRECISION, ALLOCATABLE :: hr(:,:) !Array to store the Hamitlonian matrix information in '_hr.dat' file
!
!Internal variables
INTEGER :: i, j, k, l, n !Integer for loop
CHARACTER (LEN=100) :: str !String for transitting data
DOUBLE PRECISION :: tr(3) !Array for transitting data
DOUBLE PRECISION, ALLOCATABLE :: kp(:,:) !Array to store the Cartesian coordinate of k-point mesh
DOUBLE PRECISION, ALLOCATABLE :: ka(:,:,:) !Array to store the Cartesian coordiantes of all k points
DOUBLE COMPLEX, ALLOCATABLE :: tb(:,:) !Array to store the extracted tight binding Hamiltonian matrix
DOUBLE COMPLEX, ALLOCATABLE :: ec(:,:) !Array to store the Eigen vector matrix
DOUBLE PRECISION, ALLOCATABLE :: ev(:,:) !Array to store the Eigen value on single k point
DOUBLE COMPLEX, ALLOCATABLE :: vh(:,:) !Array to store the velocity of Hamiltonian matrix
DOUBLE PRECISION :: dk(2) !Array to store the Delta kx and ky
DOUBLE COMPLEX :: sc !Sum of conductivity on all km(1) k points
DOUBLE COMPLEX, ALLOCATABLE :: ct_all(:) !Array of ct
DOUBLE COMPLEX :: ct !Sum of conductivity on all k points
DOUBLE COMPLEX :: ct_total !Sum of conductivity
!
!Parameters for timer
INTEGER :: cr, t00, t0, t !Timer variables
DOUBLE PRECISION :: ra !Timer rate
!Parameters for MPI
INTEGER :: world_size !MPI
INTEGER :: world_rank, ierr !MPI
INTEGER :: irank, j0 !MPI
!
!Initializing MPI
CALL MPI_Init(ierr)
CALL MPI_Comm_size(MPI_COMM_WORLD, world_size, ierr)
CALL MPI_Comm_rank(MPI_COMM_WORLD, world_rank, ierr)
!
!Initializing timer
IF (world_rank .EQ. 0) THEN
CALL system_clock(count_rate=cr)
ra = REAL(cr)
END IF
!
!Starting timer for reading and broadcasting all input parameters
IF (world_rank .EQ. 0) THEN
CALL system_clock(t00)
CALL system_clock(t0)
END IF
!
!Reading the parameters in the INPUT.dat file
IF (world_rank .EQ. 0) THEN
!Opening INPUT.dat file
OPEN (UNIT=3, FILE='INPUT.dat', STATUS='OLD')
!
READ (UNIT=3, FMT=*)
READ (UNIT=3, FMT='(a)') na(1)
READ (UNIT=3, FMT=*)
READ (UNIT=3, FMT='(a)') na(2)
DO i = 1, 8, 1
READ (UNIT=3, FMT=*)
END DO
READ (UNIT=3, FMT=*) km
READ (UNIT=3, FMT=*)
READ (UNIT=3, FMT=*) vd
READ (UNIT=3, FMT=*)
READ (UNIT=3, FMT=*) fermi
READ (UNIT=3, FMT=*)
READ (UNIT=3, FMT=*)
READ (UNIT=3, FMT=*)
READ (UNIT=3, FMT=*) bv
!
!Closing INPUT.dat file
CLOSE(UNIT=3)
!
!Opening files with magnetization along z axis
OPEN (UNIT=4, FILE=TRIM(ADJUSTL(na(2))), STATUS='OLD', IOSTAT=sta)
OPEN (UNIT=6, FILE=TRIM(ADJUSTL(na(1))), STATUS='OLD')
!
END IF
!
!Broadcasting parameters from rank 0 to all other ranks
CALL MPI_Bcast(na, 50*2, MPI_char, 0, MPI_COMM_WORLD, ierr)
CALL MPI_Bcast(km, 2, MPI_int, 0, MPI_COMM_WORLD, ierr)
CALL MPI_Bcast(vd, 1, MPI_int, 0, MPI_COMM_WORLD, ierr)
CALL MPI_Bcast(fermi, 1, MPI_double, 0, MPI_COMM_WORLD, ierr)
CALL MPI_Bcast(bv, 1, MPI_double, 0, MPI_COMM_WORLD, ierr)
!
!Allocating array to store Cartesian coordinates of all k points
ALLOCATE (ka(km(2),km(1),3))
!
!Insitialising the array to store Carteisan coordiantes of all k points
ka = 0.0d0
!
!Reading the '.wout' file, generating coordiantes of all k points and computing delta kx and ky
IF (world_rank .EQ. 0) THEN
!Reading Lattice constant in real space
DO WHILE (sta .EQ. 0)
READ (UNIT=4, FMT='(a)', IOSTAT=sta) str
IF (TRIM(ADJUSTL(str)) .EQ. 'Lattice Vectors (Ang)') THEN
DO i = 1, 3, 1
READ (UNIT=4, FMT='(a)', IOSTAT=sta) str
str = ADJUSTL(str)
READ (UNIT=str(4:), FMT=*) rea_c(i,:)
END DO
EXIT
END IF
END DO
!
!Reading Vectors of unit cell in the reciprocal space
DO WHILE (sta .EQ. 0)
READ (UNIT=4, FMT='(a)', IOSTAT=sta) str
IF (TRIM(ADJUSTL(str)) .EQ. 'Reciprocal-Space Vectors (Ang^-1)') THEN
DO i = 1, 3, 1
READ (UNIT=4, FMT='(a)', IOSTAT=sta) str
str = ADJUSTL(str)
READ (UNIT=str(4:), FMT=*) rec_c(i,:)
END DO
EXIT
END IF
END DO
!
!Closing the output file with magnetization along z axis
CLOSE (UNIT=4)
!
!Generating the Cartesian coordinates for Monkhorst k-point mesh
OPEN (UNIT=5, FILE='k_cartesian.dat', STATUS='UNKNOWN')
WRITE (UNIT=5, FMT='(I10)') km(1) * km(2)
DO i = 1, km(2), 1
DO j = 1, km(1), 1
tr(1) = 0.0d0 + 1.0d0 / DBLE(km(1)) * DBLE(j - 1)
tr(2) = 0.0d0 + 1.0d0 / DBLE(km(2)) * DBLE(i - 1)
tr(3) = 0.0d0
ka(i,j,1) = tr(1) * rec_c(1,1) + tr(2) * rec_c(2,1) +&
tr(3) * rec_c(3,1)
ka(i,j,2) = tr(1) * rec_c(1,2) + tr(2) * rec_c(2,2) +&
tr(3) * rec_c(3,2)
ka(i,j,3) = tr(1) * rec_c(1,3) + tr(2) * rec_c(2,3) +&
tr(3) * rec_c(3,3)
WRITE (UNIT=5, FMT='(F15.8,3X,F15.8,3X,F15.8)') ka(i,j,1:3)
END DO
END DO
CLOSE (UNIT=5)
!
!Computing Delta kx and ky
dk(1) = DSQRT(rec_c(1,1) ** 2 + rec_c(1,2) ** 2 + rec_c(1,3) ** 2) / DBLE(km(1))
dk(2) = DSQRT(rec_c(2,1) ** 2 + rec_c(2,2) ** 2 + rec_c(2,3) ** 2) / DBLE(km(2))
!
END IF
!
!Broadcasting lattice constants in both real and reciprocal spaces, the Cartesian coordiantes of all k points and
!delta kx and ky from rank 0 to all ranks
CALL MPI_Bcast(rea_c, 3*3, MPI_double, 0, MPI_COMM_WORLD, ierr)
CALL MPI_Bcast(rec_c, 3*3, MPI_double, 0, MPI_COMM_WORLD, ierr)
CALL MPI_Bcast(ka, km(2)*km(1)*3, MPI_double, 0, MPI_COMM_WORLD, ierr)
CALL MPI_Bcast(dk, 2, MPI_double, 0, MPI_COMM_WORLD, ierr)
!
!Stopping timer for reading and broadcasting all input parameters
IF (world_rank .EQ. 0) THEN
CALL system_clock(t)
WRITE (*,'(A,F10.3)') "Time for INIT (seconds):", (t - t0) / ra
END IF
!
!Starting timer for computing conductivity
IF (world_rank .EQ. 0) THEN
CALL system_clock(t0)
END IF
!
!Reading number of wannier function
IF (world_rank .EQ. 0) THEN
READ (UNIT=6, FMT=*)
READ (UNIT=6, FMT=*) nu_wa
!Reading number of Wigner-Seitz grind point in Hamiltonian file
READ (UNIT=6, FMT=*) nu_nr
!
END IF
!
!Broadcasting number of wannier function and the degenerancy of each Wigner-Seitz grid point from rank 0 to all other ranks
CALL MPI_Bcast(nu_wa, 1, MPI_int, 0, MPI_COMM_WORLD, ierr)
CALL MPI_Bcast(nu_nr, 1, MPI_int, 0, MPI_COMM_WORLD, ierr)
!
!Allocating the array to store the degeneracy of each Wigner-Seitz grid point
ALLOCATE (nd(nu_nr))
!
!Allocating array to store k point, Hamiltonian matrix, eigen vector matrix and eigen value
!Allocating the array to store the Cartesian coordinates of k-point mesh
ALLOCATE (kp(km(1),3))
!
!Allocating the array to store the extracted tight binding Hamiltonian matrix
ALLOCATE (tb(nu_wa*km(1),nu_wa))
!
!Allocating the array to store the tight binding Eigen vector matrix
ALLOCATE (ec(nu_wa*km(1),nu_wa))
!
!Allocating the array to store the tight binding Eigen value
ALLOCATE (ev(km(1),nu_wa))
!
!Allocating array to store the velocity of Hamiltonian matrix
ALLOCATE (vh(nu_wa*km(1),nu_wa*2))
!
!Allocating the array to store Hamiltonian matrix information in '_hr.dat' file from wannier90
ALLOCATE (hr(nu_wa**2*nu_nr,7))
!
!Reading relevant information in Hamiltonian matrix
!Reading the degeneracy of each Wigner-Seitz grid point
IF (world_rank .EQ. 0) THEN
IF (MOD(nu_nr, 15) .EQ. 0) THEN
DO i = 1, nu_nr / 15, 1
READ (UNIT=6, FMT=*) nd(1+(i-1)*15:i*15)
END DO
ELSE
DO i = 1, nu_nr / 15, 1
READ (UNIT=6, FMT=*) nd(1+(i-1)*15:15+i*15)
END DO
READ (UNIT=6, FMT=*) nd(1+(nu_nr/15)*15:nu_nr)
END IF
!
!Reading the Hamiltonian matrix information in '_hr.dat' file
DO i = 1, nu_wa**2*nu_nr, 1
READ (UNIT=6, FMT=*) hr(i,:)
END DO
!Converting the unit number into coordinate for R in exponent term of phase factor in
!tight binding Hamiltonian matrix for magnetic moment along z axis case
DO i = 1, nu_wa**2*nu_nr, 1
tr(1) = hr(i,1) * rea_c(1,1) + hr(i,2) * rea_c(2,1) + hr(i,3) * rea_c(3,1)
tr(2) = hr(i,1) * rea_c(1,2) + hr(i,2) * rea_c(2,2) + hr(i,3) * rea_c(3,2)
tr(3) = hr(i,1) * rea_c(1,3) + hr(i,2) * rea_c(2,3) + hr(i,3) * rea_c(3,3)
hr(i,1:3) = tr
END DO
!
END IF
!Broadcasting Hamiltonian from rank 0 to all other ranks
CALL MPI_Bcast(nd, nu_nr, MPI_int, 0, MPI_COMM_WORLD, ierr)
CALL MPI_Bcast(hr, nu_wa**2*nu_nr*7, MPI_double, 0, MPI_COMM_WORLD, ierr)
!
!Opening file that stores the total conductivity value
OPEN (UNIT=7, FILE='Conductivity.dat', STATUS='UNKNOWN')
!
!!Building up the Hamitonian
!Initialising array used to store the total conductivity
ct = CMPLX(0.0d0, 0.0d0)
!
!opening test files
open (unit=21,file='normalisedprefactor.dat',status='unknown')
open (unit=22,file='gd.dat',status='unknown')
open (unit=23,file='con.dat',status='unknown')
open (unit=24,file='par.dat',status='unknown')
open (unit=25,file='grga.dat',status='unknown')
open (unit=26,file='nfdk.dat',status='unknown')
!Reading the Cartesian coordinates of k-point mesh
DO j = 1, km(2), 1
IF (mod(j-1, world_size) .NE. world_rank) CYCLE
DO k = 1, km(1), 1
kp(k,:) = ka(j,k,:)
END DO
!Building up Hamiltonian matrix on k points and diagonalising the matrix to obtain Eigen vectors and values
CALL HAMCON(vd,kp,nu_wa,nu_nr,km(1),nd,hr,tb,ec,ev,vh,fermi,bv,dk,sc)
!
ct = ct + sc
END DO
!
CALL MPI_Barrier(MPI_COMM_WORLD, ierr)
IF (world_rank .EQ. 0) THEN
ALLOCATE (ct_all(world_size))
END IF
ct_all = CMPLX(0.0d0, 0.0d0)
CALL MPI_Gather(ct, 1, MPI_double_complex, ct_all, 1, MPI_double_complex, 0, MPI_COMM_WORLD, ierr)
!Writing total conductivity value into the file
IF (world_rank .EQ. 0) THEN
ct_total = CMPLX(0.0d0, 0.0d0)
DO i = 1, world_size, 1
ct_total = ct_total + ct_all(i)
END DO
WRITE (UNIT=7, FMT='(A33,$)') 'Conductivity without coeffieicnt:'
WRITE (UNIT=7, FMT=*) ct_total
WRITE (UNIT=7, FMT='(A30,$)') 'Conductivity with coefficient:'
WRITE (UNIT=7, FMT=*) !ct_total /
END IF
!
IF (world_rank .EQ. 0) THEN
DEALLOCATE (ct_all)
END IF
!Stopping timer for computing conductivity
IF (world_rank .EQ. 0) THEN
CALL system_clock(t)
WRITE (*,'(A,f10.3)') "Time for HAM&CON (seconds):", (t-t0)/ra
WRITE (*,'(A,f10.3)') "Time for ALL (seconds):", (t-t00)/ra
END IF
!
!Finalising MPI
CALL MPI_Finalize(ierr)
!
!Deallocating array that stores the degeneracy of each Wigner-Seitz grid point
DEALLOCATE (nd)
!
!Deallocating array that stores the Hamitlonian matrix information in '_hr.dat' file
DEALLOCATE (hr)
!
!Deallocating the array to store the Cartesian coordinates of k-point mesh
DEALLOCATE (kp)
!
!Deallocating the array to store the extracted tight binding Hamiltonian matrix
DEALLOCATE (tb)
!
!Deallocating array that stores the tight binding Eigen vector matrix
DEALLOCATE (ec)
!
!Deallocating array that stores the tight binding Eigen value
DEALLOCATE (ev)
!
!Deallocating array to store the velocity of Hamiltonian matrix
DEALLOCATE (vh)
!
!Closing files with magnetization along z axis
CLOSE (UNIT=6)
!
!Closing file that store the total conductivity
CLOSE (UNIT=7)
!
close(unit=21)
close(unit=22)
close(unit=23)
close(unit=24)
close(unit=25)
close(unit=26)
STOP
END PROGRAM MAIN
The conductivityCALp.f90 source code
MODULE CAL
!USE MKL!LAPACK
IMPLICIT NONE
CONTAINS
!Building up tight binding Hamiltonian matrix and computing eigen vector matrix and eigen value
SUBROUTINE HAMCON(vd,kp,nu_wa,nu_ws,nu_kp,nd,hr,tb,ec,ev,vh,fermi,bv,dk,sc)
!External variables
INTEGER :: vd !Velocity direction of the Hamiltonian matrix
DOUBLE PRECISION :: kp(:,:) !Array to store the Cartesian coordinate of k-point mesh
INTEGER :: nu_wa !Number of wannier function
INTEGER :: nu_ws !Number of Wigner-Seitz grid point for different magnetic moment direction cases
INTEGER, ALLOCATABLE :: nd(:) !Degeneracy of each Wigner-Seitz grid point
DOUBLE PRECISION, ALLOCATABLE :: hr(:,:) !Array to store the Hamitlonian matrix information in '_hr.dat' file
DOUBLE COMPLEX, ALLOCATABLE :: tb(:,:) !Array to store the extracted tight binding Hamiltonian matrix
DOUBLE COMPLEX, ALLOCATABLE :: ec(:,:) !Array to store the Eigen vector matrix
DOUBLE PRECISION :: ev(:,:) !Array to store the Eigen value
DOUBLE COMPLEX, ALLOCATABLE :: vh(:,:) !Array to store the velocity of Hamiltonian matrix
DOUBLE PRECISION :: fermi !Fermi energy value
DOUBLE PRECISION :: bv !Broadening value
DOUBLE PRECISION :: dk(2) !Array to store the Delta kx and ky
DOUBLE COMPLEX :: sc !Sum of conductivity on all km(1) k points
!
!Internal variables
INTEGER :: nu_kp !Number of k point passed by the main code
INTEGER :: i, j, k, l, m !Integer for loop
DOUBLE COMPLEX :: dc(3) !Array to store complex number i
DOUBLE COMPLEX, ALLOCATABLE :: tr1(:,:) !Array for transitting Hamiltonian matrix
DOUBLE COMPLEX, ALLOCATABLE :: tr2(:,:) !Array for transitting Hamiltonian matrix
DOUBLE COMPLEX, ALLOCATABLE :: tr3(:,:) !Array for transitting Hamiltonian matrix
DOUBLE COMPLEX, ALLOCATABLE :: tr4(:,:) !Array for transitting Hamiltonian matrix
!
!Variables for ZHEEV subroutine
DOUBLE COMPLEX, ALLOCATABLE :: a(:,:) !Array for transitting the Eigen vector matrix
DOUBLE PRECISION, ALLOCATABLE :: w(:) !Array for transitting the Eigen value
INTEGER :: n, lda, lwork, info !Parameters in ZHEEV subroutine
DOUBLE PRECISION, ALLOCATABLE :: rwork(:) !Parameters in ZHEEV subroutine
DOUBLE COMPLEX, ALLOCATABLE :: work(:) !Parameters in ZHEEV subroutine
!
!Variables for computing conductivity
DOUBLE COMPLEX :: gr(2) !Array to store the retarded Green functions
DOUBLE COMPLEX :: ga(2) !Array to store the advanced Green functions
DOUBLE COMPLEX :: gd(2) !Array to store the Gr - Ga
DOUBLE COMPLEX, ALLOCATABLE :: mt1(:,:) !Array for storing conjugate eigen vectors
DOUBLE COMPLEX, ALLOCATABLE :: mt2(:,:) !Array for storing eigen vectors
DOUBLE COMPLEX, ALLOCATABLE :: mt3(:,:) !Array for storing conjugate eigen vectors
DOUBLE COMPLEX, ALLOCATABLE :: mt4(:,:) !Array for storing eigen vectors
DOUBLE COMPLEX, ALLOCATABLE :: mt5(:,:) !Array for storing velocity of Hamiltonian
DOUBLE PRECISION, ALLOCATABLE :: nm(:) !Array for storage of normalising prefactor
DOUBLE COMPLEX :: oc(2) !Conductivity value on single k point
!
write(unit=24,fmt=*)vd,nu_wa,nu_ws,fermi,bv,dk(1),dk(2)
tb = CMPLX(0.0d0, 0.0d0)
dc(1) = CMPLX(0.0d0, 1.0d0)
!Allocating array to transit Hamiltonian matrix
ALLOCATE (tr1(nu_wa,nu_wa))
ALLOCATE (tr2(nu_wa,nu_wa))
ALLOCATE (tr3(nu_wa,nu_wa))
ALLOCATE (tr4(nu_wa,nu_wa))
!
!Building up Hamiltonian matrix
DO i = 1, nu_kp, 1
tr1 = CMPLX(0.0d0, 0.0d0)
DO j = 1, nu_ws, 1
tr2 = CMPLX(0.0d0, 0.0d0)
DO k = 1, nu_wa**2, 1
l = hr(k+(j-1)*nu_wa**2,4)
m = hr(k+(j-1)*nu_wa**2,5)
dc(2) = CMPLX(hr(k+(j-1)*nu_wa**2,6), hr(k+(j-1)*nu_wa**2,7))
tr2(l,m) = EXP(dc(1) * (kp(i,1)*hr(k+(j-1)*nu_wa**2,1)&
+kp(i,2)*hr(k+(j-1)*nu_wa**2,2)&
+kp(i,3)*hr(k+(j-1)*nu_wa**2,3)))&
* dc(2)
END DO
tr2 = tr2 / DBLE(nd(j))
tr1 = tr1 + tr2
END DO
DO j = 1, nu_wa, 1
l = j + (i-1) * nu_wa
DO k = 1, nu_wa, 1
tb(l,k) = tb(l,k) + tr1(j,k)
END DO
END DO
END DO
!
!Initialising the array to store the Eigen vector matrix
ec = CMPLX(0.0d0, 0.0d0)
!
!Initialising the array to store the Eigen value
ev = 0.0d0
!
!Setting up all parameters used by ZHEEV subroutine
n = nu_wa
lda = nu_wa
ALLOCATE (a(nu_wa,nu_wa)) !Transitting Eigen vector matrix
ALLOCATE (w(nu_wa)) !Transitting Eigen value
ALLOCATE (work(2*nu_wa-1))
lwork = 2 * nu_wa - 1
ALLOCATE (rwork(3*nu_wa-2))
!
!Computing Hamiltonian matrix, Eigen vector matrix and Eigen value on each k point
DO i = 1, nu_kp, 1
!Initialising parameters used by ZHEEV subroutine
a = CMPLX(0.0d0, 0.0d0)
w = 0.0d0
work = CMPLX(0.0d0, 0.0d0)
rwork = 0.0d0
!
DO j = 1, nu_wa, 1
a(j,:) = tb(j+(i-1)*nu_wa,:)
END DO
CALL ZHEEV('V','L',n,a,lda,w,work,lwork,rwork,info)
DO j = 1, nu_wa, 1
ec(1+(i-1)*nu_wa:i*nu_wa,j) = a(:,j)
END DO
ev(i,:) = w
END DO
!
!Computing the velocity of the Hamiltonian matrix
vh = CMPLX(0.0d0, 0.0d0)
DO i = 1, nu_kp, 1
tr1 = CMPLX(0.0d0, 0.0d0)
tr2 = CMPLX(0.0d0, 0.0d0)
DO j = 1, nu_ws, 1
tr3 = CMPLX(0.0d0, 0.0d0)
tr4 = CMPLX(0.0d0, 0.0d0)
DO k = 1, nu_wa**2, 1
l = hr(k+(j-1)*nu_wa**2,4)
m = hr(k+(j-1)*nu_wa**2,5)
dc(2) = CMPLX(hr(k+(j-1)*nu_wa**2,6), hr(k+(j-1)*nu_wa**2,7))
!vx
dc(3) = CMPLX(hr(k+(j-1)*nu_wa**2,1), 0.0d0)
tr3(l,m) = EXP(dc(1) * (kp(i,1)*hr(k+(j-1)*nu_wa**2,1)&
+kp(i,2)*hr(k+(j-1)*nu_wa**2,2)&
+kp(i,3)*hr(k+(j-1)*nu_wa**2,3)))&
* dc(2) * dc(1) * dc(3)
!
!Vy
dc(3) = CMPLX(hr(k+(j-1)*nu_wa**2,2), 0.0d0)
tr4(l,m) = EXP(dc(1) * (kp(i,1)*hr(k+(j-1)*nu_wa**2,1)&
+kp(i,2)*hr(k+(j-1)*nu_wa**2,2)&
+kp(i,3)*hr(k+(j-1)*nu_wa**2,3)))&
* dc(2) * dc(1) * dc(3)
!
END DO
tr3 = tr3 / DBLE(nd(j))
tr4 = tr4 / DBLE(nd(j))
!Vx
tr1 = tr1 + tr3
!
!Vy
tr2 = tr2 + tr3
!
END DO
DO j = 1, nu_wa, 1
l = j + (i-1) * nu_wa
DO k = 1, nu_wa, 1
vh(l,k) = vh(l,k) + tr1(j,k)
vh(l,k+nu_wa) = vh(l,k+nu_wa) + tr2(j,k)
END DO
END DO
END DO
!
!Computing the conductivity
!
!Allocating the arrays that store the eigen vector, velocity of Hamiltonian and normalising prefactor
ALLOCATE (mt1(1,nu_wa))
ALLOCATE (mt2(nu_wa,1))
ALLOCATE (mt3(1,nu_wa))
ALLOCATE (mt4(nu_wa,1))
ALLOCATE (mt5(nu_wa,nu_wa))
ALLOCATE (nm(nu_wa))
!
!Initialising the array that stores the conductivity values on all km(1) k points
sc = CMPLX(0.0d0, 0.0d0)
!
!Computing the conductivity
DO i = 1, nu_kp, 1
!Normalized factor part
DO j = 1, nu_wa, 1
mt1(1,:) = DCONJG(ec(1+(i-1)*nu_wa:i*nu_wa,j))
mt2(:,1) = ec(1+(i-1)*nu_wa:i*nu_wa,j)
nm(j) = REAL(SUM(MATMUL(mt1,mt2)))
WRITE (UNIT=21, FMT=*) SUM(MATMUL(mt1,mt2))
nm(j) = 1.0d0 / DSQRT(nm(j))
END DO
!
!Velocity of Hamiltonian
IF (vd .EQ. 0) THEN
mt5 = vh(1+(i-1)*nu_wa:i*nu_wa,1:nu_wa)
ELSE
mt5 = vh(1+(i-1)*nu_wa:i*nu_wa,1+nu_wa:2*nu_wa)
END IF
!
!Conductivity part
oc = CMPLX(0.0d0, 0.0d0)
DO j = 1, nu_wa, 1
gr(1) = CMPLX (1.0d0, 0.0d0) / CMPLX(fermi - ev(i,j), bv)
ga(1) = CMPLX (1.0d0, 0.0d0) / CMPLX(fermi - ev(i,j), 0.0d0 - bv)
gd(1) = gr(1) - ga(1)
mt1(1,:) = DCONJG(ec(1+(i-1)*nu_wa:i*nu_wa,j))
mt2(:,1) = ec(1+(i-1)*nu_wa:i*nu_wa,j)
DO k = 1, nu_wa, 1
gr(2) = CMPLX (1.0d0, 0.0d0) / CMPLX(fermi - ev(i,k), bv)
ga(2) = CMPLX (1.0d0, 0.0d0) / CMPLX(fermi - ev(i,k), 0.0d0 - bv)
gd(2) = gr(2) - ga(2)
mt3(1,:) = DCONJG(ec(1+(i-1)*nu_wa:i*nu_wa,k))
mt4(:,1) = ec(1+(i-1)*nu_wa:i*nu_wa,k)
oc(1) = SUM(MATMUL(mt1,MATMUL(mt5,mt4)))*SUM(MATMUL(mt3,MATMUL(mt5,mt2)))*&
gd(1)*gd(2)*nm(j)*nm(j)*nm(k)*nm(k)*dk(1)*dk(2)
write(unit=22,fmt=*) SUM(MATMUL(mt1,MATMUL(mt5,mt4))), SUM(MATMUL(mt3,MATMUL(mt5,mt2)))
write(unit=25,fmt=*) gr(1),ga(1),gr(2),ga(2)
write(unit=26,fmt=*) nm(j), nm(k), dk(1), dk(2)
oc(2) = oc(2) + oc(1)
END DO
END DO
sc = sc + oc(2)
write(unit=23,fmt=*) oc(2),sc
!
END DO
!
!Deallocating arrays used for transitting Hamiltonian
DEALLOCATE (tr1)
DEALLOCATE (tr2)
DEALLOCATE (tr3)
DEALLOCATE (tr4)
!
!Deallocating arrays used by ZHEEV subroutine
DEALLOCATE (a)
DEALLOCATE (w)
DEALLOCATE (rwork)
DEALLOCATE (work)
!
!Deallocating arrays used to transitting eigen vectors
DEALLOCATE (mt1)
DEALLOCATE (mt2)
DEALLOCATE (mt3)
DEALLOCATE (mt4)
!
!Deallocating array used to transitting velocity of Hamiltonian
DEALLOCATE (mt5)
!
!Deallocating array used to store the normalising prefactor
DEALLOCATE (nm)
!
RETURN
END SUBROUTINE HAMCON
!
END MODULE CAL

Related

Question about MPI_SEND and MPI_RECV command

I modified my code and put MPI_RECV before MPI_SEND. This time, I did not receive any error message; however, it seemed that code still encountered deadlock. The reason is that I opened some files (UNIT=11, 12, 13,14) before MPI_RECV and MPI_SEND commands; then, I collected data through these two commands and wrote them into these files but there was no data written into these files. I paste my modified code below. Would you please have a look at it and give me some suggestions? Thank you so much.
PROGRAM MAIN
USE MPI
USE CAL
IMPLICIT NONE
INTEGER :: nb !Number of valence band
DOUBLE PRECISION :: me !Minimum eigen value
DOUBLE COMPLEX, ALLOCATABLE :: u_s1(:,:) !Array to store the contribution of each eigen state to the total spin orbit torque
DOUBLE COMPLEX, ALLOCATABLE :: u_s2(:,:) !Array to store the contribution of each eigen state to the total spin orbit torque
DOUBLE COMPLEX, ALLOCATABLE :: u_t1(:,:) !Array to collect the contribution of each eigen state to the total spin orbit torque from all processors
DOUBLE COMPLEX, ALLOCATABLE :: u_t2(:,:) !Array to collect the contribution of each eigen state to the total spin orbit torque from all processors
DOUBLE COMPLEX :: sr1 !Sum of Femri surface part for spin orbit torque on all km(1) k points
DOUBLE COMPLEX :: sr2 !Sum of Femri surface part for spin orbit torque on all km(1) k points
DOUBLE PRECISION, ALLOCATABLE, TARGET :: nme(:) !Array to store the minimum eigen value
INTEGER, ALLOCATABLE, TARGET :: nnb(:) !Array to store the number of valence band
INTEGER :: world_size !MPI
INTEGER :: world_rank, ierr !MPI
INTEGER :: irank, j0 !MPI
!
!Initializing MPI
CALL MPI_Init(ierr)
CALL MPI_Comm_size(MPI_COMM_WORLD, world_size, ierr)
CALL MPI_Comm_rank(MPI_COMM_WORLD, world_rank, ierr)
!
!Opening file that stores the total spin orbit torque value for the Fermi surface part
OPEN (UNIT=11, FILE='SOT_Surface.dat', STATUS='UNKNOWN')
!
!Opening file that stores the spin orbit torque for the Fermi surface part versus energy
OPEN (UNIT=12, FILE='SOT_Surface_sve_xz.dat', STATUS='UNKNOWN')
OPEN (UNIT=13, FILE='SOT_Surface_sve_yz.dat', STATUS='UNKNOWN')
!
!Opening file that stores the minimum eigen value and number of valence band
OPEN (UNIT=14, FILE='SOT_mineig_numval.dat', STATUS='UNKNOWN')
!
!Allocating the array used to store the contribution of each eigen state to the total spin orbit torque
ALLOCATE (u_s1(2,nu_wa*km(1)))
ALLOCATE (u_s2(2,nu_wa*km(1)))
!
!Allocating array to collect the contribution of each eigen state to the total spin orbit torque from all processors
IF (world_rank .EQ. 0) THEN
ALLOCATE (u_t1(2,nu_wa*km(1)*km(2)))
ALLOCATE (u_t2(2,nu_wa*km(1)*km(2)))
END IF
u_t1 = CMPLX(0.0d0, 0.0d0)
u_t2 = CMPLX(0.0d0, 0.0d0)
!
!Allocating array to collect the number of valence band and the minimum eigen value
IF (world_rank .EQ. 0) THEN
ALLOCATE (nme(km(2)))
ALLOCATE (nnb(km(2)))
END IF
nme = 0.0d0
nnb = 0
!
!Allocating array to collect the contribution of each eigen state to the total spin orbit torque from all processors
IF (world_rank .EQ. 0) THEN
ALLOCATE (u_t1(2,nu_wa*km(1)*km(2)))
ALLOCATE (u_t2(2,nu_wa*km(1)*km(2)))
END IF
u_t1 = CMPLX(0.0d0, 0.0d0)
u_t2 = CMPLX(0.0d0, 0.0d0)
!
!Allocating array to collect the number of valence band and the minimum eigen value
IF (world_rank .EQ. 0) THEN
ALLOCATE (nme(km(2)))
ALLOCATE (nnb(km(2)))
END IF
nme = 0.0d0
nnb = 0
!
!opening test files
open (unit=21,file='normalisedprefactor.dat',status='unknown')
open (unit=22,file='gd.dat',status='unknown')
open (unit=23,file='con.dat',status='unknown')
open (unit=24,file='par.dat',status='unknown')
open (unit=25,file='grga.dat',status='unknown')
open (unit=26,file='nfdk.dat',status='unknown')
!Reading the Cartesian coordinates of k-point mesh
DO j = 1, km(2), 1
IF (mod(j-1, world_size) .NE. world_rank) CYCLE
DO k = 1, km(1), 1
kp(k,:) = ka(j,k,:)
END DO
!Building up Hamiltonian matrix on k points and diagonalising the matrix to obtain Eigen vectors and values
CALL HAMSUR(vd,kp,nu_wa,nu_nr,km(1),nd1,nd2,nd3,nd4,nd5,hr1,hr2,hr3,hr4,hr5,tb,ec,ev,fermi,an,wf,bv,dk,u_s1,u_s2,sr1,sr2,nb,me)
!
CALL MPI_Barrier(MPI_COMM_WORLD, ierr)
IF (WORLD_RANK .EQ. 0) THEN
u_t1(1:2,1+nu_wa*km(1)*(j-1):nu_wa*km(1)*j) = u_s1
u_t2(1:2,1+nu_wa*km(1)*(j-1):nu_wa*km(1)*j) = u_s2
DO k = 1, WORLD_SIZE-1, 1
IF (j-1+k .EQ. km(2)) EXIT
l = k + 101
n = k + 102
CALL MPI_RECV(u_t1(1,1+nu_wa*km(1)*(j-1+k)), 2*nu_wa*km(1), MPI_DOUBLE_COMPLEX, k,l, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
CALL MPI_RECV(u_t2(1,1+nu_wa*km(1)*(j-1+k)), 2*nu_wa*km(1), MPI_DOUBLE_COMPLEX, k,n, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
END DO
ELSE
l = WORLD_RANK + 101
n = WORLD_RANK + 102
CALL MPI_SEND(u_s1,2*nu_wa*km(1), MPI_DOUBLE_COMPLEX, 0, l, MPI_COMM_WORLD, ierr)
CALL MPI_SEND(u_s2,2*nu_wa*km(1), MPI_DOUBLE_COMPLEX, 0, n, MPI_COMM_WORLD, ierr)
END IF
crr1 = crr1 + sr1
crr2 = crr2 + sr2
IF (WORLD_RANK .EQ. 0) THEN
nme(j-1) = me
nnb(j-1) = nb
DO k = 1, WORLD_SIZE-1, 1
IF (j-1+k .EQ. km(2)) EXIT
l = k + 103
n = k + 104
CALL MPI_RECV(nme(j-1+k), 1, MPI_DOUBLE, k,l, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
CALL MPI_RECV(nnb(j-1+k), 1, MPI_INT, k,n, MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
END DO
ELSE
l = WORLD_RANK + 103
n = WORLD_RANK + 104
CALL MPI_SEND(me, 1, MPI_DOUBLE, 0, l, MPI_COMM_WORLD, ierr)
CALL MPI_SEND(nb, 1, MPI_INT, 0, n, MPI_COMM_WORLD, ierr)
END IF
END DO
!
CALL MPI_Barrier(MPI_COMM_WORLD, ierr)
IF (world_rank .EQ. 0) THEN
ALLOCATE (crr1_all(world_size))
ALLOCATE (crr2_all(world_size))
END IF
crr1_all = CMPLX(0.0d0, 0.0d0)
crr2_all = CMPLX(0.0d0, 0.0d0)
CALL MPI_Gather(crr1, 1, MPI_double_complex, crr1_all, 1, MPI_double_complex, 0, MPI_COMM_WORLD, ierr)
CALL MPI_Gather(crr2, 1, MPI_double_complex, crr2_all, 1, MPI_double_complex, 0, MPI_COMM_WORLD, ierr)
!Writing total conductivity value into the file
IF (world_rank .EQ. 0) THEN
crr1_total = CMPLX(0.0d0, 0.0d0)
crr2_total = CMPLX(0.0d0, 0.0d0)
DO i = 1, world_size, 1
crr1_total = crr1_total + crr1_all(i)
crr2_total = crr2_total + crr2_all(i)
END DO
!Finding the minimum eigen value
NULLIFY (p1, p2)
p1 => nme(1)
p2 => nnb(1)
DO i = 2, km(2), 1
IF (p1 .GE. nme(i)) THEN
p1 => nme(i)
END IF
IF (p2 .LE. nnb(i)) THEN
p2 => nnb(i)
END IF
END DO
WRITE (UNIT=14, FMT='(A27,$)') 'The minimum eigen value is:'
WRITE (UNIT=14, FMT=*) p1
WRITE (UNIT=14, FMT='(A30,$)') 'The number of valence band is:'
WRITE (UNIT=14, FMT=*) p2
!
!Constant for the coefficient
pi = DACOS(-1.0d0)
hb = 1.054571817d-34 !(unit - J)
es = 1.602176634d-19 !(unit - J*s)
!
WRITE (UNIT=11, FMT='(A55,$)') 'Spin Orbit Torque without coeffieicnt within x-z plane:'
WRITE (UNIT=11, FMT=*) crr1_total
WRITE (UNIT=11, FMT='(A52,$)') 'Spin Orbit Torque with coefficient within x-z plane:'
WRITE (UNIT=11, FMT=*) crr1_total * es ** 2 * hb / 4.0d0 / pi
WRITE (UNIT=11, FMT='(A55,$)') 'Spin Orbit Torque without coeffieicnt within y-z plane:'
WRITE (UNIT=11, FMT=*) crr2_total
WRITE (UNIT=11, FMT='(A52,$)') 'Spin Orbit Torque with coefficient within y-z plane:'
WRITE (UNIT=11, FMT=*) crr2_total * es ** 2 * hb / 4.0d0 / pi
DO i = 1, nu_wa*km(1)*km(2), 1
WRITE (UNIT=12, FMT=*) u_t1(1:2,i)
WRITE (UNIT=13, FMT=*) u_t2(1:2,i)
END DO
END IF
!
!Finalising MPI
CALL MPI_Finalize(ierr)
!
!Deallocating array that sotres and collect the fermi-surface-part contribution of each eigen state to the total spin orbit torque
DEALLOCATE (u_s1)
DEALLOCATE (u_s2)
DEALLOCATE (u_t1)
DEALLOCATE (u_t2)
!
STOP
END PROGRAM MAIN

Gathering data through MPI

I use MPI_Gather command to collect data from each processor but got the following error information (The 523rd line in MAINp.f90 has error).
forrtl: severe (174): SIGSEGV, segmentation fault occurred
Image PC Routine Line Source
sot 0000000000427FD3 Unknown Unknown Unknown
libpthread-2.26.s 00002AAAB0D1C2F0 Unknown Unknown Unknown
sot 000000000041D2AE MAIN__ 523 MAINp.f90
sot 0000000000409B92 Unknown Unknown Unknown
libc-2.26.so 00002AAAB115034A __libc_start_main Unknown Unknown
sot 0000000000409AAA Unknown Unknown Unknown
srun: error: nid01236: task 19: Exited with exit code 174
srun: Terminating job step 14213926.0
slurmstepd: error: *** STEP 14213926.0 ON nid01236 CANCELLED AT 2020-04-23T06:53:35 ***
I do not know why it is wrong. I just want to collect data from each processor. I only put part of my MAINp.F90 below and the error line follows the label (!THIS IS THE ERROR LINE). Would anyone please give me some suggestions? Thank you.
PROGRAM MAIN
USE MPI
USE CAL
IMPLICIT NONE
!Variables for setting up the parameters in INPUT.dat file
CHARACTER (LEN=50) :: na(6) !Array to store the names of Hamiltonian files from wannier90
DOUBLE PRECISION :: an !Angel interval
INTEGER :: km(2) !k point mesh
INTEGER :: vd !Velocity direction of the Hamiltonian matrix
DOUBLE PRECISION :: fermi !Fermi energy value
DOUBLE PRECISION :: wf !Energy window
DOUBLE PRECISION :: bv !Broadening value
DOUBLE PRECISION :: pi !pi
DOUBLE PRECISION :: hb !h_bar
DOUBLE PRECISION :: es !Electron volt
!
!Variables for parameters in '.wout' file
INTEGER :: sta !Status of files
DOUBLE PRECISION :: rea_c(3,3) !Lattice constant of unit cell in real space
DOUBLE PRECISION :: rec_c(3,3) !Vectors of unit cell in the reciprocal space
!
!Variables for parameters in Hamiltonian ('_hr.dat') file from wannier90
INTEGER :: nu_wa !Number of wannier function
INTEGER :: nu_nr(5) !Number of Wigner-Seitz grid point
INTEGER, ALLOCATABLE :: nd1(:) !Degeneracy of each Wigner-Seitz grid point with magnetizaiton along z axis
INTEGER, ALLOCATABLE :: nd2(:) !Degeneracy of each Wigner-Seitz grid point with magnetizaiton along different axes
INTEGER, ALLOCATABLE :: nd3(:) !Degeneracy of each Wigner-Seitz grid point with magnetizaiton along different axes
INTEGER, ALLOCATABLE :: nd4(:) !Degeneracy of each Wigner-Seitz grid point with magnetizaiton along different axes
INTEGER, ALLOCATABLE :: nd5(:) !Degeneracy of each Wigner-Seitz grid point with magnetizaiton along different axes
DOUBLE PRECISION, ALLOCATABLE :: hr1(:,:) !Array to store the Hamitlonian matrix information in '_hr.dat' file, magnetization along z axis
DOUBLE PRECISION, ALLOCATABLE :: hr2(:,:) !Array to store the Hamitlonian matrix information in '_hr.dat' file, magnetization along other axes
DOUBLE PRECISION, ALLOCATABLE :: hr3(:,:) !Array to store the Hamitlonian matrix information in '_hr.dat' file, magnetization along other axes
DOUBLE PRECISION, ALLOCATABLE :: hr4(:,:) !Array to store the Hamitlonian matrix information in '_hr.dat' file, magnetization along other axes
DOUBLE PRECISION, ALLOCATABLE :: hr5(:,:) !Array to store the Hamitlonian matrix information in '_hr.dat' file, magnetization along other axes
!
!Internal variables
INTEGER :: i, j, k, l, n !Integer for loop
CHARACTER (LEN=100) :: str !String for transitting data
DOUBLE PRECISION :: tr(3) !Array for transitting data
DOUBLE PRECISION, ALLOCATABLE :: kp(:,:) !Array to store the Cartesian coordinate of k-point mesh
DOUBLE PRECISION, ALLOCATABLE :: ka(:,:,:) !Array to store the Cartesian coordiantes of all k points
DOUBLE COMPLEX, ALLOCATABLE :: tb(:,:) !Array to store the extracted tight binding Hamiltonian matrix
DOUBLE COMPLEX, ALLOCATABLE :: ec(:,:) !Array to store the Eigen vector matrix
DOUBLE PRECISION, ALLOCATABLE :: ev(:,:) !Array to store the Eigen value on single k point
DOUBLE PRECISION :: dk(2) !Array to store the Delta kx and ky
INTEGER :: nb !Number of valence band
DOUBLE PRECISION :: me !Minimum eigen value
DOUBLE COMPLEX, ALLOCATABLE :: u_s1(:,:) !Array to store the contribution of each eigen state to the total spin orbit torque
DOUBLE COMPLEX, ALLOCATABLE :: u_s2(:,:) !Array to store the contribution of each eigen state to the total spin orbit torque
DOUBLE COMPLEX, ALLOCATABLE :: u_t1(:,:) !Array to collect the contribution of each eigen state to the total spin orbit torque from all processors
DOUBLE COMPLEX, ALLOCATABLE :: u_t2(:,:) !Array to collect the contribution of each eigen state to the total spin orbit torque from all processors
DOUBLE COMPLEX :: sr1 !Sum of Femri surface part for spin orbit torque on all km(1) k points
DOUBLE COMPLEX :: sr2 !Sum of Femri surface part for spin orbit torque on all km(1) k points
DOUBLE COMPLEX, ALLOCATABLE :: crr1_all(:) !Array of ct
DOUBLE COMPLEX, ALLOCATABLE :: crr2_all(:) !Array of ct
DOUBLE COMPLEX :: crr1 !Sum of conductivity on all k points
DOUBLE COMPLEX :: crr2 !Sum of conductivity on all k points
DOUBLE COMPLEX :: crr1_total !Sum of conductivity
DOUBLE COMPLEX :: crr2_total !Sum of conductivity
DOUBLE PRECISION, ALLOCATABLE, TARGET :: nme(:) !Array to store the minimum eigen value
INTEGER, ALLOCATABLE, TARGET :: nnb(:) !Array to store the number of valence band
DOUBLE PRECISION, POINTER :: p1 !Pointer used to find the minimum eigen value
INTEGER, POINTER :: p2 !Pointer used to find the number of valence band
!
!Parameters for timer
INTEGER :: cr, t00, t0, t !Timer variables
DOUBLE PRECISION :: ra !Timer rate
!Parameters for MPI
INTEGER :: world_size !MPI
INTEGER :: world_rank, ierr !MPI
INTEGER :: irank, j0 !MPI
!
!Initializing MPI
CALL MPI_Init(ierr)
CALL MPI_Comm_size(MPI_COMM_WORLD, world_size, ierr)
CALL MPI_Comm_rank(MPI_COMM_WORLD, world_rank, ierr)
!
!Allocating the array used to store the contribution of each eigen state to the total spin orbit torque
ALLOCATE (u_s1(2,nu_wa*km(1)))
ALLOCATE (u_s2(2,nu_wa*km(1)))
!
!Initialising array used to store the total conductivity
cr = CMPLX(0.0d0, 0.0d0)
!
!Allocating array to collect the contribution of each eigen state to the total spin orbit torque from all processors
IF (world_rank .EQ. 0) THEN
ALLOCATE (u_t1(2,nu_wa*km(1)*km(2)))
ALLOCATE (u_t2(2,nu_wa*km(1)*km(2)))
END IF
u_t1 = CMPLX(0.0d0, 0.0d0)
u_t2 = CMPLX(0.0d0, 0.0d0)
!
!Allocating array to collect the number of valence band and the minimum eigen value
IF (world_rank .EQ. 0) THEN
ALLOCATE (nme(km(2)))
ALLOCATE (nnb(km(2)))
END IF
nme = 0.0d0
nnb = 0
!
!Reading the Cartesian coordinates of k-point mesh
DO j = 1, km(2), 1
IF (mod(j-1, world_size) .NE. world_rank) CYCLE
DO k = 1, km(1), 1
kp(k,:) = ka(j,k,:)
END DO
!Building up Hamiltonian matrix on k points and diagonalising the matrix to obtain Eigen vectors and values
CALL HAMSUR(vd,kp,nu_wa,nu_nr,km(1),nd1,nd2,nd3,nd4,nd5,hr1,hr2,hr3,hr4,hr5,tb,ec,ev,fermi,an,wf,bv,dk,u_s1,u_s2,sr1,sr2,nb,me)
!
!THIS IS THE ERROR LINE
CALL MPI_Gather(u_s1, 2*nu_wa*km(1), MPI_DOUBLE_COMPLEX, u_t1(1:2,1+nu_wa*km(1)*(j-1):nu_wa*km(1)*j),&
2*nu_wa*km(1), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD, ierr)
CALL MPI_Gat**her(u_s2, 2*nu_wa*km(1), MPI_DOUBLE_COMPLEX, u_t2(1:2,1+nu_wa*km(1)*(j-1):nu_wa*km(1)*j),&
2*nu_wa*km(1), MPI_DOUBLE_COMPLEX, 0, MPI_COMM_WORLD, ierr)
crr1 = crr1 + sr1
crr2 = crr2 + sr2
CALL MPI_Gather(me, 1, MPI_DOUBLE, nme(j), 1, MPI_INT, 0, MPI_COMM_WORLD, ierr)
CALL MPI_Gather(nb, 1, MPI_INT, nnb(j), 1, MPI_INT, 0, MPI_COMM_WORLD, ierr)
END DO
!
CALL MPI_Barrier(MPI_COMM_WORLD, ierr)
IF (world_rank .EQ. 0) THEN
ALLOCATE (crr1_all(world_size))
ALLOCATE (crr2_all(world_size))
END IF
crr1_all = CMPLX(0.0d0, 0.0d0)
crr2_all = CMPLX(0.0d0, 0.0d0)
CALL MPI_Gather(crr1, 1, MPI_double_complex, crr1_all, 1, MPI_double_complex, 0, MPI_COMM_WORLD, ierr)
CALL MPI_Gather(crr2, 1, MPI_double_complex, crr2_all, 1, MPI_double_complex, 0, MPI_COMM_WORLD, ierr)
!Writing total conductivity value into the file
IF (world_rank .EQ. 0) THEN
crr1_total = CMPLX(0.0d0, 0.0d0)
crr2_total = CMPLX(0.0d0, 0.0d0)
DO i = 1, world_size, 1
crr1_total = crr1_total + crr1_all(i)
crr2_total = crr2_total + crr2_all(i)
END DO
!Finding the minimum eigen value
NULLIFY (p1, p2)
p1 => nme(1)
p2 => nnb(1)
DO i = 2, km(2), 1
IF (p1 .GE. nme(i)) THEN
p1 => nme(i)
END IF
IF (p2 .LE. nnb(i)) THEN
p2 => nnb(i)
END IF
END DO
WRITE (UNIT=14, FMT='(A27,$)') 'The minimum eigen value is:'
WRITE (UNIT=14, FMT=*) p1
WRITE (UNIT=14, FMT='(A30,$)') 'The number of valence band is:'
WRITE (UNIT=14, FMT=*) p2
!
!Constant for the coefficient
pi = DACOS(-1.0d0)
hb = 1.054571817d-34 !(unit - J)
es = 1.602176634d-19 !(unit - J*s)
!
END IF
!
IF (world_rank .EQ. 0) THEN
DEALLOCATE (crr1_all)
DEALLOCATE (crr2_all)
END IF
!Finalising MPI
CALL MPI_Finalize(ierr)
!
!Deallocating array that sotres and collect the fermi-surface-part contribution of each eigen state to the total spin orbit torque
DEALLOCATE (u_s1)
DEALLOCATE (u_s2)
DEALLOCATE (u_t1)
DEALLOCATE (u_t2)
!
STOP
END PROGRAM MAIN

Storing a Variable with a Multi-Dimensional Index in Fortran

Question
Consider the following code:
program example
implicit none
integer, parameter :: n_coeffs = 1000
integer, parameter :: n_indices = 5
integer :: i
real(8), dimension(n_coeffs) :: coeff
integer, dimension(n_coeffs,n_indices) :: index
do i = 1, n_coeffs
coeff(i) = real(i*3,8)
index(i,:) = [2,4,8,16,32]*i
end do
end
For any 5 dimensional index I need to obtain the associated coefficient, without knowing or calculating i. For instance, given [2,4,8,16,32] I need to obtain 3.0 without computing i.
Is there a reasonable solution, perhaps using sparse matrices, that would work for n_indices in the order of 100 (though n_coeffs still in the order of 1000)?
A Bad Solution
One solution would be to define a 5 dimensional array as in
real(8), dimension(2000,4000,8000,16000,32000) :: coeff2
do i = 1, ncoeffs
coeff2(index(i,1),index(i,2),index(i,3),index(i,4),index(i,5)) = coeff(i)
end do
then, to get the coefficient associated with [2,4,8,16,32], call
coeff2(2,4,8,16,32)
However, besides being very wasteful of memory, this solution would not allow n_indices to be set to a number higher than 7 given the limit of 7 dimensions to an array.
OBS: This question is a spin-off of this one. I have tried to ask the question more precisely having failed in the first attempt, an effort that greatly benefited from the answer of #Rodrigo_Rodrigues.
Actual Code
In case it helps here is the code for the actual problem I am trying to solve. It is an adaptive sparse grid method for approximating a function. The main goal is to make the interpolation at the and as fast as possible:
MODULE MOD_PARAMETERS
IMPLICIT NONE
SAVE
INTEGER, PARAMETER :: d = 2 ! number of dimensions
INTEGER, PARAMETER :: L_0 = 4 ! after this adaptive grid kicks in, for L <= L_0 usual sparse grid
INTEGER, PARAMETER :: L_max = 9 ! maximum level
INTEGER, PARAMETER :: bound = 0 ! 0 -> for f = 0 at boundary
! 1 -> adding grid points at boundary
! 2 -> extrapolating close to boundary
INTEGER, PARAMETER :: max_error = 1
INTEGER, PARAMETER :: L2_error = 1
INTEGER, PARAMETER :: testing_sample = 1000000
REAL(8), PARAMETER :: eps = 0.01D0 ! epsilon for adaptive grid
END MODULE MOD_PARAMETERS
PROGRAM MAIN
USE MOD_PARAMETERS
IMPLICIT NONE
INTEGER, DIMENSION(d,d) :: ident
REAL(8), DIMENSION(d) :: xd
INTEGER, DIMENSION(2*d) :: temp
INTEGER, DIMENSION(:,:), ALLOCATABLE :: grid_index, temp_grid_index, grid_index_new, J_index
REAL(8), DIMENSION(:), ALLOCATABLE :: coeff, temp_coeff, J_coeff
REAL(8) :: temp_min, temp_max, V, T, B, F, x1
INTEGER :: k, k_1, k_2, h, i, j, L, n, dd, L1, L2, dsize, count, first, repeated, add, ind
INTEGER :: time1, time2, clock_rate, clock_max
REAL(8), DIMENSION(L_max,L_max,2**(L_max),2**(L_max)) :: coeff_grid
INTEGER, DIMENSION(d) :: level, LL, ii
REAL(8), DIMENSION(testing_sample,d) :: x_rand
REAL(8), DIMENSION(testing_sample) :: interp1, interp2
! ============================================================================
! EXECUTABLE
! ============================================================================
ident = 0
DO i = 1,d
ident(i,i) = 1
ENDDO
! Initial grid point
dsize = 1
ALLOCATE(grid_index(dsize,2*d),grid_index_new(dsize,2*d))
grid_index(1,:) = 1
grid_index_new = grid_index
ALLOCATE(coeff(dsize))
xd = (/ 0.5D0, 0.5D0 /)
CALL FF(xd,coeff(1))
CALL FF(xd,coeff_grid(1,1,1,1))
L = 1
n = SIZE(grid_index_new,1)
ALLOCATE(J_index(n*2*d,2*d))
ALLOCATE(J_coeff(n*2*d))
CALL SYSTEM_CLOCK (time1,clock_rate,clock_max)
DO WHILE (L .LT. L_max)
L = L+1
n = SIZE(grid_index_new,1)
count = 0
first = 1
DEALLOCATE(J_index,J_coeff)
ALLOCATE(J_index(n*2*d,2*d))
ALLOCATE(J_coeff(n*2*d))
J_index = 0
J_coeff = 0.0D0
DO k = 1,n
DO i = 1,d
DO j = 1,2
IF ((bound .EQ. 0) .OR. (bound .EQ. 2)) THEN
temp = grid_index_new(k,:)+(/ident(i,:),ident(i,:)*(grid_index_new(k,d+i)-(-1)**j)/)
ELSEIF (bound .EQ. 1) THEN
IF (grid_index_new(k,i) .EQ. 1) THEN
temp = grid_index_new(k,:)+(/ident(i,:),ident(i,:)*(-(-1)**j)/)
ELSE
temp = grid_index_new(k,:)+(/ident(i,:),ident(i,:)*(grid_index_new(k,d+i)-(-1)**j)/)
ENDIF
ENDIF
CALL XX(d,temp(1:d),temp(d+1:2*d),xd)
temp_min = MINVAL(xd)
temp_max = MAXVAL(xd)
IF ((temp_min .GE. 0.0D0) .AND. (temp_max .LE. 1.0D0)) THEN
IF (first .EQ. 1) THEN
first = 0
count = count+1
J_index(count,:) = temp
V = 0.0D0
DO k_1 = 1,SIZE(grid_index,1)
T = 1.0D0
DO k_2 = 1,d
CALL XX(1,temp(k_2),temp(d+k_2),x1)
CALL BASE(x1,grid_index(k_1,k_2),grid_index(k_1,k_2+d),B)
T = T*B
ENDDO
V = V+coeff(k_1)*T
ENDDO
CALL FF(xd,F)
J_coeff(count) = F-V
ELSE
repeated = 0
DO h = 1,count
IF (SUM(ABS(J_index(h,:)-temp)) .EQ. 0) THEN
repeated = 1
ENDIF
ENDDO
IF (repeated .EQ. 0) THEN
count = count+1
J_index(count,:) = temp
V = 0.0D0
DO k_1 = 1,SIZE(grid_index,1)
T = 1.0D0
DO k_2 = 1,d
CALL XX(1,temp(k_2),temp(d+k_2),x1)
CALL BASE(x1,grid_index(k_1,k_2),grid_index(k_1,k_2+d),B)
T = T*B
ENDDO
V = V+coeff(k_1)*T
ENDDO
CALL FF(xd,F)
J_coeff(count) = F-V
ENDIF
ENDIF
ENDIF
ENDDO
ENDDO
ENDDO
ALLOCATE(temp_grid_index(dsize,2*d))
ALLOCATE(temp_coeff(dsize))
temp_grid_index = grid_index
temp_coeff = coeff
DEALLOCATE(grid_index,coeff)
ALLOCATE(grid_index(dsize+count,2*d))
ALLOCATE(coeff(dsize+count))
grid_index(1:dsize,:) = temp_grid_index
coeff(1:dsize) = temp_coeff
DEALLOCATE(temp_grid_index,temp_coeff)
grid_index(dsize+1:dsize+count,:) = J_index(1:count,:)
coeff(dsize+1:dsize+count) = J_coeff(1:count)
dsize = dsize + count
DO i = 1,count
coeff_grid(J_index(i,1),J_index(i,2),J_index(i,3),J_index(i,4)) = J_coeff(i)
ENDDO
IF (L .LE. L_0) THEN
DEALLOCATE(grid_index_new)
ALLOCATE(grid_index_new(count,2*d))
grid_index_new = J_index(1:count,:)
ELSE
add = 0
DO h = 1,count
IF (ABS(J_coeff(h)) .GT. eps) THEN
add = add + 1
J_index(add,:) = J_index(h,:)
ENDIF
ENDDO
DEALLOCATE(grid_index_new)
ALLOCATE(grid_index_new(add,2*d))
grid_index_new = J_index(1:add,:)
ENDIF
ENDDO
CALL SYSTEM_CLOCK (time2,clock_rate,clock_max)
PRINT *, 'Elapsed real time1 = ', DBLE(time2-time1)/DBLE(clock_rate)
PRINT *, 'Grid Points = ', SIZE(grid_index,1)
! ============================================================================
! Compute interpolated values:
! ============================================================================
CALL RANDOM_NUMBER(x_rand)
CALL SYSTEM_CLOCK (time1,clock_rate,clock_max)
DO i = 1,testing_sample
V = 0.0D0
DO L1=1,L_max
DO L2=1,L_max
IF (L1+L2 .LE. L_max+1) THEN
level = (/L1,L2/)
T = 1.0D0
DO dd = 1,d
T = T*(1.0D0-ABS(x_rand(i,dd)/2.0D0**(-DBLE(level(dd)))-DBLE(2*FLOOR(x_rand(i,dd)*2.0D0**DBLE(level(dd)-1))+1)))
ENDDO
V = V + coeff_grid(L1,L2,2*FLOOR(x_rand(i,1)*2.0D0**DBLE(L1-1))+1,2*FLOOR(x_rand(i,2)*2.0D0**DBLE(L2-1))+1)*T
ENDIF
ENDDO
ENDDO
interp2(i) = V
ENDDO
CALL SYSTEM_CLOCK (time2,clock_rate,clock_max)
PRINT *, 'Elapsed real time2 = ', DBLE(time2-time1)/DBLE(clock_rate)
END PROGRAM
For any 5 dimensional index I need to obtain the associated
coefficient, without knowing or calculating i. For instance, given
[2,4,8,16,32] I need to obtain 3.0 without computing i.
function findloc_vector(matrix, vector) result(out)
integer, intent(in) :: matrix(:, :)
integer, intent(in) :: vector(size(matrix, dim=2))
integer :: out, i
do i = 1, size(matrix, dim=1)
if (all(matrix(i, :) == vector)) then
out = i
return
end if
end do
stop "No match for this vector"
end
And that's how you use it:
print*, coeff(findloc_vector(index, [2,4,8,16,32])) ! outputs 3.0
I must confess I was reluctant to post this code because, even though this answers your question, I honestly think this is not what you really want/need, but you dind't provide enough information for me to know what you really do want/need.
Edit (After actual code from OP):
If I decrypted your code correctly (and considering what you said in your previous question), you are declaring:
REAL(8), DIMENSION(L_max,L_max,2**(L_max),2**(L_max)) :: coeff_grid
(where L_max = 9, so size(coeff_grid) = 21233664 =~160MB) and then populating it with:
DO i = 1,count
coeff_grid(J_index(i,1),J_index(i,2),J_index(i,3),J_index(i,4)) = J_coeff(i)
ENDDO
(where count is of the order of 1000, i.e. 0.005% of its elements), so this way you can fetch the values by its 4 indices with the array notation.
Please, don't do that. You don't need a sparse matrix in this case either. The new approach you proposed is much better: storing the indices in each row of an smaller array, and fetching on the array of coefficients by the corresponding location of those indices in its own array. This is way faster (avoiding the large allocation) and much more memory-efficient.
PS: Is it mandatory for you to stick to Fortran 90? Its a very old version of the standard and chances are that the compiler you're using implements a more recent version. You could improve the quality of your code a lot with the intrinsic move_alloc (for less array copies), the kind constants from the intrinsic module iso_fortran_env (for portability), the [], >, <, <=,... notation (for readability)...

Fortran error: size of variable is too large

I have a long program and the goal is to solve the matrix system ax=b. When I run it, it reveals that "error: size of variable is too large".
program ddm
integer :: i,j,k
integer, parameter :: FN=1,FML=80,FMH=80
integer, parameter :: NBE=1*80*80 !NBE=FN*FML*FMH
double precision, dimension(1:3*NBE,1:3*NBE) :: AA
double precision, dimension(1:3*NBE) :: BB
double precision :: XX(3*NBE)
double precision, dimension(1:NBE) :: DSL,DSH,DNN
double precision, dimension(1:FML,1:FMH) :: DSL1,DSH1,DNN1
! Construct a block matrix
AA(1:NBE,1:NBE) = SLSL
AA(1:NBE,NBE+1:2*NBE) = SLSH
AA(1:NBE,2*NBE+1:3*NBE) = SLNN
AA(NBE+1:2*NBE,1:NBE) = SHSL
AA(NBE+1:2*NBE,NBE+1:2*NBE) = SHSH
AA(NBE+1:2*NBE,2*NBE+1:3*NBE) = SHNN
AA(2*NBE+1:3*NBE,1:NBE) = NNSL
AA(2*NBE+1:3*NBE,NBE+1:2*NBE) = NNSH
AA(2*NBE+1:3*NBE,2*NBE+1:3*NBE) = NNNN
! Construct a block matrix for boundary condition
BB(1:NBE) = SLBC
BB(NBE+1:2*NBE) = SHBC
BB(2*NBE+1:3*NBE) = NNBC
call GE(AA,BB,XX,3*NBE)
DSL = XX(1:NBE)
DSH = XX(NBE+1:2*NBE)
DNN = XX(2*NBE+1:3*NBE)
DSL1 = reshape(DSL,(/FML,FMH/))
DSH1 = reshape(DSH,(/FML,FMH/))
DNN1 = reshape(DNN,(/FML,FMH/))
open(unit=2, file='DNN2.txt', ACTION="write", STATUS="replace")
do i=1,80
write(2,'(*(F14.7))') real(DNN1(i,:))
end do
end program ddm
Note: GE(AA,BB,XX,3*NBE) is the function for solving the matrix system. Below is the GE function.
subroutine GE(a,b,x,n)
!===========================================================
! Solutions to a system of linear equations A*x=b
! Method: Gauss elimination (with scaling and pivoting)
!-----------------------------------------------------------
! input ...
! a(n,n) - array of coefficients for matrix A
! b(n) - array of the right hand coefficients b
! n - number of equations (size of matrix A)
! output ...
! x(n) - solutions
! coments ...
! the original arrays a(n,n) and b(n) will be destroyed
! during the calculation
!===========================================================
implicit none
integer n
double precision a(n,n),b(n),x(n)
double precision s(n)
double precision c, pivot, store
integer i, j, k, l
! step 1: begin forward elimination
do k=1, n-1
! step 2: "scaling"
! s(i) will have the largest element from row i
do i=k,n ! loop over rows
s(i) = 0.0
do j=k,n ! loop over elements of row i
s(i) = max(s(i),abs(a(i,j)))
end do
end do
! step 3: "pivoting 1"
! find a row with the largest pivoting element
pivot = abs(a(k,k)/s(k))
l = k
do j=k+1,n
if(abs(a(j,k)/s(j)) > pivot) then
pivot = abs(a(j,k)/s(j))
l = j
end if
end do
! Check if the system has a sigular matrix
if(pivot == 0.0) then
write(*,*) "The matrix is singular"
return
end if
! step 4: "pivoting 2" interchange rows k and l (if needed)
if (l /= k) then
do j=k,n
store = a(k,j)
a(k,j) = a(l,j)
a(l,j) = store
end do
store = b(k)
b(k) = b(l)
b(l) = store
end if
! step 5: the elimination (after scaling and pivoting)
do i=k+1,n
c=a(i,k)/a(k,k)
a(i,k) = 0.0
b(i)=b(i)- c*b(k)
do j=k+1,n
a(i,j) = a(i,j)-c*a(k,j)
end do
end do
end do
! step 6: back substiturion
x(n) = b(n)/a(n,n)
do i=n-1,1,-1
c=0.0
do j=i+1,n
c= c + a(i,j)*x(j)
end do
x(i) = (b(i)- c)/a(i,i)
end do
end subroutine GE
Turn your arrays (at least AA, BB, XX) into allocatable arrays and allocate them by yourself in the code. You are hitting the memory limit of statically allocated arrays. There is a limit of 2GB on some systems if I remember well (experts will confirm or give the right numbers).

Using mpi_scatterv with 4D fortran array

I'm trying to break up a 4D array over the third dimension, and send to each node using MPI. Basically, I'm computing derivatives of a matrix, Cpq, with respect to atom positions in each of the three cartesian directions. Cpq is of size nat_sl x nat_sl, so dCpqdR is of size nat_sl x nat_sl x nat x 3. At the end of the day, for ever s,i pair, I have to compute the matrix product of dCpqdR between the transpose of the eigenvectors of Cpq and the eigenvectors of Cpq like so:
temp = MATMUL(TRANSPOSE(Cpq), MATMUL(dCpqdR(:, :, s, i), Cpq))
This is fine, but as it turns out, the loop over s and i is now by far the slow part of my code. Because each can be done independently, I was hoping that I could break up dCpqdR, and give each task it's own s, i to compute the derivative of. That is, I'd like task 1 to get dCpqdR(:,:,1,1), task 2 to get dCpqdR(:,:,1,2), etc.
I've got this working in some sense by using a buffered send/recv pair of calls. The root node allocates a temporary array, fills it, sends to the relevant nodes, and the relevant nodes do their computations as they wish. This is fine, but can be slow and memory inefficient. I'd ideally like to break it up in a more memory efficient way.
The logical thing to do, then, is to use mpi_scatterv, but here is where I start running into trouble, as I'm having trouble figuring out the memory layout for this. I've written this, so far:
call mpi_type_create_subarray(4, (/ nat_sl, nat_sl, nat, 3 /), (/nat_sl, nat_sl, n_pairs(me_image+1), 3/),&
(/0, 0, 0, 0/), mpi_order_fortran, mpi_double_precision, subarr_typ, ierr)
call mpi_type_commit(subarr_typ, ierr)
call mpi_scatterv(dCpqdR, n_pairs(me_image+1), f_displs, subarr_typ,&
my_dCpqdR, 3*nat_sl*3*nat_sl*3*n_pairs(me_image+1), subarr_typ,&
root_image, intra_image_comm, ierr)
I've computed n_pairs using this subroutine:
subroutine mbdvdw_para_init_int_forces()
implicit none
integer :: p, s, i, counter, k, cpu_ind
integer :: num_unique_rpq, n_pairs_per_proc, cpu
real(dp) :: Rpq(3), Rpq_norm, current_val
num_pairs = nat
if(.not.allocated(f_cpu_id)) allocate(f_cpu_id(nat, 3))
n_pairs_per_proc = floor(dble(num_pairs)/nproc_image)
cpu = 0
n_pairs = 0
counter = 1
p = 1
do counter = 0, num_pairs-1, 1
n_pairs(modulo(counter, nproc_image)+1) = n_pairs(modulo(counter, nproc_image)+1) + 1
end do
do s = 1, nat, 1
f_cpu_id(s) = cpu
if((counter.lt.num_pairs)) then
if(p.eq.n_pairs(cpu+1)) then
cpu = cpu + 1
p = 0
end if
end if
p = p + 1
end do
call mp_set_displs( n_pairs, f_displs, num_pairs, nproc_image)
f_displs = f_displs*nat_sl*nat_sl*3
end subroutine mbdvdw_para_init_int_forces
and the full method for the matrix multiplication is
subroutine mbdvdw_interacting_energy(energy, forcedR, forcedh, forcedV)
implicit none
real(dp), intent(out) :: energy
real(dp), dimension(nat, 3), intent(out) :: forcedR
real(dp), dimension(3,3), intent(out) :: forcedh
real(dp), dimension(nat), intent(out) :: forcedV
real(dp), dimension(3*nat_sl, 3*nat_sl) :: temp
real(dp), dimension(:,:,:,:), allocatable :: my_dCpqdR
integer :: num_negative, i_atom, s, i, j, counter
integer, parameter :: eigs_check = 200
integer :: subarr_typ, ierr
! lapack work variables
integer :: LWORK, errorflag
real(dp) :: WORK((3*nat_sl)*(3+(3*nat_sl)/2)), eigenvalues(3*nat_sl)
call start_clock('mbd_int_energy')
call mp_sum(Cpq, intra_image_comm)
eigenvalues = 0.0_DP
forcedR = 0.0_DP
energy = 0.0_DP
num_negative = 0
forcedV = 0.0_DP
errorflag=0
LWORK=3*nat_sl*(3+(3*nat_sl)/2)
call DSYEV('V', 'U', 3*nat_sl, Cpq, 3*nat_sl, eigenvalues, WORK, LWORK, errorflag)
if(errorflag.eq.0) then
do i_atom=1, 3*nat_sl, 1
!open (unit=eigs_check, file="eigs.tmp",action="write",status="unknown",position="append")
! write(eigs_check, *) eigenvalues(i_atom)
!close(eigs_check)
if(eigenvalues(i_atom).ge.0.0_DP) then
energy = energy + dsqrt(eigenvalues(i_atom))
else
num_negative = num_negative + 1
end if
end do
if(num_negative.ge.1) then
write(stdout, '(3X," WARNING: Found ", I3, " Negative Eigenvalues.")'), num_negative
end if
else
end if
energy = energy*nat/nat_sl
!!!!!!!!!!!!!!!!!!!!
! Forces below here. There's going to be some long parallelization business.
!!!!!!!!!!!!!!!!!!!!
call start_clock('mbd_int_forces')
if(.not.allocated(my_dCpqdR)) allocate(my_dCpqdR(nat_sl, nat_sl, n_pairs(me_image+1), 3)), my_dCpqdR = 0.0_DP
if(mbd_vdw_forces) then
do s=1,nat,1
if(me_image.eq.(f_cpu_id(s)+1)) then
do i=1,3,1
temp = MATMUL(TRANSPOSE(Cpq), MATMUL(my_dCpqdR(:, :, counter, i), Cpq))
do j=1,3*nat_sl,1
if(eigenvalues(j).ge.0.0_DP) then
forcedR(s, i) = forcedR(s, i) + 1.0_DP/(2.0_DP*dsqrt(eigenvalues(j)))*temp(j,j)
end if
end do
end do
counter = counter + 1
end if
end do
forcedR = forcedR*nat/nat_sl
do s=1,3,1
do i=1,3,1
temp = MATMUL(TRANSPOSE(Cpq), MATMUL(dCpqdh(:, :, s, i), Cpq))
do j=1,3*nat_sl,1
if(eigenvalues(j).ge.0.0_DP) then
forcedh(s, i) = forcedh(s, i) + 1.0_DP/(2.0_DP*dsqrt(eigenvalues(j)))*temp(j,j)
end if
end do
end do
end do
forcedh = forcedh*nat/nat_sl
call mp_sum(forcedR, intra_image_comm)
call mp_sum(forcedh, intra_image_comm)
end if
call stop_clock('mbd_int_forces')
call stop_clock('mbd_int_energy')
return
end subroutine mbdvdw_interacting_energy
But when run, it's complaining that
[MathBook Pro:58100] *** An error occurred in MPI_Type_create_subarray
[MathBook Pro:58100] *** reported by process [2560884737,2314885530279477248]
[MathBook Pro:58100] *** on communicator MPI_COMM_WORLD
[MathBook Pro:58100] *** MPI_ERR_ARG: invalid argument of some other kind
[MathBook Pro:58100] *** MPI_ERRORS_ARE_FATAL (processes in this communicator will now abort,
[MathBook Pro:58100] *** and potentially your MPI job)
so something is going wrong, but I have no idea what. I know my description is somewhat sparse to start with, so please let me know what information would be necessary to help.