I want to run a Fortran program which calls a subroutine that I want to parallelize with MPI. I know this sounds complicated, but I want to be able to specify the number of processes for each call. What I would want to use is a structure like this:
program my_program
implicit none
!Define variables
nprocs = !formula for calculating number of processes.
call my_subroutine(output,nprocs,other input vars)
end my_program
I want to run my_subroutine with the same effect as this:
mpirun -n nprocs my_subroutine.o
where my_subroutine has been compiled with 'other input vars.'
Is this possible?
Here is a simple example. I try compiling as follows:
$ mpif90 -o my_program WAVE_2D_FP_TUNER_mpi.f90 randgen.f SIMPLE_ROUTINE.f90
I try to run it like this:
$ mpirun -np (1 or 2) my_program
PROGRAM WAVE_2D_FP_TUNER_mpi
USE MPI
IMPLICIT NONE
REAL(KIND=8) :: T,PARAM(1:3),Z,ZBQLU01
REAL(KIND=8) :: ERRORS,COSTS,CMAX,CMAX_V(1:1000),THRESHOLD,Z_MIN,Z_MAX
REAL(KIND=8) :: U,S,R(1:6),MATRIX(1:15)
INTEGER :: EN,INC,I,J,M,P
INTEGER :: NPROCS,IERR
!0.8,-0.4,0.4,10,4,4,7 -- [0.003,0.534]
!0.8,-0.2,0.2,10,4,4,7 -- [0.190,0.588]
CALL MPI_INIT(IERR)
CALL MPI_COMM_SIZE(MPI_COMM_WORLD,NPROCS,IERR)
THRESHOLD = 0.D0
EN = 81
INC = 1
Z_MIN = -2.D-1; Z_MAX = 2.D-1
T = 1.D0
PARAM(1) = 10.D0; PARAM(2) = 4.D0; PARAM(3) = 4.D0
CMAX = 7.D0 !Max that wave speed could possibly be.
CALL ZBQLINI(0.D0)
OPEN(UNIT = 1, FILE = "TUNER_F.txt")
WRITE(1,*) 'Grid Size: '
WRITE(1,*) T/(EN-1)
DO P = 1,15
S = 0
Z = Z_MIN + (1.d0/(15-1))*dble((P-1))*(Z_MAX - Z_MIN)
WRITE(1,*) 'Z: ',Z
DO I = 1,1000
DO J = 1,6
R(J) = ZBQLU01(0.D0)
END DO
!CALL PDE_WAVE_F_mpi(T,PARAM,R,Z,CMAX,EN,INC,NPROCS,U)
CALL SIMPLE_ROUTINE(T,PARAM,R,Z,CMAX,EN,INC,NPROCS,U)
IF (U<=threshold) THEN
S = S + 1.D0
ELSE
S = S + 0.D0
END IF
END DO
MATRIX(P) = (1.D0/1000)*S
END DO
DO I = 1,15
WRITE(1,*) MATRIX(I)
END DO
PRINT *,MINVAL(MATRIX)
PRINT *,MAXVAL(MATRIX)
CLOSE(1)
CALL MPI_FINALIZE(IERR)
END PROGRAM WAVE_2D_FP_TUNER_mpi
Here is the subroutine that I wish to parallelize with mpi.
SUBROUTINE SIMPLE_ROUTINE(T,PARAM,R,Z,CMAX,EN,INC,NPROCS,U)
! Outputs scalar U = T*Z*CMAX*INC*SUM(PARAM)*SUM(R)*SUM(Y)
USE MPI
IMPLICIT NONE
REAL(KIND=8), INTENT(IN) :: T,PARAM(1:3),R(1:6),Z,CMAX
INTEGER, INTENT(IN) :: EN,INC
INTEGER, INTENT(IN) :: NPROCS
REAL(KIND=8), INTENT(OUT) :: U
REAL(KIND=8) :: H,LOCAL_SUM,SUM_OF_X
REAL(KIND=8), DIMENSION(:), ALLOCATABLE :: X
INTEGER :: PX,PX_MAX,NXL,REMX,IX_OFF,P_LEFT,P_RIGHT
INTEGER :: J
INTEGER :: IERR,MYID
! Broadcast nprocs handle to all processes in MPRI_COMM_WORLD
CALL MPI_BCAST(&NPROCS, NPROCS, MPI_INT, 0, MPI_COMM_WORLD,IERR)
! Create subcommunicator SUBCOMM (Do not know how to define WORLD_GROUP?)
CALL MPI_COMM_SPLIT(MPI_COMM_WORLD,WORLD_GROUP,SUBCOMM,IERR)
! Assign IDs to processes in SUBCOMM
CALL MPI_COMM_RANK(SUBCOMM,MYID,IERR)
! Give NPROCS - 1 to SUBCOMM
CALL MPI_COMM_SIZE(SUBCOMM,NPROCS-1,IERR)
H = 2.D0/(EN-1)
! LABEL THE PROCESSES FROM 1 TO PX_MAX.
PX = MYID + 1
PX_MAX = NPROCS
! SPLIT UP THE GRID IN THE X-DIRECTION.
NXL = EN/PX_MAX !nxl = 10/3 = 3
REMX = EN-NXL*PX_MAX !remx = 10-3*3 = 1
IF (PX .LE. REMX) THEN !for px = 1,nxl = 3
NXL = NXL+1 !nxl = 4
IX_OFF = (PX-1)*NXL !ix_off = 0
ELSE
IX_OFF = REMX*(NXL+1)+(PX-(REMX+1))*NXL !for px = 2 and px = 3, ix_off = 1*(3+1)+(2-(1+1))*3 = 4, ix_off = 1*(3+1)+(3-(1+1))*3 = 7
END IF
! ALLOCATE MEMORY FOR VARIOUS ARRAYS.
ALLOCATE(X(0:NXL+1))
X(:) = (/(-1.D0+DBLE(J-1+IX_OFF)*H, J=1,EN)/)
LOCAL_SUM = SUM(X(1:NXL))
CALL MPI_REDUCE(LOCAL_SUM,SUM_OF_X,1,&
MPI_DOUBLE_PRECISION,MPI_SUM,&
0,MPI_COMM_WORLD,IERR)
U = T*Z*CMAX*INC*SUM(PARAM)*SUM(R)*SUM_OF_X
DEALLOCATE(X)
CALL MPI_COMM_FREE(SUBCOMM,IERR)
CALL MPI_BARRIER(MPI_COMM_WORLD,IERR)
END SUBROUTINE SIMPLE_ROUTINE
Ultimately, I want to be able to change the number of processors used in the subroutine, where I want nprocs to be calculated from the value of EN.
A simple approach is to start the MPI app with the maximum number of processes.
Then my_subroutine will first MPI_Bcast(&nprocs, ...) and MPI_COMM_SPLIT(MPI_COMM_WORLD, ..., &subcomm) in order to create a sub communicator subcomm with nprocs
(you can use MPI_UNDEFINED so the "other" communicator will be MPI_COMM_NULL.
Then the MPI tasks that are part of subcomm will perform the computation.
Finally, MPI_Comm_free(&subcomm) and MPI_Barrier(MPI_COMM_WORLD)
From a performance point of view, note sub-communicator creation can be expensive, but hopefully not significant compared to the computation time.
If not, you'd rather revamp your algorithm so it can have nprocs tasks do the job, and the other ones waiting.
An other approach would be to start your app with one MPI task, MPI_Comm_spawn() nprocs-1 tasks, merge the inter-communicator, perform the computation, and terminates the spawned tasks.
The overhead of task creation is way more important, and this might not be fully supported by your resource manager, so I would not advise this option.
Related
Let's say that you separated your domain according to the number of ranks (with mpi_cart_create) then you have the indices of each domain. Then I want to use gatherv to put back a global array in order to write the results. This code seems to work for 2 or 4 processes but doesn't for 8. I feel like I am missing something. There should be no error message.
Edit : each rank contains its local array. Its elements are equal to the rank of the process (from 0 to n_procs). What I am trying to archieve is to put back the global array with gatherv. This is for a finite volume code. My aim is just to reproduce the domain decomposition and what I desire with gatherv. Here, there should be no -10 in the print and the block should be clearly dissociable with the decomposition done by mpi.
The code works as follow : I separate my domain, compute the delimitation/indices associated to each domain, definite my new type that will be received in the global array, then calculate the displacement and use gatherv.
program scatter
use mpi
implicit none
integer,parameter :: nrows = 8,ncols = 8
integer, dimension(8,8) :: global
integer, dimension(2) :: dims_orig,dims,coo,sizes,subsizes,coo_init
logical,dimension(2) :: periods
integer, dimension(:,:),allocatable :: local
integer, dimension(:),allocatable :: displ,counts
integer :: info,code,rank,n_procs,comm2d
integer :: ndi,ndj,nni,nnj,nni_last,nnj_last
integer :: i1,i2,j1,j2,ii1,ii2,jj1,jj2,row,col
integer :: type0,type_subarray,sizeofint,i,j
integer :: imax,jmax,loc_size
integer(kind=mpi_address_kind) :: start,extend
logical :: reorder
! dimensions of the global array
imax = nrows
jmax = ncols
! default value of the global array
global = -10
call mpi_init(code)
call mpi_comm_rank(mpi_comm_world,rank,code)
call mpi_comm_size(mpi_comm_world,n_procs,code)
dims_orig = 0
CALL MPI_DIMS_CREATE(n_procs ,2,dims_orig,code)
! number of domain in each direction
ndi = dims_orig(1)
ndj = dims_orig(2)
! nni/nnj = sizes of each domain
nni = nrows/ndi
nnj = ncols/ndj
nni_last = nrows-(ndi-1)*nrows/ndi
nnj_last = ncols-(ndj-1)*ncols/ndj
dims(1) = ndi
dims(2) = ndj
periods = .false.
reorder = .true.
call mpi_cart_create(MPI_COMM_WORLD,2,dims,periods,reorder,comm2d,code)
call mpi_comm_rank(comm2d,rank,code)
call mpi_cart_get(comm2d,2,dims,periods,coo,code)
if(coo(1)==ndi-1) then
nni = nni_last
i2 = imax
i1 = i2-nni+1
else
i1 = rank/ndj*nni+1
i2 = i1+nni-1
endif
if(coo(2)==ndj-1) then
nnj = nnj_last
j2 = jmax
j1 = j2-nnj+1
else
j1 = MOD(rank,ndj)*nnj+1
j2 = j1+nnj-1
endif
print*,rank,"|",i1,i2,j1,j2
call mpi_barrier(mpi_comm_world,code)
! create new types
sizes = [imax, jmax]
subsizes = [i2-i1+1,j2-j1+1]
coo_init = [i1-1 ,j1-1 ]
call mpi_type_create_subarray(2,sizes,subsizes,coo_init,&
mpi_order_fortran,MPI_integer,type0,code)
call mpi_type_size(MPI_INTEGER,sizeofint,code)
start = 0
extend = sizeofint*nnj
! call mpi_type_get_extent(type0,start,extend,code)
call MPI_TYPE_CREATE_RESIZED(TYPE0,start,extend,TYPE_SUBARRAY,info)
!type_subarray = type0
call MPI_TYPE_COMMIT(TYPE_SUBARRAY,code)
allocate(displ(ndi*ndj))
! forall(col=1:ndj,row=1:ndi)
! displ(1+(row-1)+(col-1)*ndi) = (row-1)+(col-1)*imax
! ! displ(1+(row-1)*ndi+(col-1)) = (row-1)*ndi+(col-1)
! endforall
! computing the displacement
do i =1,ndi
do j = 1,ndj
displ(1+(i-1)+(j-1)*ndj) = (i-1) + (j-1)*imax
enddo
enddo
allocate(local(i1:i2,j1:j2))
allocate(counts(n_procs))
counts = 1
local = rank
loc_size = (i2-i1+1)*(j2-j1+1)!(i2-i1+1)*(j2-j1+1)
call mpi_gatherv(local,loc_size,MPI_integer,&
global,counts,displ,type_subarray,0,&
MPI_COMM_WORLD,code)
call mpi_barrier(mpi_comm_world,code)
if(rank==0) then
do i = 1,nrows
print*,global(i,:)
enddo
endif
call mpi_finalize(code)
endprogram scatter
I parallelized three nested-loops with MPI. When I ran the code, an error popped up, saying 'srun: error: Unable to create step for job 20258899: More processors requested than permitted'
Here is the script that I used to submit job.
#!/bin/bash
#SBATCH --partition=workq
#SBATCH --job-name="code"
#SBATCH --nodes=2
#SBATCH --time=1:00:00
#SBATCH --exclusive
#SBATCH --err=std.err
#SBATCH --output=std.out
#---#
module switch PrgEnv-cray PrgEnv-intel
export OMP_NUM_THREADS=1
#---#
echo "The job "${SLURM_JOB_ID}" is running on "${SLURM_JOB_NODELIST}
#---#
srun --ntasks=1000 --cpus-per-task=${OMP_NUM_THREADS} --hint=nomultithread ./example_parallel
I paste my code below. Would anyone please tell me what problem is with my code? Is the MPI that I used wrong or not? Thank you very much.
PROGRAM THREEDIMENSION
USE MPI
IMPLICIT NONE
INTEGER, PARAMETER :: dp = SELECTED_REAL_KIND(p=15,r=14)
INTEGER :: i, j, k, le(3)
REAL (KIND=dp), ALLOCATABLE :: kp(:,:,:,:), kpt(:,:), col1(:), col2(:)
REAL (KIND=dp) :: su, co, tot
INTEGER :: world_size, world_rank, ierr
INTEGER :: world_comm_1st, world_comm_2nd, world_comm_3rd
INTEGER :: th3_dimension_size, th3_dimension_size_max, th3_dimension_rank
INTEGER :: th2_dimension_size, th2_dimension_size_max, th2_dimension_rank
INTEGER :: th1_dimension_size, th1_dimension_size_max, th1_dimension_rank
INTEGER :: proc_1st_dimension_len, proc_2nd_dimension_len, proc_3rd_last_len, proc_i, proc_j, proc_k
REAL (KIND=dp) :: t0, t1
CALL MPI_INIT(ierr)
CALL MPI_COMM_SIZE(MPI_COMM_WORLD, world_size, ierr)
CALL MPI_COMM_RANK(MPI_COMM_WORLD, world_rank, ierr)
IF (world_rank == 0) THEN
t0 = MPI_WTIME()
END IF
le(1) = 1000
le(2) = 600
le(3) = 900
ALLOCATE (kp(le(1),le(2),le(3),3))
ALLOCATE (kpt(le(3),3))
ALLOCATE (col1(le(1)))
ALLOCATE (col2(le(2)))
DO i = 1, le(1), 1
DO j = 1, le(2), 1
DO k = 1, le(3), 1
kp(i,j,k,1) = DBLE(i+j+j+1)
kp(i,j,k,2) = DBLE(i+j+k+2)
kp(i,j,k,3) = DBLE(i+j+k+3)
END DO
END DO
END DO
proc_1st_dimension_len = (world_size - 1) / le(1) + 1
proc_2nd_dimension_len = (world_size - 1 / (le(1) + le(2))) + 1
proc_3rd_last_len = MOD(world_size - 1, le(1)+le(2)) + 1
IF (world_rank <= proc_3rd_last_len*proc_2nd_dimension_len*proc_1st_dimension_len) THEN
proc_i = MOD(world_rank,proc_1st_dimension_len)
proc_j = world_rank / proc_1st_dimension_len
proc_k = world_rank / (proc_1st_dimension_len*proc_2nd_dimension_len)
ELSE
proc_i = MOD(world_rank-proc_3rd_last_len,proc_1st_dimension_len-1)
proc_j = (world_rank-proc_3rd_last_len) / proc_1st_dimension_len-1
proc_k = (world_rank-proc_3rd_last_len) / (proc_2nd_dimension_len*proc_2nd_dimension_len-1)
END IF
CALL MPI_BARRIER(MPI_COMM_WORLD,ierr)
CALL MPI_COMM_SPLIT(MPI_COMM_WORLD,proc_i,world_rank,world_comm_1st,ierr)
CALL MPI_COMM_SIZE(world_comm_1st,th1_dimension_size,ierr)
CALL MPI_COMM_RANK(world_comm_1st,th1_dimension_rank,ierr)
CALL MPI_COMM_SPLIT(MPI_COMM_WORLD,proc_j,world_rank,world_comm_2nd,ierr)
CALL MPI_COMM_SIZE(world_comm_2nd,th2_dimension_size,ierr)
CALL MPI_COMM_RANK(world_comm_2nd,th2_dimension_rank,ierr)
CALL MPI_COMM_SPLIT(MPI_COMM_WORLD,proc_k,world_rank,world_comm_3rd,ierr)
CALL MPI_COMM_SIZE(world_comm_3rd,th3_dimension_size,ierr)
CALL MPI_COMM_RANK(world_comm_3rd,th3_dimension_rank,ierr)
CALL MPI_BARRIER(MPI_COMM_WORLD,ierr)
CALL MPI_ALLREDUCE(th1_dimension_size,th1_dimension_size_max,1,MPI_INT,MPI_MAX,MPI_COMM_WORLD,ierr)
CALL MPI_ALLREDUCE(th2_dimension_size,th2_dimension_size_max,1,MPI_INT,MPI_MAX,MPI_COMM_WORLD,ierr)
IF (world_rank == 0) THEN
OPEN (UNIT=3, FILE='out.dat', STATUS='UNKNOWN')
END IF
col1 = 0.0
DO i = 1, le(1), 1
IF (MOD(i-1,th1_dimension_size_max) /= th1_dimension_rank) CYCLE
col2 = 0.0
DO j = 1, le(2), 1
IF (MOD(j-1,th2_dimension_size_max) /= th2_dimension_rank) CYCLE
kpt = kp(i,j,:,:)
su = 0.0
DO k = 1, le(3), 1
IF(MOD(k-1,th1_dimension_size*th2_dimension_size) /= th3_dimension_rank) CYCLE
CALL CAL(kpt(k,3),co)
su = su + co
END DO
CALL MPI_BARRIER(world_comm_3rd,ierr)
CALL MPI_REDUCE(su,col2(j),1,MPI_DOUBLE,MPI_SUM,0,world_comm_3rd,ierr)
END DO
CALL MPI_BARRIER(world_comm_2nd,ierr)
CALL MPI_REDUCE(col2,col1(i),le(2),MPI_DOUBLE,MPI_SUM,0,world_comm_2nd,ierr)
END DO
CALL MPI_BARRIER(world_comm_1st,ierr)
tot = 0.0
IF (th1_dimension_rank == 0) THEN
CALL MPI_REDUCE(col1,tot,le(1),MPI_DOUBLE,MPI_SUM,0,world_comm_1st,ierr)
WRITE (UNIT=3, FMT=*) tot
CLOSE (UNIT=3)
END IF
DEALLOCATE (kp)
DEALLOCATE (kpt)
DEALLOCATE (col1)
DEALLOCATE (col2)
IF (world_rank == 0) THEN
t1 = MPI_WTIME()
WRITE (UNIT=3, FMT=*) 'Total time:', t1 - t0, 'seconds'
END IF
CALL MPI_FINALIZE (ierr)
STOP
END PROGRAM THREEDIMENSION
SUBROUTINE CAL(arr,co)
IMPLICIT NONE
INTEGER, PARAMETER :: dp=SELECTED_REAL_KIND(p=15,r=14)
INTEGER :: i
REAL (KIND=dp) :: arr(3), co
co = 0.0d0
co = co + (arr(1) ** 2 + arr(2) * 3.1d1) / (arr(3) + 5.0d-1)
RETURN
END SUBROUTINE CAL
With the #SBATCH directives in the header of the file, you request two nodes explicitly, and, as you do not specify --ntasks, you get the default of one task per node, so you implicitly request two tasks.
Then, when the job starts, your srun line tries to "use" 1000 tasks. You should have a line
#SBATCH --ntasks=1000
in the header as suggested per #Gilles. The srun command will inherit from that 1000 tasks by default so there is no need to specify it there in this case.
Also, if ${OMP_NUM_THREADS} were not 1, you would have to specify the --cpu-per-tasks in the header as a SBATCH directive otherwise you will face the same error.
I have a fortran code that computes the solution vector using the thomas algorithm subroutine.
I want the solution vector to run in a loop for a certain number of time.
How do i call this subroutine in the loop?
my subroutine is the thomas algorithm subroutine.
It returns the solution vector u but I want it to use the vectors NN times in a loop. So the old u becomes the new u to use in the subroutine.
How do I do this?
Below is the what i tried
program thomasalg2
implicit double precision(A-H,O-Z)
real*8, dimension(9,1) :: a,b,c,r,u,uold!the dimension is subject to change depending on the size of the new matrix
!real*8, dimension(9,50) :: W
real*8 :: pi
real*8 :: h,k,lm,l,T
integer :: i,j,al,NN,n
l = 1!right endpoint on the X-axis
n = 9 !number of rols/cols of the coefficient matrix with boundaries included
T = 0.5 !maximum number of the time variable
NN = 50!number of time steps
np = n
h = l/n
k = T/NN
al = 1.0D0 !alpha
pi = dacos(-1.0D0)
lm = (al**2)*(k/(h**2)) !lambda
do i = 1,n
r(i,1) = sin(pi*i*h) !this is W_0
end do
a(1,1) = 0.0D0
do i = 2,n
a(i,1) = -lm
end do
do i = 1,n
b(i,1) = 1 + (2*lm)
end do
c(9,1) = 0.0D0
do i = 1,n-1
c(i,1) = -lm
end do
!the 3 diagonals are stored in the 1st, 2nd, 3rd & 4th files respectively.
open(10, file = 'thom1.txt')
open(11, file = 'thom2.txt')
open(12, file = 'thom3.txt')
open(13, file = 'thom4.txt')
write(10,*)
do i = 1,n
write(10,*) a(i,1)
end do
write(11,*)
do i = 1,n
write(11,*) b(i,1)
end do
write(12,*)
do i = 1,n
write(12,*) c(i,1)
end do
write(13,*)
do i = 1,n
write(13,*) r(i,1)
end do
open(14, file = 'tridag2.txt')
write(14,*)
n = 9
do i = 1,n
write(14,*) a(i,1),b(i,1),c(i,1),r(i,1) !write the given vectors in the file in the form of a column vector
end do
call tridag(a,b,c,r,u,n)
!solve the given system and return the solution vector u
do i = 1,NN
call tridag(a,b,c,r,u,n)
!write(15,*) u
r = u
end do
open(15, file = 'tridag2u.txt')
write(15,*)
!write the solution vector in the form of a column vector
do i = 1,n
write(15,*) u(i,1)
end do
!print *, "Your data has been written in 'tridag2.txt'"
end program thomasalg2
subroutine tridag(a,b,c,r,u,n)
implicit double precision (A-H, O-Z)
integer n, NMAX
real*8 a(n), b(n), c(n), r(n), u(n)
parameter (NMAX = 500)
integer j
real*8 bet, gam(NMAX)
if(b(1).eq.0.) stop "tridag: rewrite equations"
bet = b(1)
u(1)=r(1)/bet
do j = 2,n
gam(j) = c(j-1)/bet
bet = b(j)-a(j)*gam(j)
if (bet.eq.0.) stop "tridag failed"
u(j) = (r(j)-a(j)*u(j-1))/bet
end do
do j = n-1,1,-1
u(j) = u(j)-gam(j+1)*u(j+1)
end do
!print *, "The solution is", u
return
end subroutine
I have 4 .mtx files that I am reading the values from. Two of them run perfectly when read from with no issues and produce the correct outputs into a .DAT file. However, the last 2 are extremely large files; it appears the code correctly reads from the files and runs, but I get no outputs and no errors when reading from these 2...not even the code timer prints the time. Any help is much appreciated! Here is the code:
program proj2matrixC40
implicit none
integer,parameter::dp=selected_real_kind(15,307)
! Set Global Variables
real(kind=dp), allocatable::Ax(:,:),A(:,:),Iglobal(:,:)
integer::At(1,3)
integer::nnz,w,n,k,ii,ff,kk
real(kind=dp)::t1,t2
call cpu_time(t1)
open(unit=78,file="e40r5000.mtx",status='old')
read(78,*) At
close(unit=78)
nnz = At(1,3)
n = At(1,1)
k = 40
kk = 35
allocate(Ax(nnz+1,3),A(nnz,3),Iglobal(k,k))
open(unit=61,file="e40r5000.mtx",status='old')
do w=1,nnz+1
read(61,*) Ax(w,:)
end do
open (unit = 53, file = "proj2matrixC40points.dat")
do ff=1,k
do ii=1,k
Iglobal(ii,ff) = (ii/ff)*(ff/ii)
end do
end do
A(1:nnz,:) = Ax(2:nnz+1,:)
call Arno(A)
call cpu_time(t2)
print '("Time elapsed = ",f10.8," seconds")', (t2 - t1)
contains
subroutine Arno(a)
real(kind=dp), intent(in)::a(:,:)
real(kind=dp),dimension(k,k)::H
real(kind=dp),dimension(k,k+1)::u,q,qconj
real(kind=dp),dimension(k,1)::x0
integer::j,f
call random_number(x0)
q(:,1) = x0(:,1)/norm2(x0(:,1))
do f=1,k
call spmat(a,q(:,f),u(:,f))
do j=1,f
qconj(j,:) = (q(:,j))
H(j,f) = dot_product(qconj(j,:),u(:,f))
u(:,f) = u(:,f) - H(j,f)*q(:,j)
end do
if (f.lt.k) then
H(f+1,f) = norm2(u(:,f))
if (H(f+1,f)==0) then
print *, "Matrix is reducible"
stop
end if
q(:,f+1) = u(:,f)/H(f+1,f)
end if
if (f==k) then
call qrit(H)
end if
end do
end subroutine
! QR Iteration with Shifts Subroutine
subroutine qrit(a)
real(kind=dp), intent(in)::a(:,:)
real(kind=dp)::sigmak
real(kind=dp),dimension(kk,k)::dia
real(kind=dp),dimension(k,k)::Qfinal,Rfinal,HH
real(kind=dp),dimension(k,k,kk)::H0,needQR
integer::v,z
HH = a
H0(:,:,1) = HH
do v=1,kk
sigmak = H0(k,k,v)
if (v-1==0) then
needQR(:,:,v) = HH - sigmak*Iglobal
else
needQR(:,:,v) = H0(:,:,v-1) - sigmak*Iglobal
end if
call givens2(needQR(:,:,v),Rfinal,Qfinal)
H0(:,:,v) = matmul(Rfinal,Qfinal) + sigmak*Iglobal
do z = 1,k
dia(v,z) = H0(z,z,v)
write(53,*) v," ", dia(v,z) ! Write values to .DAT file
end do
end do
end subroutine
! Sparse Matrix Vector Multiplication Subroutine
subroutine spmat(a,b,c)
real(kind=dp), intent(in)::a(:,:)
real(kind=dp), intent(in), dimension(k,1)::b
real(kind=dp), intent(out), dimension(k,1)::c
integer::m,rowi,columni
real(kind=dp), dimension(k,1)::x,y
x = b
y(:,1) = 0
do m = 1,nnz
rowi = a(m,1)
columni = a(m,2)
y(rowi,1) = y(rowi,1) + a(m,3)*x(columni,1)
end do
c(:,1) = y(:,1)
end subroutine
! QR Factorization Givens Rotations Subroutine
subroutine givens2(a,Rfinal,Qfinal)
real(kind=dp), intent(in)::a(:,:)
real(kind=dp), dimension(k,k,(k*k))::G,QQ
real(kind=dp), dimension(k,k), intent(out)::Rfinal,Qfinal
real(kind=dp), dimension(k,k)::I2,y,aa
real(kind=dp), dimension(1,k)::ek1,ek2
real(kind=dp)::c,s
integer::kt,m,nn,j,i,l,p
m = size(a,1)
nn = size(a,2)
aa = a
i = 1
do kt=1,nn-1
do j=m,kt+1,-1
if (aa(j,kt).eq.0) then
continue
else
ek1(1,:) = 0
ek2(1,:) = 0
do p=1,m
do l=1,m
I2(l,p) = (l/p)*(p/l)
end do
end do
c = aa(kt,kt)/sqrt(aa(kt,kt)**2 + aa(j,kt)**2)
s = aa(j,kt)/sqrt(aa(kt,kt)**2 + aa(j,kt)**2)
ek1(1,kt) = c
ek1(1,j) = s
ek2(1,kt) = -s
ek2(1,j) = c
I2(kt,:) = ek1(1,:)
I2(j,:) = ek2(1,:)
G(:,:,i) = I2
if (i.eq.1) then
QQ(:,:,i) = G(:,:,i)
else
QQ(:,:,i) = matmul(G(:,:,i),QQ(:,:,i-1))
end if
y = matmul(G(:,:,i),aa)
aa = y
if (kt.eq.nn-1) then
if (j.eq.kt+1) then
Qfinal = transpose(QQ(:,:,i))
Rfinal = aa
end if
end if
i = i + 1
end if
end do
end do
end subroutine
end program proj2matrixC40
A couple notes. The line which I put asterisks around (for this question) call mat_print('H',H) can't be deleted otherwise I get the wrong answers (this is strange...thoughts?). Also so your computer won't freeze opening the big files, their names are 'e40r5000.mtx' and 's3dkt3m2.mtx' (these are the two I have issues with). I am using gfortran version 8.1.0
Here is the link to the files
https://1drv.ms/f/s!AjG0dE43DVddaJfY62ABE8Yq3CI
When you need to add a call to a subroutine that shouldn't actually change anything in order to get things working, you probably have a memory corruption. This happens most often when you access arrays outside of their boundaries.
I have compiled it with some run time checks:
gfortran -o p2m -g -O0 -fbacktrace -fcheck=all -Wall proj2mat.f90
And it's already giving me some issues:
It's warning me about implicit type conversions. That shouldn't be too much of an issue if you trust your data.
In line 46 you have an array length mismatch (x0(:, 1) has length 40, q(:,1) is 41)
Similarly on line 108 (x=b) x is really large, but b is only 41 long.
I have stopped now, but I implore you to go through your code and clean it up. Use the compiler options above which will let you know when and where there is an array bound violation.
I want to read data from some .dat files to a Fortran code for postprocessing. As a test case, I am just using one processor for MPI and trying to read single data file to my code. The content of the data file is as follows:
qout0050.dat : 1 1 1
However, the matrix (Vn in this case) which is supposed to store the content of this data file shows all 0 values. The relevant part of the code which reads from data file and store to the matrix is as follows:
subroutine postproc()
use precision_mod
use mpicomms_mod
implicit none
integer(kind=MPI_Offset_kind) :: i, j , igrid, k, l, disp, iproc, info, lwork
integer :: rst, numvar, ifile, number, num, step, ntot
integer :: Nx_max, Ny_max, Nz_max
integer :: Nxp, Nzp, Nyp, Ngrid
integer :: Ifirst, Ilast, Jfirst, Jlast, Kfirst, Klast
character*(64) :: fname, buffer, ffname
integer :: tmp, N1, N2, N3, tmpp
real(WP), allocatable :: qout(:,:,:,:), phi_xyz(:,:,:,:,:)
real(WP), allocatable :: Vn(:,:), Vntmp(:,:), TAU(:,:)
real(WP), allocatable :: Rprime(:,:), Rtmp(:,:), Rend(:,:), Q(:,:), tmpL(:), tmpG(:)
real(WP), allocatable :: s(:), vt(:,:), u(:,:), utmp(:,:)
real(WP), allocatable :: tmp1(:,:), tmp2(:,:), tmp3(:,:), phi(:,:)
integer, dimension(2) :: view
integer :: view1
integer, dimension(3) :: lsizes, gsizes, start
real(WP) :: tmpr
real(WP), allocatable :: work(:), mu(:), eigY(:,:), wr(:), wi(:), beta(:)
integer(kind=MPI_Offset_kind) :: SP_MOK, Nx_MOK, Ny_MOK, Nz_MOK, WP_MOK
open(unit=110,file='postparameters.dat',form="formatted")
read (110,*) Nx_max
read (110,*) Ny_max
read (110,*) Nz_max
read (110,*) numvar
read (110,*) step
close(110)
! Define the size of grid on each processor
if (mod(Nx_max,px).ne.0) then
write(*,*) 'Error in preproc: Nx_max is not devisable by px'
call MPI_ABORT(MPI_COMM_WORLD,0,ierr)
end if
Nxp = Nx_max/px
if (mod(Ny_max,py).ne.0) then
write(*,*) 'Error in preproc: Nx_max is not devisable by px'
call MPI_ABORT(MPI_COMM_WORLD,0,ierr)
end if
Nyp = Ny_max/py
if (mod(Nz_max,pz).ne.0) then
write(*,*) 'Error in preproc: Nx_max is not devisable by px'
call MPI_ABORT(MPI_COMM_WORLD,0,ierr)
end if
Nzp = Nz_max/pz
Ifirst = irank*Nxp + 1
Ilast = Ifirst + Nxp - 1
Jfirst = jrank*Nyp + 1
Jlast = Jfirst + Nyp - 1
Kfirst = krank*Nzp + 1
Klast = Kfirst + Nzp - 1
! Setting the view for phi
gsizes(1) = Nx_max
gsizes(2) = Ny_max
gsizes(3) = Nz_max
lsizes(1) = Nxp
lsizes(2) = Nyp
lsizes(3) = Nzp
start(1) = Ifirst - 1
start(2) = Jfirst - 1
start(3) = Kfirst - 1
call MPI_TYPE_CREATE_SUBARRAY(3,gsizes,lsizes,start,&
MPI_ORDER_FORTRAN,MPI_REAL_SP,view,ierr)
call MPI_TYPE_COMMIT(view,ierr)
call MPI_TYPE_CREATE_SUBARRAY(3,gsizes,lsizes,start,&
MPI_ORDER_FORTRAN,MPI_REAL_WP,view1,ierr)
call MPI_TYPE_COMMIT(view1,ierr)
WP_MOK = int(8, MPI_Offset_kind)
Nx_MOK = int(Nx_max, MPI_Offset_kind)
Ny_MOK = int(Ny_max, MPI_Offset_kind)
Nz_MOK = int(Nz_max, MPI_Offset_kind)
! Reading the qout file
ffname = 'qout'
allocate(qout(Nxp,Nyp,Nzp,numvar))
allocate(Vn(Nxp*Nyp*Nzp*numvar,step))
do rst = 1,step
if (myrank == 0) print*, 'Step = ', 50 + rst -1
write(buffer,"(i4.4)") 50 + rst -1
fname = trim(ffname)//trim(buffer)
fname = trim('ufs')//":"// trim(fname)
fname = trim(adjustl(fname))//'.dat'
call MPI_FILE_OPEN(MPI_COMM_WORLD,fname,MPI_MODE_RDONLY,MPI_INFO_NULL,ifile,ierr)
call MPI_FILE_READ(ifile,Ngrid,1,MPI_INTEGER,status,ierr)
if (1 /= Ngrid) then
if (myrank == 0 ) write(*,*) Ngrid
endif
call MPI_FILE_READ(ifile,tmp,1,MPI_INTEGER,status,ierr)
if (tmp /= Nx_max) write(*,*) tmp
call MPI_FILE_READ(ifile,tmp,1,MPI_INTEGER,status,ierr)
if (tmp /= Ny_max) write(*,*) tmp
call MPI_FILE_READ(ifile,tmp,1,MPI_INTEGER,status,ierr)
if (tmp /= Nz_max) write(*,*) tmp
call MPI_FILE_READ(ifile,tmpr,1,MPI_REAL_WP,status,ierr)
call MPI_FILE_READ(ifile,tmpr,1,MPI_REAL_WP,status,ierr)
call MPI_FILE_READ(ifile,tmpr,1,MPI_REAL_WP,status,ierr)
call MPI_FILE_READ(ifile,tmpr,1,MPI_REAL_WP,status,ierr)
do l=1,numvar
disp = 4*4 + 4*WP_MOK + Nx_MOK*Ny_MOK*Nz_MOK*WP_MOK*(l-1)
call MPI_FILE_SET_VIEW(ifile,disp,MPI_REAL_WP,view1,"native",MPI_INFO_NULL,ierr)
call MPI_FILE_READ_ALL(ifile,qout(1:Nxp,1:Nyp,1:Nzp,l),Nxp*Nzp*Nyp, MPI_REAL_WP,status,ierr)
end do
call MPI_FILE_CLOSE(ifile,ierr)
!-----------------------------------------------
! Bluiding the snapshot matrix Vn --------------
!-----------------------------------------------
do i=1,numvar
do k=1,Nzp
do j=1,Nyp
Vn((1 + Nxp*(j-1) + Nxp*Nyp*(k-1) + Nxp*Nyp*Nzp*(i-1)):(Nxp*j + Nxp*Nyp*(k-1) + Nxp*Nyp*Nzp*(i-1)),rst) = qout(1:Nxp,j,k,i)
end do
end do
end do
end do
call MPI_BARRIER(MPI_COMM_WORLD,ierr)
deallocate(qout)