I am trying to parallelize my code using openmp, but am unsure what is private and what is shared. I've used MPI a lot, but not openmpi.
For now, I'm using gnu compiler. I include at the bottom the copy of my program which I am compiling with gfortran -fopenmp. The code runs a serial version of the calculation, and then an attempt at a parallelized version.
In this code I have the following nested set of do loops I want to parallelize. It is a loop over my calculation domain and for each (ix,iz) I am performing a vectorized integration over the whole domain. You may recognize this as a numerical integration of the Biot-Savart law, which uses a Green's function in the integrand.
Here is the loop I want to parallelize
DO ix=1,nx
DO iz=1,nz
rpx = x(ix)-x2D
rpz = z(iz)-z2D
magrp = sqrt(rpx**2.0_num+rpz**2.0_num)
integrand1 = Jy*rpz/(magrp**2.0_num)
integrand2 =-Jy*rpx/(magrp**2.0_num)
integrand1(ix,iz) = 0.0_num
integrand2(ix,iz) = 0.0_num
bx_BS(ix,iz) = dS*0.25_num*sum(integrand1*weight)/(2.0_num*pi)
bz_BS(ix,iz) = dS*0.25_num*sum(integrand2*weight)/(2.0_num*pi)
END DO
END DO
print*, 'bx test', bx_BS(floor(nx/2.0_num+1.0_num),floor(nz/2.0_num+1.0_num))
print*, 'bz test', bz_BS(floor(nx/2.0_num+1.0_num),floor(nz/2.0_num+1.0_num))
where x2D,y2D,magrp,integrand1,integrand2, Jy are the 2D arrays that go into the sum for each Bx_BS(ix,iz) and Bz_BS(ix,iz).
My first attempt at parallelizing looks like
!$omp parallel default(shared) private(ix,iz,tid)
tid = OMP_GET_THREAD_NUM()
nthreads = OMP_GET_NUM_THREADS()
seconds = omp_get_wtime()
!$omp do
DO ix=1,nx
DO iz=1,nz
rpx = x(ix)-x2D
rpz = z(iz)-z2D
magrp = sqrt(rpx**2.0_num+rpz**2.0_num)
integrand1 = Jy*rpz/(magrp**2.0_num)
integrand2 =-Jy*rpx/(magrp**2.0_num)
integrand1(ix,iz) = 0.0_num
integrand2(ix,iz) = 0.0_num
bx_BS(ix,iz) = dS*0.25_num*sum(integrand1*weight)/(2.0_num*pi)
bz_BS(ix,iz) = dS*0.25_num*sum(integrand2*weight)/(2.0_num*pi)
END DO
END DO
!$omp end do
!$omp end parallel
print*, 'bx test', bx_BS(floor(nx/2.0_num+1.0_num),floor(nz/2.0_num+1.0_num))
print*, 'bz test', bz_BS(floor(nx/2.0_num+1.0_num),floor(nz/2.0_num+1.0_num))
However, the printed test results from the serial and parallel integration are not the same, and moreover, the value of Bx_BS varies for different executions, so something is wrong.
I assumed that any array that has to be read from by a thread should be shared, and also the target array (Bx_BS, Bz_BS) should be shared too, and so I set the only private variables as the indices of the nested do loops which I assume openmp is threading over.
Can anyone spot where I'm going wrong?
Thanks!
PROGRAM internal_2D
USE omp_lib
IMPLICIT NONE
INTEGER, PARAMETER :: num=kind(1.0d0)
INTEGER, PARAMETER :: nx=32, nz=32
REAL(num), PARAMETER :: pi = 3.14159265358979323_num
REAL(num) :: L, dx, dz, dS, wtube, seconds
INTEGER :: ix, iz, m, out_unit, tid, nthreads
REAL(num), DIMENSION(:), ALLOCATABLE :: x, z
REAL(num), DIMENSION(:,:), ALLOCATABLE :: x2D, z2D, r
REAL(num), DIMENSION(:,:), ALLOCATABLE :: bx, by, bz,jx, jy, jz, Ax, Ay, Az
REAL(num), DIMENSION(:,:), ALLOCATABLE :: rpx, rpy, rpz, magrp, integrand1, integrand2, integrand3
REAL(num), DIMENSION(:,:), ALLOCATABLE :: bx_BS, by_BS, bz_BS, Ax_BS, Ay_BS, Az_BS
REAL(num), DIMENSION(:,:), ALLOCATABLE :: curlBx_BS, curlBy_BS, curlBz_BS, curlAx_BS, curlAy_BS, curlAz_BS
REAL(num), DIMENSION(:,:), ALLOCATABLE :: temp1, temp2, divB, curlBx, curlBy, curlBz, curlAx, curlAy, curlAz
REAL(num), DIMENSION(:,:), ALLOCATABLE :: del2Ay, weight, del2Ay_BS, curl_diff_B_BS
ALLOCATE(x(1:nx), z(1:nz))
ALLOCATE(x2D(1:nx,1:nz), z2D(1:nx,1:nz), r(1:nx,1:nz))
ALLOCATE(jx(1:nx,1:nz), jy(1:nx,1:nz), jz(1:nx,1:nz))
ALLOCATE(bx(1:nx,1:nz), by(1:nx,1:nz), bz(1:nx,1:nz))
ALLOCATE(Ax(1:nx,1:nz), Ay(1:nx,1:nz), Az(1:nx,1:nz))
ALLOCATE(temp1(1:nx,1:nz),temp2(1:nx,1:nz),divB(1:nx,1:nz))
ALLOCATE(curlBx(1:nx,1:nz),curlBy(1:nx,1:nz),curlBz(1:nx,1:nz))
ALLOCATE(curlAx(1:nx,1:nz),curlAy(1:nx,1:nz),curlAz(1:nx,1:nz))
ALLOCATE(del2Ay(1:nx,1:nz),weight(1:nx,1:nz))
ALLOCATE(magrp(1:nx,1:nz), integrand1(1:nx,1:nz), integrand2(1:nx,1:nz))
ALLOCATE(bx_BS(1:nx,1:nz),by_BS(1:nx,1:nz),bz_BS(1:nx,1:nz))
ALLOCATE(Ax_BS(1:nx,1:nz),Ay_BS(1:nx,1:nz),Az_BS(1:nx,1:nz))
ALLOCATE(curlAx_BS(1:nx,1:nz),curlAz_BS(1:nx,1:nz),del2Ay_BS(1:nx,1:nz))
ALLOCATE(curl_diff_B_BS(1:nx,1:nz))
L = 1.0_num
print*, '----------------------------------------------------------'
PRINT*, 'Setting up arrays'
DO ix=1,nx
x(ix) = (-L/2.0_num) + L*DBLE(ix-1)/DBLE(nx-1)
END DO
dx = x(2)-x(1)
DO iz=1,nz
z(iz) = (-L/2.0_num) + L*DBLE(iz-1)/DBLE(nz-1)
END DO
dz = z(2)-z(1)
dS = dx * dz
DO ix=1,nx
DO iz=1,nz
x2D(ix,iz) = (-L/2.0_num) + L*DBLE(ix-1)/DBLE(nx-1)
z2D(ix,iz) = (-L/2.0_num) + L*DBLE(iz-1)/DBLE(nz-1)
END DO
END DO
print*, '----------------------------------------------------------'
print*, 'defining J and B and A for test'
wtube = 0.5d0
r = sqrt(x2d**2.0_num+z2d**2.0_num)
bx = 0.0_num
bz = 0.0_num
jy = 0.0_num
where (r .le. wtube)
bx = z2d * ( (0.25_num*r**2.0_num/(1.0_num*wtube**2.0_num)) - &
(2.00_num*r**3.0_num/(5.0_num*wtube**3.0_num)) + &
(1.00_num*r**4.0_num/(6.0_num*wtube**4.0_num)) )
bz =-x2d * ( (0.25_num*r**2.0_num/(1.0_num*wtube**2.0_num)) - &
(2.00_num*r**3.0_num/(5.0_num*wtube**3.0_num)) + &
(1.00_num*r**4.0_num/(6.0_num*wtube**4.0_num)) )
jy = (1.0_num-(r/wtube))**2.0_num * (r/wtube)**2.0_num
elsewhere
bx = z2d*wtube**2.0_num / (60.0_num*r**2.0_num)
bz =-x2d*wtube**2.0_num / (60.0_num*r**2.0_num)
end where
print*, '----------------------------------------------------------'
print*, 'checking divB=0'
temp1 = 0.0_num
temp2 = 0.0_num
call pdiv_2D(bx,temp1, nx, 1, dx)
call pdiv_2D(bz,temp2, nz, 2, dz)
divB = temp1+temp2
print*, 'maxval(abs(div.B)) ', maxval(abs(divB))
print*, 'total(abs(div.B)) ', sum(abs(divB))
print*, '----------------------------------------------------------'
print*, 'checking curlB=J'
temp1 = 0.0_num
temp2 = 0.0_num
call pdiv_2D(bx,temp1, nx, 2, dz)
call pdiv_2D(bz,temp2, nz, 1, dx)
curlBy = temp1-temp2
print*, 'maxval(abs(curlBy-Jy)) ', maxval(abs(curlBy-Jy))
print*, 'total(abs(curlBy-Jy)) ', sum(abs(curlBy-jy))
print*, '----------------------------------------------------------'
print*, '2D vectorized B-S integration'
weight = 4.0_num
weight(1 ,1 ) = 1.0_num
weight(1 ,nz) = 1.0_num
weight(nx,1 ) = 1.0_num
weight(nx,nz) = 1.0_num
weight(2:nx-1,1 ) = 2.0_num
weight(2:nx-1,nz) = 2.0_num
weight(1 ,2:nz-1) = 2.0_num
weight(nx,2:nz-1) = 2.0_num
seconds = omp_get_wtime()
DO ix=1,nx
DO iz=1,nz
rpx = x(ix)-x2D
rpz = z(iz)-z2D
magrp = sqrt(rpx**2.0_num+rpz**2.0_num)
integrand1 = Jy*rpz/(magrp**2.0_num)
integrand2 =-Jy*rpx/(magrp**2.0_num)
integrand1(ix,iz) = 0.0_num
integrand2(ix,iz) = 0.0_num
bx_BS(ix,iz) = dS*0.25_num*sum(integrand1*weight)/(2.0_num*pi)
bz_BS(ix,iz) = dS*0.25_num*sum(integrand2*weight)/(2.0_num*pi)
integrand2 =-Jy*log(magrp)
integrand2(ix,iz) = 0.0_num
Ay_BS(ix,iz) = dS*0.25*sum(weight*integrand2)/(2.0_num*pi)
END DO
END DO
seconds = omp_get_wtime()-seconds
write(*,*) ' Time for serial calc = ', seconds
print*, 'bx test', bx_BS(floor(nx/2.0_num+1.0_num),floor(nz/2.0_num+1.0_num))
print*, 'bz test', bz_BS(floor(nx/2.0_num+1.0_num),floor(nz/2.0_num+1.0_num))
bx_BS = 0.0_num
bz_BS = 0.0_num
Ay_BS = 0.0_num
!$omp parallel default(shared) private(ix,iz,tid)
tid = OMP_GET_THREAD_NUM()
nthreads = OMP_GET_NUM_THREADS()
seconds = omp_get_wtime()
!$omp do
DO ix=1,nx
DO iz=1,nz
rpx = x(ix)-x2D
rpz = z(iz)-z2D
magrp = sqrt(rpx**2.0_num+rpz**2.0_num)
integrand1 = Jy*rpz/(magrp**2.0_num)
integrand2 =-Jy*rpx/(magrp**2.0_num)
integrand1(ix,iz) = 0.0_num
integrand2(ix,iz) = 0.0_num
bx_BS(ix,iz) = dS*0.25_num*sum(integrand1*weight)/(2.0_num*pi)
bz_BS(ix,iz) = dS*0.25_num*sum(integrand2*weight)/(2.0_num*pi)
integrand2 =-Jy*log(magrp)
integrand2(ix,iz) = 0.0_num
Ay_BS(ix,iz) = dS*0.25*sum(weight*integrand2)/(2.0_num*pi)
END DO
END DO
!$omp end do
!$omp end parallel
seconds = omp_get_wtime()-seconds
write(*,*) ' Time for parallel calc = ', seconds
print*, 'bx test', bx_BS(floor(nx/2.0_num+1.0_num),floor(nz/2.0_num+1.0_num))
print*, 'bz test', bz_BS(floor(nx/2.0_num+1.0_num),floor(nz/2.0_num+1.0_num))
!!$
!!$ print*, '----------------------------------------------------------'
!!$ print*, 'checking curlA_BS vs B_BS'
!!$
!!$ print*, '----------------------------------------------------------'
!!$ print*, 'checking del2Ay vs curlB_BS'
!!$
!!$ print*, '----------------------------------------------------------'
!!$ print*, 'checking A_BS vs A'
!!$
!!$ print*, '----------------------------------------------------------'
!!$ print*, 'checking curlA_BS vs curlA'
!!$
!!$ print*, '----------------------------------------------------------'
!!$ print*, 'checking curlcurlA_BS vs curlcurlA'
!!$
print*, '----------------------------------------------------------'
print*, 'checking B_BS vs B'
print*, 'maxval(abs(Bx_BS-Bx)) ', maxval(abs(Bx_BS-Bx))
print*, 'total(abs(Bx_BS-Bx)) ', sum(abs(Bx_BS-Bx))
print*, 'sqrt(total(|Bx_BS-Bx|^2)/total(|Bx|^2) ', &
sqrt(sum((Bx_BS-Bx)**2)/sum(Bx**2))
print*, 'maxval(abs(Bz_BS-Bz)) ', maxval(abs(Bz_BS-Bz))
print*, 'total(abs(Bz_BS-Bz)) ', sum(abs(Bz_BS-Bz))
print*, 'sqrt(total(|Bz_BS-Bz|^2)/total(|Bz|^2) ', &
sqrt(sum((Bz_BS-Bz)**2)/sum(Bz**2))
print*, '----------------------------------------------------------'
print*, 'checking curl(B-BS)'
temp1 = 0.0_num
temp2 = 0.0_num
call pdiv_2D(bx_BS-bx,temp1, nx, 2, dz)
call pdiv_2D(bz_BS-bz,temp2, nz, 1, dx)
curl_diff_B_BS = temp1-temp2
print*, 'maxval(abs(curl_diff_By_BS))', maxval(abs(curl_diff_B_BS))
print*, 'total(abs(curl_diff_By_BS))', sum(abs(curl_diff_B_BS))
print*, '----------------------------------------------------------'
print*, 'checking curlB_BS vs curlB'
temp1 = 0.0_num
temp2 = 0.0_num
call pdiv_2D(bx_BS,temp1, nx, 2, dz)
call pdiv_2D(bz_BS,temp2, nz, 1, dx)
curlBy_BS = temp1-temp2
print*, 'maxval(abs(curlBy_BS-curlBy)) ', maxval(abs(curlBy_BS-curlBy))
print*, 'total(abs(curlBy_BS-curlBy)) ', sum(abs(curlBy_BS-curlBy))
print*, 'sqrt(total(|curlBy_BS-curlBy|^2)/total(|curlBy|^2) ', &
sqrt(sum((curlBy_BS-curlBy)**2)/sum(curlBy**2))
out_unit = 3
open(out_unit, file="output_internal_2D.dat", status="replace", form="unformatted")
write(out_unit) x
write(out_unit) z
write(out_unit) jy
write(out_unit) bx
write(out_unit) bz
write(out_unit) curlBy
!!$ write(out_unit) Ay
!!$ write(out_unit) curlAx
!!$ write(out_unit) curlAz
!!$ write(out_unit) del2Ay
!!$ write(out_unit) Ay_BS
!!$ write(out_unit) curlAx_BS
!!$ write(out_unit) curlAz_BS
write(out_unit) Bx_BS
write(out_unit) Bz_BS
write(out_unit) curlBy_BS
write(out_unit) curl_diff_B_BS
close(3)
print*, '----------------------------------------------------------'
CONTAINS
SUBROUTINE pdiv_2D(array_in, array_out, n, dir, res)
INTEGER, INTENT(IN) :: dir, n
REAL(num), INTENT(IN) :: array_in(n,n)
REAL(num), INTENT(OUT) :: array_out(n,n)
REAL(num), INTENT(IN) :: res
INTEGER :: ix, iz
REAL(num), DIMENSION(n) :: grad
IF (dir .eq. 1) THEN
DO iz=1,n
CALL deriv(array_in(:,iz), array_out(:,iz), n, res)
END DO
ELSE IF (dir .eq. 2) THEN
DO ix=1,n
CALL deriv(array_in(ix,:), array_out(ix,:), n, res)
END DO
ENDIF
END SUBROUTINE pdiv_2D
SUBROUTINE deriv(a_in, a_out, n, res) ! one dimensional derivative - start with three point stencil - carfeul at ends!
INTEGER, INTENT(IN) :: n
REAL(num), INTENT(IN) :: a_in(n)
REAL(num), INTENT(OUT) :: a_out(n)
REAL(num), INTENT(IN) :: res
INTEGER :: ix
!3 point stencil - fix end points to be one directional
!a_out = (cshift(a_in,1)-cshift(a_in,-1))/(2.0_num*res)
!a_out(1) = (1.0_num/res) * (-3.0_num*a_in(1) + 4.0_num*a_in(2 ) - a_in(3 )) / 2.0_num
!a_out(n) = (1.0_num/res) * ( 3.0_num*a_in(n) - 4.0_num*a_in(n-1) + a_in(n-2)) / 2.0_num
!d = (shift(x,-1) - shift(x,1))/2.D0
!d[0] = (-3.D0*x[0] + 4.D0*x[1] - x[2])/2.D0
!d[n-1] = (3.D0*x[n-1] - 4.D0*x[n-2] + x[n-3])/2.D0
!7 point stencil - fix end points to be one directional
a_out = -(-cshift(a_in,3)+9.0_num*cshift(a_in,2)-45.0_num*cshift(a_in,1)+45.0_num*&
cshift(a_in,-1)-9.0_num*cshift(a_in,-2)+cshift(a_in,-3))/(60.0_num*res)
ix=1
a_out(ix) = (1.0_num/res) * (-147.0_num*a_in(ix+0)+360.0_num*a_in(ix+1)-450.0_num*&
a_in(ix+2)+400.0_num*a_in(ix+3)-225.0_num*a_in(ix+4)+72.0_num*a_in(ix+5)-10.0_num*a_in(ix+6))/(60.0_num)
ix = 2
a_out(ix) = (1.0_num/res) * (-10.0_num*a_in(ix-1)-77.0_num*a_in(ix+0)+150.0_num*&
a_in(ix+1)-100.0_num*a_in(ix+2)+50.0_num*a_in(ix+3)-15.0_num*a_in(ix+4)+2.0_num*a_in(ix+5))/(60.0_num)
ix = 3
a_out(ix) = (1.0_num/res) * (2.0_num*a_in(ix-2)-24.0_num*a_in(ix-1)-35.0_num*&
a_in(ix+0)+80.0_num*a_in(ix+1)-30.0_num*a_in(ix+2)+8.0_num*a_in(ix+3)-1.0_num*a_in(ix+4))/(60.0_num)
ix = n-2
a_out(ix) = (1.0_num/res) * (1.0_num*a_in(ix-4)-8.0_num*a_in(ix-3)+30.0_num*&
a_in(ix-2)-80.0_num*a_in(ix-1)+35.0_num*a_in(ix+0)+24.0_num*a_in(ix+1)-2.0_num*a_in(ix+2))/(60.0_num)
ix=n-1
a_out(ix) = (1.0_num/res) * (-2.0_num*a_in(ix-5)+15.0_num*a_in(ix-4)-50.0_num*&
a_in(ix-3)+100.0_num*a_in(ix-2)-150.0_num*a_in(ix-1)+77.0_num*a_in(ix+0)+10.0_num*a_in(ix+1))/(60.0_num)
ix=n
a_out(ix)= (1.0_num/res) * (10.0_num*a_in(ix-6)-72.0_num*a_in(ix-5)+225.0_num*&
a_in(ix-4)-400.0_num*a_in(ix-3)+450.0_num*a_in(ix-2)-360.0_num*a_in(ix-1)+147.0_num*a_in(ix+0))/(60.0_num)
!d = (-shift(x,3)+9.d0*shift(x,2)-45.d0*shift(x,1)+45.d0*shift(x,-1)-9.d0*shift(x,-2)+shift(x,-3))/60.D0
! I=0L
! d[I] =(-147.d0*x[i+0]+360.d0*x[i+1]-450.d0*x[i+2]+400.d0*x[i+3]-225.d0*x[i+4]+72.d0*x[i+5]-10.d0*x[i+6])/(60.d0)
! I=1L
! d[I] =(-10.d0*x[i-1]-77.d0*x[i+0]+150.d0*x[i+1]-100.d0*x[i+2]+50.d0*x[i+3]-15.d0*x[i+4]+2.d0*x[i+5])/(60.d0)
! I=2L
! d[I] =(2.d0*x[i-2]-24.d0*x[i-1]-35.d0*x[i+0]+80.d0*x[i+1]-30.d0*x[i+2]+8.d0*x[i+3]-1.d0*x[i+4])/(60.d0)
! I=N-3
! d[I]= (1.d0*x[i-4]-8.d0*x[i-3]+30.d0*x[i-2]-80.d0*x[i-1]+35.d0*x[i+0]+24.d0*x[i+1]-2.d0*x[i+2])/(60.d0)
! I=N-2
! d[I]=(-2.d0*x[i-5]+15.d0*x[i-4]-50.d0*x[i-3]+100.d0*x[i-2]-150.d0*x[i-1]+77.d0*x[i+0]+10.d0*x[i+1])/(60.d0)
! I=N-1
! d[I]= (10.d0*x[i-6]-72.d0*x[i-5]+225.d0*x[i-4]-400.d0*x[i-3]+450.d0*x[i-2]-360.d0*x[i-1]+147.d0*x[i+0])/(60.d0)
END SUBROUTINE deriv
END PROGRAM internal_2D
Related
A Fortran code has two definitions of a subroutine within an if defined block, as shown below. If I manually remove of the definitions, the code can be compiled, but that's not what the author intended. Compiling with gfortran -c -cpp does not work. What is the right way to compile it?
#:if defined('SLICOT')
subroutine dlyap(TT, RQR, P0, ns, info)
! Computes the solution to the discrete Lyapunov equation,
! P0 = TT*P0*TT' + RQR
! where (inputs) TT, RQR and (output) P0 are ns x ns (real) matrices.
!--------------------------------------------------------------------------------
integer, intent(in) :: ns
real(wp), intent(in) :: TT(ns,ns), RQR(ns,ns)
integer, intent(out) :: info
real(wp), intent(out) :: P0(ns,ns)
! for slicot
real(wp) :: scale, U(ns,ns), UH(ns, ns), rcond, ferr, wr(ns), wi(ns), dwork(14*ns*ns*ns), sepd
integer :: iwork(ns*ns), ldwork
integer :: t
UH = TT
P0 = -1.0_wp*RQR
!call sb03md('D','X', 'N', 'T', ns, UH, ns, U, ns, P0, ns, &
! scale, sepd, ferr, wr, wi, iwork, dwork, 14*ns*ns*ns, info)
!if (ferr > 0.000001_wp) call dlyap_symm(TT, RQR, P0, ns, info)
if (info .ne. 0) then
print*,'SB03MD failed. (info = ', info, ')'
P0 = 0.0_wp
info = 1
do t = 1,ns
P0(t,t)=1.0_wp
end do
return
else
! P0 = 0.5_wp*P0 + 0.5_wp*transpose(P0)
info = 0
end if
end subroutine dlyap
#:else
! from elmar
SUBROUTINE DLYAP(A, QQ, Sigma, nx, status)
! doubling, calling DSYMM and DGEMM
! Sigma = A * Sigma * A' + B * B'
! output Sigma is symmetric
IMPLICIT NONE
integer, intent(in) :: nx
integer, intent(out) :: status
real(wp), intent(in) :: QQ(nx,nx), A(nx,nx)
real(wp), intent(out) :: Sigma(nx,nx)
INTEGER, PARAMETER :: maxiter = 100
DOUBLE PRECISION, PARAMETER :: tol = 1.0d-8
INTEGER :: iter, i
LOGICAL :: converged
DOUBLE PRECISION, DIMENSION(Nx,Nx) :: AA, AAA, AASigma, Sigma0
Sigma0 = QQ
! Sigma0 = B B'
! Sigma0 = 0.0d0
! call DSYRK('U','N',Nx,Nw,1.0d0,B,Nx,0.0d0,Sigma0,Nx)
! ! fill up lower triangular -- necessary for DGEMM below
! FORALL (i=2:Nx) Sigma0(i,1:i-1) = Sigma0(1:i-1,i)
converged = .false.
iter = 0
AA = A
DO
iter = iter + 1
! call sandwichplus(Sigma, AA, Nx, Sigma0, Nx)
! MANUAL SANDWICHPLUS: Sigma = AA * Sigma0 * AA' + Sigma
call DSYMM('R','U',Nx,Nx,1.0d0,Sigma0,Nx,AA,Nx,0.0d0,AASigma,Nx)
Sigma = Sigma0 ! this line requires Sigma0 to
call DGEMM('N','T',Nx,Nx,Nx,1.0d0,AASigma,Nx,AA,Nx,1.0d0,Sigma,Nx)
! balance for symmetry
Sigma = 0.5d0 * (Sigma + transpose(Sigma))
IF (abs(maxval(Sigma - Sigma0)) < tol) converged = .true.
! print *, iter, abs(maxval(Sigma - Sigma0)), tol
! Sigma = (Sigma + transpose(Sigma)) / dble(2)
IF (converged .OR. (iter > maxiter)) EXIT
! AAA = AA * AA
call DGEMM('N','N',Nx,Nx,Nx,1.0d0,AA,Nx,AA,Nx,0.0d0,AAA,Nx)
AA = AAA
Sigma0 = Sigma
END DO
IF (converged) THEN
status = 0
ELSE
status = -1
END IF
END SUBROUTINE DLYAP
#:endif
Thank you guys, all your advices are relevant, but I gave up of Fortran and translated my code to Python. There, with just 1 or 2 little bugs my code worked perfect
I wrote a program to solve a linear system using the Gauss method. I wrote all the algorithms, the forward elimination and the back substitution and I made a lot of others subroutines and I don't know anymore what's wrong, I don't if is something wrong with my code or if some problem programming in Fortran because I'm new in this language. I'll put my code below and the linear system that I should find a solution
PROGRAM metodo_Gauss
IMPLICIT NONE
REAL :: det_a_piv
INTEGER :: n, i, j
REAL, DIMENSION(:,:), ALLOCATABLE :: a, a_piv
INTEGER, DIMENSION(:), ALLOCATABLE :: p
REAL, DIMENSION(:), ALLOCATABLE :: b, x
PRINT*, "Entre com a dimensão n do sistema a ser resolvido"
READ*, n
! allocate memory
ALLOCATE(a(n, n))
ALLOCATE(a_piv(n, n))
ALLOCATE(p(n))
ALLOCATE(b(n))
ALLOCATE(x(n))
CALL matriz_a(n, a)
CALL vetor_b(n, b)
a_piv(1:n, 1:n) = a(1:n, 1:n)
DO i = 1, n
x(i) = 0
END DO
CALL eliminacao(n, a, a_piv, p)
det_a_piv = (-1) ** n
DO j = 1, n
det_a_piv = det_a_piv * a_piv(j, j)
END DO
IF (det_a_piv == 0) THEN
PRINT*, "O sistema linear é indeterminado"
ELSE IF (abs(det_a_piv) <= 1) THEN
PRINT*, "O sistema linear é mal-condicionado"
ELSE
CALL substituicao(n, a_piv, p, b, x)
PRINT*, "A solução do sistema é:"
PRINT*, x
END IF
END PROGRAM metodo_Gauss
SUBROUTINE matriz_a(n, a)
IMPLICIT NONE
INTEGER, INTENT(in) :: n
REAL, DIMENSION(n,n), INTENT(inout) :: a
INTEGER :: i, j !Indícios usados em loops para percorrer os arrays
PRINT*, "Por favor digite os valores do elementos da matriz sistema linear seguindo pela ordem das linhas até o final:"
DO i = 1, n
DO j = 1, n
READ*, a(i,j)
END DO
END DO
END SUBROUTINE matriz_a
SUBROUTINE vetor_b(n, b)
IMPLICIT NONE
INTEGER, INTENT(in) :: n
REAL, DIMENSION(n), INTENT(inout) :: b
INTEGER :: i
PRINT*, "Por favor entre com os elementos do vetor b:"
DO i = 1, n
READ*, b(i)
END DO
END SUBROUTINE vetor_b
SUBROUTINE eliminacao(n, a, a_piv, p)
IMPLICIT NONE
INTEGER, INTENT(in) :: n
REAL, DIMENSION(n, n), INTENT(in) :: a
REAL, DIMENSION(n, n), INTENT(out) :: a_piv
INTEGER, DIMENSION(n), INTENT(out) :: p
INTEGER :: i, j, local, dim
REAL :: mult
DO i = 1, (n - 1)
dim = n - 1
CALL local_pivo(dim, a(i:n, i), local)
a_piv(i, i:n) = a(local, i:n)
a_piv(local, i:n) = a(i, i:n)
p(i) = local
DO j = (i + 1), n
mult = (-1) * (a_piv(j,i) / a_piv(local,i))
a_piv(j,i) = mult
a_piv(j, j:n) = a_piv(j, j:n) + mult * a_piv(i, j:n)
END DO
END DO
END SUBROUTINE eliminacao
SUBROUTINE local_pivo(n, a, local)
IMPLICIT NONE
INTEGER, INTENT(in) :: n
REAL, DIMENSION(n), INTENT(in) :: a
INTEGER, INTENT(inout) :: local
INTEGER :: i
local = 1
DO i = 2, n
IF ((ABS(a(i))) > ABS(a(local))) THEN
local = i
END IF
END DO
END SUBROUTINE local_pivo
SUBROUTINE substituicao(n, a_piv, p, b, x)
IMPLICIT NONE
INTEGER, INTENT(in) :: n
REAL, DIMENSION(n, n), INTENT(in) :: a_piv
REAL, DIMENSION(n), INTENT(out) :: b, x
INTEGER, DIMENSION(n), INTENT(in) :: p
INTEGER :: i, j, k, l, pivo
REAL :: aux
DO i = 1, (n - 1)
pivo = p(i)
IF (pivo /= i) THEN
aux = b(i)
b(i) = b(pivo)
b(pivo) = aux
END IF
DO j = (i + 1), n
b(j) = a_piv(j, i) * b(j) + b(i)
END DO
END DO
DO k = n, 1, -1
IF (k == n) THEN
x(n) = b(n) / a_piv(n, n)
ELSE
x(k) = (b(k) + a_piv(k, n) * x(n)) / a_piv(k, k)
DO l = n, k, -1
x(l) = x(l) + (a_piv(k, l) * x(l)) / a_piv(k, k)
END DO
END IF
END DO
END SUBROUTINE substituicao
Here it is the system that I'm trying to solve
My input is:
4
4
3
2
2
2
1
1
2
2
2
2
4
6
1
1
2
5
8
3
1
My output is:
-40.5000000 -40.2500000 -3.75000000 -37.5000000
But the output should be:
6.500000
-44.000000
72.000000
-16.500000
I'm trying to figure out how to properly normalize the results of a DFT using FFTW. The FFTW tutorial states that the forward (FFTW_FORWARD) discrete Fourier transform of a 1d complex array X of size n computes an array Y, where
Y_k = \sum\limits_{j=0}^{n-1} X_j e^{-2\pi j k \sqrt{-1}/n}
The backward DFT computes:
Y_k = \sum\limits_{j=0}^{n-1} X_j e^{+2\pi j k \sqrt{-1}/n}
These definitions are the same as for real-to-complex transformations.
Furthermore, the tutorial specifies that "FFTW computes an unnormalized transform, in that there is no coefficient in front of the summation in the DFT. In other words, applying the forward and then the backward transform will multiply the input by n." However, it doesn't specify where exactly this re-scaling needs to be done. I suppose this may be application dependant, but am not sure how to use it properly. This answer states that it should be normalized in the forward direction, but I have my doubts, which I will elaborate.
My goal is to figure out how to properly normalize the FFT results in order to get what I expect. So I did a simple 1D transformation first, where I know what to expect exactly: Using the same convention as FFTW (normalisation factor=1, oscillatory factor=-2*pi for the forward fourier transform), when I transform
1/2 (δ(1 + x) - δ(1 - x))
with δ being the dirac delta function, I expect to get:
integral_(-∞)^∞ (1/2 (δ(1 + x) - δ(1 - x))) e^(-2 π i ω x) dx = i sin(2π ω)
the same holds for when I do an IFFT on i sin(2π ω), only now I need to normalize by dividing by n.
Here is the code I use to demonstrate this behaviour:
program use_fftw
use,intrinsic :: iso_c_binding
implicit none
include 'fftw3.f03'
integer, parameter :: N = 1000
integer, parameter :: dp = kind(1.d0)
real(dp), parameter :: pi = 3.1415926d0
real(dp), parameter :: physical_length = 500
real(dp), parameter :: dx = physical_length/real(N)
real(dp), parameter :: dk = 1.d0 / physical_length
integer :: i, ind1, ind2
! for double precision: use double complex & call dfftw_plan_dft_1d
complex(C_DOUBLE_COMPLEX), allocatable, dimension(:) :: arr_out
real(C_DOUBLE), allocatable, dimension(:) :: arr_in
type(C_PTR) :: plan_forward, plan_backward
allocate(arr_in(1:N))
allocate(arr_out(1:N/2+1))
plan_forward = fftw_plan_dft_r2c_1d(N, arr_in, arr_out, FFTW_ESTIMATE)
plan_backward = fftw_plan_dft_c2r_1d(N, arr_out, arr_in, FFTW_ESTIMATE)
!----------------------
! Setup
!----------------------
! add +1: index = 1 corresponds to x=0
ind1 = int(1.d0/dx)+1 ! index where x=1
ind2 = int((physical_length-1.d0)/dx)+1 ! index where x=-1
arr_in = 0
arr_in(ind1) = -0.5d0
arr_in(ind2) = 0.5d0
!----------------------
! Forward
!----------------------
call fftw_execute_dft_r2c(plan_forward, arr_in, arr_out)
write(*,*) "Verification: Max real part of arr_out:", maxval(real(arr_out))
open(unit=666,file='./fftw_output_norm1d_fft.txt', form='formatted')
do i = 1, N/2+1
write(666, '(2E14.5,x)') (i-1)*dk, aimag(arr_out(i))
enddo
close(666)
write(*,*) "Finished! Written results to fftw_output_norm1d_fft.txt"
!----------------------
! Backward
!----------------------
call fftw_execute_dft_c2r(plan_backward, arr_out, arr_in)
arr_in = arr_in/N
open(unit=666,file='./fftw_output_norm1d_real.txt', form='formatted')
do i = 1, N
write(666, '(2E14.5,x)') (i-1)*dx, arr_in(i)
enddo
close(666)
write(*,*) "Finished! Written results to fftw_output_norm1d_real.txt"
deallocate(arr_in, arr_out)
call fftw_destroy_plan(plan_forward)
call fftw_destroy_plan(plan_backward)
end program use_fftw
And the results, perfectly according to what I'd expect:
So in this case, I only normalized (division by n) when going from Fourier space back to real space and obtained what I wanted.
But I ran into problems when I tried to do the same for multiple dimensions.
This time, I'm trying to transform
sqrt(π/2) ((δ(-1 + x) - δ(1 + x)) δ(y) + δ(x) (δ(-1 + y) - δ(1 + y)))
which should give
integral_(-∞)^∞ (sqrt(π/2) ((δ(-1 + x) - δ(1 + x)) δ(y) + δ(x) (δ(-1 + y) - δ(1 + y)))) e^(-2 π i {x, y} {a, b}) d{x, y} = +i sin(a) + i sin(b)
I plot the results for x=0 (k_x = 0, respectively):
which seems completely wrong, both in frequency of the sinus wave and the amplitude.
However, transforming back and normalising by dividing by n^2 gives the expected initial conditions, in both x and y direction. Here is the plot for x=0:
I have no idea what I am doing wrong...
Here is the 2d code:
program use_fftw
use,intrinsic :: iso_c_binding
implicit none
include 'fftw3.f03'
integer, parameter :: N = 1000
integer, parameter :: dp = kind(1.d0)
real(dp), parameter :: pi = 3.1415926d0
real(dp), parameter :: physical_length = 500
real(dp), parameter :: dx = physical_length/real(N)
real(dp), parameter :: dk = 1.d0 / physical_length
integer :: i, ind1, ind2
! for double precision: use double complex & call dfftw_plan_dft_1d
complex(C_DOUBLE_COMPLEX), allocatable, dimension(:,:) :: arr_out
real(C_DOUBLE), allocatable, dimension(:,:) :: arr_in
type(C_PTR) :: plan_forward, plan_backward
allocate(arr_in(1:N, 1:N))
allocate(arr_out(1:N/2+1, 1:N))
plan_forward = fftw_plan_dft_r2c_2d(N, N, arr_in, arr_out, FFTW_ESTIMATE)
plan_backward = fftw_plan_dft_c2r_2d(N, N, arr_out, arr_in, FFTW_ESTIMATE)
!----------------------
! Setup
!----------------------
! add +1: index = 1 corresponds to x=0
ind1 = int(1.d0/dx)+1 ! get index where x = 1
ind2 = int((physical_length-1.d0)/dx)+1 ! get index where x = -1
arr_in = 0
! y=0:
arr_in(ind1, 1) = sqrt(pi/2)
arr_in(ind2, 1) = -sqrt(pi/2)
! x=0:
arr_in(1, ind1) = sqrt(pi/2)
arr_in(1, ind2) = -sqrt(pi/2)
!----------------------
! Forward
!----------------------
call fftw_execute_dft_r2c(plan_forward, arr_in, arr_out)
write(*,*) "Verification: Max real part of arr_out:", maxval(real(arr_out))
open(unit=666,file='./fftw_output_norm2d_fft_x=0.txt', form='formatted')
open(unit=667,file='./fftw_output_norm2d_fft_y=0.txt', form='formatted')
do i = 1, N
write(666, '(2E14.5,x)') (i-1)*dk, aimag(arr_out(1,i))
enddo
do i = 1, N/2+1
write(667, '(2E14.5,x)') (i-1)*dk, aimag(arr_out(i,1))
enddo
close(666)
close(667)
write(*,*) "Finished! Written results to fftw_output_normalisation_fft_x.txt and fftw_output_normalisation_fft_y.txt"
!----------------------
! Backward
!----------------------
call fftw_execute_dft_c2r(plan_backward, arr_out, arr_in)
! Normalisation happens here!
arr_in = arr_in/N**2
open(unit=666,file='./fftw_output_norm2d_real_x=0.txt', form='formatted')
open(unit=667,file='./fftw_output_norm2d_real_y=0.txt', form='formatted')
do i = 1, N
write(666, '(2E14.5,x)') (i-1)*dx, arr_in(1, i)
write(667, '(2E14.5,x)') (i-1)*dx, arr_in(i, 1)
enddo
close(666)
close(667)
write(*,*) "Finished! Written results to fftw_output_norm2d_real_x=0.txt and fftw_output_norm2d_real_y=0.txt"
deallocate(arr_in, arr_out)
call fftw_destroy_plan(plan_forward)
call fftw_destroy_plan( plan_backward)
end program use_fftw
and a python plotting tool:
#!/usr/bin/python3
#====================================
# Plots the results of the FFTW
# example programs.
#====================================
import numpy as np
import matplotlib.pyplot as plt
from sys import argv
from time import sleep
errormessage="""
I require an argument: Which output file to plot.
Usage: ./plot_fftw.py <case>
options for case:
1 fftw_output_norm1d_fft.txt
2 fftw_output_norm1d_real.txt
3 fftw_output_norm2d_fft_x=0.txt
4 fftw_output_norm2d_real_x=0.txt
5 fftw_output_norm2d_fft_y=0.txt
6 fftw_output_norm2d_real_y=0.txt
Please select a case: """
#----------------------
# Hardcoded stuff
#----------------------
file_dict={}
file_dict['1'] = ('fftw_output_norm1d_fft.txt', '1d Fourier transform')
file_dict['2'] = ('fftw_output_norm1d_real.txt', '1d Full circle')
file_dict['3'] = ('fftw_output_norm2d_fft_x=0.txt', '2d Fourier transform, x=0')
file_dict['4'] = ('fftw_output_norm2d_real_x=0.txt', '2d Full circle, x=0')
file_dict['5'] = ('fftw_output_norm2d_fft_y=0.txt', '2d Fourier transform, y=0')
file_dict['6'] = ('fftw_output_norm2d_real_y=0.txt', '2d Full circle, y=0')
#------------------------
# Get case from cmdline
#------------------------
case = ''
def enforce_integer():
global case
while True:
case = input(errormessage)
try:
int(case)
break
except ValueError:
print("\n\n!!! Error: Case must be an integer !!!\n\n")
sleep(2)
if len(argv) != 2:
enforce_integer()
else:
try:
int(argv[1])
case = argv[1]
except ValueError:
enforce_integer()
filename,title=file_dict[case]
#-------------------------------
# Read and plot data
#-------------------------------
k, Pk = np.loadtxt(filename, dtype=float, unpack=True)
fig = plt.figure()
ax = fig.add_subplot(111)
# ax.plot(k, Pk, label='power spectrum')
if case in ['1', '3', '5']:
ax.plot(k, Pk, label='recovered wave', lw=3) # ignore negative k
x = np.linspace(k.min(), k.max(), 1000)
if case=='1':
ax.plot(x, np.sin(2*np.pi*x), ':', label='expected wave', lw=3)
if case in ['3', '5']:
ax.plot(x, np.sin(x), ':', label='expected wave', lw=3)
ax.set_title(title)
ax.set_xlabel("k")
ax.set_ylabel("F(k)")
if case in ['2', '4', '6']:
# in this case: k=x, Pk=f(x)
ax.plot(k, Pk, label='recovered original', lw=3) # ignore negative k
N=1000
plen=500
dx=plen/N
x = np.linspace(k.min(), k.max(), 1000)
y = np.zeros(1000)
ind = int(1.0/dx)
if case=='2':
y[ind] = -0.5
y[-ind] = 0.5
if case in ['4', '6']:
y[ind] = np.sqrt(np.pi/2)
y[-ind] = -np.sqrt(np.pi/2)
ax.plot(x, y, ':', label='expected original', lw=3)
ax.set_title(title)
ax.set_xlabel("x")
ax.set_ylabel("f(x)")
ax.legend()
plt.show()
Blow is a main file
PROGRAM SPHEROID
USE nrtype
USE SUB_INFO
INCLUDE "/usr/local/include/fftw3.f"
INTEGER(I8B) :: plan_forward, plan_backward
INTEGER(I4B) :: i, t, int_N
REAL(DP) :: cth_i, sth_i, real_i, perturbation
REAL(DP) :: PolarEffect, dummy, x1, x2, x3
REAL(DP), DIMENSION(4096) :: dummy1, dummy2, gam, th, ph
REAL(DP), DIMENSION(4096) :: k1, k2, k3, k4, l1, l2, l3, l4, f_in
COMPLEX(DPC), DIMENSION(2049) :: output1, output2, f_out
CHARACTER(1024) :: baseOutputFilename
CHARACTER(1024) :: outputFile, format_string
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
int_N = 4096
! File Open Section
format_string = '(I5.5)'
! Write the coodinates at t = 0
do i = 1, N
real_i = real(i)
gam(i) = 2d0*pi/real_N
perturbation = 0.01d0*dsin(2d0*pi*real_i/real_N)
ph(i) = 2d0*pi*real_i/real_N + perturbation
th(i) = pi/3d0 + perturbation
end do
! Initialization Section for FFTW PLANS
call dfftw_plan_dft_r2c_1d(plan_forward, int_N, f_in, f_out, FFTW_ESTIMATE)
call dfftw_plan_dft_c2r_1d(plan_backward, int_N, f_out, f_in, FFTW_ESTIMATE)
! Runge-Kutta 4th Order Method Section
do t = 1, Iter_N
call integration(th, ph, gam, k1, l1)
do i = 1, N
dummy1(i) = th(i) + 0.5d0*dt*k1(i)
end do
do i = 1, N
dummy2(i) = ph(i) + 0.5d0*dt*l1(i)
end do
call integration(dummy1, dummy2, gam, k2, l2)
do i = 1, N
dummy1(i) = th(i) + 0.5d0*dt*k2(i)
end do
do i = 1, N
dummy2(i) = ph(i) + 0.5d0*dt*l2(i)
end do
call integration(dummy1, dummy2, gam, k3, l3)
do i = 1, N
dummy1(i) = th(i) + dt*k3(i)
end do
do i = 1, N
dummy2(i) = ph(i) + dt*l3(i)
end do
call integration(dummy1, dummy2, gam, k4, l4)
do i = 1, N
cth_i = dcos(th(i))
sth_i = dsin(th(i))
PolarEffect = (nv-sv)*dsqrt(1d0+a*sth_i**2) + (nv+sv)*cth_i
PolarEffect = PolarEffect/(sth_i**2)
th(i) = th(i) + dt*(k1(i) + 2d0*k2(i) + 2d0*k3(i) + k4(i))/6d0
ph(i) = ph(i) + dt*(l1(i) + 2d0*l2(i) + 2d0*l3(i) + l4(i))/6d0
ph(i) = ph(i) + dt*0.25d0*PolarEffect/pi
end do
!! Fourier Filtering Section
call dfftw_execute_dft_r2c(plan_forward, th, output1)
do i = 1, N/2+1
dummy = abs(output1(i))
if (dummy.lt.threshhold) then
output1(i) = dcmplx(0.0d0)
end if
end do
call dfftw_execute_dft_c2r(plan_backward, output1, th)
do i = 1, N
th(i) = th(i)/real_N
end do
call dfftw_execute_dft_r2c(plan_forward, ph, output2)
do i = 1, N/2+1
dummy = abs(output2(i))
if (dummy.lt.threshhold) then
output2(i) = dcmplx(0.0d0)
end if
end do
call dfftw_execute_dft_c2r(plan_backward, output2, ph)
do i = 1, N
ph(i) = ph(i)/real_N
end do
!! Data Writing Section
write(baseOutputFilename, format_string) t
outputFile = "xyz" // baseOutputFilename
open(unit=7, file=outputFile)
outputFile = "Fsptrm" // baseOutputFilename
open(unit=8, file=outputFile)
do i = 1, N
x1 = dsin(th(i))*dcos(ph(i))
x2 = dsin(th(i))*dsin(ph(i))
x3 = dsqrt(1d0+a)*dcos(th(i))
write(7,*) x1, x2, x3
end do
do i = 1, N/2+1
write(8,*) abs(output1(i)), abs(output2(i))
end do
close(7)
close(8)
do i = 1, N/2+1
output1(i) = dcmplx(0.0d0)
end do
do i = 1, N/2+1
output2(i) = dcmplx(0.0d0)
end do
end do
! Destroying Process for FFTW PLANS
call dfftw_destroy_plan(plan_forward)
call dfftw_destroy_plan(plan_backward)
END PROGRAM
Below is a subroutine file for integration
! We implemented Shelly's spectrally accurate convergence method
SUBROUTINE integration(in1,in2,in3,out1,out2)
USE nrtype
USE SUB_INFO
INTEGER(I4B) :: i, j
REAL(DP) :: th_i, th_j, gi, ph_i, ph_j, gam_j, v1, v2
REAL(DP), DIMENSION(N), INTENT(INOUT) :: in1, in2, in3, out1, out2
REAL(DP) :: ui, uj, part1, part2, gj, cph, sph
REAL(DP) :: denom, numer, temp
do i = 1, N
out1(i) = 0d0
end do
do i = 1, N
out2(i) = 0d0
end do
do i = 1, N
th_i = in1(i)
ph_i = in2(i)
ui = dcos(th_i)
part1 = dsqrt(1d0+a)/(dsqrt(-a)*ui+dsqrt(1d0+a-a*ui*ui))
part1 = part1**(dsqrt(-a))
part2 = (dsqrt(1d0+a-a*ui*ui)+ui)/(dsqrt(1d0+a-a*ui*ui)-ui)
part2 = dsqrt(part2)
gi = dsqrt(1d0-ui*ui)*part1*part2
do j = 1, N
if (mod(i+j,2).eq.1) then
th_j = in1(j)
ph_j = in2(j)
gam_j = in3(j)
uj = dcos(th_j)
part1 = dsqrt(1d0+a)/(dsqrt(-a)*uj+dsqrt(1d0+a-a*uj*uj))
part1 = part1**(dsqrt(-a))
part2 = (dsqrt(1d0+a-a*uj*uj)+uj)/(dsqrt(1d0+a-a*uj*uj)-uj)
part2 = dsqrt(part2)
gj = dsqrt(1d0-ui*ui)*part1*part2
cph = dcos(ph_i-ph_j)
sph = dsin(ph_i-ph_j)
numer = dsqrt(1d0-uj*uj)*sph
denom = (gj/gi*(1d0-ui*ui) + gi/gj*(1d0-uj*uj))*0.5d0
denom = denom - dsqrt((1d0-ui*ui)*(1d0-uj*uj))*cph
denom = denom + krasny_delta
v1 = -0.25d0*gam_j*numer/denom/pi
temp = dsqrt(1d0+(1d0-ui*ui)*a)
numer = -(gj/gi)*(temp+ui)
numer = numer + (gi/gj)*((1d0-uj*uj)/(1d0-ui*ui))*(temp-ui)
numer = numer + 2d0*ui*dsqrt((1d0-uj*uj)/(1d0-ui*ui))*cph
numer = 0.5d0*numer
v2 = -0.25d0*gam_j*numer/denom/pi
out1(i) = out1(i) + 2d0*v1
out2(i) = out2(i) + 2d0*v2
end if
end do
end do
END
Below is a module file
module nrtype
Implicit none
!integer
INTEGER, PARAMETER :: I8B = SELECTED_INT_KIND(20)
INTEGER, PARAMETER :: I4B = SELECTED_INT_KIND(9)
INTEGER, PARAMETER :: I2B = SELECTED_INT_KIND(4)
INTEGER, PARAMETER :: I1B = SELECTED_INT_KIND(2)
!real
INTEGER, PARAMETER :: SP = KIND(1.0)
INTEGER, PARAMETER :: DP = KIND(1.0D0)
!complex
INTEGER, PARAMETER :: SPC = KIND((1.0,1.0))
INTEGER, PARAMETER :: DPC = KIND((1.0D0,1.0D0))
!defualt logical
INTEGER, PARAMETER :: LGT = KIND(.true.)
!mathematical constants
REAL(DP), PARAMETER :: pi = 3.141592653589793238462643383279502884197_dp
!derived data type s for sparse matrices,single and double precision
!User-Defined Constants
INTEGER(I4B), PARAMETER :: N = 4096, Iter_N = 20000
REAL(DP), PARAMETER :: real_N = 4096d0
REAL(DP), PARAMETER :: a = -0.1d0, dt = 0.001d0, krasny_delta = 0.01d0
REAL(DP), PARAMETER :: nv = 0d0, sv = 0d0, threshhold = 0.00000000001d0
!N : The Number of Point Vortices, Iter_N * dt = Total time, dt : Time Step
!krasny_delta : Smoothing Parameter introduced by R.Krasny
!nv : Northern Vortex Strength, sv : Southern Vortex Strength
!a : The Eccentricity in the direction of z , threshhold : Filtering Threshhold
end module nrtype
Below is a subroutine info file
MODULE SUB_INFO
INTERFACE
SUBROUTINE integration(in1,in2,in3,out1,out2)
USE nrtype
INTEGER(I4B) :: i, j
REAL(DP) :: th_i, th_j, gi, ph_i, ph_j, gam_j, v1, v2
REAL(DP), DIMENSION(N), INTENT(INOUT) :: in1, in2, in3, out1, out2
REAL(DP) :: ui, uj, part1, part2, gj, cph, sph
REAL(DP) :: denom, numer, temp
END SUBROUTINE
END INTERFACE
END MODULE
I compiled them using the below command
gfortran -o p0 -fbounds-check nrtype.f90 spheroid_sub_info.f90 spheroid_sub_integration.f90 spheroid_main.f90 -lfftw3 -lm -Wall -pedantic -pg
nohup ./p0 &
Note that 2049 = 4096 / 2 + 1
When making plan_backward, isn't it correct that we use 2049 instead of 4096 since the dimension of output is 2049?
But when I do that, it blows up. (Blowing up means NAN error)
If I use 4096 in making plan_backward, Everything is fine except that some Fourier coefficients are abnormally big which should not happen.
Please help me use FFTW in Fortran correctly. This issue has discouraged me for a long time.
First, although you claim your example is minimal, it is still pretty large, I have no time to study it.
But I updated my gist code https://gist.github.com/LadaF/73eb430682ef527eea9972ceb96116c5 to show also the backward transform and to answer the title question about the transform dimensions.
The logical size of the transform is the size of the real array (Real-data DFT Array Format) but the complex part is smaller due to inherent symmetries.
But when you make first r2c transform from real array of size n to complex array of size n/2+1. and then an opposite transform back, the real array should be again of size n.
This is my minimal example from the gist:
module FFTW3
use, intrinsic :: iso_c_binding
include "fftw3.f03"
end module
use FFTW3
implicit none
integer, parameter :: n = 100
real(c_double), allocatable :: data_in(:)
complex(c_double_complex), allocatable :: data_out(:)
type(c_ptr) :: planf, planb
allocate(data_in(n))
allocate(data_out(n/2+1))
call random_number(data_in)
planf = fftw_plan_dft_r2c_1d(size(data_in), data_in, data_out, FFTW_ESTIMATE+FFTW_UNALIGNED)
planb = fftw_plan_dft_c2r_1d(size(data_in), data_out, data_in, FFTW_ESTIMATE+FFTW_UNALIGNED)
print *, "real input:", real(data_in)
call fftw_execute_dft_r2c(planf, data_in, data_out)
print *, "result real part:", real(data_out)
print *, "result imaginary part:", aimag(data_out)
call fftw_execute_dft_c2r(planb, data_out, data_in)
print *, "real output:", real(data_in)/n
call fftw_destroy_plan(planf)
call fftw_destroy_plan(planb)
end
Note that I am using the modern Fortran interface. I don't like using the old one.
One issue may be that dfftw_execute_dft_c2r can destroy the content of the input array, as described in this page. The key excerpt is
FFTW_PRESERVE_INPUT specifies that an out-of-place transform must not change its input array. This is ordinarily the default, except for c2r and hc2r (i.e. complex-to-real) transforms for which FFTW_DESTROY_INPUTis the default...
We can verify this, for example, by modifying the sample code by #VladimirF such that it saves data_out to data_save right after the first FFT(r2c) call, and then calculating their difference after the second FFT (c2r) call. So, in the case of OP's code, it seems safer to save output1 and output2 to different arrays before entering the second FFT (c2r).
With the following program I experience errors.
Program COM
!Input
!No of Atoms
!No of Iterations
!Respective Positions.
!As of now for homogeneous clusters.
Implicit None
Real, Parameter :: R8B=selected_real_kind(10)
Real, Parameter :: R4B=selected_real_kind(4)
Integer, Parameter :: I1B=selected_int_kind(2)
Integer, Parameter :: I2B=selected_int_kind(4)
Integer, Parameter :: I4B=selected_int_kind(9)
Integer, Parameter :: I8B=selected_int_kind(18)
Real (R8B), Dimension (:,:), Allocatable :: Posx, Posy, Posz
Real (R8B), Dimension (:), Allocatable :: Posx_n, Posy_n, Posz_n
Real (R8B), Dimension (:), Allocatable :: dist_com, avj_dist_com
Integer (I4B), Dimension (:), Allocatable :: bin_array
Real (R8B) :: comx, comy, comz
Integer (I8B) :: nIter, nAtom, dist
Integer (I8B) :: I,J,ii,k
Integer (I1B) :: xyz_format, FlagR, FlagM, Flag_com
Integer (I8B) :: bin
Integer (R8B) :: max_dist
Character (50) POS_file, COM_file,Bin_file
Character (2) jj
Read (*,*) POS_file
Read (*,*) COM_file
Read (*,*) Bin_file
Read (*,*) nAtom
Read (*,*) nIter
Read (*,*) xyz_format
Read (*,*) max_dist, bin
! if Flag_com == 1 then compute dist from COM
! if its 0 then specify the atom no and g(r) will be computed..
! i.e. no of atoms from that atom between dist r and r + dr
Allocate (Posx(nAtom,nIter))
Allocate (Posy(nAtom,nIter))
Allocate (Posz(nAtom,nIter))
! xyz_format = 0 ==> old_ks
! xyz_format = 1 ==> xmakemol
! xyz_format = 2 ==> Envision
write(*,*)POS_file
Open (unit=99, file=POS_file)
if (xyz_format == 0 ) then
do i = 1,nIter
read(99,*)
do j = 1,nAtom
read(99,*)ii,Posx(j,i),Posy(j,i),Posz(j,i),ii
enddo
enddo
elseif (xyz_format == 1 ) then
do i = 1,nIter
read(99,*)ii
read(99,*)
do j = 1,nAtom
read(99,*)jj,Posx(j,i),Posy(j,i),Posz(j,i)
enddo
enddo
elseif (xyz_format == 2 ) then
read(99,*)
read(99,*)
read(99,*)
read(99,*)
do i = 1,nIter
do j = 1,nAtom
read(99,*)
read(99,*)Posx(j,i),Posy(j,i),Posz(j,i)
enddo
enddo
endif
Close (99)
Write (*,'(\1x,"Reading Complete")')
allocate (avj_dist_com (nIter))
allocate (dist_com (nAtom))
avj_dist_com = 0.0d0
dist_com = 0.0d0
Allocate (Posx_n(nAtom))
Allocate (Posy_n(nAtom))
Allocate (Posz_n(nAtom))
Allocate (Bin_Array(bin))
Posx_n = 0.0d0
Posy_n = 0.0d0
Posz_n = 0.0d0
bin_array = 0.0d0
Open (unit=2, file=COM_file)
Do I = 1, nIter
comx = 0.0d0
comy = 0.0d0
comz = 0.0d0
Do J = 1, nAtom
comx = comx + Posx(j,i)
comy = comy + Posy(j,i)
comz = comz + Posz(j,i)
Enddo
comx = comx/nAtom
comy = comy/nAtom
comz = comz/nAtom
Write (*,*) i, comx, comy, comz
Do J = 1, nAtom
Posx_n (j) = Posx(j,i) - comx
Posy_n (j) = Posy(j,i) - comy
Posz_n (j) = Posz(j,i) - comz
dist_com (j) = dsqrt ( Posx_n(j)*Posx_n(j) &
+ Posy_n(j)*Posy_n(j) &
+ Posz_n(j)*Posz_n(j) )
avj_dist_com (i) = avj_dist_com(i) + dist_com(j)
Enddo
avj_dist_com(i) = avj_dist_com(i)/nAtom
Do j = 1, nAtom
dist = dist_com (j) * dfloat((bin/max_dist))
bin_array(dist) = bin_array(dist) + 1
Enddo
write (2,'(2x,i6,143(2x,f10.7))') I, avj_dist_com(i),(dist_com(k),k=1,nAtom)
write(*,*) i
Enddo
close (2)
Open (unit=3, file=Bin_file)
do i = 1, bin
write (3,'(2x,i6,4x,i8)') i , bin_array(i)
enddo
close (3)
deAllocate (Posx)
deAllocate (Posy)
deAllocate (Posz)
deAllocate (Posx_n)
deAllocate (Posy_n)
deAllocate (Posz_n)
deallocate (avj_dist_com)
deallocate (dist_com)
deallocate (bin_array)
Stop
End Program COM
The errors look like
Real(KIND=r8b), Dimension (:), Allocatable :: Posx, Posy, Posz
1
Error: Integer expression required at (1)
and there are many more
How can I rectify these?
The kind parameter for a type must be an integer constant expression. You have the latter part down, as you are using named constants R8B and R4B.
However, and this is what the error message says, you have not used an integer constant expression. You should notice that selected_real_kind returns an integer value even as the kind for a selected real type. So, you can correct your code with
Integer, Parameter :: R8B=selected_real_kind(10)
Integer, Parameter :: R4B=selected_real_kind(4)