Understand the generalized eigenvalue solvers in LAPACK in Fortran - fortran

I tried to solve a generalized eigenvalue problem for both eigenvalue and eigenvectors, at least for the lowest one.
A x = lambda B x
where A and B are two real symmetric matrices. B is positive semi definite.(in practice, positive definite)
Looks like dgeev/dsygv is the right tool, http://www.netlib.org/lapack/explore-html/d9/d8e/group__double_g_eeigen_ga66e19253344358f5dee1e60502b9e96f.html http://www.netlib.org/lapack/explore-html/d2/d8a/group__double_s_yeigen_ga007d33bcdcc697e17c6d15432f159b73.html
As an initial step, I would to see if dgeev/dsygv can reproduce the problem for standard eigenvalue problem, namely when B is a unit matrix, by comparing with dsyev. However, I got different eigenvectors. Here is my fortran code
Program eigen_test
integer, parameter :: dp = selected_real_kind(15, 307)
real(dp) :: A(2,2), B(2,2), work(2), w(2), wr(2), wi(2), vl(2,2), vr(2)
integer :: i,j,N, info, lwork, LDA, LDB, LDVL, LDVR, ITYPE
character :: jobz, uplo, JOBVL, JOBVR
external dsyev, dgeev, DSYGV
N = 2
jobz = "V"
uplo = "U"
lwork = 6 * N
A(1,1) = 1.0_dp
A(1,2) = 0.2_dp
A(2,2) = 1.0_dp
A(2,1) = A(1,2)
B(1,1) = 1.0_dp
B(1,2) = 0.0_dp
B(2,2) = 1.0_dp
B(2,1) = B(1,2)
JOBVL = 'N'
JOBVR = 'V'
LDA = N
LDB = N
LDVL = N
LDVR = N
write (*,*) 'before dsyev', A
call dsyev(jobz, uplo, N, A, N, w, work, lwork, info)
write (*,*) 'after A dsyev', A
A(1,1) = 1.0_dp
A(1,2) = 0.2_dp
A(2,2) = 1.0_dp
A(2,1) = A(1,2)
write (*,*) 'A before dgeev', A
call dgeev(JOBVL, JOBVR, N, A, LDA, WR, WI, VL, LDVL, VR, LDVR, WORK, LWORK, INFO)
write (*,*) 'A after dgeev', A
write (*,*) 'VR',VR
A(1,1) = 1.0_dp
A(1,2) = 0.2_dp
A(2,2) = 1.0_dp
A(2,1) = A(1,2)
B(1,1) = 1.0_dp
B(1,2) = 0.0_dp
B(2,2) = 1.0_dp
B(2,1) = B(1,2)
write (*,*) 'A before DSYGV', A
ITYPE = 1
call DSYGV(ITYPE, JOBZ, UPLO, N, A, LDA, B, LDB, W, WORK, LWORK, INFO)
write (*,*) 'A after DSYGV', A
end program eigen_test
By gfortran eigen.f90 -L/opt/OpenBLAS/lib -lopenblas, (I tried to call it from lopenblas), I got
after A dsyev 6.4512101403162237E-003 -0.99997919072734975 -0.99997919072734975 -6.4512101403162237E-003
A before dgeev 1.0000000000000000 0.20000000000000001 0.20000000000000001 1.0000000000000000
A after dgeev 1.2000000000000000 0.0000000000000000 0.0000000000000000 0.79999999999999993
VR 0.70710678118654746 0.70710678118654746
A before DSYGV 1.0000000000000000 0.20000000000000001 0.20000000000000001 1.0000000000000000
A after DSYGV 6.4512101403162237E-003 -0.99997919072734975 -0.99997919072734975 -6.4512101403162237E-003
I expect the eigenvector to be -0.70710678118654746 0.70710678118654746 or 0.70710678118654746 0.70710678118654746, which can be verified from https://www.emathhelp.net/calculators/linear-algebra/eigenvalue-and-eigenvector-calculator/?i=%5B%5B1%2C0.2%5D%2C%5B0.2%2C1%5D%5D
but it is similar to the result from dgeev, not dsyev or DSYGV. My question is, why dsyev and DSYGV provide different results? Should I somehow input in an upper triangle type of array A?

Related

Getting error, not sure why. Rank mismatch in argument

Every time I compile this program I get these errors...
(This error occurs every time I try to call the subroutine.)
103 | call ColumnInsert(M(n,n), b, n, col, MatOut(n,n))
| 1
Error: Explicit interface required for ‘columninsert’ at (1): assumed-shape argument
(This error also occurs every time I run the function)
107 | detA = Determinant (MatOut(:,:), n)
| 1
Error: Type mismatch in argument ‘m’ at (1); passed INTEGER(4) to REAL(8)
Here is the main program:
program CramersRule
! System of equations. 2x2, 3x3
implicit none
! Declare varialble
integer :: n, row, col, i
real*8, allocatable :: Matrix1(:,:), b(:), x(:)
real*8 :: detA, detM, determinant
logical :: Success
! Open the input and output files.
open(42,file='Data2.txt')
open(43,file='Data2Out.txt')
! Solve each system in the input files.
do
! Read in size of first system.
read(42,*) n
if (n .eq. 0) exit ! Quit if zero.
! Allocate memory for system, right hand side, and solution vector.
allocate(Matrix1(n,n), b(n), x(n))
! Read in the system. Ask if you do not understand how this works!
do row = 1, n
read(42,*) (Matrix1(row, col), col = 1, n), b(row)
print*, Matrix1
enddo
! Use cramers rule to get solution.
call Cramer(Matrix1, b, n, x, Success)
if (Success) then
! Write solution to file
do row = 1, n
write(43,*) x(row)
enddo
write(43,*)
else ! This happens when there is no unique solution.
write(43,*) 'No Solution'
write(43,*)
endif
! clean up memory and go back up to top for next system.
deallocate(Matrix1, b, x)
enddo
! close files
close(42)
close(43)
end program CramersRule
subroutine Cramer(M, b, n, x, Success)
! This subroutine does Cramer's Rule
implicit none
! Declare and initialize your variables first.
real*8, allocatable :: M(:,:), b(:), x(:)
integer :: n, row, col, i
integer :: MatOut(n,n)
real*8 :: detA, detM, x1, x2, x3, Determinant, solution1, solution2, solution3
logical :: Success
! Find the determinant of M first. print it to screen.
detM = Determinant(M, n)
print*, "The determinant of this matrix is = ", detM
! If it is zero, set the Success logical variable and quit.
if (detM .eq. 0) then
Success = .false.
return
end if
! Allocate memory for a working matrix for column substituion. Then, for each
! column, i, substitute column i with vector b and get that determinant.
! Compute the ith solution.
if (n .eq. 2)then
col = 1
call ColumnInsert(M(n,n), b, n, col, MatOut(n,n))
print*, MatOut(:,:)
detA = Determinant (MatOut(:,:), n)
x1 = detA/detM
solution1 = x1
col = col + 1
call ColumnInsert(M, b, n, col, MatOut)
print*, MatOut(:,:)
detA = Determinant (MatOut(:,:), n)
x2 = detA/detM
solution2 = x2
success = .true.
return
else
col = 1
call ColumnInsert(M, b, n, col, MatOut)
print*, MatOut(:,:)
detA = Determinant (MatOut(:,:), n)
x1 = detA/detM
solution1 = x1
col = col + 1
call ColumnInsert(M, b, n, col, MatOut)
print*, MatOut(:,:)
detA = Determinant (MatOut(:,:), n)
x2 = detA/detM
solution2 = x2
col = col +1
call ColumnInsert(M, b, n, col, MatOut)
print*, MatOut(:,:)
detA = Determinant (MatOut(:,:), n)
x3 = detA/detM
solution3 = x3
success = .true.
return
end if
! deallocate memory for the working matrix.
deallocate(M, b, x)
end subroutine Cramer
subroutine ColumnInsert(M, b, n, col, MatOut)
! This subroutine takes vector b and inserts in into matrix M at column col.
implicit none
integer :: n
integer, intent(out) :: col, MatOut(:,:)
real :: a, b1, c, d, e, f, g, h, j, k, l, m1
double precision :: M(n,n), b(1,n)
if (n .eq. 2)then
a = M(1,1)
b1 = M(1,2)
c = M(2,1)
d = M(2,2)
e = b(1,1)
f = b(1,2)
!the next if statement substitutes based on which column the main program asks for
if (col .eq. 1)then
M(1,1) = e
M(1,2) = f
M(2,1) = c
M(2,2) = d
MatOut(:,:) = M(:,:)
print*, MatOut(:,:)
return
else
M(1,1) = a
M(1,2) = b1
M(2,1) = e
M(2,2) = f
MatOut(:,:) = M(:,:)
print*, MatOut(:,:)
return
endif
!this is for 3x3 matricies
else
a = M(1,1)
b1 = M(1,2)
c = M(1,3)
d = M(2,1)
e = M(2,2)
f = M(2,3)
g = M(3,1)
h = M(3,2)
j = M(3,3)
k = b(1,1)
l = b(1,2)
m1 = b(1,3)
if (col .eq. 1) then
M(1,1) = k
M(1,2) = l
M(1,3) = m1
M(2,1) = d
M(2,2) = e
M(2,3) = f
M(3,1) = g
M(3,2) = h
M(3,3) = j
MatOut(:,:) = M(:,:)
print*, MatOut(:,:)
return
else if (col .eq. 2)then
M(1,1) = a
M(1,2) = b1
M(1,3) = c
M(2,1) = k
M(2,2) = l
M(2,3) = m1
M(3,1) = g
M(3,2) = h
M(3,3) = j
MatOut(:,:) = M(:,:)
print*, MatOut(:,:)
return
else
M(1,1) = a
M(1,2) = b1
M(1,3) = c
M(2,1) = d
M(2,2) = e
M(2,3) = f
M(3,1) = k
M(3,2) = l
M(3,3) = m1
MatOut(:,:) = M(:,:)
print*, MatOut(:,:)
return
endif
endif
end subroutine ColumnInsert
function Determinant(M, n) result(Det)
!pulled straight from lab 2 in week 4
integer :: n
real*8 :: M(n,n), Det, a, b, c, d, e, f, g, h, j
if (n .eq. 2) then
a = M(1,1)
b = M(1,2)
c = M(2,1)
d = M(2,2)
Det = (a*d)-(b*c)
else
a = M(1,1)
b = M(1,2)
c = M(1,3)
d = M(2,1)
e = M(2,2)
f = M(2,3)
g = M(3,1)
h = M(3,2)
j = M(3,3)
Det = (a*e*j)+(b*f*g)+(c*d*h)-(c*e*g)-(b*d*j)-(a*f*h)
endif
end function Determinant
I know this might seem like a dumb question to be asking, but I cannot for the life of me find where I need to change something. Any help or guidance is greatly appreciated. Thanks!

Why does a manually programmed matrix multiplication combined with matrix addition give better performance than the intrinsic functions?

I have some legacy code which performs the matrix operation of B = B + A*E as
DO I = 1,N
DO L = 1,N
DO K = 1,N
B(I,K) = B(I,K) + A(I,L)*E(L,K,J-1)
end do
end do
end do
To improve readability as well as take advantage of modern fortran intrinsic functions, I would like to write the above code as
B = B + matmul( A, E(:, 1:N, J-1) )
I noticed that the improved readability comes at the cost of performance. I determined that the problem is not with the intrinsic function matmul - the left figure shows that matmul performs just as well as the manually written operation for all values of N.
When matrix multiplication is combined with matrix addition, then for small values of N the manually written operation performs better than the intrinsic functions. For my uses, usually N < 10; I would like to improve the readability without losing the performance. Might there be a suggestion for that?
The code I am using is below. I am using Mac OS 10.14.6 with gfortran 8.2.0 and compiling with the -O3 optimization option.
program test
implicit none
integer :: loop_max = 1000
integer :: j ! loop index
integer :: i ! loop index
real :: t1, t2 ! start and end times
real :: t_manual, t_intrinsic, t_man_add, t_intrn_add
integer :: N ! matrix dimension
integer, parameter :: NJ = 12
real, dimension(:, :), allocatable :: A, B ! matrices
real, dimension(:, :), allocatable :: D
real, dimension(:), allocatable :: G
real, dimension(:, :, :), allocatable :: E
open(1, file = 'Delete.txt', status = 'unknown')
do N = 1, 40
allocate(A(N,N), B(N,N), G(N), D(N, 2*N+1), E(N, N+1, NJ))
! ##########################################################################
! manual matrix multiplication vs matmul
call rand_fill
call CPU_time(t1)
do i = 1, loop_max
do j = 2, 12
call matmul_manual(j, N, NJ, A, B, D, G, E)
end do
end do
call CPU_time(t2)
t_manual = t2 - t1
write(1, *) A, B, D, G, E
call rand_fill
call CPU_time(t1)
do i = 1, loop_max
do j = 2, 12
B = matmul( A, E(:, 1:N, j-1) )
end do
end do
call CPU_time(t2)
t_intrinsic = t2 - t1
write(1, *) A, B, D, G, E
! --------------------------------------------------------------------------
! ##########################################################################
! manual matrix multiplication with matrix addition
call rand_fill
call CPU_time(t1)
do i = 1, loop_max
do j = 2, 12
call manual_matmul_add(j, N, NJ, A, B, D, G, E)
end do
end do
call CPU_time(t2)
t_man_add = t2 - t1
write(1, *) A, B, D, G, E
! --------------------------------------------------------------------------
! ##########################################################################
! intrinsic matrix multiplication (matmul) with matrix addition
call rand_fill
call CPU_time(t1)
do i = 1, loop_max
do j = 2, 12
call intrinsic_matmul_add(j, N, NJ, A, B, D, G, E)
end do
end do
call CPU_time(t2)
t_intrn_add = t2 - t1
write(1, *) A, B, D, G, E
! --------------------------------------------------------------------------
deallocate(A, B, D, G, E)
print*, N, t_manual, t_intrinsic, t_man_add, t_intrn_add
end do
contains
subroutine rand_fill
! fill the matrices with random numbers
call random_number(A)
call random_number(B)
call random_number(D)
call random_number(G)
call random_number(E)
end subroutine
end program test
subroutine matmul_manual(j, N, NJ, A, B, D, G, E)
implicit none
integer, intent(in) :: j
integer, intent(in) :: N, NJ
real, dimension(N, N), intent(in out) :: A, B
real, dimension(N, 2*N+1), intent(in out) :: D
real, dimension(N), intent(in out) :: G
real, dimension(N, N+1, NJ), intent(in out) :: E
integer :: I, L, K ! loop indices
B = 0.0
DO I = 1,N
DO L = 1,N
DO K = 1,N
B(I,K) = B(I,K) + A(I,L)*E(L,K,J-1)
end do
end do
end do
end subroutine matmul_manual
subroutine manual_matmul_add(j, N, NJ, A, B, D, G, E)
implicit none
integer, intent(in) :: j
integer, intent(in) :: N, NJ
real, dimension(N, N), intent(in out) :: A, B
real, dimension(N, 2*N+1), intent(in out) :: D
real, dimension(N), intent(in out) :: G
real, dimension(N, N+1, NJ), intent(in out) :: E
integer :: I, L, K ! loop indices
DO I = 1,N
D(I,N+1) = -G(I)
DO L = 1,N
D(I,N+1) = D(I,N+1)+A(I,L)*E(L,N+1,J-1)
DO K = 1,N
B(I,K) = B(I,K) + A(I,L)*E(L,K,J-1)
end do
end do
end do
end subroutine manual_matmul_add
subroutine intrinsic_matmul_add(j, N, NJ, A, B, D, G, E)
implicit none
integer, intent(in) :: j
integer, intent(in) :: N, NJ
real, dimension(N, N), intent(in out) :: A, B
real, dimension(N, 2*N+1), intent(in out) :: D
real, dimension(N), intent(in out) :: G
real, dimension(N, N+1, NJ), intent(in out) :: E
real, dimension(N, N+1) :: temp1
real, dimension(N, N) :: temp2
D(:, N+1) = -G + matmul( A, E(:, N+1, j-1) )
B = B + matmul( A, E(:, 1:N, j-1) )
end subroutine intrinsic_matmul_add
subroutine mat_sub_new(j, N, NJ, A, B, D, G, E)
implicit none
integer, intent(in) :: j
integer, intent(in) :: N, NJ
real, dimension(N, N), intent(in out) :: A, B
real, dimension(N, 2*N+1), intent(in out) :: D
real, dimension(N), intent(in out) :: G
real, dimension(N, N+1, NJ), intent(in out) :: E
if (N == 1) then ! matmul seems to be inefficient when N = 1
D(N,N+1) = -G(N) + A(N,N)*E(N, N+1, J-1)
B(N,N) = B(N,N) + A(N,N)*E(N, N, J-1)
else
D(:, N+1) = -G + matmul( A, E(:, N+1, j-1) )
B = B + matmul( A, E(:, 1:N, j-1) )
end if
end subroutine mat_sub_new
I suspect this has to do with two issues:
How the compiler resolves the MATMUL generic interface to a proper call to the correct routine (REAL vs. COMPLEX vs. INTEGER, or matrix times vector vs. matrix times matrix): I have no idea whether this is done systematically at compilation or at runtime, or whether this choice is made based on the optimization level; (this would justify the additional overhead especially for the low-size cases)
The internal "general purpose" algorithm may not be in general the best suited for your problem, as it looks like in some cases, brute-force compiler optimization does a better job. Is gfortran's MATMUL intrinsics based on BLAS, for example? If so, that may not be the fastest.
I've done a similar test with NORM2 on my PC (Windows, i7, gfortran 9.2.0, compiled with -O3 -march=core-avx2): Turns out that, for NORM2:
The BLAS implementation is always slowest, despite having been slightly refactored to feed the compiler a PURE version
Usage of the intrinsics (both NORM2 or SQRT(SUM(X**2))) is always slow, regardless of the array size
The fastest cases are when either using a simple loop, or the intrinsics with a fixed-size array:
ARRAY SIZE assumed-shape fixed-size intrinsic NORM2 LOOP BLAS
N [ms/N] [ms/N] [ms/N] [ms/N] [ms/N]
2 5.93750E-09 4.06250E-09 8.43750E-09 4.06250E-09 1.03125E-08
12 1.03125E-08 7.81250E-09 3.12500E-08 7.81250E-09 5.09375E-08
22 1.65625E-08 1.40625E-08 5.50000E-08 1.43750E-08 9.15625E-08
32 2.25000E-08 2.00000E-08 7.81250E-08 2.00000E-08 1.29375E-07
BTW The code is pasted here below (beware of the large memory footprint!):
program test_norm
use iso_fortran_env
implicit none
integer :: xsize,i,iunit,icase
integer, parameter :: testSize = 50000000
real(real64), allocatable :: set(:,:),setNorm(:)
real(real64) :: t0,t1,setSum(5),timeTable(5,35)
intrinsic :: norm2
open(newunit=iunit,file='garbage.txt',action='write')
print '(6(1x,a15))' ,'ARRAY SIZE','assumed-shape','fixed-size','intrinsic NORM2','LOOP','BLAS'
print '(6(1x,a15))' ,'N',('[ms/N]',i=1,5)
icase = 0
do xsize = 2,32,10
! Initialize test set
icase = icase+1
allocate(set(xsize,testSize),setNorm(testSize))
call random_number(set)
! Test 1: intrinsic SQRT/SUM, assumed-shape array
call cpu_time(t0); forall(i=1:testSize) setNorm(i) = norm_v1(set(:,i)); call cpu_time(t1)
setSum(1) = sum(setNorm); timeTable(1,icase) = t1-t0
! Test 2: intrinsic SQRT/SUM, fixed-size array
call cpu_time(t0); forall(i=1:testSize) setNorm(i) = norm_v2(xsize,set(:,i)); call cpu_time(t1)
setSum(2) = sum(setNorm); timeTable(2,icase) = t1-t0
! Test 3: intrinsic NORM2
call cpu_time(t0); forall(i=1:testSize) setNorm(i) = norm2(set(:,i)); call cpu_time(t1)
setSum(3) = sum(setNorm); timeTable(3,icase) = t1-t0
! Test 4: LOOP
call cpu_time(t0); forall(i=1:testSize) setNorm(i) = norm_v3(xsize,set(:,i)); call cpu_time(t1)
setSum(4) = sum(setNorm); timeTable(4,icase) = t1-t0
! Test 5: BLAS
call cpu_time(t0); forall(i=1:testSize) setNorm(i) = DNRM2(xsize,set(:,i),1); call cpu_time(t1)
setSum(5) = sum(setNorm); timeTable(5,icase) = t1-t0
! Print output
print '(7x,i2,9x,5(2x,1pe13.5e2,1x))', xsize,timeTable(:,icase)/testSize
write (iunit,*) 'setSum = ',setSum
deallocate(set,setNorm)
end do
close(iunit)
contains
pure real(real64) function norm_v1(x) result(L2)
real(real64), intent(in) :: x(:)
L2 = sqrt(sum(x**2))
end function norm_v1
pure real(real64) function norm_v2(n,x) result(L2)
integer, intent(in) :: n
real(real64), intent(in) :: x(n)
L2 = sqrt(sum(x**2))
end function norm_v2
pure real(real64) function norm_v3(n,x) result(L2)
integer, intent(in) :: n
real(real64), intent(in) :: x(n)
integer :: i
L2 = 0.0_real64
do i=1,n
L2 = L2 + x(i)**2
end do
L2 = sqrt(L2)
end function norm_v3
PURE REAL(REAL64) FUNCTION DNRM2 ( N, X, INCX )
INTEGER, INTENT(IN) :: N,INCX
REAL(REAL64), INTENT(IN) :: X( * )
REAL(REAL64), PARAMETER :: ONE = 1.0_REAL64
REAL(REAL64), PARAMETER :: ZERO = 0.0_REAL64
INTEGER :: IX
REAL(REAL64) :: ABSXI, NORM, SCALE, SSQ
INTRINSIC :: ABS, SQRT
IF( N<1 .OR. INCX<1 )THEN
NORM = ZERO
ELSE IF( N==1 )THEN
NORM = ABS( X( 1 ) )
ELSE
SCALE = ZERO
SSQ = ONE
DO IX = 1, 1 + ( N - 1 )*INCX, INCX
IF( X( IX )/=ZERO )THEN
ABSXI = ABS( X( IX ) )
IF( SCALE<ABSXI )THEN
SSQ = ONE + SSQ*( SCALE/ABSXI )**2
SCALE = ABSXI
ELSE
SSQ = SSQ + ( ABSXI/SCALE )**2
END IF
END IF
END DO
NORM = SCALE * SQRT( SSQ )
END IF
DNRM2 = NORM
RETURN
END FUNCTION DNRM2
end program test_norm

How can I reduce floating-point error in a cubic equation solver?

I'm using the following algorithm to solve a cubic polynomial equation (x^3 + ax^2 + bx + c = 0):
function find_roots(a, b, c, lower_bound, upper_bound)
implicit none
real*8, intent(in) :: a, b, c, lower_bound, upper_bound
real*8 :: find_roots
real*8 :: Q, R, theta, x, Au, Bu
integer :: i, iter
Q = (a**2 - 3.D0*b)/9.D0
R = (2.D0*a**3 - 9.D0*a*b + 27.D0*c)/54.D0
!If roots are all real, get root in range
if (R**2.lt.Q**3) then
iter = 0
theta = acos(R/sqrt(Q**3))
!print *, "theta = ", theta
do i=-1,1
iter = iter+1
x = -2.D0*sqrt(Q)*cos((theta + dble(i)*PI*2.D0)/3.D0)-a/3.D0
!print *, "iter = ", iter, "root = ", x
if ((x.ge.lower_bound).and.(x.le.upper_bound)) then
find_roots = x
return
end if
end do
!Otherwise, two imaginary roots and one real root, return real root
else
Au = -sign(1.D0, R)*(abs(R)+sqrt(R**2-Q**3))**(1.D0/3.D0)
if (Au.eq.0.D0) then
Bu = 0.D0
else
Bu = Q/Au
end if
find_roots = (Au+Bu)-a/3.D0
return
end if
end function find_roots
Now it turns out that it can be shown analytically that a cubic equation with the following inputs:
Q0 = 1.D0
alpha = 1.D-2
dt = 0.00001D0
Y = 1000000.D0
find_roots(-(2.D0*Q0+Y), &
-(alpha-Q0**2-2.D0*Y*Q0+dt/2.D0*alpha), &
(dt/2.D0*alpha*Q0+Y*alpha-Y*Q0**2), &
Q0-sqrt(alpha), &
Q0+sqrt(alpha)))
MUST have a root between Q0+sqrt(alpha) and Q0-sqrt(alpha). This is a mathematical certainty. However, the function as called above will return 0, not the correct root, due to floating-point error, since the required result is very close to Q0+sqrt(alpha). I've confirmed this by creating a new function which uses quadruple precision. Unfortunately, I can't just always use quadruple precision since this function will be called billions of times and is a performance bottleneck.
So my question is, are there any general ways I could re-write this code to reduce these precision errors, while also maintaining the performance? I tried using the algorithm suggested by wikipedia, but the problem actually got worse.
https://www.cliffsnotes.com/study-guides/algebra/algebra-ii/factoring-polynomials/sum-or-difference-of-cubes
This should reduce rounding error.
Likewise, you should be able to find a much better grouping of terms, where you don't make the compiler guess what you want,
https://en.wikipedia.org/wiki/Horner%27s_method
alpha-Q0**2-2.D0*Y*Q0+dt/2.D0*alpha /= (alpha+alpha*.5*dt)-Q0*(Q0+2*Y)
You might argue that any good optimizer should know what to do with .5dt vs. dt/2. ifort considers that a part of -no-prec-div even though it can't change roundoff.
It's up to you whether you choose single precision constants for readability after checking to make sure that the promotion rules cause them to promote exactly to double. It seems particularly bad style to depend on f77 D0 suffix to choose the same data type as the never-standard real*8; no doubt it does if your compiler doesn't complain.
There is something wrong with the accuracy of your calculations, either the calculation of a,b,c or the find_roots function estimates.
I used the a,b,c that are calculated and found that your lower_bound and upper_bound were better estimates of the roots.
I then modified the bounds to be +/- sqrt(alpha)*1.1 so that the range test would work for 64-bit.
I also simplified constants that promote exactly to double.
Finally I compared your estimate of the root to the fn (0.9d0) and fn (1.1d0), which shows the find_roots function does not work for the a,b,c provided.
You should check your references for the error or it may just be the approach fails when acos (+/- 1.0 ) is used.
The program I used to test this with lots of prints is:
real*8 function find_roots (a, b, c, lower_bound, upper_bound)
implicit none
real*8, intent(in) :: a, b, c, lower_bound, upper_bound
real*8 :: Q, R, theta, x, Au, Bu, thi
integer :: i, iter
real*8 :: two_pi ! = 8 * atan (1.0d0)
Q = (a**2 - 3.*b)/9.
R = (2.*a**3 - 9.*a*b + 27.*c)/54.
two_pi = 8 * atan (1.0d0)
!If roots are all real, get root in range
if (R**2 < Q**3) then
iter = 0
x = R/sqrt(Q**3)
theta = acos(x)
print *, "theta = ", theta, x
do i=-1,1
iter = iter+1
!! x = -2.D0*sqrt(Q)* cos((theta + dble(i)*PI*2.D0)/3.D0) - a/3.D0
thi = (theta + i*two_pi)/3.
x = -2.*sqrt(Q) * cos (thi) - a/3.
!print *, "iter = ", iter, "root = ", x
if ( (x >= lower_bound) .and. (x <= upper_bound) ) then
find_roots = x
print *, "find_roots = ", x
! return
end if
end do
!Otherwise, two imaginary roots and one real root, return real root
else
Au = -sign(1.D0, R)*(abs(R)+sqrt(R**2-Q**3))**(1.D0/3.D0)
if (Au.eq.0.D0) then
Bu = 0.D0
else
Bu = Q/Au
end if
find_roots = (Au+Bu)-a/3.D0
return
end if
end function find_roots
real*8 function get_cubic (x, a, b, c)
implicit none
real*8, intent(in) :: x, a, b, c
get_cubic = ( ( x + a) * x + b ) * x + c
end function get_cubic
! Now it turns out that it can be shown analytically that a cubic equation with the following inputs:
real*8 Q0, alpha, dt, Y, a, b, c, lower_bound, upper_bound, val, fn
real*8, external :: find_roots, get_cubic
!
Q0 = 1.D0
alpha = 1.0D-2
dt = 0.00001D0
Y = 1000000.0D0
!
a = -(2.*Q0 + Y)
b = -(alpha - Q0**2 - 2.*Y*Q0 + dt/2.*alpha)
c = (dt/2.*alpha*Q0 + Y*alpha - Y*Q0**2)
write (*,*) a,b,c
!
lower_bound = Q0-sqrt(alpha)*1.1
upper_bound = Q0+sqrt(alpha)*1.1
write (*,*) lower_bound, upper_bound
!
val = find_roots (a, b, c, lower_bound, upper_bound)
!
fn = get_cubic ( val, a,b,c )
write (*,*) val, fn
!
! Test the better root values
val = 0.9d0
fn = get_cubic ( val, a,b,c )
write (*,*) val, fn
!
val = 1.1d0
fn = get_cubic ( val, a,b,c )
write (*,*) val, fn
end

Eigenvector discontinuities in hermitian matrix diagonalization

I need to diagonalize a 2x2 Hermitian matrix that depends on a parameter x, which varies continuously. For diagonalization I use EISPACK. When I plot the real and imaginary components of eigenvectors as a function of x, I notice that they have discontinuities. The eigenvalues calculation is OK. When I plot the eigenvectors in Maxima, the solutions appear continuous. I need the continuous eigenvectors since in next step I will need to calculate their derivatives.
Below the f77 code I use as test (compiling with gfortran on mingw).
program Eigenvalue
implicit none
integer n, m
parameter (n=2)
integer ierr, matz, i, j
double precision x, dx, xf, amp, xin
double precision w(n)
double precision Ar(n,n), Ai(n,n)
double precision xr(n,n), xi(n,n)
double precision fm1(2,n) ! f77
double precision fv1(n) ! f77
double precision fv2(n) ! f77
double complex psi1a, psi1b, psi2a, psi2b
m = 51
xf = 10.d0
xin = 0.0d0
amp = 2.d0
dx = (xf - xin)/(m-1)
do i = 1, m
x = dx*(i-m) + xf
Ar(1,1) = dsin(x)**2
Ar(1,2) = amp*dcos(x)
Ar(2,1) = amp*dcos(x)
Ar(2,2) = dcos(x)**2
Ai(1,1) = 0.0d0
Ai(1,2) = amp*dsin(x)
Ai(2,1) = -amp*dsin(x)
Ai(2,2) = 0.0d0
matz = 1
call ch ( n, n, ar, ai, w, matz, xr, xi, fv1, fv2, fm1, ierr ) !f77
write(20,*) x, w(1), w(2)
write(21,*) x, xr(1,1), xi(1,1)
write(22,*) x, xr(2,1), xi(2,1)
write(23,*) x, xr(1,2), xi(1,2)
write(24,*) x, xr(2,2), xi(2,2)
! autovetor 1
psi1a = cmplx(xr(1,1),xi(1,1))
psi1b = cmplx(xr(1,2),xi(1,2))
! autovetor 2
psi2a = cmplx(xr(2,1),xi(2,1))
psi2b = cmplx(xr(2,2),xi(2,2))
end do
end
While not really an answer, what follows is the code I used with LAPACK.
I used the latest versions of LAPACK and BLAS, with the following compiler options:
gfortran -Og -std=f2008 -Wall -Wextra {location_of_lapack}/liblapack.a {location_of_blas}/blas_LINUX.a main.f90 -o main
I'm compiling on Mac OS X with gfortran 6.3.0 from homebrew.
As Ian mentioned above, things like dcos are replaced with cos and I have used the KIND= formulation to ensure the same precision.
Ian also mentioned above about the arbitrary phase.
This problem is answered here; I have translated this solution into my code below.
The "magic" happens after the call to ZHEEV.
With this fix, I see no discontinuities.
program Eigenvalue
!> This can be used with the f2008 call
use, intrinsic :: iso_fortran_env
implicit none
!> dp contains the kind value for double precision.
!> Use below if compiling to f2008
integer, parameter :: dp = REAL64
!> Use below if compiling with f95 up and comment out iso_fortran_env
!>integer, parameter :: dp = SELECTED_REAL_KIND(15, 300)
!> Set wp to the desired precision.
integer, parameter :: wp = dp
integer, parameter :: n = 2
integer :: i, j, k, m
real(kind=wp) x, dx, xf, amp, xin
real(kind=wp), dimension(n) :: w
real(kind=wp), dimension(n, n) :: Ar, Ai
complex(kind=wp), dimension(n, n) :: A
complex(kind=wp), dimension(max(1,2*n-1)) :: WORK
integer, parameter :: lwork = max(1,2*n-1)
real(kind=wp), dimension(max(1, 3*n-2)) :: RWORK
integer :: info
complex(kind=wp) :: psi1a, psi1b, psi2a, psi2b
real(kind=wp) :: mag
m = 51
xf = 10.0_wp
xin = 0.0_wp
amp = 2.0_wp
if (m .eq. 1) then
dx = 0.0_wp
else
dx = (xf - xin)/(m-1)
end if
do i = 1, m
x = dx*(i-m) + xf
Ar(1,1) = sin(x)**2
Ar(1,2) = amp*cos(x)
Ar(2,1) = amp*cos(x)
Ar(2,2) = cos(x)**2
Ai(1,1) = 0.0_wp
Ai(1,2) = amp*sin(x)
Ai(2,1) = -amp*sin(x)
Ai(2,2) = 0.0_wp
do j = 1, n
do k = 1, n
A(j, k) = cmplx(Ar(j, k), Ai(j, k), kind=wp)
end do
end do
call ZHEEV('V', 'U', N, A, N, W, WORK, LWORK, RWORK, INFO)
do j = 1, n
A(:, j) = A(:, j) / A(1, j)
mag = sqrt(real(A(1, j)*conjg(A(1, j)))+ real(A(2, j)*conjg(A(2, j))))
A(:, j) = A(:, j)/mag
end do
psi1a = A(1, 1)
psi1b = A(1, 2)
psi2a = A(2, 1)
psi2b = A(2, 2)
end do
end program Eigenvalue

Check bounds changes variables

I'm porting a program that I use in a chemistry classroom from Matlab (very forgiving) to Fortran (err, not so much). The problem I see is that if I include print statements in 1 subroutine, my code returns significantly different values than if I don't (the ones with the print statement included are correct).
After reading stack overflow, I removed the print statement, recompiled with gfortran and fcheck='bounds', and my program returned the correct results, and no errors during compile.
The subroutines stored in a module Basis_Subs, and called from the main program, which I've posted below. The problem appears in the 4 dimensional matrix Gabcd(nb,nb,nb,nb) which is constructed using the subroutine Build_Electron_Repulsion from the Basis_Subs module. That subroutine calculates the matrix elements of Gabcd, and uses 1 internal helper functions, Rntuv, and 1 internal subroutine Gprod_1D, both of which are also stored in the Basis_Subs module.
These functions/routines are used in another section of the program, and that portion of the program doesn't show any errors or funny array behavior. That leads me to think the problem must either be in Build_Electron_Repulsion, how I'm calling Build_Electron_Repulsion or how I'm calling the the helper functions from inside Build_Electron_Repulsion.
I've posted the main program, and the subroutines for Build_Electron_Repulsion, gprod_1D, and the function Rntuv. What I'm really wondering is if you have any tips on tracking down where the error might be.
I'm using a pico style editor and gfortran.
Main Program, Z.f08
program HF
use typedefs
use Basis_Subs
use SCF_Mod
implicit none
real(dp) :: output, start, finish
integer (kind=4) :: IFLAG , i, N, nb,j,k,l,natom
integer, allocatable, dimension(:) :: Z
real(dp), allocatable, dimension(:,:) :: AL, S,T, VAB, H0
real(dp), allocatable, dimension(:,:,:,:) :: Gabcd
real(dp), dimension(maxl) :: Ex=0
real(dp) :: Energy, Nuc
type(primitive) :: g1, Build_Primitive
type(Basis) :: b1
type(Basis), dimension(100) :: bases
character(LEN=20) :: fname
print *, 'Input the filename'
read (*,*), fname
open(unit=12, file=fname)
read(12,*) natom
allocate(Z(natom))
allocate(AL(natom,3))
read(12,*) Z
do i=1, natom
read(12,*) AL(i,1), AL(i,2), AL(i,3)
end do
print *, 'Atomic Coorinates = ', AL
print *, 'Z in the main routine = ', Z
call cpu_time(start)
%Calculate the energies that don't depend on electrons
call Nuclear_Repulsion(natom, Z, AL, Nuc)
N=Sum(Z)
%Build the atom specific basis set
call Build_Bases(Z, AL, nb, bases)
%Using nb, from Build_Basis, allocate matrices
allocate(S(nb,nb))
allocate(T(nb,nb))
allocate(VAB(nb,nb))
allocate(Gabcd(nb,nb,nb,nb))
call Build_Overlap(bases, nb, S)
call Build_Kinetic(bases, nb, T)
call Build_Nuclear_Attraction(Z, AL, bases, nb, VAB)
H0 = T+VAB
call Build_Electron_Repulsion(bases, nb, Gabcd)
call cpu_time(finish)
print *, 'Total time for Matrix Elements= ', finish - start
call SCF(N, nb, H0, S, Gabcd, Nuc, Energy)
end program HF
Build_Electron_Repulsion is located inside the module Basis_Subs:
subroutine Build_Electron_Repulsion(bases, nbases, Gabcd)
!!Calculate the 4 centered electron repulsion integrals. Loop over array of !!basis sets 1:nb 4 times. Each element of basis set is a defined type that !!includes and array of gaussian functions and contraction coefficients !!basis(a)%g(1:nga) and basis(a)%c(1:nga). For each gaussian in each basis set,
!!Calculate int(int(basis(a1)*basis(b1)*basis(c2)*basis(d2)*1/r12 dr1)dr2).
!!Uses helper function Rntuv listed below
implicit none
type(basis), dimension(100), intent(in) :: bases
integer, intent(in) :: nbases
real(dp), dimension(nbases, nbases,nbases,nbases), intent(out) :: Gabcd
integer :: a, b,c,d, nga, ngb, ngc, ngd, index, lx, ly, lz, llx, lly,llz
integer :: llxmax, llymax, llzmax, lxmax, lymax, lzmax, xmax, ymax, zmax
integer :: x, y, z
real(dp) :: p, q, midpoint, PX, PY, PZ, output
real(dp) :: pp, qq, midpoint2, PPX, PPY, PPZ, tmp
real(dp) :: alpha_a, alpha_b, alpha_c, alpha_d, alpha
real(dp) :: ax, ay, az, bx, by, bz, cx,cy,cz, dx,dy,dz
real(dp), dimension(maxl) ::EabX, EabY, EabZ, EcdX, EcdY, EcdZ
real(dp), dimension(2*maxl, 2*maxl, 2*maxl) :: R
R=0
Gabcd=0.0D0
print *, 'Calculating 4 centered integrals'
do a=1, nbases
do b=1, nbases
do c=1, nbases
do d=1, nbases
do nga = 1, bases(a)%n
do ngb = 1, bases(b)%n
alpha_a=bases(a)%g(nga)%alpha
alpha_b=bases(b)%g(ngb)%alpha
p=alpha_a + alpha_b
ax=bases(a)%g(nga)%x
ay=bases(a)%g(nga)%y
az=bases(a)%g(nga)%z
bx=bases(b)%g(ngb)%x
by=bases(b)%g(ngb)%y
bz=bases(b)%g(ngb)%z
PX=(alpha_a*ax + alpha_b*bx)/p
PY=(alpha_a*ay + alpha_b*by)/p
PZ=(alpha_a*az + alpha_b*bz)/p
call gprod_1D(ax, alpha_a, bases(a)%g(nga)%lx, bx, alpha_b, bases(b)%g(ngb)%lx, EabX)
call gprod_1D(ay, alpha_a, bases(a)%g(nga)%ly, by, alpha_b, bases(b)%g(ngb)%ly, EabY)
call gprod_1D(az, alpha_a, bases(a)%g(nga)%lz, bz, alpha_b, bases(b)%g(ngb)%lz, EabZ)
lxmax=bases(a)%g(nga)%lx + bases(b)%g(ngb)%lx
lymax=bases(a)%g(nga)%ly + bases(b)%g(ngb)%ly
lzmax=bases(a)%g(nga)%lz + bases(b)%g(ngb)%lz
do ngc= 1, bases(c)%n
do ngd = 1, bases(d)%n
alpha_c=bases(c)%g(ngc)%alpha
alpha_d=bases(d)%g(ngd)%alpha
pp=alpha_c + alpha_d
cx=bases(c)%g(ngc)%x
cy=bases(c)%g(ngc)%y
cz=bases(c)%g(ngc)%z
dx=bases(d)%g(ngd)%x
dx=bases(d)%g(ngd)%y
dz=bases(d)%g(ngd)%z
PPX=(alpha_c*cx + alpha_d*dx)/pp
PPY=(alpha_c*cy + alpha_d*dy)/pp
PPZ=(alpha_c*cz + alpha_d*dz)/pp
llxmax=bases(c)%g(ngc)%lx + bases(d)%g(ngd)%lx
llymax=bases(c)%g(ngc)%ly + bases(d)%g(ngd)%ly
llzmax=bases(c)%g(ngc)%lz + bases(d)%g(ngd)%lz
call gprod_1D(cx, alpha_c, bases(c)%g(ngc)%lx, dx, alpha_d, bases(d)%g(ngd)%lx, EcdX)
call gprod_1D(cy, alpha_c, bases(c)%g(ngc)%ly, dy, alpha_d, bases(d)%g(ngd)%ly, EcdY)
call gprod_1D(cz, alpha_c, bases(c)%g(ngc)%lz, dz, alpha_d, bases(d)%g(ngd)%lz, EcdZ)
alpha=p*pp/(p+pp)
tmp=0
xmax= lxmax + llxmax
ymax = lymax + llymax
zmax = lzmax + llzmax
do x = 0, xmax
do y =0, ymax
do z=0, zmax
R(x+1,y+1,z+1)=Rntuv(0,x,y,z,alpha, PX, PY, PZ, PPX, PPY, PPZ)
end do
end do
end do
!if (a ==1 .and. b==1 .and. c ==1 .and. d==1) then
! print *,' R = ', R(1,1,1)
!print *, xmax, ymax, zmax
!print *,a,b,c,d,nga,ngb,ngc,ngd, 'R = ', R(1,1,1)
!end if
! if (PZ ==PPZ) then
! ! print *, R(1,1,1)
! output = Rntuv(0,0,0,0,alpha, PX, PY, PZ, PPX, PPY, PPZ)
! print *, output
! print *, a,b,c,d , PY, PPY
!
! end if
do lx = 0, lxmax
do ly = 0, lymax
do lz = 0, lzmax
do llx= 0, llxmax
do lly= 0, llymax
do llz= 0, llzmax
tmp = tmp + EabX(lx+1)*EabY(ly+1)*EabZ(lz+1)*(-1.0D0)**(llx + lly + llz) * &
EcdX(llx+1)*EcdY(lly+1)*EcdZ(llz+1)*R(lx+ llx+1, ly+lly+1, lz+llz+1)
end do
end do
end do
end do
end do
end do
Gabcd(a,b,c,d) = Gabcd(a,b,c,d) + 2.0D0*pi**2.5D0/(p*pp*sqrt(p + pp))*tmp*bases(a)%g(nga)%N &
* bases(b)%g(ngb)%N * bases(c)%g(ngc)%N * bases(d)%g(ngd)%N * bases(a)%c(nga) &
* bases(b)%c(ngb) * bases(c)%c(ngc) * bases(d)%c(ngd)
end do
end do
end do
end do
end do
end do
end do
end do
end subroutine Build_Electron_Repulsion
real(dp) function Rntuv(n, tmax, umax, vmax, p, Px, Py, Pz, Ax, Ay, Az) result(out)
!Rntuv(n, t,u,v,p,P,A)Determine the helper integral Rntuv for the coulomb
!integral of order n, the t,u,v th Hermite polynomial with exponent p
!centered at [Px Py Pz] and charge centered at location [Ax Ay Az];
implicit none
integer, intent(in) :: n, tmax, umax, vmax
real(dp), intent(in) :: Px, Py, Pz, Ax, Ay, Az, p
real(dp) :: PA2, output
real(dp), dimension(n+tmax+umax+vmax+2, tmax+1, umax+1, vmax+1) :: R
integer :: nmax, t, u, v
integer :: i, IFLAG
R=0
nmax = n+ tmax + umax + vmax + 2
PA2 = (Px-Ax)**2.0D0 + (Py-Ay)**2.0D0 + (Pz-Az)**2.0D0
do i = 0, nmax-1
output=Boys(i, p*PA2)
R(i+1,1,1,1)= (-2*p)**(1.0D0*i)*Boys(i, p*PA2)
end do
do t=1, tmax
if (t==1) then
do i=1,nmax-1
R(i,2,1,1)=(Px - Ax)*R(i+1,1,1,1)
end do
else
do i=1,nmax-1
R(i,t+1,1,1)=(t-1)*R(i+1,t-1,1,1)+ (Px-Ax)*R(i+1,t,1,1)
end do
end if
end do
do u = 1,umax
if (u==1) then
do i = 1,nmax-1
R(i,tmax+1,2,1)=(Py-Ay)*R(i+1,tmax+1,1,1)
end do
else
do i = 1,nmax-1
R(i,tmax+1,u+1,1)=(u-1)*R(i+1,tmax+1,u-1,1) + (Py-Ay)*R(i+1,tmax+1,u,1)
end do
end if
end do
do v=1,vmax
if (v==1) then
do i = 1, nmax-1
R(i,tmax+1,umax+1,2)=(Pz-Az)*R(i+1,tmax+1,umax+1,1)
end do
else
do i = 1, nmax-1
R(i,tmax+1,umax+1,v+1)=(v-1)*R(i+1,tmax+1,umax+1,v-1) + (Pz-Az)*R(i+1,tmax+1,umax+1,v)
end do
end if
end do
out = R(n+1,tmax+1,umax+1,vmax+1)
end function Rntuv
subroutine gprod_1D(x1, alpha1, lx1, x2, alpha2, lx2, Ex)
real(dp), intent(in) :: x1, alpha1, x2, alpha2
integer, intent(in) :: lx1, lx2
integer :: tmax, i, j ,t, qint
real(dp) :: p, q, midpoint, weighted_middle, KAB
real(dp), dimension(maxl), intent(inout) :: Ex
real(dp), dimension(maxl, maxl, 2*maxl) ::coefficients
coefficients=0.0D0
tmax=lx1 + lx2
Ex=0
p=alpha1 + alpha2
q=alpha1*alpha2/p
midpoint = x1 - x2
weighted_middle=(alpha1*x1 + alpha2*x2)/p
KAB= e**(-q*midpoint**2.0D0)
coefficients(1,1,1) = KAB
i=0
j=0
do while (i < lx1)
do t= 0, i+j+1
if (t==0) then
coefficients(i+2,j+1,t+1)=(weighted_middle - x1)*coefficients(i+1,j+1,t+1) + (t+1)*coefficients(i+1,j+1,t+2)
else
coefficients(i+2,j+1,t+1)=1/(2*p)*coefficients(i+1,j+1,t) + (weighted_middle-x1)*coefficients(i+1,j+1,t+1) + &
(t+1)*coefficients(i+1,j+1,t+2)
end if
end do
i=i+1
end do
do while (j < lx2)
do t=0, i+j+1
if (t==0) then
coefficients(i+1,j+2,t+1) = (weighted_middle - x2)*coefficients(i+1,j+1,t+1) + (dble(t)+1.0d0)*coefficients(i+1,j+1,t+2)
else
coefficients(i+1,j+2,t+1)=1/(2*p)*coefficients(i+1,j+1,t) + (weighted_middle - x2)*coefficients(i+1,j+1,t+1) + &
(t+1)*coefficients(i+1,j+1,t+2)
end if
end do
j=j+1
end do
do qint=1, i+j+1
Ex(qint) = coefficients(i+1,j+1,qint)
end do
end subroutine gprod_1D