I have some legacy code which performs the matrix operation of B = B + A*E as
DO I = 1,N
DO L = 1,N
DO K = 1,N
B(I,K) = B(I,K) + A(I,L)*E(L,K,J-1)
end do
end do
end do
To improve readability as well as take advantage of modern fortran intrinsic functions, I would like to write the above code as
B = B + matmul( A, E(:, 1:N, J-1) )
I noticed that the improved readability comes at the cost of performance. I determined that the problem is not with the intrinsic function matmul - the left figure shows that matmul performs just as well as the manually written operation for all values of N.
When matrix multiplication is combined with matrix addition, then for small values of N the manually written operation performs better than the intrinsic functions. For my uses, usually N < 10; I would like to improve the readability without losing the performance. Might there be a suggestion for that?
The code I am using is below. I am using Mac OS 10.14.6 with gfortran 8.2.0 and compiling with the -O3 optimization option.
program test
implicit none
integer :: loop_max = 1000
integer :: j ! loop index
integer :: i ! loop index
real :: t1, t2 ! start and end times
real :: t_manual, t_intrinsic, t_man_add, t_intrn_add
integer :: N ! matrix dimension
integer, parameter :: NJ = 12
real, dimension(:, :), allocatable :: A, B ! matrices
real, dimension(:, :), allocatable :: D
real, dimension(:), allocatable :: G
real, dimension(:, :, :), allocatable :: E
open(1, file = 'Delete.txt', status = 'unknown')
do N = 1, 40
allocate(A(N,N), B(N,N), G(N), D(N, 2*N+1), E(N, N+1, NJ))
! ##########################################################################
! manual matrix multiplication vs matmul
call rand_fill
call CPU_time(t1)
do i = 1, loop_max
do j = 2, 12
call matmul_manual(j, N, NJ, A, B, D, G, E)
end do
end do
call CPU_time(t2)
t_manual = t2 - t1
write(1, *) A, B, D, G, E
call rand_fill
call CPU_time(t1)
do i = 1, loop_max
do j = 2, 12
B = matmul( A, E(:, 1:N, j-1) )
end do
end do
call CPU_time(t2)
t_intrinsic = t2 - t1
write(1, *) A, B, D, G, E
! --------------------------------------------------------------------------
! ##########################################################################
! manual matrix multiplication with matrix addition
call rand_fill
call CPU_time(t1)
do i = 1, loop_max
do j = 2, 12
call manual_matmul_add(j, N, NJ, A, B, D, G, E)
end do
end do
call CPU_time(t2)
t_man_add = t2 - t1
write(1, *) A, B, D, G, E
! --------------------------------------------------------------------------
! ##########################################################################
! intrinsic matrix multiplication (matmul) with matrix addition
call rand_fill
call CPU_time(t1)
do i = 1, loop_max
do j = 2, 12
call intrinsic_matmul_add(j, N, NJ, A, B, D, G, E)
end do
end do
call CPU_time(t2)
t_intrn_add = t2 - t1
write(1, *) A, B, D, G, E
! --------------------------------------------------------------------------
deallocate(A, B, D, G, E)
print*, N, t_manual, t_intrinsic, t_man_add, t_intrn_add
end do
contains
subroutine rand_fill
! fill the matrices with random numbers
call random_number(A)
call random_number(B)
call random_number(D)
call random_number(G)
call random_number(E)
end subroutine
end program test
subroutine matmul_manual(j, N, NJ, A, B, D, G, E)
implicit none
integer, intent(in) :: j
integer, intent(in) :: N, NJ
real, dimension(N, N), intent(in out) :: A, B
real, dimension(N, 2*N+1), intent(in out) :: D
real, dimension(N), intent(in out) :: G
real, dimension(N, N+1, NJ), intent(in out) :: E
integer :: I, L, K ! loop indices
B = 0.0
DO I = 1,N
DO L = 1,N
DO K = 1,N
B(I,K) = B(I,K) + A(I,L)*E(L,K,J-1)
end do
end do
end do
end subroutine matmul_manual
subroutine manual_matmul_add(j, N, NJ, A, B, D, G, E)
implicit none
integer, intent(in) :: j
integer, intent(in) :: N, NJ
real, dimension(N, N), intent(in out) :: A, B
real, dimension(N, 2*N+1), intent(in out) :: D
real, dimension(N), intent(in out) :: G
real, dimension(N, N+1, NJ), intent(in out) :: E
integer :: I, L, K ! loop indices
DO I = 1,N
D(I,N+1) = -G(I)
DO L = 1,N
D(I,N+1) = D(I,N+1)+A(I,L)*E(L,N+1,J-1)
DO K = 1,N
B(I,K) = B(I,K) + A(I,L)*E(L,K,J-1)
end do
end do
end do
end subroutine manual_matmul_add
subroutine intrinsic_matmul_add(j, N, NJ, A, B, D, G, E)
implicit none
integer, intent(in) :: j
integer, intent(in) :: N, NJ
real, dimension(N, N), intent(in out) :: A, B
real, dimension(N, 2*N+1), intent(in out) :: D
real, dimension(N), intent(in out) :: G
real, dimension(N, N+1, NJ), intent(in out) :: E
real, dimension(N, N+1) :: temp1
real, dimension(N, N) :: temp2
D(:, N+1) = -G + matmul( A, E(:, N+1, j-1) )
B = B + matmul( A, E(:, 1:N, j-1) )
end subroutine intrinsic_matmul_add
subroutine mat_sub_new(j, N, NJ, A, B, D, G, E)
implicit none
integer, intent(in) :: j
integer, intent(in) :: N, NJ
real, dimension(N, N), intent(in out) :: A, B
real, dimension(N, 2*N+1), intent(in out) :: D
real, dimension(N), intent(in out) :: G
real, dimension(N, N+1, NJ), intent(in out) :: E
if (N == 1) then ! matmul seems to be inefficient when N = 1
D(N,N+1) = -G(N) + A(N,N)*E(N, N+1, J-1)
B(N,N) = B(N,N) + A(N,N)*E(N, N, J-1)
else
D(:, N+1) = -G + matmul( A, E(:, N+1, j-1) )
B = B + matmul( A, E(:, 1:N, j-1) )
end if
end subroutine mat_sub_new
I suspect this has to do with two issues:
How the compiler resolves the MATMUL generic interface to a proper call to the correct routine (REAL vs. COMPLEX vs. INTEGER, or matrix times vector vs. matrix times matrix): I have no idea whether this is done systematically at compilation or at runtime, or whether this choice is made based on the optimization level; (this would justify the additional overhead especially for the low-size cases)
The internal "general purpose" algorithm may not be in general the best suited for your problem, as it looks like in some cases, brute-force compiler optimization does a better job. Is gfortran's MATMUL intrinsics based on BLAS, for example? If so, that may not be the fastest.
I've done a similar test with NORM2 on my PC (Windows, i7, gfortran 9.2.0, compiled with -O3 -march=core-avx2): Turns out that, for NORM2:
The BLAS implementation is always slowest, despite having been slightly refactored to feed the compiler a PURE version
Usage of the intrinsics (both NORM2 or SQRT(SUM(X**2))) is always slow, regardless of the array size
The fastest cases are when either using a simple loop, or the intrinsics with a fixed-size array:
ARRAY SIZE assumed-shape fixed-size intrinsic NORM2 LOOP BLAS
N [ms/N] [ms/N] [ms/N] [ms/N] [ms/N]
2 5.93750E-09 4.06250E-09 8.43750E-09 4.06250E-09 1.03125E-08
12 1.03125E-08 7.81250E-09 3.12500E-08 7.81250E-09 5.09375E-08
22 1.65625E-08 1.40625E-08 5.50000E-08 1.43750E-08 9.15625E-08
32 2.25000E-08 2.00000E-08 7.81250E-08 2.00000E-08 1.29375E-07
BTW The code is pasted here below (beware of the large memory footprint!):
program test_norm
use iso_fortran_env
implicit none
integer :: xsize,i,iunit,icase
integer, parameter :: testSize = 50000000
real(real64), allocatable :: set(:,:),setNorm(:)
real(real64) :: t0,t1,setSum(5),timeTable(5,35)
intrinsic :: norm2
open(newunit=iunit,file='garbage.txt',action='write')
print '(6(1x,a15))' ,'ARRAY SIZE','assumed-shape','fixed-size','intrinsic NORM2','LOOP','BLAS'
print '(6(1x,a15))' ,'N',('[ms/N]',i=1,5)
icase = 0
do xsize = 2,32,10
! Initialize test set
icase = icase+1
allocate(set(xsize,testSize),setNorm(testSize))
call random_number(set)
! Test 1: intrinsic SQRT/SUM, assumed-shape array
call cpu_time(t0); forall(i=1:testSize) setNorm(i) = norm_v1(set(:,i)); call cpu_time(t1)
setSum(1) = sum(setNorm); timeTable(1,icase) = t1-t0
! Test 2: intrinsic SQRT/SUM, fixed-size array
call cpu_time(t0); forall(i=1:testSize) setNorm(i) = norm_v2(xsize,set(:,i)); call cpu_time(t1)
setSum(2) = sum(setNorm); timeTable(2,icase) = t1-t0
! Test 3: intrinsic NORM2
call cpu_time(t0); forall(i=1:testSize) setNorm(i) = norm2(set(:,i)); call cpu_time(t1)
setSum(3) = sum(setNorm); timeTable(3,icase) = t1-t0
! Test 4: LOOP
call cpu_time(t0); forall(i=1:testSize) setNorm(i) = norm_v3(xsize,set(:,i)); call cpu_time(t1)
setSum(4) = sum(setNorm); timeTable(4,icase) = t1-t0
! Test 5: BLAS
call cpu_time(t0); forall(i=1:testSize) setNorm(i) = DNRM2(xsize,set(:,i),1); call cpu_time(t1)
setSum(5) = sum(setNorm); timeTable(5,icase) = t1-t0
! Print output
print '(7x,i2,9x,5(2x,1pe13.5e2,1x))', xsize,timeTable(:,icase)/testSize
write (iunit,*) 'setSum = ',setSum
deallocate(set,setNorm)
end do
close(iunit)
contains
pure real(real64) function norm_v1(x) result(L2)
real(real64), intent(in) :: x(:)
L2 = sqrt(sum(x**2))
end function norm_v1
pure real(real64) function norm_v2(n,x) result(L2)
integer, intent(in) :: n
real(real64), intent(in) :: x(n)
L2 = sqrt(sum(x**2))
end function norm_v2
pure real(real64) function norm_v3(n,x) result(L2)
integer, intent(in) :: n
real(real64), intent(in) :: x(n)
integer :: i
L2 = 0.0_real64
do i=1,n
L2 = L2 + x(i)**2
end do
L2 = sqrt(L2)
end function norm_v3
PURE REAL(REAL64) FUNCTION DNRM2 ( N, X, INCX )
INTEGER, INTENT(IN) :: N,INCX
REAL(REAL64), INTENT(IN) :: X( * )
REAL(REAL64), PARAMETER :: ONE = 1.0_REAL64
REAL(REAL64), PARAMETER :: ZERO = 0.0_REAL64
INTEGER :: IX
REAL(REAL64) :: ABSXI, NORM, SCALE, SSQ
INTRINSIC :: ABS, SQRT
IF( N<1 .OR. INCX<1 )THEN
NORM = ZERO
ELSE IF( N==1 )THEN
NORM = ABS( X( 1 ) )
ELSE
SCALE = ZERO
SSQ = ONE
DO IX = 1, 1 + ( N - 1 )*INCX, INCX
IF( X( IX )/=ZERO )THEN
ABSXI = ABS( X( IX ) )
IF( SCALE<ABSXI )THEN
SSQ = ONE + SSQ*( SCALE/ABSXI )**2
SCALE = ABSXI
ELSE
SSQ = SSQ + ( ABSXI/SCALE )**2
END IF
END IF
END DO
NORM = SCALE * SQRT( SSQ )
END IF
DNRM2 = NORM
RETURN
END FUNCTION DNRM2
end program test_norm
I'd like to use Minpack (fortran) to estimate the D parameter in the following generalized form of the S-curve: y = (A - D) / (1 + (x**B/C)) + D
The idea is that in this application, the user provides A [which is always 0 to force passage through (0,0)], B, and C, and from there Minpack will find a value for D that forces passage through (1,y), where y is also supplied by the user but must be <= 1. I was able to accomplish this task with the code below, however, minpack is claiming it hasn't converged when in fact it appears that it has. For example, when running this code and entering the values 1 (at the first prompt) and 0 4 0.1 (at the second prompting), minpack returns info = 2, which according to the comments in lmdif means:
relative error between two consecutive iterates is at most xtol.
I'm tempted to comment out line 63, but am worried that's playing with fire...are there any seasoned minpack users out there who could comment on this? Line 63 is the one that reads:
if (info /= 1) stop "failed to converge"
Am I mis-using Minpack even though it appears to converge (based on my verifying the value in pars)?
module types
implicit none
private
public dp
integer, parameter :: dp=kind(0d0)
end module
module f_vals
DOUBLE PRECISION, SAVE, DIMENSION(:), POINTER:: fixed_vals
end module
module find_fit_module
! This module contains a general function find_fit() for a nonlinear least
! squares fitting. The function can fit any nonlinear expression to any data.
use minpack, only: lmdif1
use types, only: dp
implicit none
private
public find_fit
contains
subroutine find_fit(data_x, data_y, expr, pars)
! Fits the (data_x, data_y) arrays with the function expr(x, pars).
! The user can provide any nonlinear function 'expr' depending on any number of
! parameters 'pars' and it must return the evaluated expression on the
! array 'x'. The arrays 'data_x' and 'data_y' must have the same
! length.
real(dp), intent(in) :: data_x(:), data_y(:)
interface
function expr(x, pars) result(y)
use types, only: dp
implicit none
real(dp), intent(in) :: x(:), pars(:)
real(dp) :: y(size(x))
end function
end interface
real(dp), intent(inout) :: pars(:)
real(dp) :: tol, fvec(size(data_x))
integer :: iwa(size(pars)), info, m, n
real(dp), allocatable :: wa(:)
tol = sqrt(epsilon(1._dp))
!tol = 0.001
m = size(fvec)
n = size(pars)
allocate(wa(m*n + 5*n + m))
call lmdif1(fcn, m, n, pars, fvec, tol, info, iwa, wa, size(wa))
open(222, FILE='D_Value.txt')
write(222,4) pars(1)
4 format(E20.12)
close(222)
if (info /= 1) stop "failed to converge"
contains
subroutine fcn(m, n, x, fvec, iflag)
integer, intent(in) :: m, n, iflag
real(dp), intent(in) :: x(n)
real(dp), intent(out) :: fvec(m)
! Suppress compiler warning:
fvec(1) = iflag
fvec = data_y - expr(data_x, x)
end subroutine
end subroutine
end module
program snwdeplcrv
! Find a nonlinear fit of the form y = (A - D) / (1 + (x**B/C)) + D.
use find_fit_module, only: find_fit
use types, only: dp
use f_vals
implicit none
real(dp) :: pars(1), y_int_at_1
real(dp) :: y(1) = 1.0 ! Initialization of value to be reset by user (y: value of S-curve # x=1)
real(dp) :: A, B, C
integer :: i
allocate(fixed_vals(3)) ! A, B, C parameters
pars = [1._dp] ! D parameter in S-curve function
! Read PEST-specified parameters
write(*,*) ' Enter value that S-curve should equal when SWE=1 (must be <= 1)'
read(*,*) y_int_at_1
if(y_int_at_1 > 1.0) y_int_at_1 = 1
y = y_int_at_1
! Read PEST-specified parameters
write(*,*) ' Enter S-curve parameters: A, B, & C. D parameter to be estimated '
read(*,*) A, B, C
fixed_vals(1) = A
fixed_vals(2) = B
fixed_vals(3) = C
call find_fit([(real(i, dp), i=1,size(y))], y, expression, pars)
print *, pars
contains
function expression(x, pars) result(y)
use f_vals
real(dp), intent(in) :: x(:), pars(:)
real(dp) :: y(size(x))
real(dp) :: A, B, C, D
A = fixed_vals(1)
B = fixed_vals(2)
C = fixed_vals(3)
D = pars(1)
y = (A - D) / (1 + (x**B / C)) + D
end function
end program
This question already has an answer here:
Procedure with assumed-shape dummy argument must have an explicit interface [duplicate]
(1 answer)
Closed 4 years ago.
Im trying to write the code for a simple program that first asks for a number n, then creates an nxn matrix with 3s on its main diagonal, 1s over it and 0s under it and a vector (n) with 3 in the uneven positions and 2 in the even positions. It must then include a subroutine that multiplies both of them not using matmul()
program P13
implicit none
integer(4) :: n, i, j
integer, dimension(:), allocatable:: v
integer, dimension(:,:), allocatable :: m
integer, dimension(:), allocatable :: r
write(*,*) "Insert n"
read(*,*) n
allocate (v(1:n))
allocate (m(1:n,1:n))
v(1:n:2) = 3
v(2:n:2) = 2
m = 0
DO i=1,n,1
m (i,i:n)=1
END DO
Do i=1,n,1
m (i,i)=3
End do
call matrmul(n, m, v)
end program
subroutine matrmul(n, b, o, t)
implicit none
integer(4), intent(in) :: n
integer(4) :: i, j
integer, dimension(:), intent(in) :: b
integer, dimension(:,:),intent(in) :: o
integer, dimension(:), intent(out) :: t
DO i=1,n,1
t(i) = sum(b*o(:,i))
END DO
write(*,'(I2)') t
end subroutine
I get the error message Explicit interface required for ‘matrmul’ at (1): assumed-shape argument
How do I fix this?? Thanks
There are plenty examples here at stack overflow that will show you how to create explicit interfaces. However, since you allocate memory for all your arrays in the main program and you pass the size into the subroutine, just declare all your arrays in the subroutine with n.
subroutine matrmul(n, b, o, t)
implicit none
integer(4), intent(in) :: n
integer(4) :: i, j
integer, dimension(n), intent(in) :: b
integer, dimension(n,n),intent(in) :: o
integer, dimension(n), intent(out) :: t
I'm struggling with LAPACK's dgetrf and dgetri routines. Below is a subroutine I've created (the variable fit_coeffs is defined externally and is allocatable, it's not the problem). When I run I get memory allocation errors, that appear when I assign fit_coeffs, due to the matmul(ATA,AT) line. I know this from inserting a bunch of print statements. Also, both error checking statements after calls to LAPACK subroutines are printed, suggesting an error.
Does anyone understand where this comes from? I'm compiling using the command:
gfortran -Wall -cpp -std=f2003 -ffree-form -L/home/binningtont/lapack-3.4.0/ read_grib.f -llapack -lrefblas.
Thanks in advance!
subroutine polynomial_fit(x_array, y_array, D)
integer, intent(in) :: D
real, intent(in), dimension(:) :: x_array, y_array
real, allocatable, dimension(:,:) :: A, AT, ATA
real, allocatable, dimension(:) :: work
integer, dimension(:), allocatable :: pivot
integer :: l, m, n, lda, lwork, ok
l = D + 1
lda = l
lwork = l
allocate(fit_coeffs(l))
allocate(pivot(l))
allocate(work(l))
allocate(A(size(x_array),l))
allocate(AT(l,size(x_array)))
allocate(ATA(l,l))
do m = 1,size(x_array),1
do n = 1,l,1
A(m,n) = x_array(m)**(n-1)
end do
end do
AT = transpose(A)
ATA = matmul(AT,A)
call dgetrf(l, l, ATA, lda, pivot, ok)
! ATA is now represented as PLU (permutation, lower, upper)
if (ok /= 0) then
write(6,*) "HERE"
end if
call dgetri(l, ATA, lda, pivot, work, lwork, ok)
! ATA now contains the inverse of the matrix ATA
if (ok /= 0) then
write(6,*) "HERE"
end if
fit_coeffs = matmul(matmul(ATA,AT),y_array)
deallocate(pivot)
deallocate(fit_coeffs)
deallocate(work)
deallocate(A)
deallocate(AT)
deallocate(ATA)
end subroutine polynomial_fit
1) Where is fit_coeffs declared? I can't see how the above can even compile
1b) Implicit None is your friend!
2) You do have an interface in scope at the calling point, don't you?
3) dgertf and dgetri want "double precision" while you have single. So you need sgetrf and sgetri
"Fixing" all these and completeing the program I get
Program testit
Implicit None
Real, Dimension( 1:100 ) :: x, y
Integer :: D
Interface
subroutine polynomial_fit(x_array, y_array, D)
Implicit None ! Always use this!!
integer, intent(in) :: D
real, intent(in), dimension(:) :: x_array, y_array
End subroutine polynomial_fit
End Interface
Call Random_number( x )
Call Random_number( y )
D = 6
Call polynomial_fit( x, y, D )
End Program testit
subroutine polynomial_fit(x_array, y_array, D)
Implicit None ! Always use this!!
integer, intent(in) :: D
real, intent(in), dimension(:) :: x_array, y_array
real, allocatable, dimension(:,:) :: A, AT, ATA
real, allocatable, dimension(:) :: work, fit_coeffs
integer, dimension(:), allocatable :: pivot
integer :: l, m, n, lda, lwork, ok
l = D + 1
lda = l
lwork = l
allocate(fit_coeffs(l))
allocate(pivot(l))
allocate(work(l))
allocate(A(size(x_array),l))
allocate(AT(l,size(x_array)))
allocate(ATA(l,l))
do m = 1,size(x_array),1
do n = 1,l,1
A(m,n) = x_array(m)**(n-1)
end do
end do
AT = transpose(A)
ATA = matmul(AT,A)
call sgetrf(l, l, ATA, lda, pivot, ok)
! ATA is now represented as PLU (permutation, lower, upper)
if (ok /= 0) then
write(6,*) "HERE"
end if
call sgetri(l, ATA, lda, pivot, work, lwork, ok)
! ATA now contains the inverse of the matrix ATA
if (ok /= 0) then
write(6,*) "HERE"
end if
fit_coeffs = matmul(matmul(ATA,AT),y_array)
deallocate(pivot)
deallocate(fit_coeffs)
deallocate(work)
deallocate(A)
deallocate(AT)
deallocate(ATA)
end subroutine polynomial_fit
This runs to completion. If I omit the interface I get "HERE" printed twice. If I use the d versions I get seg faults.
Does this answer your question?
I am trying to get the ZGEEV routine in Lapack to work for a test problem and having some difficulties. I just started coding in FORTRAN a week ago, so I think it is probably something very trivial that I am missing.
I have to diagonalize rather large complex symmetric matrices. To get started, using Matlab I created a 200 by 200 matrix, which I have verified is diagonalizable. When I run the code, it brings up no errors and the INFO = 0, suggesting a success. However, all the eigenvalues are (0,0) which I know is wrong.
Attached is my code.
PROGRAM speed_zgeev
IMPLICIT NONE
INTEGER(8) :: N
COMPLEX*16, DIMENSION(:,:), ALLOCATABLE :: MAT
INTEGER(8) :: INFO, I, J
COMPLEX*16, DIMENSION(:), ALLOCATABLE :: RWORK
COMPLEX*16, DIMENSION(:), ALLOCATABLE :: D
COMPLEX*16, DIMENSION(1,1) :: VR, VL
INTEGER(8) :: LWORK = -1
COMPLEX*16, DIMENSION(:), ALLOCATABLE :: WORK
DOUBLE PRECISION :: RPART, IPART
EXTERNAL ZGEEV
N = 200
ALLOCATE(D(N))
ALLOCATE(RWORK(2*N))
ALLOCATE(WORK(N))
ALLOCATE(MAT(N,N))
OPEN(UNIT = 31, FILE = "newmat.txt")
OPEN(UNIT = 32, FILE = "newmati.txt")
DO J = 1,N
DO I = 1,N
READ(31,*) RPART
READ(32,*) IPART
MAT(I,J) = CMPLX(RPART, IPART)
END DO
END DO
CLOSE(31)
CLOSE(32)
CALL ZGEEV('N','N', N, MAT, N, D, VL, 1, VR, 1, WORK, LWORK, RWORK, INFO)
INFO = WORK(1)
DEALLOCATE(WORK)
ALLOCATE(WORK(INFO))
CALL ZGEEV('N','N', N, MAT, N, D, VL, 1, VR, 1, WORK, LWORK, RWORK, INFO)
IF (INFO .EQ. 0) THEN
PRINT*, D(1:10)
ELSE
PRINT*, INFO
END IF
DEALLOCATE(MAT)
DEALLOCATE(D)
DEALLOCATE(RWORK)
DEALLOCATE(WORK)
END PROGRAM speed_zgeev
I have tried the same code on smaller matrices, of size 30 by 30 and they work fine. Any help would be appreciated! Thanks.
I forgot to mention that I am loading the matrices from a test file which I have verified to be working right.
Maybe LWORK = WORK (1) instead of INFO = WORK(1)? Also change ALLOCATE(WORK(INFO)).