Unclassifiable OpenMP directive in a Fortran program - fortran

I was trying to parallelize a code in Fortran using openMP, with this code:
program pigreco
!----------------------------------------!
use OMP_LIB
implicit none
!----------------------------------------!
integer :: i
integer, parameter :: N = 100000
integer, parameter :: NCPU = 4
real*8 :: t0, t1
real :: h, totale, x, f
!----------------------------------------!
print '(a,2x,i15)', ' Number of intervals: ', N
totale = 0.0
h = 1. / N
call OMP_SET_NUM_THREADS(NCPU)
write(*, '(a,i10)') 'Numero di processori totali: ', NCPU
t0 = OMP_GET_WTIME()
!----------------------------------------!
#ifdef PARALLEL
!
print '(a)', "Scelta la versione parallela."
!
!$OMP PARALLEL DO PRIVATE(x, f) REDUCTION(+:totale)
!
do i = 1, N
x = (i - 0.5) * h
f = (4 * h) / (1 + x**2)
totale = totale + f
enddo
!$OMP END PARALLEL DO
!
#endif
!
t1 = OMP_GET_WTIME()
!
PRINT '(a,2x,f30.25)', ' Computed PI =', totale
PRINT '(a,2x,f30.25)', ' Total computational time =', t1 - t0
!
end program pigreco
When I then try to compile with the line: gfortran prova.F90 -fopenmp -D PARALLEL it gives me an error that says "unclassifiable OpenMP directive at (1)".

The problem is that you defined PARALLEL with the preprocessor, so instead of reading OMP PARALLEL DO, the compiler reads OMP 1 DO, which of course doesn't make sense. Change #ifdef PARALLEL to #ifdef RUNPARALLEL and -DPARALLEL to -DRUNPARALLEL, then the compiler gives no error.
Alternatively, you can use the fact that when compiling with OpenMP support the macro variable _OPENMP is defined automatically, so you could use #ifdef _OPENMP, and no -D flag.

Related

Parallelizing DO loop with nvfortran on gpu

I am tring to parallelize a do loop in Fortran. Using OMP parallel do (and converted to standard do loop) it works nicely (using both gfortran and nvfortran), but when compiling it with nvfortran -stdpar=gpu it compiles, but running it, it crashes with:
0 Current file: xxx/pi.f90
function: pi
line: 15 This file was compiled: -acc=gpu -gpu=cc35 -gpu=cc50 -gpu=cc60 -gpu=cc60 -gpu=cc70 -gpu=cc75 -gpu=cc80 -
Here is the code:
program pi
implicit none
integer :: count, n, i
real :: r
real, dimension(10000) :: x,y
logical , dimension(10000) :: c
c = .false.
n=size(x,1)
print*,count(c)
call RANDOM_SEED
call random_number(x)
call random_number(y)
do concurrent (i = 1: n)
if (x(i)**2 + y(i)**2 <1.0) c(i)=.true.
end do
r = 4 * real(count(c))/n
print *, r
end program pi

Speedup of calculation for symmetric matrix using OMP

My matrix calculation is: C=C-A*B
Here C is a symmetric matrix so I want to speed up this calculation by considering just the upper triangular and then take the opposite elelement. I used OMP and see that my implementation is slower than the normal calculation for the entire matrix C.
I also see that the calculation for C=C-AxB is slower than C=C+AxB.
My program is attached. Please advise me!
Program testspeed
implicit none
integer nstate,nmeas,i,j,l
integer(kind=8) :: tclock1, tclock2, clock_rate
real(kind=8) :: elapsed_time
double precision, allocatable, dimension(:,:):: B,C,A
nstate =20000
nmeas=10000
allocate (B(nmeas,nstate),C(nstate,nstate),A(nstate,nmeas))
A=1d0
B=1d0
call system_clock(tclock1)
write(*,*) "1"
!$omp parallel do
do j = 1, nstate
do l = 1,nmeas
do i = 1, j
C(j,i) = C(j,i) - A(j,l)*B(l,i)
C(i,j)=C(j,i)
end do
end do
end do
!$omp end parallel do
write(*,*) "2"
call system_clock(tclock2, clock_rate)
elapsed_time = float(tclock2 - tclock1) / float(clock_rate)
write(*,*) elapsed_time
end Program testspeed
One of the basic rules I have taught my students is that nobody should be writing dense matrix multiplies themselves nowadays - and should not have been doing for 30 years +. You should use the BLAS library instead. Below I compare using the BLAS library against your loop ordering and a better loop ordering, and also against the Fortran intrinsic function matmul which I use as a reference to check the results are correct. BLAS and matmul don't take advantage of the symmetry of C, yet they still are the fastest routines - BLAS is about 200-300 times quicker than the loop ordering you have written. Note I have also cut the matrix size down somewhat as I got bored waiting for the original to run for larger cases:
ijb#ijb-Latitude-5410:~/work/stack$ cat mm.f90
Program testspeed
Use, Intrinsic :: iso_fortran_env, Only : wp => real64, li => int64
Use omp_lib, Only : omp_get_max_threads
Implicit None
Integer nstate,nmeas,i,j,l
Integer(li) :: tclock1, tclock2, clock_rate
Real(wp) :: elapsed_time
Real( wp ), Allocatable, Dimension(:,:):: B,C,A
Real( wp ), Allocatable, Dimension(:,:):: C_test
Real( wp ), Allocatable, Dimension(:,:):: C_start
Write( *, * ) 'Using ', omp_get_max_threads(), ' threads'
!!$ nstate =2000
!!$ nmeas=1000
nstate = 5000
nmeas = 2500
Allocate (B(nmeas,nstate),C(nstate,nstate),A(nstate,nmeas))
Allocate( C_test, Mold = C )
Allocate( C_start, Mold = C )
!!$ A=1.0_wp
!!$ B=1.0_wp
! Random numbers are a much better test
Call Random_number( A )
B = Transpose( A ) ! make sure result is symmetric
Call Random_number( C_start )
! Make Initial C Symmetric
C_start = 0.5_wp * ( C_start + Transpose( C_start ) )
Write( *, * ) 'Matix sizes ', Shape( A ), Shape( B ), Shape( C )
C_test = C_start
Call system_Clock(tclock1)
C_test = C_test - Matmul( A, B )
Call system_Clock(tclock2, clock_rate)
elapsed_time = Real(tclock2 - tclock1,wp) / Real(clock_rate,wp)
Write( *,'( a, t20, f8.3 )' ) 'Matmul', elapsed_time
C = C_start
Call system_Clock(tclock1)
!$omp parallel do
Do j = 1, nstate
Do l = 1,nmeas
Do i = 1, j
C(j,i) = C(j,i) - A(j,l)*B(l,i)
C(i,j)=C(j,i)
End Do
End Do
End Do
!$omp end parallel do
Call system_Clock(tclock2, clock_rate)
elapsed_time = Real(tclock2 - tclock1,wp) / Real(clock_rate,wp)
Write(*,'( a, t20, f8.3, t30, "Max error ", g20.12 )' ) &
'Orig loops', elapsed_time, Maxval( Abs( C_test - C ) )
C = C_start
Call system_Clock(tclock1)
!$omp parallel default( none ) shared ( nstate, nmeas, A, B, C ), private( i, j, l )
!$omp do
Do i = 1, nstate
Do l = 1,nmeas
Do j = 1, i
C(j,i) = C(j,i) - A(j,l)*B(l,i)
End Do
End Do
End Do
!$omp end do
!$omp do
Do i = 1, nstate
Do j = 1, i
C( i, j ) = C( j, i )
End Do
End Do
!$omp end do
!$omp end parallel
Call system_Clock(tclock2, clock_rate)
elapsed_time = Real(tclock2 - tclock1,wp) / Real(clock_rate,wp)
Write(*,'( a, t20, f8.3, t30, "Max error ", g20.12 )' ) &
'Sensible loops', elapsed_time, Maxval( Abs( C_test - C ) )
C = C_start
Call system_Clock(tclock1)
Call dgemm( 'N', 'N', nstate, nstate, nmeas, -1.0_wp, A, Size( A, Dim = 1 ), &
B, Size( B, Dim = 1 ), &
+1.0_wp, C, Size( C, Dim = 1 ) )
Call system_Clock(tclock2, clock_rate)
elapsed_time = Real(tclock2 - tclock1,wp) / Real(clock_rate,wp)
Write(*,'( a, t20, f8.3, t30, "Max error ", g20.12 )' ) &
'BLAS ', elapsed_time, Maxval( Abs( C_test - C ) )
End Program testspeed
ijb#ijb-Latitude-5410:~/work/stack$ gfortran --version
GNU Fortran (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0
Copyright (C) 2019 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
ijb#ijb-Latitude-5410:~/work/stack$ gfortran -fopenmp -Wall -Wextra -std=f2018 -O3 mm.f90 -lopenblas
ijb#ijb-Latitude-5410:~/work/stack$ export OMP_NUM_THREADS=1
ijb#ijb-Latitude-5410:~/work/stack$ ./a.out
Using 1 threads
Matix sizes 5000 2500 2500 5000 5000 5000
Matmul 4.793
Orig loops 421.564 Max error 0.488853402203E-11
Sensible loops 20.742 Max error 0.488853402203E-11
BLAS 2.185 Max error 0.682121026330E-12
ijb#ijb-Latitude-5410:~/work/stack$ export OMP_NUM_THREADS=2
ijb#ijb-Latitude-5410:~/work/stack$ ./a.out
Using 2 threads
Matix sizes 5000 2500 2500 5000 5000 5000
Matmul 4.968
Orig loops 324.319 Max error 0.466116034659E-11
Sensible loops 17.656 Max error 0.466116034659E-11
BLAS 1.161 Max error 0.682121026330E-12
ijb#ijb-Latitude-5410:~/work/stack$ export OMP_NUM_THREADS=3
ijb#ijb-Latitude-5410:~/work/stack$ ./a.out
Using 3 threads
Matix sizes 5000 2500 2500 5000 5000 5000
Matmul 4.852
Orig loops 243.268 Max error 0.500222085975E-11
Sensible loops 15.802 Max error 0.500222085975E-11
BLAS 0.852 Max error 0.682121026330E-12
ijb#ijb-Latitude-5410:~/work/stack$ export OMP_NUM_THREADS=4
ijb#ijb-Latitude-5410:~/work/stack$ ./a.out
Using 4 threads
Matix sizes 5000 2500 2500 5000 5000 5000
Matmul 4.994
Orig loops 189.189 Max error 0.477484718431E-11
Sensible loops 14.245 Max error 0.477484718431E-11
BLAS 0.707 Max error 0.682121026330E-12
For BLAS I have used openblas - which is freely available. On Linux system a simple apt get or similar should be enough.
Please also note
If you have to write your own loops your inner most loop should go, if possible, over the first index of your array. This is because Fortran orders its arrays as column major. This is what I have done in the "sensible" loop ordering
Real( 8 ) is not portable, not guaranteed to be supported by your compiler, and not guaranteed to do what you expect, and shouldn't be used. Similar for Integer( 8 ). Please see what I have done for a better way that should work with all compilers.
Float is not a standard intrinsic - use Real as I have done
As benchmarks are meaningless if the results are incorrect you should always include a way to test the results. Here I use the Fortran intrinsic matmul to provide a reference version. Your original code does not initialise C, so the results can not be trusted - but as you don't check you get the correct values for C you can't know this.
I personally dislike !$omp parallel do intensely, I think it a mistake that such short cuts are in OpenMP. Instead separate them into !$omp parallel and !$omp do - it is very important to understand that thread creation and work sharing are different things, convoluting them in one line is not a good way to learn OpenMP.

fortran arrays same shape and size parallel in OpenMP

I would like to ask whether openMP is capable of parallelizing fortran arrays with the same shape and size using simple notation. I did some research but I am not capable to find or figure out whether it is possible.
I refer as simple notation the following form:
a = b + c * 1.1
Find below a full example:
PROGRAM Parallel_Hello_World
USE OMP_LIB
implicit none
integer, parameter :: ILEN = 1000
integer :: a(ILEN,ILEN), b(ILEN,ILEN), c(ILEN,ILEN), d(ILEN,ILEN)
integer :: i, j
a = 1
b = 2
!$OMP PARALLEL SHARED(a, b, c, d)
!$OMP DO
DO i=1,ILEN
DO j=1, ILEN
c(j,i) = a(j,i) + b(j,i) * 1.1
ENDDO
END DO
!$OMP END DO
# is this loop parallel?
d = a + b * 1.1
!$OMP END PARALLEL
write (*,*) "Total C: ", c(1:5, 1)
write (*,*) "Total D: ", d(1:5, 1)
write (*,*) "C same D? ", all(c == d)
END
Is the d loop parallelized with openMP with the current notation?
As commented by #Gilles the answer to the question is to wrap it with the workshare clause:
!$OMP WORKSHARE
d = a + b * 1.1
!$OMP END WORKSHARE
Find more info here

Calculations on vectors become slower after better optimization flag and OpenMP

Consider the following Fortran code
program example
implicit none
integer, parameter :: ik = selected_int_kind(15)
integer, parameter :: rk = selected_real_kind(15,307)
integer(ik) :: N, i, j, pc, time_rate, start_time, end_time, M
real(rk), allocatable:: K(:,:), desc(:,:)
real(rk) :: kij, dij
integer :: omp_get_num_threads, nth
N = 2000
M = 400
allocate(K(N,N))
allocate(desc(N,M))
pc=10
do i = 1, N
desc(i,:) = real(i,rk)
if (i==int(N*pc)/100) then
print * ,"desc % complete: ",pc
pc=pc+10
endif
enddo
call system_clock(start_time)
!$OMP PARALLEL PRIVATE(nth)
nth = omp_get_num_threads()
print *,"omp threads", nth
!$OMP END PARALLEL
!$OMP PARALLEL DO &
!$OMP DEFAULT(SHARED) &
!$OMP PRIVATE(i,j,dij,kij)
do i = 1, N
do j = i, N
dij = sum(abs(desc(i,:) - desc(j,:)))
kij = dexp(-dij)
K(i,j) = kij
K(j,i) = kij
enddo
K(i,i) = K(i,i) + 0.1
enddo
!$OMP END PARALLEL DO
call system_clock(end_time, time_rate)
print* , "Time taken for Matrix:", real(end_time - start_time, rk)/real(time_rate, rk)
end program example
I compiled it using gfortran-6 on MacOS X 10.11 usin following flags
gfortran example.f90 -fopenmp -O0
gfortran example.f90 -fopenmp -O3
gfortran example.f90 -fopenmp -mtune=native
following which I ran it with single and double threads using OMP_NUM_THREADS variable. I can see that it is utilizing two cores. However O3 flag which should enable vectorization, does not help the performance at all, if anything it degrades it a bit. Timings are given below (in seconds) (avgd over 10 runs):
|Thrds->| 1 | 2 |
|Opt | | |
----------------------
|O0 |10.962|9.183|
|O3 |11.581|9.250|
|mtune |11.211|9.084|
What is wrong in my program?
First of all, if you want good performance from -O3, you should give it something that can actually be optimised. The bulk of the work happens in the sum intrinsic, which works on a vectorised expression. It doesn't get any more optimised when you switch from -O0 to -O3.
Also, if you want better performance, transpose desc because desc(i,:) is non-contiguous in memory. desc(:,i) is. That's Fortran - its matrices are column-major.

Is there a straightforward way to do concurrent calculations with a random number generator in Fortran?

I have some finite element code I have programmed in Fortran 95 that I have optimised so that I can now get well over 16Mil. elements working under 2GB of memory footprint.
The source function for my code is not smooth so I am using a (stratified) Monte-Carlo method to integrate, which requires a random number generator to select sample locations
I have tried compiling with gfortran-9 using -fopenmp -Ofast -ftree-parallelize-loops=4 but the loop with the random number generator won't go parallel. I tried do concurrent but obviously that didn't work because random_number isn't 'pure'. https://stackoverflow.com/a/32637737/2372254
I also tried blocking my loop but that didn't work either.
Here is the code I am talking about
do k=1,n_els ! total elements is n_els**2. This is block
do i=1+ (k-1)*n_els ,k*n_els
supp_vec = 0
integ_vec = 0.0_wp
! in this subroutine I call random_number
call do_element(ind, n_els, i, num_points_per_strat, &
strat_rows, strat_cols, supp_vec, integ_vec)
do j=1, 4
sc_vec(supp_vec(j) ) = integ_vec(j)
end do
! give some info about progress
if (mod( i , (n_els**2)/10) == 0) print*, i*10/((n_els**2)/10), "% done"
end do
end do
It seems I could write blocks to a file and call n different instances of the routine. I figure there must be a cleaner way to do that. Any tips on how to get that going faster?
I was considering writing a block-worth of points (depending on memory limits) to an array first and supplying that in the subroutine call. Before I try that I thought I would see if anybody had any advice about a better way. It would be good to keep the memory footprint down where possible.
As of version 7 and newer GFortran has a parallel random number generator. When implementing it, here's the OpenMP code I used to verify that the performance indeed scales with increasing numbers of threads (from https://gcc.gnu.org/ml/gcc-patches/2015-12/msg02110.html ):
! Benchmark generating random numbers
! Janne Blomqvist 2015
program randbench
#ifdef _OPENMP
use omp_lib
#endif
implicit none
integer, parameter :: dp=kind(0.d0) ! double precision
integer, parameter :: i64 = selected_int_kind(18) ! At least 64-bit integer
#ifdef _OPENMP
print *, "Using up to ", omp_get_max_threads(), " threads."
#endif
call genr4
call genr8
contains
subroutine genr4
integer, parameter :: n = int(1e7)
real, save :: r(n)
integer :: i
integer(i64) :: t1, t2, td
#ifdef _OPENMP
integer :: blocks, blocksize, l, h
#endif
Print *, "Generate default real random variables"
call system_clock (t1)
!$omp parallel do private(i)
do i = 1, n
call random_number(r(i))
end do
!$omp end parallel do
call system_clock (t2)
td = t2 - t1
print *, "Generating ", n, " default reals individually took ", td, " ticks."
call system_clock (t1)
#ifdef _OPENMP
blocks = omp_get_max_threads()
blocksize = n / blocks
!$omp parallel do private(l,h,i)
do i = 0, blocks - 1
l = i * blocksize + 1
h = l + blocksize - 1
!print *, "Low: ", l, " High: ", h
call random_number(r(l:h))
end do
#else
call random_number(r)
#endif
Call system_clock (t2)
print *, "Generating ", n, " default reals as an array took ", t2-t1, &
" ticks. => ind/arr = ", real(td, dp) / (t2-t1)
end subroutine genr4
subroutine genr8
integer, parameter :: n = int(1e7)
real(dp), save :: r(n)
integer :: i
integer(i64) :: t1, t2, td
#ifdef _OPENMP
integer :: blocks, blocksize, l, h
#endif
print *, "Generate double real random variables"
call system_clock (t1)
!$omp parallel do
do i = 1, n
call random_number(r(i))
end do
call system_clock (t2)
td = t2 - t1
print *, "Generating ", n, " double reals individually took ", td, " ticks."
call system_clock (t1)
#ifdef _OPENMP
blocks = omp_get_max_threads()
blocksize = n / blocks
!$omp parallel do private(l,h,i)
do i = 0, blocks - 1
l = i * blocksize + 1
h = l + blocksize - 1
!print *, "Low: ", l, " High: ", h
call random_number(r(l:h))
end do
#else
call random_number(r)
#endif
call system_clock (t2)
print *, "Generating ", n, " double reals as an array took ", t2-t1, &
" ticks. => ind/arr = ", real(td, dp) / (t2 -t1)
end subroutine genr8
end program