Speedup of calculation for symmetric matrix using OMP - fortran

My matrix calculation is: C=C-A*B
Here C is a symmetric matrix so I want to speed up this calculation by considering just the upper triangular and then take the opposite elelement. I used OMP and see that my implementation is slower than the normal calculation for the entire matrix C.
I also see that the calculation for C=C-AxB is slower than C=C+AxB.
My program is attached. Please advise me!
Program testspeed
implicit none
integer nstate,nmeas,i,j,l
integer(kind=8) :: tclock1, tclock2, clock_rate
real(kind=8) :: elapsed_time
double precision, allocatable, dimension(:,:):: B,C,A
nstate =20000
nmeas=10000
allocate (B(nmeas,nstate),C(nstate,nstate),A(nstate,nmeas))
A=1d0
B=1d0
call system_clock(tclock1)
write(*,*) "1"
!$omp parallel do
do j = 1, nstate
do l = 1,nmeas
do i = 1, j
C(j,i) = C(j,i) - A(j,l)*B(l,i)
C(i,j)=C(j,i)
end do
end do
end do
!$omp end parallel do
write(*,*) "2"
call system_clock(tclock2, clock_rate)
elapsed_time = float(tclock2 - tclock1) / float(clock_rate)
write(*,*) elapsed_time
end Program testspeed

One of the basic rules I have taught my students is that nobody should be writing dense matrix multiplies themselves nowadays - and should not have been doing for 30 years +. You should use the BLAS library instead. Below I compare using the BLAS library against your loop ordering and a better loop ordering, and also against the Fortran intrinsic function matmul which I use as a reference to check the results are correct. BLAS and matmul don't take advantage of the symmetry of C, yet they still are the fastest routines - BLAS is about 200-300 times quicker than the loop ordering you have written. Note I have also cut the matrix size down somewhat as I got bored waiting for the original to run for larger cases:
ijb#ijb-Latitude-5410:~/work/stack$ cat mm.f90
Program testspeed
Use, Intrinsic :: iso_fortran_env, Only : wp => real64, li => int64
Use omp_lib, Only : omp_get_max_threads
Implicit None
Integer nstate,nmeas,i,j,l
Integer(li) :: tclock1, tclock2, clock_rate
Real(wp) :: elapsed_time
Real( wp ), Allocatable, Dimension(:,:):: B,C,A
Real( wp ), Allocatable, Dimension(:,:):: C_test
Real( wp ), Allocatable, Dimension(:,:):: C_start
Write( *, * ) 'Using ', omp_get_max_threads(), ' threads'
!!$ nstate =2000
!!$ nmeas=1000
nstate = 5000
nmeas = 2500
Allocate (B(nmeas,nstate),C(nstate,nstate),A(nstate,nmeas))
Allocate( C_test, Mold = C )
Allocate( C_start, Mold = C )
!!$ A=1.0_wp
!!$ B=1.0_wp
! Random numbers are a much better test
Call Random_number( A )
B = Transpose( A ) ! make sure result is symmetric
Call Random_number( C_start )
! Make Initial C Symmetric
C_start = 0.5_wp * ( C_start + Transpose( C_start ) )
Write( *, * ) 'Matix sizes ', Shape( A ), Shape( B ), Shape( C )
C_test = C_start
Call system_Clock(tclock1)
C_test = C_test - Matmul( A, B )
Call system_Clock(tclock2, clock_rate)
elapsed_time = Real(tclock2 - tclock1,wp) / Real(clock_rate,wp)
Write( *,'( a, t20, f8.3 )' ) 'Matmul', elapsed_time
C = C_start
Call system_Clock(tclock1)
!$omp parallel do
Do j = 1, nstate
Do l = 1,nmeas
Do i = 1, j
C(j,i) = C(j,i) - A(j,l)*B(l,i)
C(i,j)=C(j,i)
End Do
End Do
End Do
!$omp end parallel do
Call system_Clock(tclock2, clock_rate)
elapsed_time = Real(tclock2 - tclock1,wp) / Real(clock_rate,wp)
Write(*,'( a, t20, f8.3, t30, "Max error ", g20.12 )' ) &
'Orig loops', elapsed_time, Maxval( Abs( C_test - C ) )
C = C_start
Call system_Clock(tclock1)
!$omp parallel default( none ) shared ( nstate, nmeas, A, B, C ), private( i, j, l )
!$omp do
Do i = 1, nstate
Do l = 1,nmeas
Do j = 1, i
C(j,i) = C(j,i) - A(j,l)*B(l,i)
End Do
End Do
End Do
!$omp end do
!$omp do
Do i = 1, nstate
Do j = 1, i
C( i, j ) = C( j, i )
End Do
End Do
!$omp end do
!$omp end parallel
Call system_Clock(tclock2, clock_rate)
elapsed_time = Real(tclock2 - tclock1,wp) / Real(clock_rate,wp)
Write(*,'( a, t20, f8.3, t30, "Max error ", g20.12 )' ) &
'Sensible loops', elapsed_time, Maxval( Abs( C_test - C ) )
C = C_start
Call system_Clock(tclock1)
Call dgemm( 'N', 'N', nstate, nstate, nmeas, -1.0_wp, A, Size( A, Dim = 1 ), &
B, Size( B, Dim = 1 ), &
+1.0_wp, C, Size( C, Dim = 1 ) )
Call system_Clock(tclock2, clock_rate)
elapsed_time = Real(tclock2 - tclock1,wp) / Real(clock_rate,wp)
Write(*,'( a, t20, f8.3, t30, "Max error ", g20.12 )' ) &
'BLAS ', elapsed_time, Maxval( Abs( C_test - C ) )
End Program testspeed
ijb#ijb-Latitude-5410:~/work/stack$ gfortran --version
GNU Fortran (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0
Copyright (C) 2019 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
ijb#ijb-Latitude-5410:~/work/stack$ gfortran -fopenmp -Wall -Wextra -std=f2018 -O3 mm.f90 -lopenblas
ijb#ijb-Latitude-5410:~/work/stack$ export OMP_NUM_THREADS=1
ijb#ijb-Latitude-5410:~/work/stack$ ./a.out
Using 1 threads
Matix sizes 5000 2500 2500 5000 5000 5000
Matmul 4.793
Orig loops 421.564 Max error 0.488853402203E-11
Sensible loops 20.742 Max error 0.488853402203E-11
BLAS 2.185 Max error 0.682121026330E-12
ijb#ijb-Latitude-5410:~/work/stack$ export OMP_NUM_THREADS=2
ijb#ijb-Latitude-5410:~/work/stack$ ./a.out
Using 2 threads
Matix sizes 5000 2500 2500 5000 5000 5000
Matmul 4.968
Orig loops 324.319 Max error 0.466116034659E-11
Sensible loops 17.656 Max error 0.466116034659E-11
BLAS 1.161 Max error 0.682121026330E-12
ijb#ijb-Latitude-5410:~/work/stack$ export OMP_NUM_THREADS=3
ijb#ijb-Latitude-5410:~/work/stack$ ./a.out
Using 3 threads
Matix sizes 5000 2500 2500 5000 5000 5000
Matmul 4.852
Orig loops 243.268 Max error 0.500222085975E-11
Sensible loops 15.802 Max error 0.500222085975E-11
BLAS 0.852 Max error 0.682121026330E-12
ijb#ijb-Latitude-5410:~/work/stack$ export OMP_NUM_THREADS=4
ijb#ijb-Latitude-5410:~/work/stack$ ./a.out
Using 4 threads
Matix sizes 5000 2500 2500 5000 5000 5000
Matmul 4.994
Orig loops 189.189 Max error 0.477484718431E-11
Sensible loops 14.245 Max error 0.477484718431E-11
BLAS 0.707 Max error 0.682121026330E-12
For BLAS I have used openblas - which is freely available. On Linux system a simple apt get or similar should be enough.
Please also note
If you have to write your own loops your inner most loop should go, if possible, over the first index of your array. This is because Fortran orders its arrays as column major. This is what I have done in the "sensible" loop ordering
Real( 8 ) is not portable, not guaranteed to be supported by your compiler, and not guaranteed to do what you expect, and shouldn't be used. Similar for Integer( 8 ). Please see what I have done for a better way that should work with all compilers.
Float is not a standard intrinsic - use Real as I have done
As benchmarks are meaningless if the results are incorrect you should always include a way to test the results. Here I use the Fortran intrinsic matmul to provide a reference version. Your original code does not initialise C, so the results can not be trusted - but as you don't check you get the correct values for C you can't know this.
I personally dislike !$omp parallel do intensely, I think it a mistake that such short cuts are in OpenMP. Instead separate them into !$omp parallel and !$omp do - it is very important to understand that thread creation and work sharing are different things, convoluting them in one line is not a good way to learn OpenMP.

Related

fortran arrays same shape and size parallel in OpenMP

I would like to ask whether openMP is capable of parallelizing fortran arrays with the same shape and size using simple notation. I did some research but I am not capable to find or figure out whether it is possible.
I refer as simple notation the following form:
a = b + c * 1.1
Find below a full example:
PROGRAM Parallel_Hello_World
USE OMP_LIB
implicit none
integer, parameter :: ILEN = 1000
integer :: a(ILEN,ILEN), b(ILEN,ILEN), c(ILEN,ILEN), d(ILEN,ILEN)
integer :: i, j
a = 1
b = 2
!$OMP PARALLEL SHARED(a, b, c, d)
!$OMP DO
DO i=1,ILEN
DO j=1, ILEN
c(j,i) = a(j,i) + b(j,i) * 1.1
ENDDO
END DO
!$OMP END DO
# is this loop parallel?
d = a + b * 1.1
!$OMP END PARALLEL
write (*,*) "Total C: ", c(1:5, 1)
write (*,*) "Total D: ", d(1:5, 1)
write (*,*) "C same D? ", all(c == d)
END
Is the d loop parallelized with openMP with the current notation?
As commented by #Gilles the answer to the question is to wrap it with the workshare clause:
!$OMP WORKSHARE
d = a + b * 1.1
!$OMP END WORKSHARE
Find more info here

How can I make the difference between a result and the preceding one in do loops?

This subroutine uses to determine composite Trapezoid so
I want to abstract(Difference) between the final result(Integration) and the previous one(Integeration-1) and use the difference as a limit for duplicate my number of interval.
Subroutine Trapezoid(a,b,n,integration)
real,external :: f
real :: h,a,b,summ,p
real,intent(out) :: integration
integer :: n
integer :: i,j
!Here as we have the whole equation is (h/2)*[f(a)+f(b)+2*sum(Xi)
!So we calculate the first part (h/2)*[f(a)+f(b) and then calculate the anoter part
do i=1,n
n=2**i !Double the number of interval
h=(b-a)/n !Calculate the delta X
p=(h/2.)*(f(a)+f(b))
summ=0
do j=1,n-1
summ=summ+h*f(a+j*h) !h/2 *2* sum[f(Xi)
enddo
if(n == 256) then !put a limit for the number of interval
Stop
end if
integration = p + summ !Here the sum the both parts
print*,n,' ',integration
enddo
end Subroutine
So instead of the limit is 250 , I want to determine the difference and when this difference smaller than 10*-8, Stop
I tried a lot, but I didn't get what I want.
I would do it something like the below (very quickly hacked together). Note that with default kind reals 1e-8 is an unrealistic accuracy to expect - hence the lower tolerance. If you want higher accuracy you will need to use a higher precision kind real.
Note also I have turned this into a complete program. In questions please do this yourself. In purely selfish terms you will be much more likely to get a useful answer.
Anyway here is the code
Program integ
Implicit None
Real, Parameter :: pi = 3.1415927
Real :: value, delta
Integer :: n_used
Intrinsic :: sin
Call Trapezoid( sin, 0.0, pi / 2.0, 20, n_used, value, delta )
Write( *, * ) 'final result', value, ' with ', 2 ** n_used, ' intervals'
Contains
Subroutine Trapezoid(f,a,b,n_max,n_used,integration,delta)
Implicit None
Real, Parameter :: tol = 1e-4
Interface
Function f( x ) Result( r )
Real :: r
Real, Intent( In ) :: x
End Function f
End Interface
Real , Intent( In ) :: a
Real , Intent( In ) :: b
Integer, Intent( In ) :: n_max
Integer, Intent( Out ) :: n_used
Real , Intent( Out ) :: integration
Real , Intent( Out ) :: delta
Real :: h,summ,p
Real :: integration_old
Integer :: n
Integer :: i,j
!Here as we have the whole equation is (h/2)*[f(a)+f(b)+2*sum(Xi)
!So we calculate the first part (h/2)*[f(a)+f(b) and then calculate the anoter part
delta = - Huge( delta )
integration_old = Huge( integration_old )
Do i=1,n_max
n=2**i !Double the number of interval
h=(b-a)/n !Calculate the delta X
p=(h/2.)*(f(a)+f(b))
summ=0
Do j=1,n-1
summ=summ+h*f(a+j*h) !h/2 *2* sum[f(Xi)
Enddo
integration = p + summ !Here the sum the both parts
If( i /= 1 ) Then
delta = integration - integration_old
Write( *, * ) n,' ',integration , delta
If( Abs( delta ) < tol ) Exit
End If
integration_old = integration
Enddo
n_used = i
End Subroutine Trapezoid
End Program
ian#eris:~/work/stack$ gfortran --version
GNU Fortran (Ubuntu 7.4.0-1ubuntu1~18.04.1) 7.4.0
Copyright (C) 2017 Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
ian#eris:~/work/stack$ gfortran -Wall -Wextra -fcheck=all -O -std=f2008 integ.f90
ian#eris:~/work/stack$ ./a.out
4 0.987115800 3.90563607E-02
8 0.996785223 9.66942310E-03
16 0.999196708 2.41148472E-03
32 0.999799252 6.02543354E-04
64 0.999949872 1.50620937E-04
128 0.999987483 3.76105309E-05
final result 0.999987483 with 128 intervals

Calculations on vectors become slower after better optimization flag and OpenMP

Consider the following Fortran code
program example
implicit none
integer, parameter :: ik = selected_int_kind(15)
integer, parameter :: rk = selected_real_kind(15,307)
integer(ik) :: N, i, j, pc, time_rate, start_time, end_time, M
real(rk), allocatable:: K(:,:), desc(:,:)
real(rk) :: kij, dij
integer :: omp_get_num_threads, nth
N = 2000
M = 400
allocate(K(N,N))
allocate(desc(N,M))
pc=10
do i = 1, N
desc(i,:) = real(i,rk)
if (i==int(N*pc)/100) then
print * ,"desc % complete: ",pc
pc=pc+10
endif
enddo
call system_clock(start_time)
!$OMP PARALLEL PRIVATE(nth)
nth = omp_get_num_threads()
print *,"omp threads", nth
!$OMP END PARALLEL
!$OMP PARALLEL DO &
!$OMP DEFAULT(SHARED) &
!$OMP PRIVATE(i,j,dij,kij)
do i = 1, N
do j = i, N
dij = sum(abs(desc(i,:) - desc(j,:)))
kij = dexp(-dij)
K(i,j) = kij
K(j,i) = kij
enddo
K(i,i) = K(i,i) + 0.1
enddo
!$OMP END PARALLEL DO
call system_clock(end_time, time_rate)
print* , "Time taken for Matrix:", real(end_time - start_time, rk)/real(time_rate, rk)
end program example
I compiled it using gfortran-6 on MacOS X 10.11 usin following flags
gfortran example.f90 -fopenmp -O0
gfortran example.f90 -fopenmp -O3
gfortran example.f90 -fopenmp -mtune=native
following which I ran it with single and double threads using OMP_NUM_THREADS variable. I can see that it is utilizing two cores. However O3 flag which should enable vectorization, does not help the performance at all, if anything it degrades it a bit. Timings are given below (in seconds) (avgd over 10 runs):
|Thrds->| 1 | 2 |
|Opt | | |
----------------------
|O0 |10.962|9.183|
|O3 |11.581|9.250|
|mtune |11.211|9.084|
What is wrong in my program?
First of all, if you want good performance from -O3, you should give it something that can actually be optimised. The bulk of the work happens in the sum intrinsic, which works on a vectorised expression. It doesn't get any more optimised when you switch from -O0 to -O3.
Also, if you want better performance, transpose desc because desc(i,:) is non-contiguous in memory. desc(:,i) is. That's Fortran - its matrices are column-major.

Unclassifiable OpenMP directive in a Fortran program

I was trying to parallelize a code in Fortran using openMP, with this code:
program pigreco
!----------------------------------------!
use OMP_LIB
implicit none
!----------------------------------------!
integer :: i
integer, parameter :: N = 100000
integer, parameter :: NCPU = 4
real*8 :: t0, t1
real :: h, totale, x, f
!----------------------------------------!
print '(a,2x,i15)', ' Number of intervals: ', N
totale = 0.0
h = 1. / N
call OMP_SET_NUM_THREADS(NCPU)
write(*, '(a,i10)') 'Numero di processori totali: ', NCPU
t0 = OMP_GET_WTIME()
!----------------------------------------!
#ifdef PARALLEL
!
print '(a)', "Scelta la versione parallela."
!
!$OMP PARALLEL DO PRIVATE(x, f) REDUCTION(+:totale)
!
do i = 1, N
x = (i - 0.5) * h
f = (4 * h) / (1 + x**2)
totale = totale + f
enddo
!$OMP END PARALLEL DO
!
#endif
!
t1 = OMP_GET_WTIME()
!
PRINT '(a,2x,f30.25)', ' Computed PI =', totale
PRINT '(a,2x,f30.25)', ' Total computational time =', t1 - t0
!
end program pigreco
When I then try to compile with the line: gfortran prova.F90 -fopenmp -D PARALLEL it gives me an error that says "unclassifiable OpenMP directive at (1)".
The problem is that you defined PARALLEL with the preprocessor, so instead of reading OMP PARALLEL DO, the compiler reads OMP 1 DO, which of course doesn't make sense. Change #ifdef PARALLEL to #ifdef RUNPARALLEL and -DPARALLEL to -DRUNPARALLEL, then the compiler gives no error.
Alternatively, you can use the fact that when compiling with OpenMP support the macro variable _OPENMP is defined automatically, so you could use #ifdef _OPENMP, and no -D flag.

Parallelizing formated writes in Fortran with OpenMP

I'm trying to parallelize a Fortran code which at one moment writes a tons of numbers to formated output. Some simple profiling showed that most CPU time is spent in format conversion, so I had the idea to do the formatting in parallel to character buffers and later write the unformatted buffers to the file.
My proof of concept looks like this:
program parawrite
implicit none
integer (kind = 4) :: i, j, tstart, tstop, rate
integer (kind = 4), parameter :: bufsize = 100000, n = 10000000, llen = 22
character (kind=1, len=:), allocatable :: buf
real (kind=8), dimension(n) :: a
! some input
do i = 1, n
a(i) = dble(i) * dble(i)
enddo
! formated writes for reference
open(unit=10, file="out1.txt", form="formatted")
call system_clock(tstart, rate);
do i = 1, n
write(10,"(E21.15)") a(i)
end do
call system_clock(tstop, rate);
print *, 'Formated write: ', dble(tstop - tstart) / dble(rate), 's'
close(10)
! parallel stuff
open(unit=10, file="out2.txt", access="stream", form="unformatted")
call system_clock(tstart, rate);
!$omp parallel private(buf, j)
allocate(character(bufsize * llen) :: buf)
j = 0;
!$omp do ordered schedule(dynamic,bufsize)
do i = 1, n
write (buf(j*llen+1:(j+1)*llen),"(E21.15,A1)") a(i), char(10)
j = j + 1
if (mod(i, bufsize) == 0) then
!$omp ordered
write (10) buf
!$omp end ordered
j = 0
end if
end do
deallocate(buf)
!$omp end parallel
close(10)
call system_clock(tstop, rate);
print *, 'Parallel write: ', dble(tstop - tstart) / dble(rate), 's'
end program parawrite
When I run it, however, not only is the parallel version much slower when at single thread, it also doesn't scale too much...
$ gfortran -O2 -fopenmp writetest.f90
$ OMP_NUM_THREADS=1 ./a.out
Formated write: 11.330000000000000 s
Parallel write: 15.625999999999999 s
$ OMP_NUM_THREADS=6 ./a.out
Formated write: 11.331000000000000 s
Parallel write: 6.1799999999999997 s
My first question would be how to make it the same speed at single thread? The time spent writing the buffer to the file is negligible, so why are the writes to the buffer slower than when writing directly to file?
My second question is about why the scaling is so bad? I have an equivalent C code which uses sprintf and fwrite and there I can get almost perfect linear scaling (I can post the code if needed), however with Fortran I can only reduce runtime to around 40% at 6 threads (with C I can reduce it to 18% at the same number of threads). It is still faster than the serial version, but I hope this could be improved.
From some experiments, it seems that an internal file is rather slow if an array element is converted to an internal file one at a time. This is also the case for an external file, but the degree of slowdown seems much greater for internal files (for some reason...). So I've modified the code such that a set of array elements are converted at once and then written to an external file via stream output. Below, four patterns are compared:
Sequential (1): The original code (which writes each element via do-loop)
Sequential (2): Write an array at once (or via implied loop) to an external file
Parallel (1): Make an internal file for many elements and then write to an external file
Parallel (2): Simplest parallel code with formatted write or spirntf for each element
Among these, Parallel (2) + sprintf (marked with *2 in the code) was the fastest, while Parallel (2) + write for each element (marked with *1) was the slowest (timing shown as Parallel (*) in the table, which does not scale with OpenMP for some reason). I guess sprintf will be the fastest probably because of the least amount of internal checks and overhead etc (just a guess!)
Results (please see the bottom for the modified codes)
$ gcc -O3 -c conv.c && gfortran -O3 -fopenmp test.f90 conv.o
# Machine: Core i7-8550U (1.8GHz), 4-core/8-thread, Ubuntu18.04 (GCC7.3.0)
# Note: The amount of data has been reduced to 1/5 of the
# original code, n = bufsize * 20, but the relative
# timing results remain the same even for larger data.
$ OMP_NUM_THREADS=1 ./a.out
Sequential (1): 2.0080000000000000 s
Sequential (2): 1.6510000000000000 s
Parallel (1): 1.6960000000000000 s
Parallel (2): 1.2640000000000000 s
Parallel (*): 3.1480000000000001 s
$ OMP_NUM_THREADS=2 ./a.out
Sequential (1): 1.9990000000000001 s
Sequential (2): 1.6479999999999999 s
Parallel (1): 0.98599999999999999 s
Parallel (2): 0.72999999999999998 s
Parallel (*): 1.8600000000000001 s
$ OMP_NUM_THREADS=4 ./a.out
Sequential (1): 2.0289999999999999 s
Sequential (2): 1.6499999999999999 s
Parallel (1): 0.61199999999999999 s
Parallel (2): 0.49399999999999999 s
Parallel (*): 1.4470000000000001 s
$ OMP_NUM_THREADS=8 ./a.out
Sequential (1): 2.0059999999999998 s
Sequential (2): 1.6499999999999999 s
Parallel (1): 0.56200000000000006 s
Parallel (2): 0.41299999999999998 s
Parallel (*): 1.7689999999999999 s
main.f90:
program main
implicit none
integer :: i, j, k, tstart, tstop, rate, idiv, ind1, ind2
integer, parameter :: bufsize = 100000, n = bufsize * 20, llen = 22, ndiv = 8
character(len=:), allocatable :: buf(:), words(:)
character(llen + 1) :: word
real(8), allocatable :: a(:)
allocate( a( n ) )
! Some input
do i = 1, n
a(i) = dble(i)**2
enddo
!.........................................................
! Formatted writes (1).
open(unit=10, file="dat_seq1.txt", form="formatted")
call system_clock(tstart, rate);
do i = 1, n
write(10,"(ES21.15)") a(i)
end do
call system_clock(tstop, rate);
print *, 'Sequential (1):', dble(tstop - tstart) / dble(rate), 's'
close(10)
!.........................................................
! Formatted writes (2).
open(unit=10, file="dat_seq2.txt", form="formatted")
call system_clock(tstart, rate);
write( 10, "(ES21.15)" ) a
! write( 10, "(ES21.15)" ) ( a( k ), k = 1, n )
call system_clock(tstop, rate);
print *, 'Sequential (2):', dble(tstop - tstart) / dble(rate), 's'
close(10)
!.........................................................
! Parallel writes (1): make a formatted string for many elements at once
allocate( character( llen * bufsize / ndiv ) :: buf( ndiv ) )
open(unit=10, file="dat_par1.txt", access="stream", form="unformatted")
call system_clock(tstart, rate);
do i = 1, n, bufsize
!$omp parallel do private( idiv, ind1, ind2, k ) shared( i, buf, a )
do idiv = 1, ndiv
ind1 = i + (idiv - 1) * bufsize / ndiv
ind2 = ind1 + bufsize / ndiv - 1
write( buf( idiv ),"(*(ES21.15, A1))") &
( a( k ), char(10), k = ind1, ind2 )
enddo
!$omp end parallel do
write(10) buf
end do
call system_clock(tstop, rate);
print *, 'Parallel (1):', dble(tstop - tstart) / dble(rate), 's'
deallocate(buf)
close(10)
!.........................................................
! Parallel writes (2): sprintf vs write for each element
allocate( character( llen ) :: words( n ) )
open(unit=10, file="dat_par2.txt", access="stream", form="unformatted")
call system_clock(tstart, rate);
!$omp parallel do private( i, word ) shared( a, words )
do i = 1, n
! write( word, "(ES21.15, A1)" ) a( i ), char(10) !! slow (*1)
call conv( word, a( i ) ) !! sprintf (*2)
words( i ) = word( 1 : llen )
enddo
!$omp end parallel do
write( 10 ) words
call system_clock(tstop, rate);
print *, 'Parallel (2):', dble(tstop - tstart) / dble(rate), 's'
close(10)
end program
conv.c:
#include <stdio.h>
void conv_( char *buf, double *val )
{
sprintf( buf, "%21.15E\n", *val );
}