Array expression vs explicit loop in parallel and serial - fortran

I tried to OpenMP parallelize the following minimal example and compare array expressions with explicit but am surprised, that I get no speedup.
Could you explain to me the problem?
program time_routines
use iso_fortran_env, only: real64, int64
USE OMP_LIB
integer, parameter :: wp = real64
integer(kind=int64) :: start, finish, rate
integer, parameter :: problem_size = 2 * 10**4
integer :: i
real(wp), allocatable :: X(:, :), Y(:, :)
allocate(X(problem_size, problem_size), Y(problem_size, problem_size))
call random_number(X)
call system_clock(count_rate=rate)
!$omp parallel
write(6, *) "Hello from thread: ", OMP_GET_THREAD_NUM()
!$OMP END PARALLEL
! fill cache lines
Y = 2._wp * X + 3._wp
call system_clock(start)
Y = 2._wp * X + 3._wp
call system_clock(finish)
write(6, *) 'serial array expression'
write(6, *) 'Elapsed Time in seconds:', real(finish - start, kind=wp) / real(rate, kind=wp)
call system_clock(start)
!$omp workshare
Y = 2._wp * X + 3._wp
!$omp end workshare
call system_clock(finish)
write(6, *) 'parallel array expression'
write(6, *) 'Elapsed Time in seconds:', real(finish - start, kind=wp) / real(rate, kind=wp)
call system_clock(start)
do j = 1, size(X, 2)
do i = 1, size(X, 1)
Y(i, j) = 2._wp * X(i, j) + 3._wp
end do
end do
call system_clock(finish)
write(6, *) 'Explicit serial loop'
write(6, *) 'Elapsed Time in seconds:', real(finish - start, kind=wp) / real(rate, kind=wp)
call system_clock(start)
!$omp do
do j = 1, size(X, 2)
do i = 1, size(X, 1)
Y(i, j) = 2._wp * X(i, j) + 3._wp
end do
end do
!$omp end do
call system_clock(finish)
write(6, *) 'Explicit parallel loop'
write(6, *) 'Elapsed Time in seconds:', real(finish - start, kind=wp) / real(rate, kind=wp)
end program
If I compile with
gfortran -fopenmp -O2 -Wall openmp_basics.f90 -o test.exe && ./test.exe my timings are the same for all possible ways of executing the calculation.

Related

Program received signal SIGSEGV: Segmentation fault - invalid memory reference in arrays with big sizes

I am getting an error while I run this code. When I run the code with small L's like L=16 or L=32 I get no error but in L = 128 or L=96 after 7000-8000 steps I get following error :
Program received signal SIGSEGV: Segmentation fault - invalid memory reference.
Backtrace for this error:
#0 0x7FBA5CAC3E08
#1 0x7FBA5CAC2F90
#2 0x7FBA5C1E84AF
#3 0x402769 in MAIN__ at newhys.f90:?
Segmentation fault (core dumped)
This is the full code :
SUBROUTINE init_random_seed()
implicit none
INTEGER :: i, n, clock
INTEGER, DIMENSION(:), ALLOCATABLE :: seed
CALL RANDOM_SEED(size = n)
ALLOCATE(seed(n))
CALL SYSTEM_CLOCK(COUNT=clock)
seed = clock + 37 * (/ (i - 1, i = 1, n) /)
CALL RANDOM_SEED(PUT = seed)
DEALLOCATE(seed)
END SUBROUTINE
!end module
Program Activ_mater
USE OMP_LIB
Implicit none
Integer,parameter :: time=1000000000, L=128, N = L**2*2
Integer,parameter:: n_thread = 8
Real(8),parameter :: pi = 3.14159265359
Real(8),parameter :: v0 = 0.50, alpha = 1.0/36.0
real(8)START,END_1 ,eta
type block_p
Integer :: partical_N
Integer :: particle_ad(10*L)
end type
Type(block_p) ,pointer,dimension(:,:) :: C
Real(8),allocatable :: x(:), y(:) , phi,angle_new(:),angle(:)
Real(8) :: sum_a, sum_b,x_in, y_in, x_out, y_out, avrage_t, r,ra,eta1(5)
Integer :: i,j,t,n_p,I_b,J_b,b_l,neighbor_i(9),neighbor_j(9),A,n_p_b,ne = 1,stateta=0,ot=0,op=175
character(len=10)::name1
call omp_set_num_threads(n_thread)
call init_random_seed()
eta1=(/2.100,2.116,2.120,2.126,2.128/) ! The value of ETA
allocate(x(2*n), y(2*n) , phi,angle_new(2*N),angle(2*N))
allocate(C(2*L,2*L))
C(:,:)%partical_N=0
do i =1,N
call random_number(ra)
x(i)=ra*L
I_b = int(x(i))+1
call random_number(ra)
y(i)=ra*L
J_b = int(y(i))+1
call random_number(ra)
angle(i)=ra*2.0*pi
C(I_b,J_b)%partical_N = C(I_b,J_b)%partical_N + 1 !Number of particle in block C(I_b,J_b)
C(I_b,J_b)%particle_ad( C(I_b,J_b)%partical_N ) = i ! The particle number in block C(I_b,J_b)
end do
! loop for eta
eta= 0.0
write(name1,'(f5.3)')eta
open(unit=10, file='Hysteresis,'//trim(name1)//'.dat')
!=====================explanation of system====================================
print*,'==========================================================================='
print*, 'eta = ', etA ,' ',' alpha = ',alpha
print*,'L=',L ,' ', 'Particle Number=', N,' ','Density=', N/L**2
print*,'==========================================================================='
!==============================================================================
START = omp_get_wtime()
do t =1,time
if (ot == 300000 )then
stateta = 0
ot = 0
op = op + 1
end if
if (stateta == 0 )then
eta = eta + ((1.0/3.0) * (10E-6))
endif
if (int(eta * 100) == op) then
stateta = 1
end if
angle_new(:)=0
!$OMP PARALLEL DEFAULT(PRIVATE) SHARED(x,y,angle,angle_new,c) firstprivate(eta)
!$OMP DO schedule(runtime)
do i =1, N
sum_a=0; sum_b=0;n_p=0
I_b = int(x(i))+1; J_b = int(y(i))+1 ! The block of particle i
! Now I should find nine neighbor of particle i-----------------------------------------------
neighbor_i=(/I_b+1, I_b, I_b-1, I_b, I_b+1, I_b-1, I_b-1, I_b+1, I_b/)
neighbor_j=(/J_b, J_b+1, J_b, J_b-1, J_b+1, J_b+1, J_b-1, J_b-1, J_b/)
do b_l = 1, 9
I_b = neighbor_i(b_l) ; J_b=neighbor_j(b_l)
if (I_b >L )I_b=1
if (I_b <1 )I_b=L
if (J_b >L )J_b=1
if (J_b <1 )J_b=L
!neighbor_i(b_l)=I_b; neighbor_j(b_l)=J_b
A = C( I_b, J_b )%partical_N ! number of particle in block C( neighbor_i(b_l), neighbor_j(b_l) )
!=============================================================================================
do n_p_b =1, A
j = C( I_b, J_b)%particle_ad(n_p_b) !particle j in the block C
if (i /= j )then
X_in = abs(max(x(i),x(j)) - min(x(i),x(j)));
Y_in = abs(max(y(i),y(j)) - min(y(i),y(j)));
X_out =L-X_in
Y_out =L-Y_in
r = sqrt(min(X_in,X_out)**2 + min(Y_in,Y_out)**2)
else
r=0.0
end if
if ( r < 1 )then
if ( j <= i )then
sum_A = sum_A + sin(angle(j));
sum_B = sum_B + cos(angle(j));
else
sum_A = sum_A + alpha*sin(angle(j));
sum_B = sum_B + alpha*cos(angle(j));
endif
n_p = n_p + 1;
endif
enddo
enddo
sum_A = sum_A/n_p; sum_B = sum_B/n_p
!if (int(sum_A*1e10) ==0 .and. int(sum_B*1e10) ==0 )print*,'zerrooo'
avrage_t=atan2(sum_A,sum_B);
if (avrage_t<0.0) then
avrage_t=avrage_t+2.0*pi;
endif
call random_number(ra)
angle_new(i)=avrage_t+eta*(ra-0.50)
if( angle_new(i)>=2*pi) angle_new(i)= angle_new(i)-2*pi
if( angle_new(i)<0) angle_new(i)= 2*pi+angle_new(i)
end do
!$OMP END DO
!$OMP END PARALLEL
angle = angle_new
C(:,:)%partical_N=0
! phi=0.0
do i=1, N
x(i) = x(i) + v0*sin(angle(i));
if (x(i)<1) x(i)=L+x(i)
if (x(i)>L) x(i)=x(i)-L
I_b = int(x(i))+1
y(i) = y(i)+ v0*cos(angle(i));
if (y(i)<1) y(i)=L+y(i)
if (y(i)>L) y(i)=y(i)-L
J_b = int(y(i))+1
C(I_b,J_b)%partical_N = C(I_b,J_b)%partical_N + 1
C(I_b,J_b)%particle_ad( C(I_b,J_b)%partical_N ) = i
end do
if (stateta == 1 )then
phi= sqrt((sum(sin(angle))**2+sum(cos(angle))**2))/N;
ot = ot + 1
end if
write(10,*)phi
if (mod(t,10)==0)then
! ave4_phi=sum(phi**4)/t;
! ave2_phi =sum(phi**2)/t;
!print* ,ave4_phi,ave2_phi
print*,'Time=',t,' ==== Eta : ',eta,"Ot : " , ot
end if
end do
END_1 = omp_get_wtime()
print*,'Run Time = ',end_1-start
End Program
P.S.(1) : I use omp lib to run my program parallel
P.S.(2) : I use gfortran to compile the code
P.S.(3) : Code compiled with -g -fcheck=all and gives me this error :
At line 155 of file z.f90 Fortran runtime error: Index '1281' of dimension 1 of array 'c%particle_ad' above upper bound of 1280
Thanks to you all
Your particle_ad arrays only have space for 10*L particles, but you appear to be trying to store up to N=2*L**2 particles in them (depending on how the random numbers fall). On average, each will have enough space, but your code will fail if too many particles fall into a single block, and (if I'm remembering the probabilities right) the chances of this happening increase as L increases.
You could solve this problem by replacing Integer :: particle_ad(10*L) with Integer :: particle_ad(N), but this will waste an awful lot of space.
A better solution would be to re-size the particle_ad arrays on the fly, making them bigger every time they get full. For convenience, you could wrap this behaviour in a method of the block_p class.
For example,
type block_p
Integer :: partical_N
Integer, allocatable :: particle_ad(:)
contains
subroutine add_particle(this, index)
class(block_p), intent(inout) :: this
integer, intent(in) :: index
integer, allocatable :: temp
! Update partical_N.
this%partical_N = this%partical_N + 1
! Resize particle_ad if it is full.
if (size(this%particle_ad)<this%partical_N) then
temp = this%particle_ad
deallocate(this%particle_ad)
allocate(this%particle_ad(2*this%partical_N))
this%particle_ad(:size(temp)) = temp
endif
! Add the new index to particle_ad.
this%particle_ad(this%partical_N) = index
end subroutine
end type
You could then replace the lines
C(I_b,J_b)%partical_N = C(I_b,J_b)%partical_N + 1
C(I_b,J_b)%particle_ad( C(I_b,J_b)%partical_N ) = i
with
call C(I_b,J_b)%add_particle(i)
Note that as each particle_ad array is now allocatable, you will need to initialise each array before you can call add_particle.

Getting Fortran Runtime error : Index '1281' of dimension 1 of array 'c%particle_ad' above upper bound of 1280 [duplicate]

I am getting an error while I run this code. When I run the code with small L's like L=16 or L=32 I get no error but in L = 128 or L=96 after 7000-8000 steps I get following error :
Program received signal SIGSEGV: Segmentation fault - invalid memory reference.
Backtrace for this error:
#0 0x7FBA5CAC3E08
#1 0x7FBA5CAC2F90
#2 0x7FBA5C1E84AF
#3 0x402769 in MAIN__ at newhys.f90:?
Segmentation fault (core dumped)
This is the full code :
SUBROUTINE init_random_seed()
implicit none
INTEGER :: i, n, clock
INTEGER, DIMENSION(:), ALLOCATABLE :: seed
CALL RANDOM_SEED(size = n)
ALLOCATE(seed(n))
CALL SYSTEM_CLOCK(COUNT=clock)
seed = clock + 37 * (/ (i - 1, i = 1, n) /)
CALL RANDOM_SEED(PUT = seed)
DEALLOCATE(seed)
END SUBROUTINE
!end module
Program Activ_mater
USE OMP_LIB
Implicit none
Integer,parameter :: time=1000000000, L=128, N = L**2*2
Integer,parameter:: n_thread = 8
Real(8),parameter :: pi = 3.14159265359
Real(8),parameter :: v0 = 0.50, alpha = 1.0/36.0
real(8)START,END_1 ,eta
type block_p
Integer :: partical_N
Integer :: particle_ad(10*L)
end type
Type(block_p) ,pointer,dimension(:,:) :: C
Real(8),allocatable :: x(:), y(:) , phi,angle_new(:),angle(:)
Real(8) :: sum_a, sum_b,x_in, y_in, x_out, y_out, avrage_t, r,ra,eta1(5)
Integer :: i,j,t,n_p,I_b,J_b,b_l,neighbor_i(9),neighbor_j(9),A,n_p_b,ne = 1,stateta=0,ot=0,op=175
character(len=10)::name1
call omp_set_num_threads(n_thread)
call init_random_seed()
eta1=(/2.100,2.116,2.120,2.126,2.128/) ! The value of ETA
allocate(x(2*n), y(2*n) , phi,angle_new(2*N),angle(2*N))
allocate(C(2*L,2*L))
C(:,:)%partical_N=0
do i =1,N
call random_number(ra)
x(i)=ra*L
I_b = int(x(i))+1
call random_number(ra)
y(i)=ra*L
J_b = int(y(i))+1
call random_number(ra)
angle(i)=ra*2.0*pi
C(I_b,J_b)%partical_N = C(I_b,J_b)%partical_N + 1 !Number of particle in block C(I_b,J_b)
C(I_b,J_b)%particle_ad( C(I_b,J_b)%partical_N ) = i ! The particle number in block C(I_b,J_b)
end do
! loop for eta
eta= 0.0
write(name1,'(f5.3)')eta
open(unit=10, file='Hysteresis,'//trim(name1)//'.dat')
!=====================explanation of system====================================
print*,'==========================================================================='
print*, 'eta = ', etA ,' ',' alpha = ',alpha
print*,'L=',L ,' ', 'Particle Number=', N,' ','Density=', N/L**2
print*,'==========================================================================='
!==============================================================================
START = omp_get_wtime()
do t =1,time
if (ot == 300000 )then
stateta = 0
ot = 0
op = op + 1
end if
if (stateta == 0 )then
eta = eta + ((1.0/3.0) * (10E-6))
endif
if (int(eta * 100) == op) then
stateta = 1
end if
angle_new(:)=0
!$OMP PARALLEL DEFAULT(PRIVATE) SHARED(x,y,angle,angle_new,c) firstprivate(eta)
!$OMP DO schedule(runtime)
do i =1, N
sum_a=0; sum_b=0;n_p=0
I_b = int(x(i))+1; J_b = int(y(i))+1 ! The block of particle i
! Now I should find nine neighbor of particle i-----------------------------------------------
neighbor_i=(/I_b+1, I_b, I_b-1, I_b, I_b+1, I_b-1, I_b-1, I_b+1, I_b/)
neighbor_j=(/J_b, J_b+1, J_b, J_b-1, J_b+1, J_b+1, J_b-1, J_b-1, J_b/)
do b_l = 1, 9
I_b = neighbor_i(b_l) ; J_b=neighbor_j(b_l)
if (I_b >L )I_b=1
if (I_b <1 )I_b=L
if (J_b >L )J_b=1
if (J_b <1 )J_b=L
!neighbor_i(b_l)=I_b; neighbor_j(b_l)=J_b
A = C( I_b, J_b )%partical_N ! number of particle in block C( neighbor_i(b_l), neighbor_j(b_l) )
!=============================================================================================
do n_p_b =1, A
j = C( I_b, J_b)%particle_ad(n_p_b) !particle j in the block C
if (i /= j )then
X_in = abs(max(x(i),x(j)) - min(x(i),x(j)));
Y_in = abs(max(y(i),y(j)) - min(y(i),y(j)));
X_out =L-X_in
Y_out =L-Y_in
r = sqrt(min(X_in,X_out)**2 + min(Y_in,Y_out)**2)
else
r=0.0
end if
if ( r < 1 )then
if ( j <= i )then
sum_A = sum_A + sin(angle(j));
sum_B = sum_B + cos(angle(j));
else
sum_A = sum_A + alpha*sin(angle(j));
sum_B = sum_B + alpha*cos(angle(j));
endif
n_p = n_p + 1;
endif
enddo
enddo
sum_A = sum_A/n_p; sum_B = sum_B/n_p
!if (int(sum_A*1e10) ==0 .and. int(sum_B*1e10) ==0 )print*,'zerrooo'
avrage_t=atan2(sum_A,sum_B);
if (avrage_t<0.0) then
avrage_t=avrage_t+2.0*pi;
endif
call random_number(ra)
angle_new(i)=avrage_t+eta*(ra-0.50)
if( angle_new(i)>=2*pi) angle_new(i)= angle_new(i)-2*pi
if( angle_new(i)<0) angle_new(i)= 2*pi+angle_new(i)
end do
!$OMP END DO
!$OMP END PARALLEL
angle = angle_new
C(:,:)%partical_N=0
! phi=0.0
do i=1, N
x(i) = x(i) + v0*sin(angle(i));
if (x(i)<1) x(i)=L+x(i)
if (x(i)>L) x(i)=x(i)-L
I_b = int(x(i))+1
y(i) = y(i)+ v0*cos(angle(i));
if (y(i)<1) y(i)=L+y(i)
if (y(i)>L) y(i)=y(i)-L
J_b = int(y(i))+1
C(I_b,J_b)%partical_N = C(I_b,J_b)%partical_N + 1
C(I_b,J_b)%particle_ad( C(I_b,J_b)%partical_N ) = i
end do
if (stateta == 1 )then
phi= sqrt((sum(sin(angle))**2+sum(cos(angle))**2))/N;
ot = ot + 1
end if
write(10,*)phi
if (mod(t,10)==0)then
! ave4_phi=sum(phi**4)/t;
! ave2_phi =sum(phi**2)/t;
!print* ,ave4_phi,ave2_phi
print*,'Time=',t,' ==== Eta : ',eta,"Ot : " , ot
end if
end do
END_1 = omp_get_wtime()
print*,'Run Time = ',end_1-start
End Program
P.S.(1) : I use omp lib to run my program parallel
P.S.(2) : I use gfortran to compile the code
P.S.(3) : Code compiled with -g -fcheck=all and gives me this error :
At line 155 of file z.f90 Fortran runtime error: Index '1281' of dimension 1 of array 'c%particle_ad' above upper bound of 1280
Thanks to you all
Your particle_ad arrays only have space for 10*L particles, but you appear to be trying to store up to N=2*L**2 particles in them (depending on how the random numbers fall). On average, each will have enough space, but your code will fail if too many particles fall into a single block, and (if I'm remembering the probabilities right) the chances of this happening increase as L increases.
You could solve this problem by replacing Integer :: particle_ad(10*L) with Integer :: particle_ad(N), but this will waste an awful lot of space.
A better solution would be to re-size the particle_ad arrays on the fly, making them bigger every time they get full. For convenience, you could wrap this behaviour in a method of the block_p class.
For example,
type block_p
Integer :: partical_N
Integer, allocatable :: particle_ad(:)
contains
subroutine add_particle(this, index)
class(block_p), intent(inout) :: this
integer, intent(in) :: index
integer, allocatable :: temp
! Update partical_N.
this%partical_N = this%partical_N + 1
! Resize particle_ad if it is full.
if (size(this%particle_ad)<this%partical_N) then
temp = this%particle_ad
deallocate(this%particle_ad)
allocate(this%particle_ad(2*this%partical_N))
this%particle_ad(:size(temp)) = temp
endif
! Add the new index to particle_ad.
this%particle_ad(this%partical_N) = index
end subroutine
end type
You could then replace the lines
C(I_b,J_b)%partical_N = C(I_b,J_b)%partical_N + 1
C(I_b,J_b)%particle_ad( C(I_b,J_b)%partical_N ) = i
with
call C(I_b,J_b)%add_particle(i)
Note that as each particle_ad array is now allocatable, you will need to initialise each array before you can call add_particle.

error: reduction variable ‘v1’ is private in outer context

I have the following Fortran code
program hello
use omp_lib
implicit none
integer :: num_threads = 2
print*, "Display Hello world!"
print*, "Number of threads used = ", num_threads
call loop()
end program hello
subroutine loop()
integer :: i,j,k,n
real :: c0
real, allocatable :: v1(:,:)
n = 3
c0 = 0.
if (.not. allocated (v1)) allocate(v1(n,n))
v1 = c0
!$omp do private(i, j, k) schedule(dynamic) reduction(+: v1)
do i = 1, n
do j = 1, n
do k = 1, n
v1(i,j) = v1(i,j) + k
end do
write (*,*) i, j, v1(i,j)
end do
end do
!$omp end do
end subroutine
gfotran -fopenmp leads to
error: reduction variable ‘v1’ is private in outer context
!$omp do private(i, j, k) schedule(dynamic) reduction(+: v1)
I checked reduction variable is private in outer context
but still unsure the reason for my issue. v1 is only used inside the loop.
What's the reason for the error message reduction variable ‘v1’ is private in outer context ?
[Solved, by adding !$omp parallel and !$omp end parallel]
Thanks for Ian Bush's comment. By adding !$omp parallel and !$omp end parallel, i.e.,
program hello
use omp_lib
implicit none
integer :: num_threads = 2
print*, "Display Hello world!"
print*, "Number of threads used = ", num_threads
call loop()
end program hello
subroutine loop()
integer :: i,j,k,n
real :: c0
real, allocatable :: v1(:,:)
n = 3
c0 = 0.
if (.not. allocated (v1)) allocate(v1(n,n))
!$omp parallel
!$omp do private(i, j, k) schedule(dynamic) reduction(+: v1)
do i = 1, n
do j = 1, n
v1(i,j) = c0
do k = 1, n
v1(i,j) = v1(i,j) + k
end do
write (*,*) i, j, v1(i,j)
end do
end do
!$omp end do
!$omp end parallel
end subroutine
the code runs normally.

Eigen is much slower than Fortran in matrix multiplication using an explicit loop

I tried to rewrite code from Fortran to C++ with a 2000*2000 matrix multiplication implements through Eigen library. I found that for loop in Eigen is much slower (>3x) than do loop in Fortran. The codes are listed below:
test.f90
program main
implicit none
integer :: n,i,j,k
integer :: tic,toc
real(8),ALLOCATABLE ::a(:,:),b(:,:),c(:,:)
real(8) :: s
n = 2000
allocate(a(n,n),b(n,n),c(n,n))
do i=1,n
do j =1,n
a(j,i) = i * 1.0
b(j,i) = i * 1.0
enddo
enddo
call system_clock(tic)
do j=1,n
do i=1,n
s = 0.0
do k=1,n
s = s + a(i,k) * b(k,j)
enddo
c(i,j) = s
enddo
enddo
call system_clock(toc)
print*,'Fortran with loop:', (toc - tic) / 1000.0
call system_clock(tic)
c = matmul(a,b)
call system_clock(toc)
print*,'Fortran with matmul:', (toc - tic) / 1000.0
DEALLOCATE(a,b,c)
end
test.cpp
#include<Eigen/Core>
#include<time.h>
#include<iostream>
using Eigen::MatrixXd;
int main(){
int n = 2000;
MatrixXd a(n,n),b(n,n),c(n,n);
for(int i=0;i<n;i++){
for(int j=0;j<n;j++){
a(i,j) = i * 1.0;
b(i,j) = j * 1.0;
}
}
clock_t tic,toc;
tic = clock();
for(int j=0;j<n;j++){
for(int i=0;i<n;i++){
double s= 0.0;
for(int k=0;k<n;k++){
s += a(i,k) * b(k,j);
}
c(i,j) = s;
}
}
toc = clock();
std::cout << (double)((toc - tic)) / CLOCKS_PER_SEC << std::endl;
tic = clock();
c= a * b;
toc = clock();
std::cout << (double)((toc - tic)) / CLOCKS_PER_SEC << std::endl;
}
Compiled by(with gcc-8.4, in Ubuntu-18.04)
gfortran test.f90 -O3 -march=native -o testf
g++ test.cpp -O3 -march=native -I/path/to/eigen -o testcpp
And I get results:
Fortran with loop: 10.9700003
Fortran with matmul: 0.834999979
Eigen with loop: 38.2188
Eigen with *: 0.40625
The internal implementation is of comparable speed, but why Eigen is much slower for the loop implementation?
The biggest problem with the loops is that they are not done in the proper order for either C++ (which should be row-major), or Fortran (which should be column-major). This gives you a large performance hit, especially for large matrices.
The nativemul implementation by John Alexiou (with dot_product) has the same problem, so I am very surprised that he claims it's faster. (And I find that it isn't; see below. Maybe his (intel?) compiler rewrites the code to use matmul internally.)
This is the correct loop order for Fortran:
c = 0
do j=1,n
do k=1,n
do i=1,n
c(i,j) = c(i,j) + a(i,k) * b(k,j)
enddo
enddo
enddo
With gfortran version 10.2.0, and compiled with -O3, I get
Fortran with original OP's loop: 53.5190010
Fortran with John Alexiou's nativemul: 53.4309998
Fortran with correct loop: 11.0679998
Fortran with matmul: 2.3699999
A correct loop in C++ should give you similar performance.
Obviously matmul/BLAS are much faster for large matrices.
In the Fortran code I saw the same problem, but then I moved the matrix multiplication in a subroutine and the resultant speed was almost as good as matmul. I also compared to BLAS Level 3 function.
Fortran with loop: 9.220000
Fortran with matmul: 8.450000
Fortran with blas3: 2.050000
and the code to produce it
program ConsoleMatMul
use BLAS95
implicit none
integer :: n,i,j
integer :: tic,toc
real(8),ALLOCATABLE :: a(:,:),b(:,:),c(:,:),xe(:,:)
n = 2000
allocate(a(n,n),b(n,n),c(n,n),xe(n,n))
do i=1,n
do j =1,n
a(j,i) = i * 1.0
b(j,i) = i * 1.0
enddo
enddo
call system_clock(tic)
call nativemul(a,b,c)
call system_clock(toc)
print*,'Fortran with loop:', (toc - tic) / 1000.0
call system_clock(tic)
c = matmul(a,b)
call system_clock(toc)
print*,'Fortran with matmul:', (toc - tic) / 1000.0
c = b
xe = 0d0
call system_clock(tic)
call gemm(a,c,xe) ! BLAS MATRIX/MATRIX MUL
call system_clock(toc)
print*,'Fortran with blas3:', (toc - tic) / 1000.0
DEALLOCATE(a,b,c)
contains
pure subroutine nativemul(a,b,c)
real(8), intent(in), allocatable :: a(:,:), b(:,:)
real(8), intent(out), allocatable :: c(:,:)
real(8) :: s
integer :: n, i,j,k
n = size(a,1)
if (.not. allocated(c)) allocate(c(n,n))
do j=1,n
do i=1,n
s = 0.0d0
do k=1,n
s = s + a(i,k) * b(k,j)
end do
c(i,j) = s
end do
end do
end subroutine
end program ConsoleMatMul
before I moved the code into a subroutine I got
Fortran with loop: 85.450000
Update the native multiplication reaches matmul levels (or exceeds it) when the inner loop is replaced by a dot_product().
pure subroutine nativemul(a,b,c)
real(8), intent(in) :: a(:,:), b(:,:)
real(8), intent(out) :: c(:,:)
integer :: n, i,j
n = size(a,1)
do j=1,n
do i=1,n
c(i,j) = dot_product(a(i,:),b(:,j))
! or = sum(a(i,:)*b(:,j))
end do
end do
end subroutine
C++ pre-increment is faster than post-increment...
for(int j=0;j<n;++j){
for(int i=0;i<n;++i){
double s= 0.0;
for(int k=0;k<n;++k){
s += a(i,k) * b(k,j);
}
c(i,j) = s;
}
}

Parallelizing formated writes in Fortran with OpenMP

I'm trying to parallelize a Fortran code which at one moment writes a tons of numbers to formated output. Some simple profiling showed that most CPU time is spent in format conversion, so I had the idea to do the formatting in parallel to character buffers and later write the unformatted buffers to the file.
My proof of concept looks like this:
program parawrite
implicit none
integer (kind = 4) :: i, j, tstart, tstop, rate
integer (kind = 4), parameter :: bufsize = 100000, n = 10000000, llen = 22
character (kind=1, len=:), allocatable :: buf
real (kind=8), dimension(n) :: a
! some input
do i = 1, n
a(i) = dble(i) * dble(i)
enddo
! formated writes for reference
open(unit=10, file="out1.txt", form="formatted")
call system_clock(tstart, rate);
do i = 1, n
write(10,"(E21.15)") a(i)
end do
call system_clock(tstop, rate);
print *, 'Formated write: ', dble(tstop - tstart) / dble(rate), 's'
close(10)
! parallel stuff
open(unit=10, file="out2.txt", access="stream", form="unformatted")
call system_clock(tstart, rate);
!$omp parallel private(buf, j)
allocate(character(bufsize * llen) :: buf)
j = 0;
!$omp do ordered schedule(dynamic,bufsize)
do i = 1, n
write (buf(j*llen+1:(j+1)*llen),"(E21.15,A1)") a(i), char(10)
j = j + 1
if (mod(i, bufsize) == 0) then
!$omp ordered
write (10) buf
!$omp end ordered
j = 0
end if
end do
deallocate(buf)
!$omp end parallel
close(10)
call system_clock(tstop, rate);
print *, 'Parallel write: ', dble(tstop - tstart) / dble(rate), 's'
end program parawrite
When I run it, however, not only is the parallel version much slower when at single thread, it also doesn't scale too much...
$ gfortran -O2 -fopenmp writetest.f90
$ OMP_NUM_THREADS=1 ./a.out
Formated write: 11.330000000000000 s
Parallel write: 15.625999999999999 s
$ OMP_NUM_THREADS=6 ./a.out
Formated write: 11.331000000000000 s
Parallel write: 6.1799999999999997 s
My first question would be how to make it the same speed at single thread? The time spent writing the buffer to the file is negligible, so why are the writes to the buffer slower than when writing directly to file?
My second question is about why the scaling is so bad? I have an equivalent C code which uses sprintf and fwrite and there I can get almost perfect linear scaling (I can post the code if needed), however with Fortran I can only reduce runtime to around 40% at 6 threads (with C I can reduce it to 18% at the same number of threads). It is still faster than the serial version, but I hope this could be improved.
From some experiments, it seems that an internal file is rather slow if an array element is converted to an internal file one at a time. This is also the case for an external file, but the degree of slowdown seems much greater for internal files (for some reason...). So I've modified the code such that a set of array elements are converted at once and then written to an external file via stream output. Below, four patterns are compared:
Sequential (1): The original code (which writes each element via do-loop)
Sequential (2): Write an array at once (or via implied loop) to an external file
Parallel (1): Make an internal file for many elements and then write to an external file
Parallel (2): Simplest parallel code with formatted write or spirntf for each element
Among these, Parallel (2) + sprintf (marked with *2 in the code) was the fastest, while Parallel (2) + write for each element (marked with *1) was the slowest (timing shown as Parallel (*) in the table, which does not scale with OpenMP for some reason). I guess sprintf will be the fastest probably because of the least amount of internal checks and overhead etc (just a guess!)
Results (please see the bottom for the modified codes)
$ gcc -O3 -c conv.c && gfortran -O3 -fopenmp test.f90 conv.o
# Machine: Core i7-8550U (1.8GHz), 4-core/8-thread, Ubuntu18.04 (GCC7.3.0)
# Note: The amount of data has been reduced to 1/5 of the
# original code, n = bufsize * 20, but the relative
# timing results remain the same even for larger data.
$ OMP_NUM_THREADS=1 ./a.out
Sequential (1): 2.0080000000000000 s
Sequential (2): 1.6510000000000000 s
Parallel (1): 1.6960000000000000 s
Parallel (2): 1.2640000000000000 s
Parallel (*): 3.1480000000000001 s
$ OMP_NUM_THREADS=2 ./a.out
Sequential (1): 1.9990000000000001 s
Sequential (2): 1.6479999999999999 s
Parallel (1): 0.98599999999999999 s
Parallel (2): 0.72999999999999998 s
Parallel (*): 1.8600000000000001 s
$ OMP_NUM_THREADS=4 ./a.out
Sequential (1): 2.0289999999999999 s
Sequential (2): 1.6499999999999999 s
Parallel (1): 0.61199999999999999 s
Parallel (2): 0.49399999999999999 s
Parallel (*): 1.4470000000000001 s
$ OMP_NUM_THREADS=8 ./a.out
Sequential (1): 2.0059999999999998 s
Sequential (2): 1.6499999999999999 s
Parallel (1): 0.56200000000000006 s
Parallel (2): 0.41299999999999998 s
Parallel (*): 1.7689999999999999 s
main.f90:
program main
implicit none
integer :: i, j, k, tstart, tstop, rate, idiv, ind1, ind2
integer, parameter :: bufsize = 100000, n = bufsize * 20, llen = 22, ndiv = 8
character(len=:), allocatable :: buf(:), words(:)
character(llen + 1) :: word
real(8), allocatable :: a(:)
allocate( a( n ) )
! Some input
do i = 1, n
a(i) = dble(i)**2
enddo
!.........................................................
! Formatted writes (1).
open(unit=10, file="dat_seq1.txt", form="formatted")
call system_clock(tstart, rate);
do i = 1, n
write(10,"(ES21.15)") a(i)
end do
call system_clock(tstop, rate);
print *, 'Sequential (1):', dble(tstop - tstart) / dble(rate), 's'
close(10)
!.........................................................
! Formatted writes (2).
open(unit=10, file="dat_seq2.txt", form="formatted")
call system_clock(tstart, rate);
write( 10, "(ES21.15)" ) a
! write( 10, "(ES21.15)" ) ( a( k ), k = 1, n )
call system_clock(tstop, rate);
print *, 'Sequential (2):', dble(tstop - tstart) / dble(rate), 's'
close(10)
!.........................................................
! Parallel writes (1): make a formatted string for many elements at once
allocate( character( llen * bufsize / ndiv ) :: buf( ndiv ) )
open(unit=10, file="dat_par1.txt", access="stream", form="unformatted")
call system_clock(tstart, rate);
do i = 1, n, bufsize
!$omp parallel do private( idiv, ind1, ind2, k ) shared( i, buf, a )
do idiv = 1, ndiv
ind1 = i + (idiv - 1) * bufsize / ndiv
ind2 = ind1 + bufsize / ndiv - 1
write( buf( idiv ),"(*(ES21.15, A1))") &
( a( k ), char(10), k = ind1, ind2 )
enddo
!$omp end parallel do
write(10) buf
end do
call system_clock(tstop, rate);
print *, 'Parallel (1):', dble(tstop - tstart) / dble(rate), 's'
deallocate(buf)
close(10)
!.........................................................
! Parallel writes (2): sprintf vs write for each element
allocate( character( llen ) :: words( n ) )
open(unit=10, file="dat_par2.txt", access="stream", form="unformatted")
call system_clock(tstart, rate);
!$omp parallel do private( i, word ) shared( a, words )
do i = 1, n
! write( word, "(ES21.15, A1)" ) a( i ), char(10) !! slow (*1)
call conv( word, a( i ) ) !! sprintf (*2)
words( i ) = word( 1 : llen )
enddo
!$omp end parallel do
write( 10 ) words
call system_clock(tstop, rate);
print *, 'Parallel (2):', dble(tstop - tstart) / dble(rate), 's'
close(10)
end program
conv.c:
#include <stdio.h>
void conv_( char *buf, double *val )
{
sprintf( buf, "%21.15E\n", *val );
}