I tried to parallel a piece of code with OPENMP, but with increasing the number of processors, the code runs slower.!
call OMP_set_num_threads(1)-->16.7sec
call OMP_set_num_threads(4)-->17.7sec
call OMP_set_num_threads(8)-->19sec
System SPEC
Intel Corei7 3610QM 2.3GH up to 3.2GH with 4 cores and 8 threads
///8GB ram DDR3
call OMP_set_num_threads(8)
!$omp parallel
!$omp do private(k,i,j,r,epsilonxx,epsilonyy,epsilonxy,epsilonzz,epsilonxz,&
epsilonyz) reduction(+:dr)
do k=1,niac
i = pair_i(k)
j = pair_j(k)
dx(1) = x(1,j) - x(1,i)
dr = dx(1)*dx(1)
do d=2,dim
dx(d) = x(d,j) - x(d,i)
dr = dr + dx(d)*dx(d)
enddo
r = sqrt(dr)
do d=1,dim
dvx(d) = vx(d,j) - vx(d,i)
enddo
if (dim.eq.3) then
if((abs(itype(i)).gt.1000 .and. abs(itype(j)).gt.1000 ) ) then
epsilonxx = dvx(1)*dwdx(1,k)
epsilonyy = dvx(2)*dwdx(2,k)
epsilonxy = (1/2.)*(dvx(1)*dwdx(2,k)+dvx(2)*dwdx(1,k))
epsilonzz = dvx(dim)*dwdx(dim,k)
epsilonxz = (1/2.)*(dvx(1)*dwdx(dim,k)+dvx(dim)*dwdx(1,k))
epsilonyz = (1/2.)*(dvx(2)*dwdx(dim,k)+dvx(dim)*dwdx(2,k))
epsxx(i) = epsxx(i) + mass(j)*epsilonxx/rho(j)
epsxx(j) = epsxx(j) + mass(i)*epsilonxx/rho(i)
epsyy(i) = epsyy(i) + mass(j)*epsilonyy/rho(j)
epsyy(j) = epsyy(j) + mass(i)*epsilonyy/rho(i)
epsxy(i) = epsxy(i) + mass(j)*epsilonxy/rho(j)
epsxy(j) = epsxy(j) + mass(i)*epsilonxy/rho(i)
epszz(i) = epszz(i) + mass(j)*epsilonzz/rho(j)
epszz(j) = epszz(j) + mass(i)*epsilonzz/rho(i)
epsxz(i) = epsxz(i) + mass(j)*epsilonxz/rho(j)
epsxz(j) = epsxz(j) + mass(i)*epsilonxz/rho(i)
epsyz(i) = epsyz(i) + mass(j)*epsilonyz/rho(j)
epsyz(j) = epsyz(j) + mass(i)*epsilonyz/rho(i)
elseif( (abs(itype(i)).lt.1000 ) .and. (abs(itype(j)).gt.1000 ) ) then
epsilonxx_interface(i) =(2/3.)*(2.e0*dvx(1)*dwdx(1,k)
epsilonxx_interface(j) =dvx(1)*dwdx(1,k)
epsilonyy_interface(i) =(2/3.)*(2.e0*dvx(2)*dwdx(2,k)
epsilonyy_interface(j) =dvx(2)*dwdx(2,k)
epsilonxy_interface(i) =dvx(1)*dwdx(2,k) + dvx(2)*dwdx(1,k)
epsilonxy_interface(j) =(1/2.)*(dvx(1)*dwdx(2,k)+dvx(2)*dwdx(1,k))
epsilonzz_interface(i) =(2/3.)*(2.e0*dvx(dim)*dwdx(dim,k)
epsilonzz_interface(j) =dvx(dim)*dwdx(dim,k) epsilonxz_interface(i) =dvx(1)*dwdx(dim,k) + dvx(dim)*dwdx(1,k)
epsilonxz_interface(j) =(1/2.)*(dvx(1)*dwdx(dim,k)+dvx(dim)*dwdx(1,k))
epsilonyz_interface(i) =dvx(2)*dwdx(dim,k) + dvx(dim)*dwdx(2,k)
epsilonyz_interface(j) =(1/2.)*(dvx(2)*dwdx(dim,k)+dvx(dim)*dwdx(2,k))
epsxx(i) = epsxx(i) + mass(j)*epsilonxx_interface(i)/rho(j)
epsxx(j) = epsxx(j) + mass(i)*epsilonxx_interface(j)/rho(i)
epsyy(i) = epsyy(i) + mass(j)*epsilonyy_interface(i)/rho(j)
epsyy(j) = epsyy(j) + mass(i)*epsilonyy_interface(j)/rho(i)
epsxy(i) = epsxy(i) + mass(j)*epsilonxy_interface(i)/rho(j)
epsxy(j) = epsxy(j) + mass(i)*epsilonxy_interface(j)/rho(i)
epszz(i) = epszz(i) + mass(j)*epsilonzz_interface(i)/rho(j)
epszz(j) = epszz(j) + mass(i)*epsilonzz_interface(j)/rho(i)
epsxz(i) = epsxz(i) + mass(j)*epsilonxz_interface(i)/rho(j)
epsxz(j) = epsxz(j) + mass(i)*epsilonxz_interface(j)/rho(i)
epsyz(i) = epsyz(i) + mass(j)*epsilonyz_interface(i)/rho(j)
epsyz(j) = epsyz(j) + mass(i)*epsilonyz_interface(j)/rho(i)
elseif( (abs(itype(i)).gt.1000 ) .and. (abs(itype(j)).lt.1000 ) ) then
epsilonxx_interface(j) = (2/3.)*(2.e0*dvx(1)*dwdx(1,k)
epsilonxx_interface(i) =dvx(1)*dwdx(1,k)
epsilonyy_interface(j) =(2/3.)*(2.e0*dvx(2)*dwdx(2,k)
epsilonyy_interface(i) = dvx(2)*dwdx(2,k)
epsilonxy_interface(j) =dvx(1)*dwdx(2,k) + dvx(2)*dwdx(1,k)
epsilonxy_interface(i) = (1/2.)*(dvx(1)*dwdx(2,k)+dvx(2)*dwdx(1,k))
epsilonzz_interface(j) = (2/3.)*(2.e0*dvx(dim)*dwdx(dim,k)
epsilonzz_interface(i) =dvx(dim)*dwdx(dim,k)
epsilonxz_interface(j) =dvx(1)*dwdx(dim,k) + dvx(dim)*dwdx(1,k)
epsilonxz_interface(i) =(1/2.)*(dvx(1)*dwdx(dim,k)+dvx(dim)*dwdx(1,k))
epsilonyz_interface(j) =dvx(2)*dwdx(dim,k) + dvx(dim)*dwdx(2,k)
epsilonyz_interface(i) =(1/2.)*(dvx(2)*dwdx(dim,k)+dvx(dim)*dwdx(2,k))
epsxx(i) = epsxx(i) + mass(j)*epsilonxx_interface(i)/rho(j)
epsxx(j) = epsxx(j) + mass(i)*epsilonxx_interface(j)/rho(i)
epsyy(i) = epsyy(i) + mass(j)*epsilonyy_interface(i)/rho(j)
epsyy(j) = epsyy(j) + mass(i)*epsilonyy_interface(j)/rho(i)
epsxy(i) = epsxy(i) + mass(j)*epsilonxy_interface(i)/rho(j)
epsxy(j) = epsxy(j) + mass(i)*epsilonxy_interface(j)/rho(i)
epszz(i) = epszz(i) + mass(j)*epsilonzz_interface(i)/rho(j)
epszz(j) = epszz(j) + mass(i)*epsilonzz_interface(j)/rho(i)
epsxz(i) = epsxz(i) + mass(j)*epsilonxz_interface(i)/rho(j)
epsxz(j) = epsxz(j) + mass(i)*epsilonxz_interface(j)/rho(i)
epsyz(i) = epsyz(i) + mass(j)*epsilonyz_interface(i)/rho(j)
epsyz(j) = epsyz(j) + mass(i)*epsilonyz_interface(j)/rho(i)
endif
endif
enddo
!$omp end do nowait
endif
!$omp end parallel
The performance problem that you observe comes from the very foundation of the algorithm that you use. Each thread picks a pair of particles and computes some values, then modifies the value of eps?? (where ?? is xx, yy, zz, etc.) for both particles. Depending on how the pair list is built, this could lead to many threads trying to modify the values for neighbouring particles or even for the same particle concurrently. In the former case it results in false sharing, which presents itself as huge slowdown due to cache lines being constantly invalidated and reloaded from higher level caches or from main memory. The latter results in completely wrong values for the array elements being computed.
While the latter problem can be easily fixed by either using atomic updates, e.g.
!$OMP ATOMIC UPDATE
epszz(i) = epszz(i) + mass(j)*epsilonzz_interface(i)/rho(j)
or CRITICAL constructs, e.g.
!$OMP CRITICAL
epsxx(i) = epsxx(i) + mass(j)*epsilonxx_interface(i)/rho(j)
epsxx(j) = epsxx(j) + mass(i)*epsilonxx_interface(j)/rho(i)
epsyy(i) = epsyy(i) + mass(j)*epsilonyy_interface(i)/rho(j)
epsyy(j) = epsyy(j) + mass(i)*epsilonyy_interface(j)/rho(i)
epsxy(i) = epsxy(i) + mass(j)*epsilonxy_interface(i)/rho(j)
epsxy(j) = epsxy(j) + mass(i)*epsilonxy_interface(j)/rho(i)
epszz(i) = epszz(i) + mass(j)*epsilonzz_interface(i)/rho(j)
epszz(j) = epszz(j) + mass(i)*epsilonzz_interface(j)/rho(i)
epsxz(i) = epsxz(i) + mass(j)*epsilonxz_interface(i)/rho(j)
epsxz(j) = epsxz(j) + mass(i)*epsilonxz_interface(j)/rho(i)
epsyz(i) = epsyz(i) + mass(j)*epsilonyz_interface(i)/rho(j)
epsyz(j) = epsyz(j) + mass(i)*epsilonyz_interface(j)/rho(i)
!$OMP END CRITICAL
or even array reductions, e.g.
!$OMP PARALLEL REDUCTION(+:epsxx,epsyy,epsxy,epszz,...)
the former problem requires that you change the algorithm. For example you can switch to a different pair list structure, e.g. an array of lists, where the array index is the particle number and each list contains the neighbours of that particle. Sorting the neighbour list will (kind of) reduce the false sharing. Depending on the geometry of the particle distribution, you might end up with severely unbalanced problem, therefore you should think about using dynamic loop scheduling.
Related
Before I present my problem, I want to apologize for those of you that feel this is more of a math post than a programming post. Neural networks are both mathematic and programming heavy, and I felt my problem was in the programming side. I have created a CNN from scratch in c++ (that works). For this reason, I feel as though the functions I use to create a convolution and a full convolution are correct. Programmatically below, I am going to show how I do the basic forward and backward of a CNN with a convolutional layer being the forward:
Matrix<float> cnn_forward(Matrix<float> weight, Matrix<float> prev){
Matrix<float> output = prev.convolute(weight);
return output;
}
And the backward pass (I am not using a bias or activation function in this case):
cnn_back cnn_backward(Matrix<float> a_prev, Matrix<float> dz, Matrix<float> kernel){
Matrix<float> rotated = kernel.rotate_180();
Matrix<float> dx = dz.convolute_full(rotated);
Matrix<float> dw = a_prev.convolute(dz);
cnn_back output;
output.dw = std::move(dw);
output.dx = std::move(dx);
return output;
}
Everything that I have seen online says that the transposed convolutional layer is just the reverse of the convolutional layer. So, I have tried implementing the following as the forward and backward passes of a transposed convolutional layer.
//forward
Matrix<float> fcn_forward(Matrix<float> weight, Matrix<float> prev){
Matrix<float> output = prev.convolute_full(weight.rotate_180());
return output;
}
//backward
fcn_back fcn_backward(Matrix<float> a_prev, Matrix<float> dz, Matrix<float> kernel){
Matrix<float> dx = dz.convolute(kernel);
Matrix<float> dw = dz.convolute(a_prev);
fcn_back output;
output.dw = std::move(dw);
output.dx = std::move(dx);
return output;
}
//again, not using a bias or activation function
My goal is to basically implement torch.nn.ConvTranspose2d from pytorch with 2 dimensional matrices. I was hoping to parallel it to the basic convolution formula that I have above.
~EDIT~
This would be the translation into python using numpy arrays which is pretty much an exact replica of my c++ code.
def convolute(X, W, strides=(1,1)):
new_row = (int)((X.shape[0] - W.shape[0])/strides[0] +1)
new_col = (int)((X.shape[1] - W.shape[1])/strides[1] +1)
out = np.zeros((new_row, new_col), dtype=float)
x_last = 0
y_last = 0
for x in range(0, X.shape[0]-(W.shape[0] - 1), strides[0]):
for y in range(0, X.shape[1]-(W.shape[1] - 1), strides[1]):
amt = 0.0
for i in range(0, W.shape[0]):
for j in range(0, W.shape[1]):
amt += W[i][j] * X[x+i][y+j]
out[x_last][y_last] = amt
y_last += 1
x_last += 1
y_last = 0
return out
def convolute_full(X, W, strides=(1, 1)):
row_num = (X.shape[0] - 1) * strides[0] + W.shape[0]
col_num = (X.shape[1] - 1) * strides[1] + W.shape[1]
output = np.zeros([row_num, col_num])
for i in range(0, X.shape[0]):
i_prime = i * strides[0]
for j in range(0, X.shape[1]):
j_prime = j * strides[1]
for k_row in range(W.shape[0]):
for k_col in range(W.shape[1]):
output[i_prime+k_row, j_prime+k_col] += W[k_row, k_col] * X[i, j]
return output
def get_errors(predicted, label):
return label - predicted
def fcn_forward(weight, prev):
rotated = np.rot90(np.rot90(weight))
output = convolute_full(prev, rotated)
return output
def fcn_backward(a_prev, dz, kernel):
dx = convolute(dz, kernel)
dw = convolute(dz, a_prev)
dx = np.clip(dx, 10, -10)
return dx, dw
def forward(weights, X_init):
values = []
values.append(X_init)
predicted = fcn_forward(weights[0], X_init)
values.append(predicted)
predicted = fcn_forward(weights[1], predicted)
values.append(predicted)
return values
def backward(weights, values, label, learningRate=0.001):
dz = get_errors(values[-1], label)
dx, dw = fcn_backward(values[-2], dz, weights[-1])
weights[-1] = weights[-1] - learningRate*dw
dz = dx
dx, dw = fcn_backward(values[-3], dz, weights[-2])
weights[-2] = weights[-2] - learningRate*dw
return weights
def train_example():
epoch = int(input("enter epoch: "))
#creating a random input
inp = np.random.randn(10,10)
#creating the weight matricies
weights = [np.random.randn(3,3), np.random.randn(3,3)]
#creating the wanted output
label = np.random.randn(14,14)
for i in range(0, epoch):
values = forward(weights, inp)
if(i == 0 or i == 1):
errors = get_errors(values[-1], label)
print("errors:")
print(errors)
print("error sum: ", np.sum(errors))
weights = backward(weights, values, label)
print("current prediction:")
print(values[-1])
print("label: ")
print(label)
errors = get_errors(values[-1], label)
print("errors:")
print(errors)
print("error sum at end of training: ", np.sum(errors))
Basically, this does not work. The weights are not corrected in the correct way. The errors only continue to get larger (the opposite of the wanted result). What is the correct way to forward and backward propagate a transposed convolutional layer?
EDIT
This is the answer for anyone wondering how it relates to my code above, due to #Bob's answer:
def convolute(X, W, strides=(1,1)):
new_row = (int)((X.shape[0] - W.shape[0])/strides[0] +1)
new_col = (int)((X.shape[1] - W.shape[1])/strides[1] +1)
out = np.zeros((new_row, new_col), dtype=float)
x_last = 0
y_last = 0
for x in range(0, X.shape[0]-(W.shape[0] - 1), strides[0]):
for y in range(0, X.shape[1]-(W.shape[1] - 1), strides[1]):
amt = 0.0
for i in range(0, W.shape[0]):
for j in range(0, W.shape[1]):
amt += W[i][j] * X[x+i][y+j]
out[x_last][y_last] = amt
y_last += 1
x_last += 1
y_last = 0
return out
#this is the same result as scipy.signal.convolute2d
def convolute_full(X, W, strides=(1, 1)):
row_num = (X.shape[0] - 1) * strides[0] + W.shape[0]
col_num = (X.shape[1] - 1) * strides[1] + W.shape[1]
output = np.zeros([row_num, col_num])
for i in range(0, X.shape[0]):
i_prime = i * strides[0]
for j in range(0, X.shape[1]):
j_prime = j * strides[1]
for k_row in range(W.shape[0]):
for k_col in range(W.shape[1]):
output[i_prime+k_row, j_prime+k_col] += W[k_row, k_col] * X[i, j]
return output
def convolute_full_backward(X, dZ, dW, strides=(1, 1)):
for i in range(0, X.shape[0]):
i_prime = i * strides[0]
for j in range(0, X.shape[1]):
j_prime = j * strides[1]
for k_row in range(dW.shape[0]):
for k_col in range(dW.shape[1]):
dW[k_row, k_col] += dZ[i_prime+k_row, j_prime+k_col] * X[i, j]
return dW
def get_errors(predicted, label):
return label - predicted
def fcn_forward(W, X):
rotated = np.rot90(np.rot90(W))
output = convolute_full(X, rotated)
return output
def fcn_backward(X, dZ, kernel):
dw = np.zeros(kernel.shape)
dw = convolute_full_backward(X, dZ, dw)
dw = np.rot90(np.rot90(dw))
dx = convolute(dZ, np.rot90(np.rot90(kernel)))
np.clip(dx, 10, -10)
return dx, dw
def forward(weights, X):
values = []
values.append(X)
predicted = fcn_forward(weights[0], X)
values.append(predicted)
predicted = fcn_forward(weights[1], predicted)
values.append(predicted)
return values
def backward(weights, values, label, learningRate=0.001):
dz = get_errors(values[-1], label)
dx, dw = fcn_backward(values[-2], dz, weights[-1])
weights[-1] = weights[-1] + learningRate*dw
dz = dx
dx, dw = fcn_backward(values[-3], dz, weights[-2])
#new apply dw:
weights[-2] = weights[-2] + learningRate*dw
return weights
def train_example():
epoch = int(input("please enter epoch: "))
inp = np.random.randn(10,10)
weights = [np.random.randn(3,3), np.random.randn(3,3)]
label = np.random.randn(14,14)
for i in range(0, epoch):
values = forward(weights, inp)
errors = get_errors(values[-1], label)
print("error sum at {} is: {}".format(i, np.sum(errors)))
weights = backward(weights, values, label)
errors = get_errors(values[-1], label)
print("error sum at end of training: ", np.sum(errors))
since your implementation has every scalar multiplication performed explicitly, the backward step can be done very clear. You keep all the loops unchanged, and when you see an update to your accumulator you compute the gradient for that.
import numpy as np
def convolute_full(X, W, strides=(1, 1)):
row_num = (X.shape[0] - 1) * strides[0] + W.shape[0]
col_num = (X.shape[1] - 1) * strides[1] + W.shape[1]
output = np.zeros([row_num, col_num])
for i in range(0, X.shape[0]):
i_prime = i * strides[0]
for j in range(0, X.shape[1]):
j_prime = j * strides[1]
for k_row in range(W.shape[0]):
for k_col in range(W.shape[1]):
output[i_prime+k_row, j_prime+k_col] += W[k_row, k_col] * X[i, j]
return output
def convolute_full_backward(X, dZ, dW, strides=(1, 1)):
row_num = (X.shape[0] - 1) * strides[0] + W.shape[0]
col_num = (X.shape[1] - 1) * strides[1] + W.shape[1]
output = np.zeros([row_num, col_num])
for i in range(0, X.shape[0]):
i_prime = i * strides[0]
for j in range(0, X.shape[1]):
j_prime = j * strides[1]
for k_row in range(W.shape[0]):
for k_col in range(W.shape[1]):
# Only this line changed compard to forward pass
dW[k_row, k_col] += dZ[i_prime+k_row, j_prime+k_col] * X[i, j]
def fcn_forward(X, W):
output = convolute_full(X, W[::-1,::-1])
return output
def fcn_backward(X, dZ, kernel_shape):
dW = np.zeros(kernel_shape)
convolute_full_backward(X, dZ, dW[::-1,::-1])
return dW
To validate I created a simple example with a linear loss function
X = np.random.randn(20, 20)
W = np.random.randn(5, 5)
Z = fcn_forward(X, W)
# pick a random loss with known gradient
dZ = np.random.randn(*Z.shape)
F = np.sum(Z * dZ)
dW = fcn_backward(X, dZ, W.shape)
# random perturbation
W_ = W + np.random.randn(*W.shape)
# expected change to the loss function
dF = np.sum(dW * (W_ - W))
Z_ = fcn_forward(X, W_)
F_ = np.sum(Z_ * dZ)
print('Predicted loss change: %f' % dF)
print('Actual loss change: %f' % (F_ - F))
Run and see.
I have a degree 6 multivariate equation in x and y written in Sympy, e.g.
eqn = a*x**6 + b*x**5*y + c*x**4*y + d*x**3*y + e*x**3*y**2 + ...
Is there a way to collect (x**2+y**2) and rearrange them into the following format?
eqn2 = A*(x**2+y**2)**3 + B*(x**2+y**2)**2 + C*(x**2+y**2) + D
A, B, C, D can be in x, y.
So far I have only tried collect(eqn, x**2 + y**2) and it returned the original equation.
Thank you!
Consider using a temporary symbol z = x**2 + y**2 and replace x**2 with z - y**2, then expand and restore:
>>> ex
A*x**6 + 3*A*x**4*y**2 + 3*A*x**2*y**4 + A*y**6 + B*x**4 + 2*B*x**2*y**2 +
B*y**4 + C*x**2 + C*y**2 + D
>>> ex.subs(x**2, z - y**2).expand().subs(z, x**2 + y**2)
A*(x**2 + y**2)**3 + B*(x**2 + y**2)**2 + C*(x**2 + y**2) + D
Although that works, perhaps a more direct thing to do is separate the expression by coefficients A-D and then factor those collections of terms:
def separatevars_additively(expr, symbols=[]):
free = set(symbols) or expr.free_symbols
d = {}
while free:
f = free.pop()
expr, dep = expr.as_independent(f, as_Add=True)
if dep.has(*free):
return None
d[f] = dep
if expr:
d[0] = expr
return d
>>> coeff = var("A:D")
>>> separatevars_additively(ex, coeff)
{B: B*x**4 + 2*B*x**2*y**2 + B*y**4, A: A*x**6 + 3*A*x**4*y**2 + 3*A*x**2*y**4 + A*y**6, D: D, C: C*x**2 + C*y**2}
>>> Add(*[factor(i) for i in _.values()])
A*(x**2 + y**2)**3 + B*(x**2 + y**2)**2 + C*(x**2 + y**2) + D
This is my code, I want to make it parallel with OpenMP. I have one main loop to make parallel and some inner loops.
Are the indices of inner loops, like p, i or Li private or shared?
What happend if I do not declare the variables as private or shared?
Do you suggest to use the allocatable variables for this parallel loop?
!$OMP PARALLEL DO
do l = 1,n_rep
do p = 1,n_l - 1
do q = 1,n_l - 1
do r = 1,n_l - 1
Li = (p - 1)*(n_l - 1)**2 + (q - 1)*(n_l - 1) + r
alpha(Li) = pi*rand()
gamma(Li) = pi*rand()
beta(Li) = pi/2*rand()
R_x(1,1) = 1.d0
R_x(1,2) = 0.d0
R_x(1,3) = 0.d0
R_x(2,1) = 0.d0
R_x(2,2) = cos(alpha(Li))
R_x(2,3) = sin(alpha(Li))
R_x(3,1) = 0.d0
R_x(3,2) = -sin(alpha(Li))
R_x(3,3) = cos(alpha(Li))
R_y(1,1) = cos(beta(Li))
R_y(1,2) = 0.d0
R_y(1,3) = -sin(beta(Li))
R_y(2,1) = 0.d0
R_y(2,2) = 1.d0
R_y(2,3) = 0.d0
R_y(3,1) = sin(beta(Li))
R_y(3,2) = 0.d0
R_y(3,3) = cos(beta(Li))
R_z(1,1) = cos(gamma(Li))
R_z(1,2) = sin(gamma(Li))
R_z(1,3) = 0.d0
R_z(2,1) = -sin(gamma(Li))
R_z(2,2) = cos(gamma(Li))
R_z(2,3) = 0.d0
R_z(3,1) = 0.d0
R_z(3,2) = 0.d0
R_z(3,3) = 1.d0
R_xy = matmul(R_x,R_y)
R_xyz = matmul(R_xy,R_z)
do i = 1,n_f - 1
do j = 1,n_f - 1
do k = 1,n_f - 1
Li = (i - 1)*(n_f - 1)**2 + (j - 1)*(n_f - 1) + k
cf_x(i) = x_f(i) + (p - 1)*d_l - x_c(p)
cf_y(j) = y_f(j) + (q - 1)*d_l - y_c(q)
cf_z(k) = z_f(k) + (r - 1)*d_l - z_c(r)
x_rotated = R_xyz(1,1)*cf_x(i) + R_xyz(1,2)*cf_y(j) &
+ R_xyz(1,3)*cf_z(k)
y_rotated = R_xyz(2,1)*cf_x(i) + R_xyz(2,2)*cf_y(j) &
+ R_xyz(2,3)*cf_z(k)
z_rotated = R_xyz(3,1)*cf_x(i) + R_xyz(3,2)*cf_y(j) &
+ R_xyz(3,3)*cf_z(k)
enddo
enddo
enddo
enddo
enddo
enddo
enddo
!$OMP END PARALLEL DO
Personally I would break this problem up a bit.
Size_of_Array = n_l * n_l * n_l
IF(ALLOCATED(Li)) DEALLOCATE( Li )
ALLOCATE( Li (Size_of_Array))
IF(ALLOCATED(Alpha)) DEALLOCATE( Alpha)
ALLOCATE (Alpha (Size_of_Array))
IF(ALLOCATED(Beta)) DEALLOCATE( Beta )
ALLOCATE( Beta (Size_of_Array))
IF(ALLOCATED(Gamma)) DEALLOCATE( Gamma)
ALLOCATE( Gamma (Size_of_Array))
indexer = 0
do l = 1,n_rep
do p = 1,n_l - 1
do q = 1,n_l - 1
do r = 1,n_l - 1
indexer = indexer + 1
Li(Indexer) = (p - 1)*(n_l - 1)**2 + (q - 1)*(n_l - 1) + r
ENDDO
ENDDO
ENDDO
ENDDO
alpha = pi*rand()
gamma = pi*rand()
beta = pi/2*rand()
!?OMP DO PARALLEL
DO I= 1, SIZE(Li)
CALL Make_Array(Alpha(I), Beta(I), Gamma(I), MyArray(:,:,I) )
ENDDO
!etc
Basically moving the array to be inside of either an ELEMENTAL FUNCTION or a PURE SUBROUTINE. Then see what it does for speed with inlining and a single parallel do of some sort (OMP or other).
PURE SUBROUTINE Make_Array(Alpha, Beta, Gamma, MyArray)
IMPLICIT NONE
DOUBLE, INTENT(IN ) :: Alpha
DOUBLE, INTENT(IN ) :: Beta
DOUBLE, INTENT(IN ) :: Gamma
DOUBLE, DIMENSION(3,3) INTENT(INOUT) :: MyArray ! Maybe just intent(OUT)?
R_x(:,:) = 0.d0
R_x(1,1) = 1.d0
R_x(2,2) = cos(alpha)
R_x(2,3) = sin(alpha)
R_x(3,2) = -sin(alpha)
R_x(3,3) = cos(alpha)
R_y(1,1) = cos(beta)
R_y(1,3) = -sin(beta)
R_y(2,1) = 0.d0
R_y(2,2) = 1.d0
R_y(2,3) = 0.d0
R_y(3,1) = sin(beta(Li))
R_y(3,2) = 0.d0
R_y(3,3) = cos(beta(Li))
R_z(1,1) = cos(gamma(Li))
R_z(1,2) = sin(gamma(Li))
R_z(1,3) = 0.d0
R_z(2,1) = -sin(gamma(Li))
R_z(2,2) = cos(gamma(Li))
END SUBROUTINE Make_Array
Etc... For other elemental functions or pure subroutines
R_xy = matmul(R_x,R_y)
R_xyz = matmul(R_xy,R_ ...
I am trying to update my function arguments after each iteration but failed to do so. Kindly check my code because I am new to python language. My task is to calculate xps, (represents collection of positions) and v2ps, (represents collection of velocities) after each iteration and want to plot them against each other. Basic this program represents the collision of objects moving vertical down and one of object also collide with plane above which they are moving.
acc_grav = 10
m1 =float(input(" Input mass of ball one, m1: "))
m2 =float(input(" Input mass of ball two, m2: "))
time_steps =10000
num_coll_bounce = 0
num_ball_coll = 0
eps=1.e-6
def ball_coll(x1_old,v1_old,x2_old,v2_old,time_ball_coll):
v1 = v1_old - acc_grav*time_ball_coll
v2 = v2_old - acc_grav*time_ball_coll
x1 = x1_old + time_ball_coll*v1_old - 0.5*acc_grav*(time_ball_coll)**2
x2 = x2_old + time_ball_coll*v2_old - 0.5*acc_grav*(time_ball_coll)**2
v1_ball_coll = (v1*(m1-m2)+(2*m2*v2))/(m1+m2)
v2_ball_coll = (v2*(m2-m1)+(2*m1*v1))/(m1+m2)
cumlv2=v2
return [v1,v2,x1,x2,v1_ball_coll,v2_ball_coll]
def floor_coll(x1_old,v1_old,x2_old,v2_old,time_floor_coll):
v1 = v1_old - acc_grav*time_floor_coll
v2 = v2_old - acc_grav*time_floor_coll
x1 = 0 #at the time of bonuce
x2 = x2_old + time_floor_coll*v2_old - 0.5*acc_grav*time_floor_coll**2
#update velocities following rules for collision with walls
v1_bounce = -v1
v2_bounce = v2
return [v1,v2,x1,x2,v1_bounce,v2_bounce]
for i in range(0, 10):
x1_0 = 1
x2_0 = 3 - (i-1)*0.1
v1_0 = 2
v2_0 = 2*v1_0
xps = []
v2ps = []
for n in range (time_steps-1):
time_ball_coll = (x2_0-x1_0)/(v1_0 - v2_0)
time_floor_coll = (v1_0 + (v1_0**2 + 2*acc_grav*x1_0)**1/2)/acc_grav
if ((time_ball_coll - time_floor_coll)<eps and v1_0 - v2_0 > 0):
num_coll_bounce = num_coll_bounce + 1
num_ball_coll = num_ball_coll + 1
ball_coll(x1_0,v1_0,x2_0,v2_0,time_ball_coll)
#xps[n] = x2_0
#v2ps(n,num_ballcoll) = v2ini
xps.append(x2_0)
v2ps.append(v2_0)
else:
num_coll_bounce = num_coll_bounce + 1
floor_coll(x1_0,v1_0,x2_0,v2_0,time_floor_coll)
#x1_old,v1_old,x2_old,v2_old,time_floor_coll = dd2
x_1.append(x1_0)
x_2.append(x2_0)
I know that people ask a LOT of segmentation err questions here, but I've put my effort in solving this problem for more than three hour and still wasn't able to solve this. :/ So here is my code:
c sinle event analysis
implicit real(a-h,o-z)
real day(12), nmonth(12), year(12), clas(12),
$ hour(12), nmin(12)
integer mark(12)
real tst(12), D(12), avgP(12,6), avgA(12,6)
integer k, m, n, g
real time(2054904), proa(2054904), prob(2054904), w1(2054904),
$ w2(2054904), w3(2054904), w4(2054904)
D(1) = 31; D(2) = 28; D(3) = 31; D(4) = 30; D(5) = 31;
D(6) = 30; D(7) = 31; D(8) = 31; D(9) = 30; D(10) = 31;
D(11) = 30; D(12) = 31
open(100,file='singleE.dat')
do i=1, 12
tst(i)=0
enddo
900 do i=1, 12
read(100, 1150) day(i), nmonth(i), year(i),
$ hour(i), nmin(i), clas(i)
do j=1, 12
if (int(nmonth(i)).EQ.(13-j)) then
tst(i) = tst(i) + D(12-j)
nmonth(i) = nmonth(i)-1
endif
enddo
tst(i) = tst(i) + day(i) + (year(i) - 2010)*365
$ + (hour(i) + nmin(i)/60)/24
if (year(i) > real(2011)) then
tst(i) = tst(i) + 1
endif
enddo
open(200,file='hole.dat',status='OLD')
k = 0
do i=1, 2054904
read(200,950) time(i), proa(i), prob(i),
$ w1(i), w2(i), w3(i), w4(i)
enddo
mark = 0
do i=1, 12
do j=1, 2054904
k = k + 1
if(abs(tst(i)-time(j))<0.0001) then
mark(i) = k
endif
enddo
enddo
n = 5;
do i= 1, 12
do j= 1,6
avgP(i,j) = 0
avgA(i,j) = 0
enddo
enddo
do i=1, 12
if (mark(i).EQ.0) then
go to 750
endif
do j = (mark(i)-(n+1)*1440), (mark(i)-n*1440)
avgP(i,1) = avgP(i,1) + proa(j)
avgP(i,2) = avgP(i,2) + prob(j)
avgP(i,3) = avgP(i,3) + w1(j)
avgP(i,4) = avgP(i,4) + w2(j)
avgP(i,5) = avgP(i,5) + w3(j)
avgP(i,6) = avgP(i,6) + w4(j)
enddo
do g = (mark(i)+n*1440), (mark(i)+(n+1)*1440)
avgA(i,1) = avgA(i,1) + proa(g)
avgA(i,2) = avgA(i,2) + prob(g)
avgA(i,3) = avgA(i,3) + w1(g)
avgA(i,4) = avgA(i,4) + w2(g)
avgA(i,5) = avgA(i,5) + w3(g)
avgA(i,6) = avgA(i,6) + w4(g)
enddo
750 print *, avgP(i,1), avgP(i,2), avgP(i,3), avgP(i,4),
$ avgP(i,5), avgP(i,6)
enddo
850 close(i)
950 FORMAT(F12.7,2x,E10.3,2x,E10.3,2x,E10.3,2x,E10.3,
$ 2x,E10.3,2x,E10.3)
1150 FORMAT(F2.0,1x,F2.0,1x,F4.0,1x,F2.0,1x,F2.0,4x F3.1)
end
The part that is causing me trouble is the loop here:
do i=1, 12
if (mark(i).EQ.0) then
go to 750
endif
do j = (mark(i)-(n+1)*1440), (mark(i)-n*1440)
avgP(i,1) = avgP(i,1) + proa(j)
avgP(i,2) = avgP(i,2) + prob(j)
avgP(i,3) = avgP(i,3) + w1(j)
avgP(i,4) = avgP(i,4) + w2(j)
avgP(i,5) = avgP(i,5) + w3(j)
avgP(i,6) = avgP(i,6) + w4(j)
enddo
do g = (mark(i)+n*1440), (mark(i)+(n+1)*1440)
avgA(i,1) = avgA(i,1) + proa(g)
avgA(i,2) = avgA(i,2) + prob(g)
avgA(i,3) = avgA(i,3) + w1(g)
avgA(i,4) = avgA(i,4) + w2(g)
avgA(i,5) = avgA(i,5) + w3(g)
avgA(i,6) = avgA(i,6) + w4(g)
enddo
enddo
Using gdb, I found out that the 'j' loop is causing trouble. All of the parameters are fine, but every time I execute the program the 'j' loop goes only once. The peculiar thing is that as 'i' increases the processes start to cripple one by one. for example, at i = 1 the loop executes well. Then, at i = 2, avgP(i,6) = avgP(i,6) + w4(j) causes the seg fault. At i = 3, avgP(i,5) = avgP(i,5) + w3(j) causes the seg fault and in the end at i = 7, entire loop doesn't work. What a strange error! Hope that I can get some help with this.
The loop
do j=1, 12
if (int(nmonth(i)).EQ.(13-j)) then
tst(i) = tst(i) + D(12-j)
nmonth(i) = nmonth(i)-1
endif
enddo
potentially tries to access the value D(0) when j=12 however D is dimensioned 1:12 so who knows what is being written to tst and its consequences.
This should be caught by turning on bounds checking.