How to parallel a nested loop - c++

This is a part of c++ code for solving a problem in computational mathematics of large dimension, say more than 100000 variables. I'd like to parallelise it using OpenMP. What is the best way of paralleling the following nested loop by OpenMP?
e = 0;
// m and n are are big numbers 200000 - 10000000
int i,k,r,s,t;
// hpk,hqk,pk_x0,n2pk_x0,dk,sk are double and declared before.
for (k=0; k<m; k++)
{
hpk = 0;
hqk = 0;
n2pk_x0 = 0;
dk = 0;
sk = 0;
for (int i=0; i<n; i++)
{
if (lamb[i] <= lam[k])
{
if (h[i]<0)
{
pk[i] = xu[i];
}
else if (h[i]>0)
{
pk[i] = xl[i];
}
qk[i] = 0;
}
else
{
pk[i] = x0[i];
qk[i] = -h[i];
}
hpk += h[i]*pk[i];
hqk += h[i]*qk[i];
pk_x0 = pk[i]-x0[i];
n2pk_x0 += pk_x0*pk_x0;
dk += pk_x0*qk[i];
sk += qk[i]*qk[i];
}
//}//p
/* ------- Compute ak, bk, ck, dk and sk to construct e(lam) -------- */
ak = - (gamma + hpk);
bk = - hqk;
ck = q0 + 0.5 * n2pk_x0;
sk = 0.5 * sk;
// some calculation based on index k
} // end of first for
I did some of the advises to private the local variables in the nested loop.The CPU time decreased by factor 1/2, but the output is not correct! Is there any way to improve the code in such a way that get correct result with less CPU time? (In the nested loop, if we set m=1, the output will be correct, but for m>1 the output is incorrect.)
This is the whole code:
static void subboconcpp(
double u[],
double *Egh,
double h[],
double gamma,
double x0[],
double q0,
double xl[],
double xu[],
int dim
)
{
int n,m,infinity = INT_MAX,i,k,r,s,t;;
double e;
double hpk, hqk, dk1, sk1, n2pk_x0;
double ak, bk, ck, dk, sk;
double lam_hat, phik, ek1, ek2;
double *pk = new double[dim];
double *qk = new double[dim];
double *lamb = new double[dim];
double *lamb1 = new double[dim];
double *lam = new double[dim];
/* ------------------ Computing lambl(i) and lambu(i) ------------------ */
/* n is the length of x0 */
n = dim;
#pragma omp parallel for shared(n,h,x0,xl,xu)//num_threads(8)
for (int i=0; i<n; i++)
{
double lamb_flag;
if (h[i] > 0)
{
lamb_flag = (x0[i] - xl[i])/h[i];
lamb[i] = lamb_flag;
lamb1[i] = lamb_flag;
}
else if (h[i] < 0)
{
lamb_flag = (x0[i] - xu[i])/h[i];
lamb[i] = lamb_flag;
lamb1[i] = lamb_flag;
}
//cout << "lamb:" << lamb[i];
}
/* --------------------------------------------------------------------- */
/* ----------------- Sorting lamb and constructing lam ----------------- */
/* lamb = sort(lamb,1); */
sort(lamb1, lamb1+n);
int q = 0;
double lam_flag = 0;
#pragma omp parallel for shared(n) firstprivate(q) lastprivate(m)
for (int j=0; j<n; j++)
{
if (lamb1[j] > lam_flag)
{
lam_flag = lamb1[j];
q = q + 1;
lam[q] = lam_flag;
//cout << "lam: \n" << lam[q];
}
if (j == n-1)
{
if (lam_flag < infinity)
{
m = q+1;
lam[m] = + infinity;
}
else
{
m = q;
}
}
//cout << "q: \n" << q;
}
/* --------------------------------------------------------------------- */
/* -- Finding the global maximizer of e(lam) for lam in[-inf, + inf] -- */
e = 0;
#pragma omp parallel shared(m,n,h,x0,xl,xu,lamb,lam) \
private(i,r,s,t,hpk, hqk, dk1, sk1, n2pk_x0,ak, bk, ck, dk, sk,lam_hat, phik, ek1, ek2)
{
#pragma omp for nowait
for (k=0; k<1; k++)
{
/*double hpk=0, hqk=0, dk1=0, sk1=0, n2pk_x0=0;
double ak, bk, ck, dk, sk;
double lam_hat, phik, ek1, ek2;
double *pk = new double[dim];
double *qk = new double[dim];*/
hpk = 0;
hqk = 0;
n2pk_x0 = 0;
dk1 = 0;
sk1 = 0;
for (int i=0; i<n; i++)
{
double pk_x0;
if (lamb[i] <= lam[k])
{
if (h[i]<0)
{
pk[i] = xu[i];
}
else if (h[i]>0)
{
pk[i] = xl[i];
}
qk[i] = 0;
}
else
{
pk[i] = x0[i];
qk[i] = -h[i];
}
hpk += h[i]*pk[i];
hqk += h[i]*qk[i];
pk_x0 = pk[i]-x0[i];
n2pk_x0 += pk_x0*pk_x0;
dk1 += pk_x0*qk[i];
sk1 += qk[i]*qk[i];
}
/* ------- Compute ak, bk, ck, dk and sk to construct e(lam) -------- */
ak = - (gamma + hpk);
bk = - hqk;
ck = q0 + 0.5 * n2pk_x0;
dk = dk1;
sk = 0.5 * sk1;
/* ----------------------------------------------------------------- */
/* - Finding the global maximizer of e(lam) for [lam(k), lam(k+1)] - */
/* --------------------- using Proposition 4 ----------------------- */
if (bk != 0)
{
double w = ak*ak - bk*(ak*dk - bk*ck)/sk;
if (w == 0)
{
lam_hat = -ak / bk;
phik = 0;
}
else
{
double w = ak*ak - bk*(ak*dk - bk*ck)/sk;
lam_hat = (-ak + sqrt(w))/bk;
phik = bk / (2*sk*lam_hat + dk);
}
}
else
{
if (ak > 0)
{
lam_hat = -dk / (2 * sk);
phik = 4*ak*sk / (4*ck*sk + (sk - 2)*(dk*dk));
}
else
{
lam_hat = + infinity;
phik = 0;
}
}
/* ----------------------------------------------------------------- */
/* --- Checking the feasibility of the solution of Proposition 4 --- */
if (lam[k] <= lam_hat && lam_hat <= lam[k + 1])
{
if (phik > e)
{
for (r=0; r<n; r++)
{
u[r] = pk[r] + lam_hat * qk[r];
}
e = phik;
}
}
else
{
ek1 = (ak+bk*lam[k])/(ck+(dk+sk*lam[k])*lam[k]);
ek2 = (ak+bk*lam[k+1])/(ck+(dk+sk*lam[k+1])*lam[k+1]);
if (ek1 >= ek2)
{
lam_hat = lam[k];
if (ek1 > e)
{
for (s=0; s<n;s++)
{
u[s] = pk[s] + lam_hat * qk[s];
}
e = ek1;
}
}
else
{
lam_hat = lam[k + 1];
if (ek2 > e)
{
for (t=0; t<n;t++)
{
u[t] = pk[t] + lam_hat * qk[t];
}
e = ek2;
}
}
}
/* ------------------------------------------------------------------ */
}/* ------------------------- End of for (k) --------------------------- */
}//p
/* --------- The global maximizer by searching all m intervals --------- */
*Egh = e;
delete[] pk;
delete[] qk;
delete[] lamb1;
delete[] lamb;
delete[] lam;
return;
/* --------------------------------------------------------------------- */
}
Please note that the first two parallel code working well, but just the output of the nested loop is in correct.
Any suggestion or comment is appreciated.

The outermost loop: I do not know all code but it look like that variables hpk, hqk, n2pk_x0, dk, sk should be private. If you do not specify them to be private it will break correctness.
OpenMP is not always very good for nested parallelism. It depends on OpenMP settings but nested loop can create p*p threads, where p is a default concurrency of your machine. So big oversubscription may lead significant performance degradation. In most cases it is Ok to parallelise the outermost loop and leave the nested loops to be serial.
The one of the reason of parallelising nested loops is achieving better work balancing. But your case seems to have balanced work and you should not face the work balancing problem if you parallelise only the outermost loop.
But if you still want to parallelise both loops may I suggest using Intel TBB instead of OpenMP? You can use tbb::parallel_for for outermost loop and tbb::parallel_reduce for the nested one. Intel TBB uses one thread pool for all its algorithms so it will not lead your application to have oversubscription.
[updated] Some parallelization advises:
Until you achieve correctness the execution time does not mean anything. Since a correctness fix can change it significantly (even for better in some cases);
Do not try to parallelise "all and at once": try to parallelise loop-by-loop. It will be easier to understand when correctness is broken;
Do not modify shared variables concurrently. If you really need it you should rethink you algorithm and use special constructions such as reductions, atomic operations, locks/mutexes/semaphores and so on.
Be accurate when write in shared arrays with private-modified indices since different threads may have the same indices.

I think your idea of nested parallelistation does not fit the OpenMP mindset very well. Allthough nested parallelism can be achieved in OpenMP, it brings more complications than necessary. Typically in OpenMP you only parallelise a single loop at once.
Parallelisation should be done on the level with the least interleaving dependencies. Often this comes out to be the top level. In your particular case this is true as well, as the steps in the outer loop are not strongly coupled.
I don't know what the rest of your code does, especially what happens to the values of hpk,hqk,n2pk_x0, dk and sk. All you have to do is add #pragma omp parallel for to your code.

Related

Loops optimization

I have a loop and inside a have a inner loop. How can I optimise it please in order to optimise execution time like avoiding accessing to memory many times to the same thing and avoid the maximum possible the addition and multiplication.
int n,m,x1,y1,x2,y2,cnst;
int N = 9600;
int M = 1800;
int temp11,temp12,temp13,temp14;
int temp21,temp22,temp23,temp24;
int *arr1 = new int [32000]; // suppose it's already filled
int *arr2 = new int [32000];// suppose it's already filled
int sumFirst = 0;
int maxFirst = 0;
int indexFirst = 0;
int sumSecond = 0;
int maxSecond = 0;
int indexSecond = 0;
int jump = 2400;
for( n = 0; n < N; n++)
{
temp14 = 0;
temp24 = 0;
for( m = 0; m < M; m++)
{
x1 = m + cnst;
y1 = m + n + cnst;
temp11 = arr1[x1];
temp12 = arr2[y1];
temp13 = temp11 * temp12;
temp14+= temp13;
x2 = m + cnst + jump;
y2 = m + n + cnst + jump;
temp21 = arr1[x2];
temp22 = arr2[y2];
temp23 = temp21 * temp22;
temp24+= temp23;
}
sumFirst += temp14;
if (temp14 > maxFirst)
{
maxFirst = temp14;
indexFirst = m;
}
sumSecond += temp24;
if (temp24 > maxSecond)
{
maxSecond = temp24;
indexSecond = n;
}
}
// At the end we use sum , index and max for first and second;
You are multiplying array elements and accumulating the result.
This can be optimized by:
SIMD (doing multiple operations at a single CPU step)
Parallel execution (using multiple physical/logical CPUs at once)
Look for CPU-specific SIMD way of doing this. Like _mm_mul_epi32 from SSE4.1 can possibly be used on x86-64. Before trying to write your own SIMD version with compiler intrinsics, make sure the compiler doesn't do it already for you.
As for parallel execution, look into omp, or using C++17 parallel accumulate.

Dot Product between 2 Mx3 Matrices in Eigen C++ ( Similar to np.dot(x.transpose, y) )

I`m trying to find the fastest way to compute a MxN matrix with dot products in c++ using Eigen.
3rd Update: This is part of a Maya 2018 plugin ( For which I'm using visual studio 2018 with the toolset v141 which is what they recommend apparently to work properly with their devkit )
Lets say we have 2 Matrices:
A(3,5812) and B(3,23686)
What I'm looking for in Numpy is achieved using:
result = np.dot(A.T, B) which creates a result matrix of 5812 x 23686
Np.Dot takes 394 ms
So my progress so far:
Loop in loop method:
for (int i = 0; i < (int)A.cols(); i++) {
for (int j = 0; j < (int)B.cols(); j++) {
result(i, j) = A.col(i).cwiseProduct(B.col(j)).sum();
}
}
Process completed in: 891 ms
and using Eigen broadcasting:
for (int i = 0; i < (int)A.cols(); i++) {
result.row(i) = A.col(i).replicate(1, B.cols()).cwiseProduct(B).colwise().sum();
}
Process completed in: 844 ms. #
result = A.transpose() * B
Process completed in: 690 ms. # ( Update 3 .. 170ms)
I used OpenMP to multithreaded the loops, but I removed that in the code above for readability.
There has to be a faster way to do this properly maybe without using loops at all? I`ve been searching for a couple of days now... So, in the end, I decided to post here...
Thanks!
Second Update:
Managed to reduce the time to 170ms using A.transpose() * B... with openblas, and using MatrixXf instead of Xd..since for this particular implementation that precision is fine... still working on it trying various optimizations...
Third Update:
So for the sake of making things more clear I will post the function I'm currently working on...
Python version:
def getCorrespondences(X, Y, Cx, Cy, Rx):
timer = time.time()
X_ = np.dot(Rx, X - Cx)
Y_ = Y - Cy
ab = np.dot(X_.T, Y_) # each cell is X_i dot Y_j
xx = np.sum(X_*X_, 0)
yy = np.sum(Y_*Y_, 0)
D = (xx[:, np.newaxis] + yy[np.newaxis, :]) - 2*ab
idx = np.argmin(D, 1)
elapsed_time = time.time() - timer
print (elapsed_time)
return idx
Python computes this in 1.377s
And my current c++ implementation
DWORD tcStart1 = GetTickCount();
X.transposeInPlace();
Y.transposeInPlace();
Cx.transposeInPlace();
Cy.transposeInPlace();
Rx.transposeInPlace();
int vtxCountX = (int)X.cols();
int vtxCountY = (int)Y.cols();
MatrixXf X_ = Rx * (X - Cx.col(0).replicate(1, vtxCountX));
MatrixXf Y_ = Y - Cy.col(0).replicate(1, vtxCountY);
MatrixXf ab = X_.transpose() * Y_;
MatrixXf xx = X_.cwiseProduct(X_).colwise().sum();
MatrixXf yy = Y_.cwiseProduct(Y_).colwise().sum();
MatrixXf D(vtxCountX, vtxCountY);
MatrixXf::Index index;
VectorXi idx(D.rows());
#ifdef _OPENMP
int a, b;
if (vtxCountX > vtxCountY) { b = threads / 4; a = threads - b; }
else { a = threads / 4; b = threads - a; }
#pragma omp parallel for num_threads(a)
#endif
for (int i = 0; i < vtxCountX; i++) {
#ifdef _OPENMP
#pragma omp parallel for num_threads(b)
#endif
for (int j = 0; j < vtxCountY; j++) {
D(i, j) = xx(i) + yy(j) - 2 * ab(i,j);
}
D.row(i).minCoeff(&index);
idx(i) = (int)index;
}
DWORD tcTotal1 = GetTickCount() - tcStart1;
//cout << idx.transpose() << endl;
cout << "Correspondence computed in: " << tcTotal1 << endl;
return idx;
This seems to the job in 906 ms with MatrixXf, and 1297ms with MatrixXd
I still need to figure out how I can squeeze more speed out of this since this function will end up getting fired hundreds of times.
I will try other things proposed in the comments... or if you guys have any other suggestions Im all ears :)... I ended up casting to float since the result is a bunch of indexes... so I dont need precision... the function basically computes distances and figures out some point correspondences, for Procrustes alignment.
I ended up using loops at a certain point instead of eigen replicate and cwise broadcasting since that seemed to be slower:/. Anyway I hope this gives more context. Cheers
Update 4...
int vtxCountX = (int)X.rows(); int vtxCountY = (int)Y.rows();
MatrixXf X_ = (X - Cx.row(0).replicate(vtxCountX, 1)) * Rx.transpose();
MatrixXf Y_ = Y - Cy.row(0).replicate(vtxCountY, 1);
MatrixXf ab = Y_* X_.transpose();
MatrixXf xx = X_.cwiseProduct(X_).rowwise().sum();
MatrixXf yy = Y_.cwiseProduct(Y_).rowwise().sum();
MatrixXf D(vtxCountX, vtxCountY);
MatrixXf::Index index;
VectorXi idx(D.rows());
#ifdef _OPENMP
int a, b;
if (vtxCountX > vtxCountY) { b = threads / 4; a = threads - b; }
else { a = threads / 4; b = threads - a; }
#pragma omp parallel for num_threads(a)
#endif
for (int i = 0; i < vtxCountX; i++) {
#ifdef _OPENMP
#pragma omp parallel for num_threads(4)
#endif
for (int j = 0; j < vtxCountY; j++) {
D(i, j) = xx(i) + yy(j) - 2 * ab(j,i);
}
D.row(i).minCoeff(&index);
idx(i) = (int)index;
}
This does the job in 469 ms... 17 faster than my original implementation... and ~3 times faster than numpy...

parallel for with omp stucks

I have problem with the following code:
int *chosen_pts = new int[k];
std::pair<float, int> *dist2 = new std::pair<float, int>[x.n];
// initialize dist2
for (int i = 0; i < x.n; ++i) {
dist2[i].first = std::numeric_limits<float>::max();
dist2[i].second = i;
}
// choose the first point randomly
int ndx = 1;
chosen_pts[ndx - 1] = rand() % x.n;
double begin, end;
double elapsed_secs;
while (ndx < k) {
float sum_distribution = 0.0;
// look for the point that is furthest from any center
begin = omp_get_wtime();
#pragma omp parallel for reduction(+:sum_distribution)
for (int i = 0; i < x.n; ++i) {
int example = dist2[i].second;
float d2 = 0.0, diff;
for (int j = 0; j < x.d; ++j) {
diff = x(example,j) - x(chosen_pts[ndx - 1],j);
d2 += diff * diff;
}
if (d2 < dist2[i].first) {
dist2[i].first = d2;
}
sum_distribution += dist2[i].first;
}
end = omp_get_wtime() - begin;
std::cout << "center assigning -- "
<< ndx << " of " << k << " = "
<< (float)ndx / k * 100
<< "% is done. Elasped time: "<< (float)end <<"\n";
/**/
bool unique = true;
do {
// choose a random interval according to the new distribution
float r = sum_distribution * (float)rand() / (float)RAND_MAX;
float sum_cdf = dist2[0].first;
int cdf_ndx = 0;
while (sum_cdf < r) {
sum_cdf += dist2[++cdf_ndx].first;
}
chosen_pts[ndx] = cdf_ndx;
for (int i = 0; i < ndx; ++i) {
unique = unique && (chosen_pts[ndx] != chosen_pts[i]);
}
} while (! unique);
++ndx;
}
As you can see i use omp to make parallel the for loop. It works fine and i can achive a significant speed up. However if i increase the value of x.n over 20000000 the function stops to work after 8-10 loops:
It doestn produces any output (std::cout)
Only one core works
No error, whatsoever
If i comment out the do while loop, it works again as expected. All cores are busy and there is an output after each iteration, and i can increase k.n over 100 millions just as i need it.
It's not OpenMP parallel for getting stuck, it's obviously in your serial do-while loop.
One particular issue that I see is that there is no array boundary checks in the inner while loop accessing dist2. In theory, out-of-boundary access should never happen; but in practice it may - see below why. So first of all I would rewrite the calculation of cdf_ndx to guarantee that the loop ends when all elements are inspected:
float sum_cdf = 0;
int cdf_ndx = 0;
while (sum_cdf < r && cdf_ndx < x.n ) {
sum_cdf += dist2[cdf_ndx].first;
++cdf_ndx;
}
Now, how it may happen that sum_cdf does not reach r? It is due to specifics of floating-point arithmetic and the fact that sum_distribution was computed in parallel, while sum_cdf is computed serially. The problem is that contribution of one element to the sum can be below the accuracy for floats; in other words, when you sum two float values that differ more than ~8 orders of magnitude, the smaller one does not affect the sum.
So, with 20M of floats after some point it might happen that the next value to add is so small comparing to the accumulated sum_cdf that adding this value does not change it! On the other hand, sum_distribution was essentially computed as several independent partial sums (one per thread) then combined together. Thus it is more accurate, and possibly bigger than sum_cdf can ever reach.
A solution can be to compute sum_cdf in portions, having two nested loops. For example:
float sum_cdf = 0;
int cdf_ndx = 0;
while (sum_cdf < r && cdf_ndx < x.n ) {
float block_sum = 0;
int block_end = min(cdf_ndx+10000, x.n); // 10000 is arbitrary selected block size
for (int i=cdf_ndx; i<block_end; ++i ) {
block_sum += dist2[i].first;
if( sum_cdf+block_sum >=r ) {
block_end = i; // adjust to correctly compute cdf_ndx
break;
}
}
sum_cdf += block_sum;
cdf_ndx = block_end;
}
And after the loop you need to check that cdf_ndx < x.n, otherwise repeat with a new random interval.

OpenMP generate segfault in Rcpp code for the SEIR model

I wrote a (probably-inefficient, but anyway..) Rcpp code using inline to simulate a stochastic SEIR model.
The serial version compiles and works perfectly, but since I need to simulate from it a large number of times and since it seems to me like an embarrassingly parallel problem (just need to simulate again for other parameter values and return a matrix with the results) I tried to add #pragma omp parallel for and to compile with -fopenmp -lgomp but ... boom!
I get a segfault even for very small examples!
I tried to add setenv("OMP_STACKSIZE","24M",1); and values well over 24M but still the segfault happens.
I'll explain briefly the code since it's a bit long (I tried to shorten it but the result change and I can't reproduce it..):
I have two nested loops, the inner one execute the model for a given parameter set and the outer one changes the parameters.
The only reason a race condition might happen is if the code were trying to execute set of instructions inside inner the loop in parallel (which cannot be done because of the model structure, on iteration t it depends on iteration t-1) and not to parallelize the outer, but if I'm not mistaken that is what the parallel for constructor does for default if put just outside the outer...
This is basically the form of the code I'm trying to run:
mat result(n_param,T_MAX);
#pragma omp parallel for
for(int i=0,i<n_param_set;i++){
t=0;
rowvec jnk(T_MAX);
while(t < T_MAX){
...
jnk(t) = something(jnk(t-1));
...
t++;
}
result.row(i)=jnk;
}
return wrap(result);
And my question is: How I tell the compiler that I just want to compute in parallel the outer loop (even distributing them statically like n_loops/n_threads for each thread) and not the inner one (which is actually non-parallelizable)?
The real code is a bit more involved and I'll present it here for the sake of reproducibility if you're really willing, but I'm only asking about the behavior of OpenMP. Please notice that the only OpenMP instruction appears at line 122.
library(Rcpp);library(RcppArmadillo);library(inline)
misc='
#include <math.h>
#define _USE_MATH_DEFINES
#include <omp.h>
using namespace arma;
template <typename T> int sgn(T val) {
return (T(0) < val) - (val < T(0));
}
uvec rmultinomial(int n,vec prob)
{
int K = prob.n_elem;
uvec rN = zeros<uvec>(K);
double p_tot = sum(prob);
double pp;
for(int k = 0; k < K-1; k++) {
if(prob(k)>0) {
pp = prob[k] / p_tot;
rN(k) = ((pp < 1.) ? (rbinom(1,(double) n, pp))(0) : n);
n -= rN[k];
} else
rN[k] = 0;
if(n <= 0) /* we have all*/
return rN;
p_tot -= prob[k]; /* i.e. = sum(prob[(k+1):K]) */
}
rN[K-1] = n;
return rN;
}
'
model_and_summary='
mat SEIR_sim_plus_summaries()
{
vec alpha;
alpha << 0.002 << 0.0045;
vec beta;
beta << 0.01 << 0.01;
vec gamma;
gamma << 1.0/14.0 << 1.0/14.0;
vec sigma;
sigma << 1.0/(3.5) << 1.0/(3.5);
vec phi;
phi << 0.8 << 0.8;
int S_0 = 800;
int E_0 = 100;
int I_0 = 100;
int R_0 = 0;
int pop = 1000;
double tau = 0.01;
double t_0 = 0;
vec obs_time;
obs_time << 1 << 2 << 3 << 4 << 5 << 6 << 7 << 8 << 9 << 10 << 11 << 12 << 13 << 14 << 15 << 16 << 17 << 18 << 19 << 20 << 21 << 22 << 23 << 24;
const int n_obs = obs_time.n_elem;
const int n_part = alpha.n_elem;
mat stat(n_part,6);
//#pragma omp parallel for
for(int k=0;k<n_part;k++) {
ivec INC_i(n_obs);
ivec INC_o(n_obs);
// Event variables
double alpha_t;
int nX; //current number of people moving
vec rates(8);
uvec trans(4); // current transitions, e.g. from S to E,I,R,Universe
vec r(4); // rates e.g. from S to E, I, R, Univ.
/*********************** Initialize **********************/
int S_curr = S_0;
int S_prev = S_0;
int E_curr = E_0;
int E_prev = E_0;
int I_curr = I_0;
int I_prev = I_0;
int R_curr = R_0;
int R_prev = R_0;
int IncI_curr = 0;
int IncI_prev = 0;
int IncO_curr = 0;
int IncO_prev = 0;
double t_curr = t_0;
int t_idx =0;
while( t_idx < n_obs ) {
// next time preparation
t_curr += tau;
S_prev = S_curr;
E_prev = E_curr;
I_prev = I_curr;
R_prev = R_curr;
IncI_prev = IncI_curr;
IncO_prev = IncO_curr;
/*********************** description (rates) of the events **********************/
alpha_t = alpha(k)*(1+phi(k)*sin(2*M_PI*(t_curr+0)/52)); //real contact rate, time expressed in weeks
rates(0) = (alpha_t * ((double)I_curr / (double)pop ) * ((double)S_curr)); //e+1, s-1, r,i one s get infected (goes in E, not yey infectous)
rates(1) = (sigma(k) * E_curr); //e-1, i+1, r,s one exposed become infectous (goes in I) INCIDENCE!!
rates(2) = (gamma(k) * I_curr); //i-1, s,e, r+1 one i recover
rates(3) = (beta(k) * I_curr); //i-1, s, r,e one i dies
rates(4) = (beta(k) * R_curr); //i,e, s, r-1 one r dies
rates(5) = (beta(k) * E_curr); //e-1, s, r,i one e dies
rates(6) = (beta(k) * S_curr); //s-1 e, i ,r one s dies
rates(7) = (beta(k) * pop); //s+1 one susc is born
// Let the events occour
/*********************** S compartement **********************/
if((rates(0)+rates(6))>0){
nX = rbinom(1,S_prev,1-exp(-(rates(0)+rates(6))*tau))(0);
r(0) = rates(0)/(rates(0)+rates(6)); r(1) = 0.0; r(2) = 0; r(3) = rates(6)/(rates(0)+rates(6));
trans = rmultinomial(nX, r);
S_curr -= nX;
E_curr += trans(0);
I_curr += trans(1);
R_curr += trans(2);
//trans(3) contains dead individual, who disappear...we could avoid this using sequential conditional binomial
}
/*********************** E compartement **********************/
if((rates(1)+rates(5))>0){
nX = rbinom(1,E_prev,1-exp(-(rates(1)+rates(5))*tau))(0);
r(0) = 0.0; r(1) = rates(1)/(rates(1)+rates(5)); r(2) = 0.0; r(3) = rates(5)/(rates(1)+rates(5));
trans = rmultinomial(nX, r);
S_curr += trans(0);
E_curr -= nX;
I_curr += trans(1);
R_curr += trans(2);
IncI_curr += trans(1);
}
/*********************** I compartement **********************/
if((rates(2)+rates(3))>0){
nX = rbinom(1,I_prev,1-exp(-(rates(2)+rates(3))*tau))(0);
r(0) = 0.0; r(1) = 0.0; r(2) = rates(2)/(rates(2)+rates(3)); r(3) = rates(3)/(rates(2)+rates(3));
trans = rmultinomial(nX, r);
S_curr += trans(0);
E_curr += trans(1);
I_curr -= nX;
R_curr += trans(2);
IncO_curr += trans(2);
}
/*********************** R compartement **********************/
if(rates(4)>0){
nX = rbinom(1,R_prev,1-exp(-rates(4)*tau))(0);
r(0) = 0.0; r(1) = 0.0; r(2) = 0.0; r(3) = rates(4)/rates(4);
trans = rmultinomial(nX, r);
S_curr += trans(0);
E_curr += trans(1);
I_curr += trans(2);
R_curr -= nX;
}
/*********************** Universe **********************/
S_curr += pop - (S_curr+E_curr+I_curr+R_curr); //it should be poisson, but since the pop is fixed...
/*********************** Save & Continue **********************/
// Check if the time is interesting for us
if(t_curr > obs_time[t_idx]){
INC_i(t_idx) = IncI_curr;
INC_o(t_idx) = IncO_curr;
IncI_curr = IncI_prev = 0;
IncO_curr = IncO_prev = 0;
t_idx++;
}
//else just go on...
}
/*********************** Finished - Starting w/ stats **********************/
// INC_i is the useful variable, how can I change its reference withour copying it?
ivec incidence = INC_i; //just so if I want to use INC_o i have to change just this...
//Scan the epidemics to recover the summary stats (naively divide the data each 52 weeks)
double n_years = ceil((double)obs_time(n_obs-1)/52.0);
vec mu_attack(n_years);
vec ratio_attack(n_years-1);
vec peak(n_years);
vec atk(52);
peak(0)=0.0;
vec tmpExplo(52); //explosiveness
vec explo(n_years);
int year=0;
int week;
for(week=0 ; week<n_obs ; week++){
if(week - 52*year > 51){
mu_attack(year) = sum( atk )/(double)pop;
if(year>0)
ratio_attack(year-1) = mu_attack(year)/mu_attack(year-1);
for(int i=0;i<52;i++){
if(atk(i)>(peak(year)/2.0)){
tmpExplo(i) = 1.0;
} else {
tmpExplo(i) = 0.0;
}
}
explo(year) = sum(tmpExplo);
year++;
peak(year)=0.0;
}
atk(week-52*year) = incidence(week);
if( peak(year) < incidence(week) )
peak(year)=incidence(week);
}
if(week - 52*year > 51){
mu_attack(year) = sum( atk )/(double)pop;
} else {
ivec idx(52);
for(int i=0;i<52;i++)
{ idx(i) = i; } //take just the updated ones...
vec tmp = atk.elem(find(idx<(week - 52*year)));
mu_attack(year) = sum( tmp )/((double)pop * (tmp.n_elem/52.0));
ratio_attack(year-1) = mu_attack(year)/mu_attack(year-1);
for(int i=0;i<tmp.n_elem;i++){
if(tmp(i)>(peak(year)/2.0)){
tmpExplo(i) = 1.0;
} else {
tmpExplo(i) = 0.0;
}
}
for(int i=tmp.n_elem;i<52;i++)
tmpExplo(i) = 0.0; //to reset the others
explo(year) = sum(tmpExplo);
}
double correlation2;
double correlation4;
vec autocorr = acf(peak);
/***** ACF *****/
if(n_years<3){
correlation2=0.0;
correlation4=0.0;
} else {
if(n_years<5){
correlation2 = autocorr(1);
correlation4 = 0.0;
} else {
correlation2 = autocorr(1);
correlation4 = autocorr(3);
}
}
rowvec jnk(6);
jnk << sum(mu_attack)/(year+1.0)
<< (sum( log(ratio_attack)%log(ratio_attack) )/(n_years-1)) - (pow(sum( log(ratio_attack) )/(n_years-1),2))
<< correlation2 << correlation4 << max(peak) << sum(explo)/n_years;
stat.row(k) = jnk;
}
return stat;
}
'
main='
std::cout << "max_num_threads " << omp_get_max_threads() << std::endl;
RNGScope scope;
mat summaries = SEIR_sim_plus_summaries();
return wrap(summaries);
'
plug = getPlugin("RcppArmadillo")
## modify the plugin for Rcpp to support OpenMP
plug$env$PKG_CXXFLAGS <- paste('-fopenmp', plug$env$PKG_CXXFLAGS)
plug$env$PKG_LIBS <- paste('-fopenmp -lgomp', plug$env$PKG_LIBS)
SEIR_sim_summary = cxxfunction(sig=signature(),main,settings=plug,inc = paste(misc,model_and_summary),verbose=TRUE)
SEIR_sim_summary()
Thanks for the help!
NB: before you ask, I slightly modified the Rcpp multinomial sampling function just because I liked that way more than the one using pointer...not any other particular reason! :)
The core pseudo-random number generators (PRNGs) in R are not designed to be used in multithreaded environments. That is, their state is stored in a static array (dummy from src/main/PRNG.c) and therefore is shared among all threads. Moreover several other static structures are used to store states for the higher-level interfaces to the core PRNGs.
A possible solution could be that you put each call to rnorm() or other sampling functions inside named critical sections with all having the same name, e.g.:
...
#pragma omp critical(random)
rN(k) = ((pp < 1.) ? (rbinom(1,(double) n, pp))(0) : n);
...
if((rates(0)+rates(6))>0){
#pragma omp critical(random)
nX = rbinom(1,S_prev,1-exp(-(rates(0)+rates(6))*tau))(0);
...
Note that the critical construct operates on the structured block following it and therefore locks the entire statement. If a random number is being drawn inline inside a call to a time-consuming function, e.g.
#pragma omp critical(random)
x = slow_computation(rbinom(...));
this is better transformed to:
#pragma omp critical(random)
rb = rbinom(...);
x = slow_computation(rb);
That way only the rb = rbinom(...); statement will be protected.

Red-Black Gauss Seidel and OpenMP

I was trying to prove a point with OpenMP compared to MPICH, and I cooked up the following example to demonstrate how easy it was to do some high performance in OpenMP.
The Gauss-Seidel iteration is split into two separate runs, such that in each sweep every operation can be performed in any order, and there should be no dependency between each task. So in theory each processor should never have to wait for another process to perform any kind of synchronization.
The problem I am encountering, is that I, independent of problem size, find there is only a weak speed-up of 2 processors and with more than 2 processors it might even be slower.
Many other linear paralleled routine I can obtain very good scaling, but this one is tricky.
My fear is that I am unable to "explain" to the compiler that operation that I perform on the array, is thread-safe, such that it is unable to be really effective.
See the example below.
Anyone has any clue on how to make this more effective with OpenMP?
void redBlackSmooth(std::vector<double> const & b,
std::vector<double> & x,
double h)
{
// Setup relevant constants.
double const invh2 = 1.0/(h*h);
double const h2 = (h*h);
int const N = static_cast<int>(x.size());
double sigma = 0;
// Setup some boundary conditions.
x[0] = 0.0;
x[N-1] = 0.0;
// Red sweep.
#pragma omp parallel for shared(b, x) private(sigma)
for (int i = 1; i < N-1; i+=2)
{
sigma = -invh2*(x[i-1] + x[i+1]);
x[i] = (h2/2.0)*(b[i] - sigma);
}
// Black sweep.
#pragma omp parallel for shared(b, x) private(sigma)
for (int i = 2; i < N-1; i+=2)
{
sigma = -invh2*(x[i-1] + x[i+1]);
x[i] = (h2/2.0)*(b[i] - sigma);
}
}
Addition:
I have now also tried with a raw pointer implementation and it has the same behavior as using STL container, so it can be ruled out that it is some pseudo-critical behavior comming from STL.
First of all, make sure that the x vector is aligned to cache boundaries. I did some test, and I get something like a 100% improvement with your code on my machine (core duo) if I force the alignment of memory:
double * x;
const size_t CACHE_LINE_SIZE = 256;
posix_memalign( reinterpret_cast<void**>(&x), CACHE_LINE_SIZE, sizeof(double) * N);
Second, you can try to assign more computation to each thread (in this way you can keep cache-lines separated), but I suspect that openmp already does something like this under the hood, so it may be worthless with large N.
In my case this implementation is much faster when x is not cache-aligned.
const int workGroupSize = CACHE_LINE_SIZE / sizeof(double);
assert(N % workGroupSize == 0); //Need to tweak the code a bit to let it work with any N
const int workgroups = N / workGroupSize;
int j, base , k, i;
#pragma omp parallel for shared(b, x) private(sigma, j, base, k, i)
for ( j = 0; j < workgroups; j++ ) {
base = j * workGroupSize;
for (int k = 0; k < workGroupSize; k+=2)
{
i = base + k + (redSweep ? 1 : 0);
if ( i == 0 || i+1 == N) continue;
sigma = -invh2* ( x[i-1] + x[i+1] );
x[i] = ( h2/2.0 ) * ( b[i] - sigma );
}
}
In conclusion, you definitely have a problem of cache-fighting, but given the way openmp works (sadly I am not familiar with it) it should be enough to work with properly allocated buffers.
I think the main problem is about type of array structure you are using. Lets try comparing results with vectors and arrays. (Arrays = c-arrays using new operator).
Vector and array sizes are N = 10000000. I force the smoothing function to repeat in order to maintain runtime > 0.1secs.
Vector Time: 0.121007 Repeat: 1 MLUPS: 82.6399
Array Time: 0.164009 Repeat: 2 MLUPS: 121.945
MLUPS = ((N-2)*repeat/runtime)/1000000 (Million Lattice Points Update per second)
MFLOPS are misleading when it comes to grid calculation. A few changes in the basic equation can lead to consider high performance for the same runtime.
The modified code:
double my_redBlackSmooth(double *b, double* x, double h, int N)
{
// Setup relevant constants.
double const invh2 = 1.0/(h*h);
double const h2 = (h*h);
double sigma = 0;
// Setup some boundary conditions.
x[0] = 0.0;
x[N-1] = 0.0;
double runtime(0.0), wcs, wce;
int repeat = 1;
timing(&wcs);
for(; runtime < 0.1; repeat*=2)
{
for(int r = 0; r < repeat; ++r)
{
// Red sweep.
#pragma omp parallel for shared(b, x) private(sigma)
for (int i = 1; i < N-1; i+=2)
{
sigma = -invh2*(x[i-1] + x[i+1]);
x[i] = (h2*0.5)*(b[i] - sigma);
}
// Black sweep.
#pragma omp parallel for shared(b, x) private(sigma)
for (int i = 2; i < N-1; i+=2)
{
sigma = -invh2*(x[i-1] + x[i+1]);
x[i] = (h2*0.5)*(b[i] - sigma);
}
// cout << "In Array: " << r << endl;
}
if(x[0] != 0) dummy(x[0]);
timing(&wce);
runtime = (wce-wcs);
}
// cout << "Before division: " << repeat << endl;
repeat /= 2;
cout << "Array Time:\t" << runtime << "\t" << "Repeat:\t" << repeat
<< "\tMLUPS:\t" << ((N-2)*repeat/runtime)/1000000.0 << endl;
return runtime;
}
I didn't change anything in the code except than array type. For better cache access and blocking you should look into data alignment (_mm_malloc).