Optimize c++ Monte Carlo simulation with long dynamic arrays - c++

This is my first post here and I am not that experienced, so please excuse my ignorance.
I am building a Monte Carlo simulation in C++ for my PhD and I need help in optimizing its computational time and performance. I have a 3d cube repeated in each coordinate as a simulation volume and inside every cube magnetic particles are generated in clusters. Then, in the central cube a loop of protons are created and move and at each step calculate the total magnetic field from all the particles (among other things) that they feel.
At this moment I define everything inside the main function and because I need the position of the particles for my calculations (I calculate the distance between the particles during their placement and also during the proton movement), I store them in dynamic arrays. I haven't used any class or function,yet. This makes my simulations really slow because I have to use eventually millions of particles and thousands of protons. Even with hundreds it needs days. Also I use a lot of for and while loops and reading/writing to .dat files.
I really need your help. I have spent weeks trying to optimize my code and my project is behind schedule. Do you have any suggestion? I need the arrays to store the position of the particles .Do you think classes or functions would be more efficient? Any advice in general is helpful. Sorry if that was too long but I am desperate...
Ok, I edited my original post and I share my full script. I hope this will give you some insight regarding my simulation. Thank you.
Additionally I add the two input files
parametersDiffusion_spher_shel.txt
parametersIONP_spher_shel.txt
#include <stdio.h>
#include <iostream>
#include <fstream>
#include <stdlib.h>
#include <math.h>
#include <iomanip> //precision to output
//#include <time.h>
#include <ctime>
#include <cstdlib>
#include <algorithm>
#include <string>
//#include <complex>
#include <chrono> //random generator
#include <random>
using namespace std;
#define PI 3.14159265
#define tN 500000 //# of timepoints (steps) to define the arrays ONLY
#define D_const 3.0E-9 //diffusion constant (m^2/s)
#define Beq 0.16 // Tesla
#define gI 2.6752218744E8 //(sT)^-1
int main(){
//Mersenne Twister random engine
mt19937 rng(chrono::steady_clock::now().time_since_epoch().count());
//uniform_int_distribution<int> intDist(0,1);
uniform_real_distribution<double> realDist(0.,1.);
//for(int i=1; i<100; i++){
//cout<<"R max: "<<Ragg-Rspm<<" r_spm: "<<(Ragg-Rspm)*sqrt(realDist(rng))<<endl;
//}
/////////////////////////////////////////////////////////////////////////////////////////////////////////
//input files
double Rionp=1.0E-8, Ragg=2.0E-7, t_tot=2.0E-2, l_tot = 3.0E-4;
int ionpN=10, aggN=10,cubAxN=10, parN=1E5;
int temp_ionpN, temp_aggN, temp_cubAxN, temp_parN;
ifstream inIONP;
inIONP.open("parametersIONP_spher_shel.txt");
if(!inIONP){
cout << "Unable to open IONP parameters file";
exit(1); // terminate with error
}
while (inIONP>>Rionp>>Ragg>>temp_ionpN>>temp_aggN>>l_tot>>temp_cubAxN) {
ionpN = (int)temp_ionpN;
aggN = (int)temp_aggN;
cubAxN = (int)temp_cubAxN;
}
inIONP.close();
cout<<"Rionp: "<<Rionp<<" ionpN: "<<ionpN <<" aggN: "<<aggN<<endl;
cout<<"l_tot: "<<l_tot<<" cubAxN: "<<cubAxN<<endl;
ifstream indiff;
indiff.open("parametersDiffusion_spher_shel.txt");
if(!indiff){
cout << "Unable to open diffusion parameters file";
exit(1); // terminate with error
}
while (indiff>>temp_parN>>t_tot) {
parN = (int)temp_parN;
}
indiff.close();
cout<<"parN: "<<parN<<" t_tot: "<<t_tot<<endl;
/////////////////////////////////////////////////////////////////////////////////////////////////
int cubN = pow(cubAxN,3.); // total cubes
int Nionp_tot = ionpN*aggN*cubN; //total IONP
double f_tot = (double)Nionp_tot*(4.*PI*pow(Rionp,3.)/3.)/pow(l_tot,3.);//volume density
//central cube
double l_c = l_tot/(double)cubAxN;
int Nionp_c = ionpN*aggN; //SPM in central cube
double f_c = (double)Nionp_c*(4.*PI*pow(Rionp,3.)/3.)/pow(l_c,3.);
cout<<"f_tot: "<<f_tot<<" Nionp_tot: "<<Nionp_tot<<" l_tot "<<l_tot<<endl;
cout<<"f_c: "<<f_c<<" Nionp_c: "<<Nionp_c<<" l_c "<<l_c<<endl;
cout<<"Now IONP are generated..."<<endl;
//position of aggregate (spherical distribution IONP)
double *x1_ionp, *x2_ionp, *x3_ionp, *theta_ionp, *phi_ionp, *r_ionp, *x1_agg, *x2_agg, *x3_agg;
x1_ionp = new double [Nionp_tot];
x2_ionp = new double [Nionp_tot];
x3_ionp = new double [Nionp_tot];
theta_ionp = new double [Nionp_tot];
phi_ionp = new double [Nionp_tot];
r_ionp = new double [Nionp_tot];
x1_agg = new double [Nionp_tot];
x2_agg = new double [Nionp_tot];
x3_agg = new double [Nionp_tot];
int ionpCounter = 0;
int aggCounter = 0;
double x1_aggTemp=0., x2_aggTemp=0., x3_aggTemp=0.;
double ionpDist = 0.; //distance SPM-SPM
for(int a=0; a<cubAxN; a++){ //x1-filling cubes
for(int b=0; b<cubAxN; b++){ //x2-
for(int c=0; c<cubAxN; c++){ //x3-
bool far_ionp = true;
cout<<"cube: (a, b, c): ("<<a<<", "<<b<<", "<<c<<")"<<endl;
for(int i=0; i<aggN; i++){ //aggregate iterations
x1_aggTemp=realDist(rng)*l_c + l_c*a - l_tot/2.; //from neg to pos filling
x2_aggTemp=realDist(rng)*l_c + l_c*b - l_tot/2.;
x3_aggTemp=realDist(rng)*l_c + l_c*c - l_tot/2.;
for(int j=0; j<ionpN; j++){ //SPM iterations
// cout<<"SPM: "<<j<<" aggregate: "<<i<<" cube: (a, b, c): ("<<a<<", "<<b<<", "<<c<<")"<<endl;
x1_agg[ionpCounter]=x1_aggTemp;
x2_agg[ionpCounter]=x2_aggTemp;
x3_agg[ionpCounter]=x3_aggTemp;
//uniform 4pi distribution in sphere
while(true){
far_ionp = true; //must be updated!
theta_ionp[ionpCounter] = 2.*PI*realDist(rng);
phi_ionp[ionpCounter] = acos(1. - 2.*realDist(rng));
r_ionp[ionpCounter] = (Ragg-Rionp)*sqrt(realDist(rng)); // to have uniform distribution sqrt
x1_ionp[ionpCounter] = sin(phi_ionp[ionpCounter])*cos(theta_ionp[ionpCounter])*r_ionp[ionpCounter] + x1_agg[ionpCounter];
x2_ionp[ionpCounter] = sin(phi_ionp[ionpCounter])*sin(theta_ionp[ionpCounter])*r_ionp[ionpCounter] + x2_agg[ionpCounter];
x3_ionp[ionpCounter] = cos(phi_ionp[ionpCounter])*r_ionp[ionpCounter] + x3_agg[ionpCounter];
for(int m=0; m<ionpCounter; m++){ //impenetrable IONP to each other
ionpDist = sqrt(pow(x1_ionp[m]-x1_ionp[ionpCounter],2.)+pow(x2_ionp[m]-x2_ionp[ionpCounter],2.)+pow(x3_ionp[m]-x3_ionp[ionpCounter],2.));
//cout<<"spmDist: "<<spmDist<<endl;
if((j>0) && (ionpDist <= 2*Rionp)){
far_ionp = false;
cout<<"CLOSE ionp-ionp! Distanse ionp-ionp: "<<ionpDist<<endl;
}
}
if(far_ionp){
cout<<"IONP can break now! ionpCounter: "<<ionpCounter<<endl;
break;
}
}
cout<<"r_ionp: "<<r_ionp[ionpCounter]<<" x1_ionp: "<<x1_ionp[ionpCounter]<<" x2_ionp: "<<x2_ionp[ionpCounter]<<" x3_ionp: "<<x3_ionp[ionpCounter]<<endl;
cout<<"x1_agg: "<<x1_agg[ionpCounter]<<" x2_agg: "<<x2_agg[ionpCounter]<<" x3_agg: "<<x3_agg[ionpCounter]<<endl;
ionpCounter++;
}
aggCounter++;
}
}
}
}
cout<<"ionpCounter: "<<ionpCounter<<" aggCounter: "<<aggCounter<<endl;
//=====proton diffusion=============//
//outfile
//proton diffusion time-positionSPM_uniform
FILE *outP_tPos;
outP_tPos = fopen("V3_MAT_positionProtons_spherical.dat","wb+");
if(!outP_tPos){// file couldn't be opened
cerr << "Error: file could not be opened" << endl;
exit(1);
}
//proton diffusion time-positionSPM_uniform
FILE *outP_tB;
outP_tB = fopen("V3_MAT_positionB_spherical.dat","wb+");
if(!outP_tB){// file couldn't be opened
cerr << "Error: file could not be opened" << endl;
exit(1);
}
double *cosPhase_S, *sinPhase_S, *m_tot;
cosPhase_S = new double [tN];
sinPhase_S = new double [tN];
m_tot = new double [tN];
double tstep = 0.; // time of each step
int stepCounter = 0; // counter for the steps for each proton
int cnt_stpMin=0; //, cnt_stpMax=0; //counters for the step length conditions
for (int i=0; i<parN; i++){// repetition for all the protons in the sample
stepCounter = 0; //reset
cout<<"Now diffusion calculated for proton: "<<i<<endl;
double x0[3]={0.}, xt[3]={0.}, vt[3]={0.};
double tt=0.;
double stepL_min = Rionp/8.; //min step length
double stepL_max = Rionp; //max step length
double stepL = 0.;
double extraL = 0.; //extra length beyond central cube
bool hit_ionp = false;
double pIONPDist = 0.; // proton-IONP Distance (vector ||)
double pIONPCosTheta = 0.; //proton_IONP vector cosTheta with Z axis
double Bloc = 0.; //B 1 IONP
double Btot = 0.; //SUM B all IONP
double Dphase = 0.; //Delta phase for step 1p
double phase = 0.; //phase 1p
double theta_p=0., phi_p=0.;
//randomized initial position of the particle;
x0[0] = realDist(rng)*l_c - l_c/2.;
x0[1] = realDist(rng)*l_c - l_c/2.;
x0[2] = realDist(rng)*l_c - l_c/2.;
//for (int j=0; j<tN; j++){ //steps
bool diffTime = true; // flag protons are allowed to diffuse (tt<10ms)
while(diffTime){ // steps loop
//unit vector for 4p direction
theta_p = 2.*PI*realDist(rng);
phi_p = acos(1. - 2.*realDist(rng));
vt[0] = sin(phi_p)*cos(theta_p);
vt[1] = sin(phi_p)*sin(theta_p);
vt[2] = cos(phi_p);;
//determine length of step
for(int k=0; k<ionpCounter; k++){
if(abs(sqrt(pow(x1_ionp[k]-x0[0],2.)+pow(x2_ionp[k]-x0[1],2.)+pow(x3_ionp[k]-x0[2],2.))-Rionp) <= 8*Rionp){
//spm closer than 8R
stepL = Rionp/8;
cnt_stpMin ++;
break;
}
else if(abs(sqrt(pow(x1_ionp[k]-x0[0],2.)+pow(x2_ionp[k]-x0[1],2.)+pow(x3_ionp[k]-x0[2],2.))-Rionp) > 8*Rionp){
stepL = Rionp;
}
else{
cout<<"sth wrong with the proton-IONP distance!"<<endl;
}
}
//determine Dt step duration
tstep = pow(stepL,2.)/(6.*D_const);
tt += tstep;
if(tt>t_tot){
diffTime = false; //proton is not allowed to diffuse any longer
cout<<"Proton id: "<<i<<" has reached diffusion time! -> Move to next one!"<<endl;
cout<<"stepCounter: "<<stepCounter<<" cnt_stpMin: "<<cnt_stpMin<<endl;
}
while(true){
xt[0]=x0[0]+vt[0]*stepL;
xt[1]=x0[1]+vt[1]*stepL;
xt[2]=x0[2]+vt[2]*stepL;
for(int m=0; m<3; m++){
if(abs(xt[m]) > l_c/2.){ //particle outside central cube,// reflected, elastic collision(no!)
//particle enters fron the other way, periodicity
// hit_cx[m] = true; //I don't need it yet
extraL = abs(xt[m]) - l_c/2.;
// xt[m]=-x0[m];
cout<<"proton outside! xt[m]: "<<xt[m]<<" extra lenght: "<<extraL<<endl;
xt[m] = xt[m]-l_c;
cout<<"Relocating => new x[t]: "<<xt[m]<<endl;
}
}
for(int k=0; k<ionpCounter; k++){//check if proton inside SPM
pIONPDist = sqrt(pow((x1_ionp[k]-xt[0]),2.)+pow((x2_ionp[k]-xt[1]),2.)+pow((x3_ionp[k]-xt[2]),2.)) - Rionp;
if(pIONPDist <= 0.){
cout<<"proton inside IONP => reposition! Distance: "<<pIONPDist<<" Rionp: "<<Rionp<<endl;
hit_ionp = true;
}
else if(pIONPDist > 0.){
hit_ionp=false; //with this I don't have to reset flag in the end
//calculations of Bloc for this position
pIONPCosTheta = (x3_ionp[k]-xt[2])/pIONPDist;
Bloc = pow(Rionp,3.)*Beq*(3.*pIONPCosTheta - 1.)/pow(pIONPDist,3.);
Btot += Bloc;
//cout<<"pSPMDist: "<<pSPMDist<<" pSPMCosTheta: "<<pSPMCosTheta<<" Bloc: "<<Bloc<<" Btot: "<<Btot<<endl;
}
else{
cout<<"Something wrong with the calculation of pIONPDist! "<<pIONPDist<<endl;
hit_ionp = true;
}
}
if(!hit_ionp){
// hit_spm=false; //reset flag (unnessesary alreaty false)
break;
}
}// end of while for new position -> the new position is determined, Btot calculated
// Dphase, phase
Dphase = gI*Btot*tstep;
phase += Dphase;
//store phase for this step
//filled for each proton at this timepoint (step)
cosPhase_S[stepCounter] += cos(phase);
sinPhase_S[stepCounter] += sin(phase);
//reset Btot
Btot = 0.;
stepCounter++;
} //end of for loop step
} //end of for loop particles
//-----calculate the <m> the total magnetization
for(int t=0; t<tN; t++){
m_tot[t] = sqrt(pow(cosPhase_S[t],2.) + pow(sinPhase_S[t],2.))/(double)parN;
//cout<<"m_tot[t]: "<<m_tot[t]<<endl;
}
fclose(outP_tPos); //proton time-position
fclose(outP_tB); //proton time-B
//====== outfile data=============//
//----- output data of SPM position---------//
FILE *outP_S;
outP_S = fopen("V3_MAT_positionSPM_spherical.dat","wb+");
if(!outP_S){// file couldn't be opened
cerr << "Error: file could not be opened" << endl;
exit(1);
}
for (int i=0; i<ionpCounter; ++i){
fprintf(outP_S,"%.10f \t %.10f \t %.10f\n",x1_ionp[i],x2_ionp[i],x3_ionp[i]);
}
fclose(outP_S);
FILE *outP_agg;
outP_agg = fopen("V3_MAT_positionAggreg_spherical.dat","wb+");
if(!outP_agg){// file couldn't be opened
cerr << "Error: file could not be opened" << endl;
exit(1);
}
for (int j=0; j<ionpCounter; ++j){
fprintf(outP_agg,"%.10f \t %.10f \t %.10f\n",x1_agg[j],x2_agg[j],x3_agg[j]);
}
fclose(outP_agg);
FILE *outSngl;
outSngl = fopen("V3_MAT_positionSingle_spherical.dat","wb+");
if(!outSngl){// file couldn't be opened
cerr << "Error: file could not be opened" << endl;
exit(1);
}
int findAgg = (int)(realDist(rng)*aggN);
int idxMin = findAgg*ionpN;
int idxMax = idxMin + ionpN;
for (int k=idxMin; k<idxMax; ++k){
fprintf(outSngl,"%.10f\t%.10f\t%.10f\t%.10f\t%.10f\t%.10f\n",x1_agg[k],x2_agg[k],x3_agg[k],x1_ionp[k],x2_ionp[k],x3_ionp[k]);
}
fclose(outSngl);
//delete new arrays
delete[] x1_ionp;
delete[] x2_ionp;
delete[] x3_ionp;
delete[] theta_ionp;
delete[] phi_ionp;
delete[] r_ionp;
delete[] x1_agg;
delete[] x2_agg;
delete[] x3_agg;
delete[] cosPhase_S;
delete[] sinPhase_S;
delete[] m_tot;
}

I talked the problem in more steps, first thing I made the run reproducible:
mt19937 rng(127386261); //I want a deterministic seed
Then I create a script to compare the three output files generated by the program:
#!/bin/bash
diff V3_MAT_positionAggreg_spherical.dat V3_MAT_positionAggreg_spherical2.dat
diff V3_MAT_positionSingle_spherical.dat V3_MAT_positionSingle_spherical2.dat
diff V3_MAT_positionSPM_spherical.dat V3_MAT_positionSPM_spherical2.dat
Where the files ending in two is created by the optimized code and the other by your version.
I run your version compiling with O3 flag and marked the time (for 20 magnetic particles and 10 protons it is taking 79 seconds on my box, my architecture is not that important because we are just going to compare the differences).
Then I start refactoring steps by steps, running every small changes comparing the output files and the time, here are all the iterations:
Remove redundant else if gain 5 seconds (total run 74.0 s)
if(sqrt(pow(x1_ionp[k]-x0[0],2.)+pow(x2_ionp[k]-x0[1],2.)+pow(x3_ionp[k]-x0[2],2.)) <= 7*Rionp){
//spm closer than 8R
stepL = Rionp/8;
cnt_stpMin ++;
break;
}
else { //this was an else if and an else for error that will never happen
stepL = Rionp;
}
At this point, I run it under the profiler and pow function stood out.
Replacing pow with square and cube gain 61 seconds (total run 13.2 s)
Simply replacing pow(x,2.) with square(x) and pow(x,3.) with cube(x) will reduce the run time by about 600%
double square(double d)
{
return d*d;
}
double cube(double d)
{
return d*d*d;
}
Now the gain is reduced quite a lot for each improvement, but still.
Remove redundant sqrt gain (total run 12.9 s)
double ionpDist = square(x1_ionp[m]-x1_ionp[ionpCounter])+square(x2_ionp[m]-x2_ionp[ionpCounter])+square(x3_ionp[m]-x3_ionp[ionpCounter]);
//cout<<"spmDist: "<<spmDist<<endl;
if((j>0) && (ionpDist <= 4*square_Rionp)){
Introducing const variable square_Rionp and cube_Rionp (total run 12.7 s)
const double square_Rionp = square(Rionp);
const double cube_Rionp = cube(Rionp);
//replaced in the code like this
if((j>0) && (ionpDist <= 4*square_Rionp)){
Introducing variable for pi (total run 12.6 s)
const double Two_PI = PI*2.0;
const double FourThird_PI = PI*4.0/3.0;
Remove a (another) redundant else if (total run 11.9s)
if(pIONPDist <= 0.){
cout<<"proton inside IONP => reposition! Distance: "<<pIONPDist<<" Rionp: "<<Rionp<<endl;
hit_ionp = true;
}
else { //this was an else if without any reason
hit_ionp=false; //with this I don't have to reset flag in the end
//calculations of Bloc for this position
pIONPCosTheta = (x3_ionp[k]-xt[2])/pIONPDist;
...
}
Remove another redundant sqrare root (total run 11.2 s)
const double Seven_Rionp_squared =square(7*Rionp);
...
for(int k=0; k<ionpCounter; k++){
if(square(x1_ionp[k]-x0[0])+square(x2_ionp[k]-x0[1])+square(x3_ionp[k]-x0[2]) <= Seven_Rionp_squared){
//spm closer than 8R
stepL = stepL_min;
cnt_stpMin ++;
break;
}
I don't see many more things obvious to squeeze more performance out of it. Further optimization may require some thinking.
I did another comparison run with 50 magnetic particles and 10 protons and I have found that my version is 7 times faster then the yours and it is producing the exact same files.
I would do this exercise with the help of source control.
Your code is trivially parallelizable, but I will go that route just when you have optimized the single thread version.
EDIT
Change += with = operator (total run 6.23 s)
I have noticed that += operator is used for no reason, the substitution to operator = is a substanzial gain:
cosPhase_S[stepCounter] = cos(phase);
sinPhase_S[stepCounter] = sin(phase);

Related

How can I make my c++ code run faster with eigen library?

I have written a parallelized c++ code, which functions as follows :
There are 75 'w' points, and each of them is sent to one processor.
For each 'w' point, I am defining a matrix. Then I am diagonalizing it. I am using the eigenvectors to compute a particular quantity by summing over the fourth power of each of the matrix elements. And then I average this quantity over 300 iterations of the matrix.
So I am using Eigen package for this calculation, and I compile the code with mpiCC -I eigen -Ofast filename.cpp. For a 512 x 512 matrix, the whole procedure takes 2.5 hours. Currently I need to do the same for a 2748 x 2748 matrix, and it's still going on after approx. 12:30 hrs. Is there anyway I can make the code run faster?
The code is given here for reference :
#include <iostream>
#include <complex>
#include <cmath>
#include<math.h>
#include<stdlib.h>
#include<time.h>
#include<Eigen/Dense>
#include<fstream>
#include<random>
#include "mpi.h"
#define pi 3.14159
using namespace std;
using namespace Eigen;
#define no_of_processor 75 // no of processors used for computing
#define no_of_disorder_av 300 //300 iterations for each w
#define A_ratio 1 //aspect ratio Ly/Lx
#define Lx 8
#define w_init 0.1 // initial value of potential strength
#define del_w 0.036 // increment of w in each loop
#define w_loop 75 // no of different w
#define alpha (sqrt(5.0)-1.0)/(double)2.0
double onsite_pot(int x,int y, int z, double phi, double alpha_0){
double B11=alpha;
double B12=alpha;
double B13=alpha;
double b1= (double)B11*x+(double)B12*y+(double)B13*z;
double c11= 1.0-cos(2*M_PI*b1+phi); //printf("%f\n",c1);
double c12= 1.0+(alpha_0*cos(2*M_PI*b1+phi));
double c1=c11/c12;
return c1;
}
int main(int argc, char *argv[])
{
clock_t begin = clock();
/*golden ratio----------------------------*/
char filename[200];
double t=1.0;
int i,j,k,l,m;//for loops
double alpha_0=0;
int Ly=A_ratio*Lx;
int Lz= A_ratio*Lx;
int A=Lx*Ly;
int V=A*Lz; //size of the matrix
int numtasks,rank,RC;
RC=MPI_Init(&argc,&argv);
if (RC != 0) {
printf ("Error starting MPI program. Terminating.\n");
MPI_Abort(MPI_COMM_WORLD, RC);
}
MPI_Comm_size(MPI_COMM_WORLD,&numtasks);
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
sprintf(filename,"IPR3D%dalpha%g.dat",rank+1,alpha_0);
ofstream myfile;
myfile.open(filename, ios::app); //preparing file to write in
int n = w_loop/no_of_processor;
double w=w_init+(double)(n*rank*del_w);
int var_w_loop = 0;
MatrixXcd H(V,V); // matrix getting defined here
MatrixXcd evec(V,V); // matrix for eigenvector
VectorXcd temp(V); // vector for a temporary space used later in calculation
double IPR[V], E_levels[V]; // for average value of the quantity and eigen values.
do{
for(i=0;i<V;i++)
{
IPR[i]=E_levels[i]=0.0;
}
/*!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!*/
/*!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!*/
/*----loop for disorder average---------------------------*/
for (l=0;l<no_of_disorder_av;l++){
for (i=V; i<V; i++)
{
for (j=V; j<V; j++)
H(i,j)=0;
}
double phi=(2*M_PI*(double)l)/(double)no_of_disorder_av;
//matrix assignment starts
int z=0;
for (int plane=0; plane<Lz; plane++)
{
z += 1 ;
int y=0;
int indx1= plane*A ; //initial index of each plane
int indx2= indx1+A-1; // last index of each plane
for (int linchain=0; linchain<Ly; linchain++){
y += 1;
int x=0;
int indx3= indx1 + linchain*Lx ; //initial index of each chain
int indx4= indx3 + Lx-1 ; //last index of each chain
for (int latpoint=0; latpoint<Lx; latpoint++){
x += 1;
int indx5= indx3 +latpoint; //index of each lattice point
H(indx5,indx5)= 2*w*onsite_pot(x,y,z,phi,alpha_0); //onsite potential
if (indx5<indx4){ //hopping inside a chain
H(indx5,(indx5+1))= t; //printf("%d %d\n",indx5,indx5+1);
H((indx5+1),indx5)= t;
}
if (indx5<=(indx2-Lx)){ //hopping between different chain
H(indx5,(indx5+Lx))= t; //printf("%d %d\n",indx5,indx5+Lx);
H((indx5+Lx),indx5)= t;
printf("%d\n",indx5);
}
if (indx5<(V-A)){
H(indx5,(indx5+A))= t; //printf("%d %d\n",indx5,indx5+A);// hopping between different plane
H((indx5+A),indx5)= t;
}
} //latpoint loop
}//linchain loop
}//plane loop
//PB..............................................
for (int plane=0; plane<Lz; plane++){
int indx1= plane*A; //initial index of each plane
int indx2 = indx1+A-1 ;//last indx of each plane
//periodic boundary condition x
for (int linchain=0; linchain<Ly; linchain++){
int indx3 = indx1 + linchain*Lx; // initital index of each chain
int indx4=indx3+ Lx-1; //last index of each chain
H(indx3,indx4)= t; //printf("%d %d\n",indx3,indx4);
H(indx4,indx3)= t;
}//linchain loop
//periodic boundary condition y
for (int i=0; i<Lx; i++){
int indx5 = indx1+i;
int indx6 = indx5+(Ly-1)*Lx; //printf("%d %d\n",indx5,indx6);
H(indx5,indx6)=t;
H(indx6,indx5)=t;
}
}//plane loop
//periodic boundary condition in z
for (int i=0; i<A; i++){
int indx1=i ;
int indx2=(Lz-1)*A+i ;
H(indx1,indx2)= t; //printf("%d %d\n",indx1,indx2);
H(indx2,indx1)= t ;
}
//matrix assignment ends
/**-------------------------------------------------------*/
double Tr = abs(H.trace());
for(i=0;i<V;i++)
{
for(j=0;j<V;j++)
{
if(i==j)
{
H(i,j) = H(i,j)-(Tr/(double)V);
}
}
}
SelfAdjointEigenSolver<MatrixXcd> es(H); //defining the diagonalizing function
double *E = NULL;
E = new double[V]; // for the eigenvalues
for(i=0;i<V;i++)
{
E[i]=es.eigenvalues()[i];
//cout<<"original eigenvalues "<<E[i]<<"\n";
}
evec=es.eigenvectors();
double bandwidth = E[V-1] - E[0];
for(i=0;i<V;i++)
E[i]=E[i]/bandwidth;
for(i=0;i<V;i++)
{
E_levels[i] = E_levels[i]+E[i]; //summing over energies for each iteration
}
delete[] E;
E=NULL;
//main calculation process
for(i=0;i<V;i++)
{
temp = evec.col(i);
double num=0.0,denom=0.0;
for(j=0;j<V;j++)
{
num=num+pow(abs(temp(j)),4);
denom=denom+pow(abs(temp(j)),2);
}
IPR[i] = IPR[i]+(num/(denom*denom));
} //calculation ends
}//no_of_disorder_av loop (l)
for(i=0; i<V; i++)
{
myfile<<w<<"\t"<<(E_levels[i]/(double)no_of_disorder_av)<<"\t"
<<(IPR[i]/(double)no_of_disorder_av)<<"\n"; //taking output in file
}
var_w_loop++; // counts number of w loop
w+= del_w; // proceeds to next w
}while(var_w_loop<n) ; // w varying do while loop
MPI_Finalize();
clock_t end = clock();
double time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
printf("time spent %f s\n\n",time_spent);
return 0;
}

Why is my neural network stagnating around a certain cost?

I am making a neural network that's supposed to be capable of identifying handwritten numbers using the Mnist database downloadable here. The network works perfectly with one to 5 examples but after 10 it starts to get a bit iffy. Using a standard of 5000 examples the program will stagnate at around 0.42 cost (it starts at around 1.2 costs). all the output of the 10 neurons in the last layer will also trend towards 0.1 and the network is noticeably never very certain of it's guess because of this (usually the output of the guess will be around 0.1 to 0.2 with some exceptions)
Example of guess and outputs of the last layer after training for 5000 iterations:
Example: 5000
index: 2435
Cost: 0.459006
Expected: 0
Out_value 0: 0.0900279
Out_value 1: 0.104657
Out_value 2: 0.0980369
Out_value 3: 0.0990471
Out_value 4: 0.101716
Out_value 5: 0.0937537
Out_value 6: 0.0933432
Out_value 7: 0.114351
Out_value 8: 0.10058
Out_value 9: 0.0924466
Guess: 7
Guess certainty: 0.114351
false
I've tried adjusting the number and size of the h-layers and the learning rate, but the result is always the same (constantly jumping around a cost of around 0.42). I, of course, theorized that my backprop or math just didn't check out, but testing this with a test network based on a guide on backprop, link here my weights adjusted themselves perfect to the decimal according to the article. So I'm not sure what to do to prevent my network from stagnating and to make it learn at this point. Does anyone have any idea why it might be stagnating like this?
Relevant code in the cpp-file for the neural network:
#define _USE_MATH_DEFINES
#include <cmath>
#include <utility>
#include "neural_network.h"
#include <ctime>
#include <string>
#include <iostream>
#include <cstdlib>
namespace nn{
double fRand(const double& f_min, const double& f_max){ //generate random double from f_min to f_max
const auto f = static_cast<double>(rand()) / RAND_MAX;
return f_min + f * (f_max - f_min);
}
double sigmoid(const double& net) { //sigmoid function for out value
const double result = 1.0 / static_cast<double>(1.0 + pow(M_E, -net));
return result;
}
double xavier(int layer_from_size) { //function used to initialize initial weights.
const double val = sqrt(1.0 / static_cast<double>(layer_from_size));
return val;
}
double out_net_derivative(const double& out){ //derviative of out-value with respect to the net-value
const double val = out * (1 - out);
return val;
}
double cost_out_derivative(const double& out, const double& target)
//derivative of the cost with respect to the out-value for the neurons in the last layer
{
const double val = out - target;
return val;
}
double calc_cost(const Layer& layer, std::vector<double> target){ //calculating the total cost mainly for logging
double cost = 0;
for(int i = 0; i < target.size(); i++){
cost += pow(target[i] - layer.get_neurons()[i].get_out(), 2) / 2;
}
return cost;
}
double delta(const double& cost_out_derivative, const double& out)
//derivative of the cost with respect to the current neurons out multiplied by out_net_derivative
{
const double val = cost_out_derivative * out_net_derivative(out);
return val;
}
Weight::Weight(double weight, int neuron_from_index)
:weight_{ weight }, neuron_from_index_{ neuron_from_index }
{}
Neuron::Neuron(int pos, int layer) //creating a empty neuron
: net_{ 0.0 }, out_{ 0.0 }, error_gradient_{ 0.0 }, pos_{ pos }, layer_{ layer }
{
}
Neuron::Neuron(int pos, double out) //creating a neuron in the first layer with a pre-assigned out-value
: net_{ 0.0 }, out_{ out }, error_gradient_{ 0.0 }, pos_{ pos }, layer_{ 0 }
{
}
void Neuron::update_weights(const Layer& layer_from, const double& learning_rate){
for (Weight& weight : weights_to_) {
//derivative of net with respect to weight
double neuron_from_out = layer_from.get_neurons()[weight.get_neuron_from_index()].get_out();
//derivative of cost with respect to weight
double val = delta(error_gradient_, out_) * neuron_from_out;
weight.update_weight(val, learning_rate);
}
}
void Layer::update_error_gradient(Layer& layer_from)
//update all the error gradients (derivative of the cost with respect to the neurons out-value) in the previous layer (layer_from)
{
for (Neuron& neuron : layer_from.neurons_) neuron.set_error_gradient(0); //resetting all previous error gradients
for (int i = 0; i < neurons_.size(); i++) {
for (int j = 0; j < layer_from.get_neurons().size(); j++) {
double delta_val = delta(neurons_[i].get_error_gradient(), neurons_[i].get_out());
//partial derivative of cost with respect to the last layers neuron in position j
double val = neurons_[i].get_weights_to()[j].get_weight() * delta_val;
layer_from.neurons_[j].update_error_gradient(val);
}
}
}
void Layer::update_bias(const double& learning_rate){
for(const Neuron& neuron: neurons_){
//derivative of the cost with respect to the layer-bias
double val = out_net_derivative(neuron.get_out()) * neuron.get_error_gradient();
bias_ -= learning_rate * val;
}
}
void Neuron::set_weights(const int& layer_from_size){ //set initial weights for neuron
for(int i = 0; i < layer_from_size; i++){
//get random weight using xavier weight initialization
double v_val = fRand(-xavier(layer_from_size), xavier(layer_from_size));
Weight weight{ v_val, i };
weights_to_.push_back(weight);
}
}
void Layer::set_weights(const int& layer_from_size){ //set initial weights for layer
for (Neuron& neuron : neurons_) neuron.set_weights(layer_from_size);
}
void Network::set_weights(){ //set initial weights for network
//srand(time(NULL));
for(int i = 1; i < layers_.size(); i++){
layers_[i].set_weights(layers_[i - 1].get_neurons().size());
}
}
Layer::Layer(int pos, int size) //make layer of any size
: pos_{ pos }, bias_{ 0.0 }
{
for (int i = 0; i < size; i++) //fill with neurons according to desired size
{
Neuron neuron{ i, pos };
neurons_.push_back(neuron);
}
}
Layer::Layer(std::vector<Neuron> first_layer) //set the first layer of the network according pre-acquired neurons
:pos_{ 0 }, bias_{ 0.0 }, neurons_{std::move(first_layer)}
{}
void Layer::forward_pass(const Layer& layer_from){ //calculate net, and out-value of each neuron in layer
for(Neuron& neuron : neurons_){
double val = calc_net(layer_from, neuron, bias_);
neuron.set_net(val);
neuron.set_out(sigmoid(val));
}
}
void Network::forward_pass(){ //calculate net, and out-value of each neuron in network
for (int i = 1; i < layers_.size(); i++)
layers_[i].forward_pass(layers_[i - 1]);
}
void Layer::backprop(const Layer& layer_from, const double& learning_rate){ //backprop and thus update weights in layer
for (Neuron& neuron : neurons_)
neuron.update_weights(layer_from, learning_rate);
}
void Network::backprop(const std::vector<double>& target){ //backprop entire network and thus update weights and biases
forward_pass();
set_last_layer_error_grads(target);
for(int i = layers_.size() - 1; i > 0; i--){
//update error gradients for the previous layer in the network
layers_[i].update_error_gradient(layers_[i - 1]);
layers_[i].backprop(layers_[i - 1], learning_rate_);
layers_[i].update_bias(learning_rate_);
}
}
Network::Network(std::vector<int> structure, double learning_rate) //create a network skeleton
:learning_rate_{learning_rate}
{
for(int i = 0; i < structure.size(); i++){ //fill network with layers of various sizes according to structure
Layer layer{ i, structure[i] };
layers_.push_back(layer);
}
}
void Network::set_last_layer_error_grads(std::vector<double> target){
for (int i = 0; i < layers_[layers_.size() - 1].get_neurons().size(); i++) {
double val = cost_out_derivative(layers_[layers_.size() - 1].get_neurons()[i].get_out(), target[i]);
layers_[layers_.size() - 1].set_neuron_error_grad(i, val);
}
}
int Network::get_guess() const{ //get the networks guess for each example (image)
int guess = 0;
for (int i = 0; i < layers_[layers_.size() - 1].get_neurons().size(); i++) {
if (layers_[layers_.size() - 1].get_neurons()[guess].get_out() < layers_[layers_.size() - 1].get_neurons()[i].get_out())
guess = i;
//std::cout << "Guess certainty " << i << ":\t" << layers[layers.size() - 1].get_neurons()[i].get_out_value() << '\n';
std::cout << "Out_value " << i << ":\t" << layers_[layers_.size() - 1].get_neurons()[i].get_out() << '\n';
}
std::cout << "Guess:\t" << guess << '\n'
<< "Guess certainty:\t" << layers_[layers_.size() - 1].get_neurons()[guess].get_out() << "\n\n";
return guess;
}
int Network::get_weight_amount() const //get number of weights
{
int amount = 0;
for (int i = 1; i < layers_.size(); i++) {
amount += layers_[i - 1].get_neurons().size() * layers_[i].get_neurons().size();
}
return amount;
}
double calc_net(const Layer& layer_from, const Neuron& neuron, const double& bias){ // calculate net-value for specific neuron
const std::vector<Neuron>& neurons_from = layer_from.get_neurons();
const std::vector<Weight>& weights = neuron.get_weights_to();
if (neurons_from.size() != weights.size())
throw std::exception("there is not strictly one weight for each neuron in layer from.");
double net = 0;
//calculate net value with respect to the previous layers neurons and weights connecting them
for (int i = 0; i < neurons_from.size(); i++)
net += neurons_from[i].get_out() * weights[i].get_weight();
net += bias;
return net;
}
void Network::train(std::ifstream& practice_file, const int& sample_size, const int& practise_loops)
//train network with a specific sample size a specific number of times according to practice loops,
//getting necessary data for the first layer from a practice file
{
//srand(time(NULL));
std::vector<Layer> images;
std::vector<std::vector<double>> targets;
for(int i = 0; i < sample_size; i++){ //get and store all images and targets for the images in different vectors
std::vector<double> image = get_image(practice_file);
images.push_back(get_flayer(image));
targets.push_back(get_target(image, layers_[layers_.size() - 1].get_neurons().size()));
}
//backprop through random examples taken from the sample
for(int i = 0; i < practise_loops; i++){
int index = rand() % images.size();
layers_[0] = images[index];
backprop(targets[index]);
std::cout << "Example:\t" << i << '\n' << //logging
"index:\t" << index << '\n'
<< "Cost:\t" << calc_cost(layers_[layers_.size() - 1], targets[index]) << '\n';
if (correct_guess(targets[index]))
std::cout << "true\n";
else std::cout << "false\n";
}
}
double Network::test(std::ifstream& test_file, const int& sample_size){ //test network accuracy
int correct = 0;
std::vector<Layer> images;
std::vector<std::vector<double>> targets;
for (int i = 0; i < sample_size; i++) {
std::vector<double> image = get_image(test_file);
images.push_back(get_flayer(image));
targets.push_back(get_target(image, layers_[layers_.size() - 1].get_neurons().size()));
}
for(int i = 0; i < sample_size; i++)
{
layers_[0] = images[i];
forward_pass();
if (correct_guess(targets[i])) correct++; //keep track of correct guesses
}
double accuracy = 100 * correct / sample_size; //calculate accuracy
return accuracy;
}
std::vector<double> get_image(std::ifstream& ifs) { //get an image data from a file (specifically from the mnist files
std::vector<double> values; //all data converted to relevant format
std::string value; //data in string format
std::string line; //all data in string format
std::getline(ifs, line); //get image
//convert image string to relevant grey scale and target doubles and store them to be returned
for (const char& ch : line) {
switch (ch) {
case '0': case '1':
case '2': case '3':
case '4': case '5':
case '6': case '7':
case '8': case '9':
case '.':
value += ch;
break;
default:
values.push_back(std::stod(value));
value.clear();
break;
}
}
values.push_back(std::stod(value)); //store last piece of data
return values;
}
std::vector<double> get_target(const std::vector<double>& image, int last_layer_size){ //get target for an image
std::vector<double> target(last_layer_size);
//make sure that every neuron that is not the correct answer isn't lit up and do the opposite for the correct answer neuron
for(int i = 0; i < last_layer_size; i++){
//according to the file setup the first piece of data in the image is the target, hence image[0]
if (i == static_cast<int>(image[0])) target[i] = 1.0; //0.99
}
return target;
}
Layer get_flayer(std::vector<double> image) { //get the first layer through image
std::vector<Neuron> neurons;
image.erase(image.begin()); //throw away the target
for (int i = 0; i < image.size(); i++) {
Neuron neuron{ i, image[i] };
neurons.push_back(neuron);
}
Layer layer{ neurons };
return layer;
}
bool Network::correct_guess( const std::vector<double>& target) const{ //confirm if a guess by the network is correct
int excpected = 0;
for (int i = 0; i < target.size(); i++)
if (target[i] == 1.0) excpected = i; //the correct guess is the neuron position of the neuron fully lit of the bunch
std::cout << "Excpected:\t" << excpected << "\n\n";
return excpected == get_guess();
}
}
Link to full code including some exta functions in cpp-file, h-file, and main-file on GitHub for more context: Full code
So it turned out that the mistake I had made was related to how I read the greyscale values (the input) from the CSV-file containing the Mnist images converted to greyscale. I had neglected to divide each greyscale value with 255 (the highest possible greyscale value) to convert the greyscale to values between 0 and 1. Because of this, the second layer received huge input values thus causing the outputs of the second layer to explode massively. I have now fixed this issue and the program has around 87% accuracy when training with a file batch of 5000 images after 10000 iterations with one h-layer of size 16 (layer sizes: 784, 16, 10). While still not being optimal it is, of course, a huge improvement.

C++ Advice on manipulating output Matrix data

I have the following code.
Essentially it is creating N random normal variables, and running through an equation M times for a simulation.
The output should be an NxM matrix of data, however the only way I could do the calculation has the output as MxN. ie each M run should be a column, not a row.
I have attempted in vain to follow some of the other suggestions that have been posted on previous similar topics.
Code:
#include <iostream>
#include <time.h>
#include <random>
int main()
{
double T = 1; // End time period for simulation
int N = 4; // Number of time steps
int M = 2; // Number of simulations
double x0 = 1.00; // Starting x value
double mu = 0.00; // mu(x,t) value
double sig = 1.00; // sigma(x,t) value
double dt = T/N;
double sqrt_dt = sqrt(dt);
double** SDE_X = new double*[M]; // SDE Matrix setup
// Random Number generation setup
double RAND_N;
srand ((unsigned int) time(NULL)); // Generator loop reset
std::default_random_engine generator (rand());
std::normal_distribution<double> distribution (0.0,1.0); // Mean = 0.0, Variance = 1.0 ie Normal
for (int i = 0; i < M; i++)
{
SDE_X[i] = new double[N];
for (int j=0; j < N; j++)
{
RAND_N = distribution(generator);
SDE_X[i][0] = x0;
SDE_X[i][j+1] = SDE_X[i][j] + mu * dt + sig * RAND_N * sqrt_dt; // The SDE we wish to plot the path for
std::cout << SDE_X[i][j] << " ";
}
std::cout << std::endl;
}
std::cout << std::endl;
std::cout << " The simulation is complete!!" << std::endl;
std::cout << std::endl;
system("pause");
return 0;
}
Well why can't you just create the transpose of your SDE_X matrix then? Isn't that what you want to get?
Keep in mind, that presentation has nothing to do with implementation. Whether to access columns or rows is your decision. So you want an implementation of it transposed. Then quick and dirty create your matrix first, and then create your number series. Change i and j, and N and M.
I said quick and dirty, because the program at all is bad:
why don't you just keep it simple and use a better data structure for your matrix? If you know the size: compile-time array or dynamic vectors at runtime? Maybe there are some nicer implementation for 2d array.
There is a bug I think: you create N doubles and access index 0 to N inclusive.
In every iteration you set index 0 to x0 what is also needless.
I would change your code a bit make more clear:
create your matrix at first
initialize the first value of the matrix
provide an algorithm function calculating a target cell taking the matrix and the parameters.
Go through each cell and invoke your function for that cell
Thank you all for your input. I was able to implement my code and have it displayed as needed.
I added a second for loop to rearrange the matrix rows and columns.
Please feel free to let me know if you think there is anyway I can improve it.
#include <iostream>
#include <time.h>
#include <random>
#include <vector>
int main()
{
double T = 1; // End time period for simulation
int N = 3; // Number of time steps
int M = 2; // Number of simulations
int X = 100; // Max number of matrix columns
int Y = 100; // Max number of matrix rows
double x0 = 1.00; // Starting x value
double mu = 0.00; // mu(x,t) value
double sig = 1.00; // sigma(x,t) value
double dt = T/N;
double sqrt_dt = sqrt(dt);
std::vector<std::vector<double>> SDE_X((M*N), std::vector<double>((M*N))); // SDE Matrix setup
// Random Number generation setup
double RAND_N;
srand ((unsigned int) time(NULL)); // Generator loop reset
std::default_random_engine generator (rand());
std::normal_distribution<double> distribution (0.0,1.0); // Mean = 0.0, Variance = 1.0 ie Normal
for (int i = 0; i <= M; i++)
{
SDE_X[i][0] = x0;
for (int j=0; j <= N; j++)
{
RAND_N = distribution(generator);
SDE_X[i][j+1] = SDE_X[i][j] + mu * dt + sig * RAND_N * sqrt_dt; // The SDE we wish to plot the path for
}
}
for (int j = 0; j <= N; j++)
{
for (int i = 0; i <=M; i++)
{
std::cout << SDE_X[i][j] << ", ";
}
std::cout << std::endl;
}
std::cout << std::endl;
std::cout << " The simulation is complete!!" << std::endl;
std::cout << std::endl;
system("pause");
return 0;
}

OpenMP generate segfault in Rcpp code for the SEIR model

I wrote a (probably-inefficient, but anyway..) Rcpp code using inline to simulate a stochastic SEIR model.
The serial version compiles and works perfectly, but since I need to simulate from it a large number of times and since it seems to me like an embarrassingly parallel problem (just need to simulate again for other parameter values and return a matrix with the results) I tried to add #pragma omp parallel for and to compile with -fopenmp -lgomp but ... boom!
I get a segfault even for very small examples!
I tried to add setenv("OMP_STACKSIZE","24M",1); and values well over 24M but still the segfault happens.
I'll explain briefly the code since it's a bit long (I tried to shorten it but the result change and I can't reproduce it..):
I have two nested loops, the inner one execute the model for a given parameter set and the outer one changes the parameters.
The only reason a race condition might happen is if the code were trying to execute set of instructions inside inner the loop in parallel (which cannot be done because of the model structure, on iteration t it depends on iteration t-1) and not to parallelize the outer, but if I'm not mistaken that is what the parallel for constructor does for default if put just outside the outer...
This is basically the form of the code I'm trying to run:
mat result(n_param,T_MAX);
#pragma omp parallel for
for(int i=0,i<n_param_set;i++){
t=0;
rowvec jnk(T_MAX);
while(t < T_MAX){
...
jnk(t) = something(jnk(t-1));
...
t++;
}
result.row(i)=jnk;
}
return wrap(result);
And my question is: How I tell the compiler that I just want to compute in parallel the outer loop (even distributing them statically like n_loops/n_threads for each thread) and not the inner one (which is actually non-parallelizable)?
The real code is a bit more involved and I'll present it here for the sake of reproducibility if you're really willing, but I'm only asking about the behavior of OpenMP. Please notice that the only OpenMP instruction appears at line 122.
library(Rcpp);library(RcppArmadillo);library(inline)
misc='
#include <math.h>
#define _USE_MATH_DEFINES
#include <omp.h>
using namespace arma;
template <typename T> int sgn(T val) {
return (T(0) < val) - (val < T(0));
}
uvec rmultinomial(int n,vec prob)
{
int K = prob.n_elem;
uvec rN = zeros<uvec>(K);
double p_tot = sum(prob);
double pp;
for(int k = 0; k < K-1; k++) {
if(prob(k)>0) {
pp = prob[k] / p_tot;
rN(k) = ((pp < 1.) ? (rbinom(1,(double) n, pp))(0) : n);
n -= rN[k];
} else
rN[k] = 0;
if(n <= 0) /* we have all*/
return rN;
p_tot -= prob[k]; /* i.e. = sum(prob[(k+1):K]) */
}
rN[K-1] = n;
return rN;
}
'
model_and_summary='
mat SEIR_sim_plus_summaries()
{
vec alpha;
alpha << 0.002 << 0.0045;
vec beta;
beta << 0.01 << 0.01;
vec gamma;
gamma << 1.0/14.0 << 1.0/14.0;
vec sigma;
sigma << 1.0/(3.5) << 1.0/(3.5);
vec phi;
phi << 0.8 << 0.8;
int S_0 = 800;
int E_0 = 100;
int I_0 = 100;
int R_0 = 0;
int pop = 1000;
double tau = 0.01;
double t_0 = 0;
vec obs_time;
obs_time << 1 << 2 << 3 << 4 << 5 << 6 << 7 << 8 << 9 << 10 << 11 << 12 << 13 << 14 << 15 << 16 << 17 << 18 << 19 << 20 << 21 << 22 << 23 << 24;
const int n_obs = obs_time.n_elem;
const int n_part = alpha.n_elem;
mat stat(n_part,6);
//#pragma omp parallel for
for(int k=0;k<n_part;k++) {
ivec INC_i(n_obs);
ivec INC_o(n_obs);
// Event variables
double alpha_t;
int nX; //current number of people moving
vec rates(8);
uvec trans(4); // current transitions, e.g. from S to E,I,R,Universe
vec r(4); // rates e.g. from S to E, I, R, Univ.
/*********************** Initialize **********************/
int S_curr = S_0;
int S_prev = S_0;
int E_curr = E_0;
int E_prev = E_0;
int I_curr = I_0;
int I_prev = I_0;
int R_curr = R_0;
int R_prev = R_0;
int IncI_curr = 0;
int IncI_prev = 0;
int IncO_curr = 0;
int IncO_prev = 0;
double t_curr = t_0;
int t_idx =0;
while( t_idx < n_obs ) {
// next time preparation
t_curr += tau;
S_prev = S_curr;
E_prev = E_curr;
I_prev = I_curr;
R_prev = R_curr;
IncI_prev = IncI_curr;
IncO_prev = IncO_curr;
/*********************** description (rates) of the events **********************/
alpha_t = alpha(k)*(1+phi(k)*sin(2*M_PI*(t_curr+0)/52)); //real contact rate, time expressed in weeks
rates(0) = (alpha_t * ((double)I_curr / (double)pop ) * ((double)S_curr)); //e+1, s-1, r,i one s get infected (goes in E, not yey infectous)
rates(1) = (sigma(k) * E_curr); //e-1, i+1, r,s one exposed become infectous (goes in I) INCIDENCE!!
rates(2) = (gamma(k) * I_curr); //i-1, s,e, r+1 one i recover
rates(3) = (beta(k) * I_curr); //i-1, s, r,e one i dies
rates(4) = (beta(k) * R_curr); //i,e, s, r-1 one r dies
rates(5) = (beta(k) * E_curr); //e-1, s, r,i one e dies
rates(6) = (beta(k) * S_curr); //s-1 e, i ,r one s dies
rates(7) = (beta(k) * pop); //s+1 one susc is born
// Let the events occour
/*********************** S compartement **********************/
if((rates(0)+rates(6))>0){
nX = rbinom(1,S_prev,1-exp(-(rates(0)+rates(6))*tau))(0);
r(0) = rates(0)/(rates(0)+rates(6)); r(1) = 0.0; r(2) = 0; r(3) = rates(6)/(rates(0)+rates(6));
trans = rmultinomial(nX, r);
S_curr -= nX;
E_curr += trans(0);
I_curr += trans(1);
R_curr += trans(2);
//trans(3) contains dead individual, who disappear...we could avoid this using sequential conditional binomial
}
/*********************** E compartement **********************/
if((rates(1)+rates(5))>0){
nX = rbinom(1,E_prev,1-exp(-(rates(1)+rates(5))*tau))(0);
r(0) = 0.0; r(1) = rates(1)/(rates(1)+rates(5)); r(2) = 0.0; r(3) = rates(5)/(rates(1)+rates(5));
trans = rmultinomial(nX, r);
S_curr += trans(0);
E_curr -= nX;
I_curr += trans(1);
R_curr += trans(2);
IncI_curr += trans(1);
}
/*********************** I compartement **********************/
if((rates(2)+rates(3))>0){
nX = rbinom(1,I_prev,1-exp(-(rates(2)+rates(3))*tau))(0);
r(0) = 0.0; r(1) = 0.0; r(2) = rates(2)/(rates(2)+rates(3)); r(3) = rates(3)/(rates(2)+rates(3));
trans = rmultinomial(nX, r);
S_curr += trans(0);
E_curr += trans(1);
I_curr -= nX;
R_curr += trans(2);
IncO_curr += trans(2);
}
/*********************** R compartement **********************/
if(rates(4)>0){
nX = rbinom(1,R_prev,1-exp(-rates(4)*tau))(0);
r(0) = 0.0; r(1) = 0.0; r(2) = 0.0; r(3) = rates(4)/rates(4);
trans = rmultinomial(nX, r);
S_curr += trans(0);
E_curr += trans(1);
I_curr += trans(2);
R_curr -= nX;
}
/*********************** Universe **********************/
S_curr += pop - (S_curr+E_curr+I_curr+R_curr); //it should be poisson, but since the pop is fixed...
/*********************** Save & Continue **********************/
// Check if the time is interesting for us
if(t_curr > obs_time[t_idx]){
INC_i(t_idx) = IncI_curr;
INC_o(t_idx) = IncO_curr;
IncI_curr = IncI_prev = 0;
IncO_curr = IncO_prev = 0;
t_idx++;
}
//else just go on...
}
/*********************** Finished - Starting w/ stats **********************/
// INC_i is the useful variable, how can I change its reference withour copying it?
ivec incidence = INC_i; //just so if I want to use INC_o i have to change just this...
//Scan the epidemics to recover the summary stats (naively divide the data each 52 weeks)
double n_years = ceil((double)obs_time(n_obs-1)/52.0);
vec mu_attack(n_years);
vec ratio_attack(n_years-1);
vec peak(n_years);
vec atk(52);
peak(0)=0.0;
vec tmpExplo(52); //explosiveness
vec explo(n_years);
int year=0;
int week;
for(week=0 ; week<n_obs ; week++){
if(week - 52*year > 51){
mu_attack(year) = sum( atk )/(double)pop;
if(year>0)
ratio_attack(year-1) = mu_attack(year)/mu_attack(year-1);
for(int i=0;i<52;i++){
if(atk(i)>(peak(year)/2.0)){
tmpExplo(i) = 1.0;
} else {
tmpExplo(i) = 0.0;
}
}
explo(year) = sum(tmpExplo);
year++;
peak(year)=0.0;
}
atk(week-52*year) = incidence(week);
if( peak(year) < incidence(week) )
peak(year)=incidence(week);
}
if(week - 52*year > 51){
mu_attack(year) = sum( atk )/(double)pop;
} else {
ivec idx(52);
for(int i=0;i<52;i++)
{ idx(i) = i; } //take just the updated ones...
vec tmp = atk.elem(find(idx<(week - 52*year)));
mu_attack(year) = sum( tmp )/((double)pop * (tmp.n_elem/52.0));
ratio_attack(year-1) = mu_attack(year)/mu_attack(year-1);
for(int i=0;i<tmp.n_elem;i++){
if(tmp(i)>(peak(year)/2.0)){
tmpExplo(i) = 1.0;
} else {
tmpExplo(i) = 0.0;
}
}
for(int i=tmp.n_elem;i<52;i++)
tmpExplo(i) = 0.0; //to reset the others
explo(year) = sum(tmpExplo);
}
double correlation2;
double correlation4;
vec autocorr = acf(peak);
/***** ACF *****/
if(n_years<3){
correlation2=0.0;
correlation4=0.0;
} else {
if(n_years<5){
correlation2 = autocorr(1);
correlation4 = 0.0;
} else {
correlation2 = autocorr(1);
correlation4 = autocorr(3);
}
}
rowvec jnk(6);
jnk << sum(mu_attack)/(year+1.0)
<< (sum( log(ratio_attack)%log(ratio_attack) )/(n_years-1)) - (pow(sum( log(ratio_attack) )/(n_years-1),2))
<< correlation2 << correlation4 << max(peak) << sum(explo)/n_years;
stat.row(k) = jnk;
}
return stat;
}
'
main='
std::cout << "max_num_threads " << omp_get_max_threads() << std::endl;
RNGScope scope;
mat summaries = SEIR_sim_plus_summaries();
return wrap(summaries);
'
plug = getPlugin("RcppArmadillo")
## modify the plugin for Rcpp to support OpenMP
plug$env$PKG_CXXFLAGS <- paste('-fopenmp', plug$env$PKG_CXXFLAGS)
plug$env$PKG_LIBS <- paste('-fopenmp -lgomp', plug$env$PKG_LIBS)
SEIR_sim_summary = cxxfunction(sig=signature(),main,settings=plug,inc = paste(misc,model_and_summary),verbose=TRUE)
SEIR_sim_summary()
Thanks for the help!
NB: before you ask, I slightly modified the Rcpp multinomial sampling function just because I liked that way more than the one using pointer...not any other particular reason! :)
The core pseudo-random number generators (PRNGs) in R are not designed to be used in multithreaded environments. That is, their state is stored in a static array (dummy from src/main/PRNG.c) and therefore is shared among all threads. Moreover several other static structures are used to store states for the higher-level interfaces to the core PRNGs.
A possible solution could be that you put each call to rnorm() or other sampling functions inside named critical sections with all having the same name, e.g.:
...
#pragma omp critical(random)
rN(k) = ((pp < 1.) ? (rbinom(1,(double) n, pp))(0) : n);
...
if((rates(0)+rates(6))>0){
#pragma omp critical(random)
nX = rbinom(1,S_prev,1-exp(-(rates(0)+rates(6))*tau))(0);
...
Note that the critical construct operates on the structured block following it and therefore locks the entire statement. If a random number is being drawn inline inside a call to a time-consuming function, e.g.
#pragma omp critical(random)
x = slow_computation(rbinom(...));
this is better transformed to:
#pragma omp critical(random)
rb = rbinom(...);
x = slow_computation(rb);
That way only the rb = rbinom(...); statement will be protected.

array "not used in this scope". Why?

I'm getting an error saying that "adjacencymatrix' was not used in this scope" right at the end of main (before the function makebond at the end) (the commented line 112 "BROKEN LINE"). Why? Sorry about this being simple. I'm compiling with g++ ($ g++ a.c -o f).
Heres the code:
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
using namespace std;
#define PI 3.1415926535897932384626433832795
#define sqr(x) ((x)*(x))
#define count 500
double density;
double volume;
int N;
double beta = 0.1;
double R = 5;
double rob = 1;
int dimension = 2;
double eps=0.1; // Increase in density
double mindensity = 0; // Minimum density
double maxdensity = 8; // max.dens (scaled for the sake of ensuring int()
int makebond(double x);
int main(){
srand(time(0));
for (int rho=mindensity;rho<=(maxdensity/eps);density++){
N = floor(density*volume);
double nodepositions[N][dimension];
// Place nodes in volume (square side L, circle volume *R and obstacle *rob)
for (int i=0;i<N;i++){
int L = 5;
double distancefromorigin;
double x = (L*(rand()/RAND_MAX))-(L/2);
double y = (L*(rand()/RAND_MAX))-(L/2);
distancefromorigin = sqrt((x*x)+(y*y));
if(distancefromorigin<R){
if(distancefromorigin>rob){
nodepositions[i][0] = x;
nodepositions[i][1] = y;
}
}
}
double adjacencymatrix [N][N];
double itzhak; //distance of node 1 from the centre
double isaac; //distance of node 2 from the centre
double vivaldi; //distance between node 1 and node 2
double phi; // a function of the above 3 doubles (see later usage)
double rubicon; // maximum distance nodes within the icecream can be apart before becoming visually indepdendent
double maxtheta; // "in the icecream" means theta < maxtheta
double theta; // angular displacement of inner point from the line bisecting the icecream
// Create adjacency matrix (note alternative implementation using incidence lists)
for (int i=0;i<N;i++){
for (int j=0;j<N;j++){
double x0 = nodepositions[i][0];
double y0 = nodepositions[i][1];
double x1 = nodepositions[j][0];
double y1 = nodepositions[j][1];
itzhak = sqrt(sqr(x0) + sqr(y0));
isaac = sqrt(sqr(x1) + sqr(y1));
vivaldi = sqrt(sqr(x0-x1) + sqr(y0-y1));
phi = ((sqr(vivaldi)+sqr(itzhak)-sqr(isaac))/(2*vivaldi*itzhak));
rubicon = ((itzhak*phi) - sqrt((sqr(rob)) - ((sqr(itzhak))*(1-sqr(phi)))));
maxtheta = asin(rob/itzhak);
theta = acos(phi);
if (x0==x1 && y0==y1){
adjacencymatrix[i][j] = 0;
}
else{
if (isaac<itzhak && theta<maxtheta) {
if (vivaldi>rubicon){
adjacencymatrix[i][j] = 0;}
else {
adjacencymatrix[i][j] = makebond(vivaldi);}
}
else{adjacencymatrix[i][j] = makebond(vivaldi);}
}
}
}
}
FILE *datafc1;
datafc1 = fopen("matrix.dat", "w");
for (int ii = 0; ii<N; ii++){
for (int jj = 0; jj<N; jj++){
int aaa;
aaa = adjacencymatrix[ii][jj];///////////////*******BROKEN LINE******
fprintf(datafc1,"%i", aaa);
}
}
fclose(datafc1);
return 0;
}
/////////////////////////////
////////////////
/////// --End Main--
////////////////
////////////////////////////
int makebond(double x){
// This function takes in the euc. dist. between two nodes and draws a link with prob. H(r)
double randomnumber = (rand()/RAND_MAX); // Random number between 0 and 1
double hr = exp(-beta*sqr(x));// ***Connection function***
int a = 1; // Number to be put into adjacency matrix
if (randomnumber > hr){
a = 0;
}
return a; //Returns 0 or 1 depending on prob. dist.
}
adjacencymatrix is declared in your first for loop, so it's out of scope before the last spot you're using it, in the print-out loop at the bottom.
In addition, you have a useless using namespace std; line. Your code doesn't include any headers that contain std namespace symbols.
Your code in line 57:
double adjacencymatrix [N][N];
is inside a for loop, outside that loop, adjacencymatrix is undefined.
You matrix is defined in the for loop on line 11. Therefore it is out of scope on line 112.
FILE *datafc1;
datafc1 = fopen("matrix.dat", "w");
for (int ii = 0; ii<N; ii++){
for (int jj = 0; jj<N; jj++){
int aaa;
//error adjacencymatrix is declared in your first for loop
aaa = adjacencymatrix[ii][jj];///////////////*******BROKEN LINE******
fprintf(datafc1,"%i", aaa);
}
}