How can i parallelize this code using openmp:
xp, yp, zp, gpx, gpy, and gpz are known 1D vectors.
for (ies = 0; ies < 1000000; ies++){
for (jes = ies+1; jes < 1000000; jes++){
double dxp = xp[ies] - xp[jes];
double dyp = yp[ies] - yp[jes];
double dzp = zp[ies] - zp[jes];
double distance = sqrt( dxp * dxp + dyp * dyp + dzp * dzp );
double gpspec = gpx[ies] * gpx[jes] + gpy[ies] * gpy[jes] + gpz[ies] * gpz[jes];
#pragma omp parallel for
for (kes = 1; kes <= 100; kes++){
double distan = kes * distance;
E1[kes] = E1[kes] + gpspec * sin(distan) / distan;
}
}
}
Here is a possibility (not tested)
#pragma omp parallel for reduction(+: E1) private(jes, kes) schedule(dynamic)
for (ies = 0; ies < 1000000; ies++){
for (jes = ies+1; jes < 1000000; jes++){
double dxp = xp[ies] - xp[jes];
double dyp = yp[ies] - yp[jes];
double dzp = zp[ies] - zp[jes];
double distance = sqrt( dxp * dxp + dyp * dyp + dzp * dzp );
double gpspec = gpx[ies] * gpx[jes] + gpy[ies] * gpy[jes] + gpz[ies] * gpz[jes];
for (kes = 1; kes <= 100; kes++){
double distan = kes * distance;
E1[kes] = E1[kes] + gpspec * sin(distan) / distan;
}
}
}
I've put a schedule(dynamic) to try to compensate the workload imbalance between threads introduced by the triangular aspect of the index domain ies * jes that the loops cover.
Also depending on the way E1 is defined, this may or may not be accepted by your compiler. But in any case, if the reduction(+: E1) isn't accepted, there's always the possibility to do the reduction by hand with a critical construct.
You already have an omp parallel for pragma on the innermost loop. To give that effect, you probably need to enable OpenMP support in your compiler by setting a compiler flag (for example, with the GCC compiler suite, that would be the -fopenmp flag). You may also need to #include the omp.h header.
But with that said, I doubt you're going to gain much from this parallelization, because one run of the loop you are parallelizing just doesn't do much work. There is runtime overhead associated with parallelization that offsets the gains from running multiple loop iterations at the same time, so I don't think you're going to net very much.
Related
I don't know much about multi-threading and I have no idea why this is happening so I'll just get to the point.
I'm processing an image and divide the image in 4 parts and pass each part to each thread(essentially I pass the indices of the first and last pixel rows of each part). For example, if the image has 1000 rows, each thread will process 250 of them. I can go in details about my implementation and what I'm trying to achieve in case it can help you. For now I provide the code executed by the threads in case you can detect why this is happening. I don't know if it's relevant but in both cases(1 thread or 4 threads) the process takes around 15ms and pfUMap and pbUMap are unordered maps.
void jacobiansThread(int start, int end,vector<float> &sJT,vector<float> &sJTJ) {
uchar* rgbPointer;
float* depthPointer;
float* sdfPointer;
float* dfdxPointer; float* dfdyPointer;
float fov = radians(45.0);
float aspect = 4.0 / 3.0;
float focal = 1 / (glm::tan(fov / 2));
float fu = focal * cols / 2 / aspect;
float fv = focal * rows / 2;
float strictFu = focal / aspect;
float strictFv = focal;
vector<float> pixelJacobi(6, 0);
for (int y = start; y <end; y++) {
rgbPointer = sceneImage.ptr<uchar>(y);
depthPointer = depthBuffer.ptr<float>(y);
dfdxPointer = dfdx.ptr<float>(y);
dfdyPointer = dfdy.ptr<float>(y);
sdfPointer = sdf.ptr<float>(y);
for (int x = roiX.x; x <roiX.y; x++) {
float deltaTerm;// = deltaPointer[x];
float raw = sdfPointer[x];
if (raw > 8.0)continue;
float dirac = (1.0f / float(CV_PI)) * (1.2f / (raw * 1.44f * raw + 1.0f));
deltaTerm = dirac;
vec3 rgb(rgbPointer[x * 3], rgbPointer[x * 3+1], rgbPointer[x * 3+2]);
vec3 bin = rgbToBin(rgb, numberOfBins);
int indexOfColor = bin.x * numberOfBins * numberOfBins + bin.y * numberOfBins + bin.z;
float s3 = glfwGetTime();
float pF = pfUMap[indexOfColor];
float pB = pbUMap[indexOfColor];
float heavisideTerm;
heavisideTerm = HEAVISIDE(raw);
float denominator = (heavisideTerm * pF + (1 - heavisideTerm) * pB) + 0.000001;
float commonFirstTerm = -(pF - pB) / denominator * deltaTerm;
if (pF == pB)continue;
vec3 pixel(x, y, depthPointer[x]);
float dfdxTerm = dfdxPointer[x];
float dfdyTerm = -dfdyPointer[x];
if (pixel.z == 1) {
cv::Point c = findClosestContourPoint(cv::Point(x, y), dfdxTerm, -dfdyTerm, abs(raw));
if (c.x == -1)continue;
pixel = vec3(c.x, c.y, depthBuffer.at<float>(cv::Point(c.x, c.y)));
}
vec3 point3D = pixel;
pixelToViewFast(point3D, cols, rows, strictFu, strictFv);
float Xc = point3D.x; float Xc2 = Xc * Xc; float Yc = point3D.y; float Yc2 = Yc * Yc; float Zc = point3D.z; float Zc2 = Zc * Zc;
pixelJacobi[0] = dfdyTerm * ((fv * Yc2) / Zc2 + fv) + (dfdxTerm * fu * Xc * Yc) / Zc2;
pixelJacobi[1] = -dfdxTerm * ((fu * Xc2) / Zc2 + fu) - (dfdyTerm * fv * Xc * Yc) / Zc2;
pixelJacobi[2] = -(dfdyTerm * fv * Xc) / Zc + (dfdxTerm * fu * Yc) / Zc;
pixelJacobi[3] = -(dfdxTerm * fu) / Zc;
pixelJacobi[4] = -(dfdyTerm * fv) / Zc;
pixelJacobi[5] = (dfdyTerm * fv * Yc) / Zc2 + (dfdxTerm * fu * Xc) / Zc2;
float weightingTerm = -1.0 / log(denominator);
for (int i = 0; i < 6; i++) {
pixelJacobi[i] *= commonFirstTerm;
sJT[i] += pixelJacobi[i];
}
for (int i = 0; i < 6; i++) {
for (int j = i; j < 6; j++) {
sJTJ[i * 6 + j] += weightingTerm * pixelJacobi[i] * pixelJacobi[j];
}
}
}
}
}
This is the part where I call each thread:
vector<std::thread> myThreads;
float step = (roiY.y - roiY.x) / numberOfThreads;
vector<vector<float>> tsJT(numberOfThreads, vector<float>(6, 0));
vector<vector<float>> tsJTJ(numberOfThreads, vector<float>(36, 0));
for (int i = 0; i < numberOfThreads; i++) {
int start = roiY.x+i * step;
int end = start + step;
if (end > roiY.y)end = roiY.y;
myThreads.push_back(std::thread(&pwp3dV2::jacobiansThread, this,start,end,std::ref(tsJT[i]), std::ref(tsJTJ[i])));
}
vector<float> sJT(6, 0);
vector<float> sJTJ(36, 0);
for (int i = 0; i < numberOfThreads; i++)myThreads[i].join();
Other Notes
To measure time I used glfwGetTime() before and right after the second code snippet. The measurements vary but the average is about 15ms as I mentioned, for both implementations.
Starting a thread has significant overhead, which might not be worth the time if you have only 15 milliseconds worth of work.
The common solution is to keep threads running in the background and send them data when you need them, instead of calling the std::thread constructor to create a new thread every time you have some work to do.
Pure spectaculation but two things might be preventing the full power of parallelization.
Processing speed is limited by the memory bus. Cores will wait until data is loaded before continuing.
Data sharing between cores. Some caches are core specific. If memory is shared between cores, data must traverse down to shared cache before loading.
On Linux you can use Perf to check for cache misses.
if you wanna better time you need to split a cycle runs from a counter, for this you need to do some preprocessing. some fast stuff like make an array of structures with headers for each segment or so. if say you can't mind anything better you can just do vector<int> with values of a counter. Then do for_each(std::execution::par,...) on that. way much faster.
for timings there's
auto t2 = std::chrono::system_clock::now();
std::chrono::milliseconds f = std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1);
I have this function:
bool interpolate(const Mat &im, float ofsx, float ofsy, float a11, float a12, float a21, float a22, Mat &res)
{
bool ret = false;
// input size (-1 for the safe bilinear interpolation)
const int width = im.cols-1;
const int height = im.rows-1;
// output size
const int halfWidth = res.cols >> 1;
const int halfHeight = res.rows >> 1;
float *out = res.ptr<float>(0);
const float *imptr = im.ptr<float>(0);
for (int j=-halfHeight; j<=halfHeight; ++j)
{
const float rx = ofsx + j * a12;
const float ry = ofsy + j * a22;
#pragma omp simd
for(int i=-halfWidth; i<=halfWidth; ++i, out++)
{
float wx = rx + i * a11;
float wy = ry + i * a21;
const int x = (int) floor(wx);
const int y = (int) floor(wy);
if (x >= 0 && y >= 0 && x < width && y < height)
{
// compute weights
wx -= x; wy -= y;
int rowOffset = y*im.cols;
int rowOffset1 = (y+1)*im.cols;
// bilinear interpolation
*out =
(1.0f - wy) * ((1.0f - wx) * imptr[rowOffset+x] + wx * imptr[rowOffset+x+1]) +
( wy) * ((1.0f - wx) * imptr[rowOffset1+x] + wx * imptr[rowOffset1+x+1]);
} else {
*out = 0;
ret = true; // touching boundary of the input
}
}
}
return ret;
}
halfWidth is very random: it can be 9, 84, 20, 95, 111...I'm only trying to optimize this code, I don't understand it in details.
As you can see, the inner for has been already vectorized, but Intel Advisor suggests this:
And this is the Trip Count analysis result:
To my understand this means that:
Vector length is 8, so it means that 8 floats can be processed at the same time for each loop. This would mean (if I'm not wrong) that data are 32 bytes aligned (even though as I explain here it seems that the compiler think that data is not aligned).
On average, 2 cycles are totally vectorized, while 3 cycles are remainder loops. The same goes for Min and Max. Otherwise I don't understand what ; means.
Now my question is: how can I follow Intel Advisor first suggestion? It says to "increase the size of objects and add iterations so the trip count is a multiple of vector length"...Ok, so it's simply sayin' "hey man do this so halfWidth*2+1 (since it goes from -halfWidth to +halfWidth is a multiple of 8)". But how can I do this? If I add random cycles, this would obviously break the algorithm!
The only solution that came to my mind is to add "fake" iterations like this:
const int vectorLength = 8;
const int iterations = halfWidth*2+1;
const int remainder = iterations%vectorLength;
for(int i=0; i<loop+length-remainder; i++){
//this iteration was not supposed to exist, skip it!
if(i>halfWidth)
continue;
}
Of course this code would not work since it goes from -halfWidth to halfWidth, but it's to make you understand my strategy of "fake" iterations.
About the second option ("Increase the size of static and automatic objects, and use a compiler option to add data padding") I have no idea how to implement this.
First, you have to check Vector Advisor Efficiency metric as well as relative time spent in Loop Remainder compared to Loop Body (see hotspots list in advisor). If efficiency is close to 100% (or time spent in Remainder is very small), then it is not worth effort (and money as MSalters mentioned in comments).
If it is << 100% (and there are no other penalties reported by the tool), then you can either refactor the code to "add fake iterations" (rare users can afford it) or you should try #pragma loop_count for most typical #iterations values (depending on typical halfWidth value).
If halfWIdth is totally random (no common or average values), then there is nothing you can really do with this issue.
as an exercise, i'm translating my master's thesis finite-difference time-domain code for simulation of wave propagation from matlab to c++ and i've come across the following problem.
i would like to create a class that corresponds to a non-physical absorbing layer called cpml. the size of the layer depends on the desired parameters of the simulation, so the arrays that define the absorbing layer have to be dynamic.
#ifndef fdtd_h
#define fdtd_h
#include <cmath>
#include <iostream>
#include <sstream>
using namespace std;
class cpml {
public:
int thickness;
int n_1, n_2, n_3;
double cut_off_freq;
double kappa_x_max, sigma_x_1_max, sigma_x_2_max, alpha_x_max;
double *kappa_x_tau_xy, *sigma_x_tau_xy, *alpha_x_tau_xy;
void set_cpml_parameters_tau_xy();
};
void cpml::set_cpml_parameters_tau_xy(){
double temp1[thickness], temp2[thickness], temp3[thickness];
for(int j = 1; j < thickness; j++){
temp1[j] = 1 + kappa_x_max * pow((double)(thickness - j - 0.5) / (double)(thickness - 1), n_1);
temp2[j] = sigma_x_1_max * pow((double)(thickness - j - 0.5) / (double)(thickness - 1), n_1 + n_2);
temp3[j] = alpha_x_max * pow((double)(j - 0.5) / (double)(thickness - 1), n_3);
}
kappa_x_tau_xy = temp1;
sigma_x_tau_xy = temp2;
for(int i = 1; i < thickness; i++){
cout << sigma_x_tau_xy[i] << endl;
}
alpha_x_tau_xy = temp3;
}
#endif /* fdtd_h */
when i call the function cpml::set_cpml_parameters_tau_xy() in my main function, the first value of the array sigma_x_tau_xy is correct. however, the further values aren't.
#include "fdtd.h"
using namespace std;
int main() {
cpml cpml;
int cpml_thickness = 10;
cpml.thickness = cpml_thickness;
int n_1 = 3, n_2 = 0, n_3 = 3;
cpml.n_1 = n_1; cpml.n_2 = n_2; cpml.n_3 = n_3;
double cut_off_freq = 1;
cpml.cut_off_freq = cut_off_freq;
double kappa_x_max = 0;
double sigma_x_1_max = 0.8 * (n_1 + 1) / (sqrt(simulation_medium.mu/simulation_medium.rho) * simulation_grid.big_delta_x), sigma_x_2_max = 0.8 * (n_1 + 1) / (sqrt(simulation_medium.mu/simulation_medium.rho) * simulation_grid.big_delta_x);
double alpha_x_max = 2 * PI * cpml.cut_off_freq;
double kappa_y_max = 0;
double sigma_y_1_max = 0.8 * (n_1 + 1) / (sqrt(simulation_medium.mu/simulation_medium.rho) * simulation_grid.big_delta_y), sigma_y_2_max = 0.8 * (n_1 + 1) / (sqrt(simulation_medium.mu/simulation_medium.rho) * simulation_grid.big_delta_y);
double alpha_y_max = 2 * PI * cpml.cut_off_freq;
cpml.kappa_x_max = kappa_x_max; cpml.sigma_x_1_max = sigma_x_1_max; cpml.sigma_x_2_max = sigma_x_2_max; cpml.alpha_x_max = alpha_x_max;
cpml.kappa_y_max = kappa_y_max; cpml.sigma_y_1_max = sigma_y_1_max; cpml.sigma_y_2_max = sigma_y_2_max; cpml.alpha_y_max = alpha_y_max;
cpml.set_cpml_parameters_tau_xy();
for(int j = 1; j < cpml.thickness; j++){
cout << *(cpml.sigma_x_tau_xy + j) << endl;
}
}
what am i doing wrong and how do i make the dynamic array members of the class cpml contain the correct values when called in the main function?
Two problems: The lesser of them is that your program is technically not a valid C++ program, since C++ doesn't have variable-length arrays (which your arrays temp1, temp2 and temp3 are).
The more serious problem is that you save pointers to local variables. When a function returns, local variables go out of scope and no longer exist. Pointers to them will become invalid, and using those pointers will lead to undefined behavior.
Both problems are easily solved by using std::vector instead of arrays and pointers.
You cannot declare an array in C++ without a "constant" expression for its size (the bounds must be known at compile time). That means this code is invalid:
double temp1[thickness], temp2[thickness], temp3[thickness];
What you should instead do is the following:
class cmpl
{
//...
std::vector<double> kappa_x_tau_xy, sigma_x_tau_xy, alpha_x_tau_xy;
// ...
};
void cpml::set_cpml_parameters_tau_xy(){
alpha_x_tau_xy.resize(thickness);
kappa_x_tau_xy.resize(thickness);
sigma_x_tau_xy.resize(thickness);
//...
std::vector will handle all the dynamic allocation under the hood for you. If your code compiled, it was because you were using a nonstandard GCC extension for variable length arrays. Turn your warnings up -Wall -pedantic -Werror when you compile and it should complain more.
Note that you also have issues in array bounds. Whereas Matlab is 1-indexed, C++ is 0-indexed, so you'll need to do this, too:
for(int j = 0; j < thickness; j++){
alpha_x_tau_xy[j] = 1 + kappa_x_max * pow((double)(thickness - j - 0.5) / (double)(thickness - 1), n_1);
kappa_x_tau_xy = sigma_x_1_max * pow((double)(thickness - j - 0.5) / (double)(thickness - 1), n_1 + n_2);
sigma_x_tau_xy = alpha_x_max * pow((double)(j - 0.5) / (double)(thickness - 1), n_3);
}
You have a similar issue in main:
for(int j = 1; j < cpml.thickness; j++){
cout << *(cpml.sigma_x_tau_xy + j) << endl;
}
Should become:
for(int j = 0; j < cpml.thickness; j++){
cout << cpml.sigma_x_tau_xy[j] << endl;
}
Additional Notes:
Your code is very unstructured. Consider putting all of the cmpl-related getting and setting into the cmpl class ([Encapsulation])(https://en.wikipedia.org/wiki/Encapsulation_(computer_programming)). This will make it easer for the client (you in this case) to interact with the object.
This will include hiding your class data as protected or private and exposing functions to get and set those variables (don't forget const where appropriate).
Add a constructor to initialize all of the fields at once. As it stands now, your class consists of mostly uninitialized garbage for much of its lifetime. If someone where to prematurely try to access a field, you're in Undefined Behavior territory.
std::endl is good for printing newline characters, but restrict that to Debug-only code. The reason being is that it flushes the buffer every time its called, which can make your code overall slower if it's printing a lot. Use a newline character "\n" instead for Release.
An additional benefit of std::vector is that it makes copying and assigning to a cmpl well behaved. Otherwise, the compiler will generate a copy constructor and copy assignment, which when used will be a shallow copy instead of the deep copy that you'd want.
After restructuring your class, your main might look something like this:
int main() {
int cpml_thickness = 10;
int n_1 = 3, n_2 = 0, n_3 = 3;
double cut_off_freq = 1;
double kappa_x_max = 0;
double sigma_x_1_max = 0.8 * (n_1 + 1) / (sqrt(simulation_medium.mu/simulation_medium.rho) * simulation_grid.big_delta_x), sigma_x_2_max = 0.8 * (n_1 + 1) / (sqrt(simulation_medium.mu/simulation_medium.rho) * simulation_grid.big_delta_x);
double alpha_x_max = 2 * PI * cut_off_freq;
double kappa_y_max = 0;
double sigma_y_1_max = 0.8 * (n_1 + 1) / (sqrt(simulation_medium.mu/simulation_medium.rho) * simulation_grid.big_delta_y), sigma_y_2_max = 0.8 * (n_1 + 1) / (sqrt(simulation_medium.mu/simulation_medium.rho) * simulation_grid.big_delta_y);
double alpha_y_max = 2 * PI * cut_off_freq;
cpml cpml(cpml_thickness, n_1, n_2, n_3, cut_off_freq, kappa_x_max, kappa_y_max, sigma_x_1_max, sigma_x_2max, alpha_x_max, alpha_y_max);
cpml.set_cpml_parameters_tau_xy();
cpml.PrintSigmaTauXY(std::cout);
}
Which is arguably better. (You might use a getter to get sigma_tau_xy from the class and then print it yourself, though). And then you can think about how to simplify things even further by creating objects that represent the logical groupings of alpha_x_max and alpha_y_max etc. This could be a std::pair or a full-on struct with its own getters and setters. Now their own logic is grouped together and is easy to pass around/reference/think about. Your constructor for cmpl also becomes simpler, where you accept a single parameter that represents both x and y instead of separate ones for both.
Matlab doesn't really encourage an Object-Oriented approach in my (admittedly breif) experience, but in C++ it's easy.
I try to optimise the following loop with OpenMP:
#pragma omp parallel for private(diff)
for (int j = 0; j < x.d; ++j) {
diff = x(example,j) - x(chosen_pts[ndx - 1],j);
#pragma omp atomic
d2 += diff * diff;
}
But it runs actually 4x slower than without #pragma.
EDIT
As Piotr S., coincoin and erenon pointed out, in my case x.d is so small, that's why parallelism makes my code run slower. I post the outer loop too, maybe there is some possibility for multithreading: (x.n is over 100 millions)
float sum_distribution = 0.0;
// look for the point that is furthest from any center
float max_dist = 0.0;
for (int i = 0; i < x.n; ++i) {
int example = dist2[i].second;
float d2 = 0.0, diff;
//#pragma omp parallel for private(diff) reduction(+:d2)
for (int j = 0; j < x.d; ++j) {
diff = x(example,j) - x(chosen_pts[ndx - 1],j);
d2 += diff * diff;
}
if (d2 < dist2[i].first) {
dist2[i].first = d2;
}
if (dist2[i].first > max_dist) {
max_dist = dist2[i].first;
}
sum_distribution += dist2[i].first;
}
If someone is interested, here is the whole function: https://github.com/ghamerly/baylorml/blob/master/fast_kmeans/general_functions.cpp#L169, but as I measured 85% of the elapsed time comes from this loop.
Yes, the outer loop, as posted, can be parallelized with OpenMP.
All variables modified in the loop are either local to an iteration or are used for aggregation over the loop. And I assume that calls to x() in the calculation of diff have no side effects.
To do aggregation in parallel correctly and efficiently, you need to use an OpenMP loop with reduction clause. For sum_distribution the reduction operation is +, and for max_dist it's max. So, adding the following pragma in front of the outer loop should do the job:
#pragma omp parallel for reduction(+:sum_distribution) reduction(max:max_dist)
Note that max as a reduction operation can only be used since OpenMP 3.1. It's not that new, so most OpenMP-enabled compilers already support it, but not all; or you might use an older version. So it makes sense to consult with the documentation for your compiler.
I'm simulating a stochastic differential equation with a monte carlo method, which in principle is perfectly suited for openMP, as different realizations do not depend on each other. Unfortunately I'm facing some problems with my code, which produces wrong result as soon as I turn on openMP. Without it, it works perfectly fine. My 'critical' loop looks like this:
double price = 0.0
#pragma omp parallel for private(VOld, VNew)
for (long i = 0; i < NSim; ++i){
VOld = S_0;
for (long index = 0; index < Nt; ++index){
VNew = VOld + (dt * r * VOld) + (sqrdt * sig * VOld * dW());
VOld = VNew;
}
double tmp = myOption.PayOff(VNew);
price += (tmp)/double(NSim);
}
I would truly appreciate any help. Thank you in advance :-)
A common mistake is forgetting that each thread must have its own random number generator. If that's not the case, then each call to dW will be messing up with the internal state of the (shared, instead of private) random number generator.
I hope this helps.
Well one problem I see is that you have a race condition on the variable price. You should be doing a reduction
#pragma omp parallel for private(VOld, VNew) reduction(+:price)
The same goes for your variable OptionPrice
Also it looks to me like rng is still shared, not private. You should define it in the parallel block if you want it private or declare it private (for private variables I prefer to declare them int the parallel block which automatically makes them private rather than declare them private).
Ok, so based on #jmbr and #raxman answers I moved the inner loop to a separate function, and made sure that rng is now really private. Also, note the seeding trick, which turns up vital. On top of that I introduced reduction on the OptionPrice. The code below works fine.
double SimulateStockPrice(const double InitialPrize, const double dt, const long Nt, const double r, const double sig, boost::mt19937 *rng){
static unsigned long seed = 0;
boost::mt19937 *rng = new boost::mt19937();
rng -> seed((++seed) + time(NULL));
boost::normal_distribution<> nd(0.0, 1.0);
boost::variate_generator< boost::mt19937, boost::normal_distribution<> > dW(*rng, nd);
double sqrdt = sqrt(dt);
double PriceNew(0.0), PriceOld(InitialPrize);
for (long index = 0; index < Nt; ++index){
PriceNew = PriceOld + (dt * r * PriceOld) + (sqrdt * sig * PriceOld * dW());
PriceOld = PriceNew;
}
delete rng;
return PriceNew;
}
Then in the big loop I go with:
#pragma omp parallel for default(none) shared(dt, NSim, Nt, S_0, myOption) reduction(+:OptionPrice)
for (long i = 0; i < NSim; ++i){
double StockPrice = SimulateStockPrice(S_0, dt, Nt, myOption.r, myOption.sig, rng);
double PayOff = myOption.myPayOffFunction(StockPrice);
OptionPrice += PayOff;
}
And off you go :-)