10 dimensional Monte Carlo integration with openmp - c++

I am trying to learn parallelization with openmp. I have written a c++ script which calculates 10 dimensional integration through MC for the function:
F = x1+ x2 + x3 +...+x10
now I am trying to convert it to work with openmp with 4 threads. my serial code gives intelligible output, so I am kind of convinced that it works fine.
here is my serial code:
I want to output for every 4^k iterations for N= number of sample points.
/* compile with
$ g++ -o monte ND_MonteCarlo.cpp
$ ./monte N
unsigned long long int for i, N
Maximum value for UNSIGNED LONG LONG INT 18446744073709551615
*/
#include <iostream>
#include <fstream>
#include <iomanip>
#include <cmath>
#include <cstdlib>
#include <ctime>
using namespace std;
//define multivariate function F(x1, x2, ...xk)
double f(double x[], int n)
{
double y;
int j;
y = 0.0;
for (j = 0; j < n; j = j+1)
{
y = y + x[j];
}
y = y;
return y;
}
//define function for Monte Carlo Multidimensional integration
double int_mcnd(double(*fn)(double[],int),double a[], double b[], int n, int m)
{
double r, x[n], v;
int i, j;
r = 0.0;
v = 1.0;
// step 1: calculate the common factor V
for (j = 0; j < n; j = j+1)
{
v = v*(b[j]-a[j]);
}
// step 2: integration
for (i = 1; i <= m; i=i+1)
{
// calculate random x[] points
for (j = 0; j < n; j = j+1)
{
x[j] = a[j] + (rand()) /( (RAND_MAX/(b[j]-a[j])));
}
r = r + fn(x,n);
}
r = r*v/m;
return r;
}
double f(double[], int);
double int_mcnd(double(*)(double[],int), double[], double[], int, int);
int main(int argc, char **argv)
{
/* define how many integrals */
const int n = 10;
double b[n] = {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0,5.0};
double a[n] = {-5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0,-5.0};
double result, mean;
int m;
unsigned long long int i, N;
// initial seed value (use system time)
srand(time(NULL));
cout.precision(6);
cout.setf(ios::fixed | ios::showpoint);
// current time in seconds (begin calculations)
time_t seconds_i;
seconds_i = time (NULL);
m = 4; // initial number of intervals
// convert command-line input to N = number of points
N = atoi( argv[1] );
for (i=0; i <=N/pow(4,i); i++)
{
result = int_mcnd(f, a, b, n, m);
mean = result/(pow(10,10));
cout << setw(30) << m << setw(30) << result << setw(30) << mean <<endl;
m = m*4;
}
// current time in seconds (end of calculations)
time_t seconds_f;
seconds_f = time (NULL);
cout << endl << "total elapsed time = " << seconds_f - seconds_i << " seconds" << endl << endl;
return 0;
}
and output:
N integral mean_integral
4 62061079725.185936 6.206108
16 33459275100.477665 3.345928
64 -2204654740.788784 -0.220465
256 4347440045.990804 0.434744
1024 -1265056243.116922 -0.126506
4096 681660387.953380 0.068166
16384 -799507050.896809 -0.079951
65536 -462592561.594820 -0.046259
262144 50902035.836772 0.005090
1048576 -91104861.129695 -0.009110
4194304 3746742.588701 0.000375
16777216 -32967862.853915 -0.003297
67108864 17730924.602974 0.001773
268435456 -416824.977687 -0.00004
1073741824 2843188.477219 0.000284
But I think my parallel code is not working at all. I know I'm doing something silly of course .As my number of threads are 4, I wanted to divide results by 4, and the output is ridiculous.
here is a parallel version of the same code:
/* compile with
$ g++ -fopenmp -Wunknown-pragmas -std=c++11 -o mcOMP parallel_ND_MonteCarlo.cpp -lm
$ ./mcOMP N
unsigned long long int for i, N
Maximum value for UNSIGNED LONG LONG INT 18446744073709551615
*/
#include <iostream>
#include <fstream>
#include <iomanip>
#include <cmath>
#include <cstdlib>
#include <ctime>
#include <omp.h>
using namespace std;
//define multivariate function F(x1, x2, ...xk)
double f(double x[], int n)
{
double y;
int j;
y = 0.0;
for (j = 0; j < n; j = j+1)
{
y = y + x[j];
}
y = y;
return y;
}
//define function for Monte Carlo Multidimensional integration
double int_mcnd(double(*fn)(double[],int),double a[], double b[], int n, int m)
{
double r, x[n], v;
int i, j;
r = 0.0;
v = 1.0;
// step 1: calculate the common factor V
#pragma omp for
for (j = 0; j < n; j = j+1)
{
v = v*(b[j]-a[j]);
}
// step 2: integration
#pragma omp for
for (i = 1; i <= m; i=i+1)
{
// calculate random x[] points
for (j = 0; j < n; j = j+1)
{
x[j] = a[j] + (rand()) /( (RAND_MAX/(b[j]-a[j])));
}
r = r + fn(x,n);
}
r = r*v/m;
return r;
}
double f(double[], int);
double int_mcnd(double(*)(double[],int), double[], double[], int, int);
int main(int argc, char **argv)
{
/* define how many integrals */
const int n = 10;
double b[n] = {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0};
double a[n] = {-5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0,-5.0};
double result, mean;
int m;
unsigned long long int i, N;
int NumThreads = 4;
// initial seed value (use system time)
srand(time(NULL));
cout.precision(6);
cout.setf(ios::fixed | ios::showpoint);
// current time in seconds (begin calculations)
time_t seconds_i;
seconds_i = time (NULL);
m = 4; // initial number of intervals
// convert command-line input to N = number of points
N = atoi( argv[1] );
#pragma omp parallel private(result, mean) shared(N, m) num_threads(NumThreads)
for (i=0; i <=N/pow(4,i); i++)
{
result = int_mcnd(f, a, b, n, m);
mean = result/(pow(10,10));
#pragma omp master
cout << setw(30) << m/4 << setw(30) << result/4 << setw(30) << mean/4 <<endl;
m = m*4;
}
// current time in seconds (end of calculations)
time_t seconds_f;
seconds_f = time (NULL);
cout << endl << "total elapsed time = " << seconds_f - seconds_i << " seconds" << endl << endl;
return 0;
}
I want only the master thread to output the values.
I compiled with:
g++ -fopenmp -Wunknown-pragmas -std=c++11 -o mcOMP parallel_ND_MonteCarlo.cpp -lm
your help and suggestion to fix the code is most appreciated. thanks a lot.

Let's see what your program does. At omp parallel, your threads are spawned and they will execute the remaining code in parallel. Operations like:
m = m * 4;
Are undefined (and make no sense generally, as they are executed four times per iteration).
Further, when those threads encounter a omp for, they will share the work of the loop, i.e. each iteration will be executed only once by some thread. Since int_mcnd is executed within a parallel region, all it's local variables are private. You have no construct in your code to actually collect those private results (also result and mean are private).
The correct approach is to use a parallel for loop with reduction clause, indicating that there is a variable (r/v) that is being aggregated throughout the execution of the loop.
To allow this, the reduction variables need to be declared as shared, outside of the scope of the parallel region. The easiest solution is to move the parallel region inside of int_mcnd. This also avoid the race condition for m.
There is one more hurdle: rand is using global state and at least my implementation is locked. Since most of the time is spent into rand, your code would scale horribly. The solution is to use an explicit threadprivate state via rand_r. (See also this question).
Piecing it together, the modified code looks like this:
double int_mcnd(double (*fn)(double[], int), double a[], double b[], int n, int m)
{
// Reduction variables need to be shared
double r = 0.0;
double v = 1.0;
#pragma omp parallel
// All variables declared inside are private
{
// step 1: calculate the common factor V
#pragma omp for reduction(* : v)
for (int j = 0; j < n; j = j + 1)
{
v = v * (b[j] - a[j]);
}
// step 2: integration
unsigned int private_seed = omp_get_thread_num();
#pragma omp for reduction(+ : r)
for (int i = 1; i <= m; i = i + 1)
{
// Note: X MUST be private, otherwise, you have race-conditions again
double x[n];
// calculate random x[] points
for (int j = 0; j < n; j = j + 1)
{
x[j] = a[j] + (rand_r(&private_seed)) / ((RAND_MAX / (b[j] - a[j])));
}
r = r + fn(x, n);
}
}
r = r * v / m;
return r;
}
double f(double[], int);
double int_mcnd(double (*)(double[], int), double[], double[], int, int);
int main(int argc, char** argv)
{
/* define how many integrals */
const int n = 10;
double b[n] = { 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0 };
double a[n] = { -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0 };
int m;
unsigned long long int i, N;
int NumThreads = 4;
// initial seed value (use system time)
srand(time(NULL));
cout.precision(6);
cout.setf(ios::fixed | ios::showpoint);
// current time in seconds (begin calculations)
time_t seconds_i;
seconds_i = time(NULL);
m = 4; // initial number of intervals
// convert command-line input to N = number of points
N = atoi(argv[1]);
for (i = 0; i <= N / pow(4, i); i++)
{
double result = int_mcnd(f, a, b, n, m);
double mean = result / (pow(10, 10));
cout << setw(30) << m << setw(30) << result << setw(30) << mean << endl;
m = m * 4;
}
// current time in seconds (end of calculations)
time_t seconds_f;
seconds_f = time(NULL);
cout << endl << "total elapsed time = " << seconds_f - seconds_i << " seconds" << endl << endl;
return 0;
}
Note that I removed the division by four, and also the output is done outside of the parallel region. The results should be similar (except for randomness of course) than the serial version.
I observe perfect 16x speedup on a 16 core system with -O3.
A few more remarks:
Declare variables as locally as possible.
If thread overhead would be a problem, you could move the parallel region outside, but then you need to think more carefully about the parallel execution, and find a solution for the shared reduction variables. Given the embarrassingly parallel nature of Monte Carlo codes, you could stick more closely with your initial solution by removing the omp for directives - which then means each thread executes all loop iterations. Then you could manually sum up the result variable and print that. But I don't really see the point.

I will not go into details but will give some pointers where to look at
Take for example this part of the code:
// step 1: calculate the common factor V
#pragma omp for
for (j = 0; j < n; j = j+1)
{
v = v*(b[j]-a[j]);
}
If you look at variable v there is clear case of race condition. That is you have to declare v private to the thread (maybe call it local_v) and then through reduction operation gather all the values into a global_v value for example.
In general I would advise you to look for race condition, critical regions, concepts of shared and private memory for openmp.

Related

Why is multi-threading of matrix calculation not faster than single-core?

this is my first time using multi-threading to speed up a heavy calculation.
Background: The idea is to calculate a Kernel Covariance matrix, by reading a list of 3D points x_test and calculating the corresponding matrix, which has dimensions x_test.size() x x_test.size().
I already sped up the calculations by only calculating the lower triangluar matrix. Since all the calculations are independent from each other I tried to speed up the process (x_test.size() = 27000 in my case) by splitting the calculations of the matrix entries row-wise, assigning a range of rows to each thread.
On a single core the calculations took about 280 seconds each time, on 4 cores it took 270-290 seconds.
main.cpp
int main(int argc, char *argv[]) {
double sigma0sq = 1;
double lengthScale [] = {0.7633, 0.6937, 3.3307e+07};
const std::vector<std::vector<double>> x_test = parse2DCsvFile(inputPath);
/* Finding data slices of similar size */
//This piece of code works, each thread is assigned roughly the same number of matrix entries
int numElements = x_test.size()*x_test.size()/2;
const int numThreads = 4;
int elemsPerThread = numElements / numThreads;
std::vector<int> indices;
int j = 0;
for(std::size_t i=1; i<x_test.size()+1; ++i){
int prod = i*(i+1)/2 - j*(j+1)/2;
if (prod > elemsPerThread) {
i--;
j = i;
indices.push_back(i);
if(indices.size() == numThreads-1)
break;
}
}
indices.insert(indices.begin(), 0);
indices.push_back(x_test.size());
/* Spreding calculations to multiple threads */
std::vector<std::thread> threads;
for(std::size_t i = 1; i < indices.size(); ++i){
threads.push_back(std::thread(calculateKMatrixCpp, x_test, lengthScale, sigma0sq, i, indices.at(i-1), indices.at(i)));
}
for(auto & th: threads){
th.join();
}
return 0;
}
As you can see, each thread performs the following calculations on the data assigned to it:
void calculateKMatrixCpp(const std::vector<std::vector<double>> xtest, double lengthScale[], double sigma0sq, int threadCounter, int start, int stop){
char buffer[8192];
std::ofstream out("lower_half_matrix_" + std::to_string(threadCounter) +".csv");
out.rdbuf()->pubsetbuf(buffer, 8196);
for(int i = start; i < stop; ++i){
for(int j = 0; j < i+1; ++j){
double kij = seKernel(xtest.at(i), xtest.at(j), lengthScale, sigma0sq);
if (j!=0)
out << ',';
out << kij;
}
if(i!=xtest.size()-1 )
out << '\n';
}
out.close();
}
and
double seKernel(const std::vector<double> x1,const std::vector<double> x2, double lengthScale[], double sigma0sq) {
double sum(0);
for(std::size_t i=0; i<x1.size();i++){
sum += pow((x1.at(i)-x2.at(i))/lengthScale[i],2);
}
return sigma0sq*exp(-0.5*sum);
}
Aspects I considered
locking by simultaneous access to data vector -> I don't pass a reference to the threads, but a copy of the data. I know this is not optimal in terms of RAM usage, but as far as I know this should prevent simultaneous data access since every thread has its own copy
Output -> every thread writes its part of the lower triangular matrix to its own file. My task manager doesn't indicate a full SSD utilization in the slightest
Compiler and machine
Windows 11
GNU GCC Compiler
Code::Blocks (although I don't think that should be of importance)
There are many details that can be improved in your code, but I think the two biggest issues are:
using vectors or vectors, which leads to fragmented data;
writing each piece of data to file as soon as its value is computed.
The first point is easy to fix: use something like std::vector<std::array<double, 3>>. In the code below I use an alias to make it more readable:
using Point3D = std::array<double, 3>;
std::vector<Point3D> x_test;
The second point is slightly harder to address. I assume you wanted to write to the disk inside each thread because you couldn't manage to write to a shared buffer that you could then write to a file.
Here is a way to do exactly that:
void calculateKMatrixCpp(
std::vector<Point3D> const& xtest, Point3D const& lengthScale, double sigma0sq,
int threadCounter, int start, int stop, std::vector<double>& kMatrix
) {
// ...
double& kij = kMatrix[i * xtest.size() + j];
kij = seKernel(xtest[i], xtest[j], lengthScale, sigma0sq);
// ...
}
// ...
threads.push_back(std::thread(
calculateKMatrixCpp, x_test, lengthScale, sigma0sq,
i, indices[i-1], indices[i], std::ref(kMatrix)
));
Here, kMatrix is the shared buffer and represents the whole matrix you are trying to compute. You need to pass it to the thread via std::ref. Each thread will write to a different location in that buffer, so there is no need for any mutex or other synchronization.
Once you make these changes and try to write kMatrix to the disk, you will realize that this is the part that takes the most time, by far.
Below is the full code I tried on my machine, and the computation time was about 2 seconds whereas the writing-to-file part took 300 seconds! No amount of multithreading can speed that up.
If you truly want to write all that data to the disk, you may have some luck with file mapping. Computing the exact size needed should be easy enough if all values have the same number of digits, and it looks like you could write the values with multithreading. I have never done anything like that, so I can't really say much more about it, but it looks to me like the fastest way to write multiple gigabytes of memory to the disk.
#include <vector>
#include <thread>
#include <iostream>
#include <string>
#include <cmath>
#include <array>
#include <random>
#include <fstream>
#include <chrono>
using Point3D = std::array<double, 3>;
auto generateSampleData() -> std::vector<Point3D> {
static std::minstd_rand g(std::random_device{}());
std::uniform_real_distribution<> d(-1.0, 1.0);
std::vector<Point3D> data;
data.reserve(27000);
for (auto i = 0; i < 27000; ++i) {
data.push_back({ d(g), d(g), d(g) });
}
return data;
}
double seKernel(Point3D const& x1, Point3D const& x2, Point3D const& lengthScale, double sigma0sq) {
double sum = 0.0;
for (auto i = 0u; i < 3u; ++i) {
double distance = (x1[i] - x2[i]) / lengthScale[i];
sum += distance*distance;
}
return sigma0sq * std::exp(-0.5*sum);
}
void calculateKMatrixCpp(std::vector<Point3D> const& xtest, Point3D const& lengthScale, double sigma0sq, int threadCounter, int start, int stop, std::vector<double>& kMatrix) {
std::cout << "start of thread " << threadCounter << "\n" << std::flush;
for(int i = start; i < stop; ++i) {
for(int j = 0; j < i+1; ++j) {
double& kij = kMatrix[i * xtest.size() + j];
kij = seKernel(xtest[i], xtest[j], lengthScale, sigma0sq);
}
}
std::cout << "end of thread " << threadCounter << "\n" << std::flush;
}
int main() {
double sigma0sq = 1;
Point3D lengthScale = {0.7633, 0.6937, 3.3307e+07};
const std::vector<Point3D> x_test = generateSampleData();
/* Finding data slices of similar size */
//This piece of code works, each thread is assigned roughly the same number of matrix entries
int numElements = x_test.size()*x_test.size()/2;
const int numThreads = 4;
int elemsPerThread = numElements / numThreads;
std::vector<int> indices;
int j = 0;
for(std::size_t i = 1; i < x_test.size()+1; ++i){
int prod = i*(i+1)/2 - j*(j+1)/2;
if (prod > elemsPerThread) {
i--;
j = i;
indices.push_back(i);
if(indices.size() == numThreads-1)
break;
}
}
indices.insert(indices.begin(), 0);
indices.push_back(x_test.size());
auto start = std::chrono::system_clock::now();
std::vector<double> kMatrix(x_test.size() * x_test.size(), 0.0);
std::vector<std::thread> threads;
for (std::size_t i = 1; i < indices.size(); ++i) {
threads.push_back(std::thread(calculateKMatrixCpp, x_test, lengthScale, sigma0sq, i, indices[i - 1], indices[i], std::ref(kMatrix)));
}
for (auto& t : threads) {
t.join();
}
auto end = std::chrono::system_clock::now();
auto elapsed_seconds = std::chrono::duration<double>(end - start).count();
std::cout << "computation time: " << elapsed_seconds << "s" << std::endl;
start = std::chrono::system_clock::now();
constexpr int buffer_size = 131072;
char buffer[buffer_size];
std::ofstream out("matrix.csv");
out.rdbuf()->pubsetbuf(buffer, buffer_size);
for (int i = 0; i < x_test.size(); ++i) {
for (int j = 0; j < i + 1; ++j) {
if (j != 0) {
out << ',';
}
out << kMatrix[i * x_test.size() + j];
}
if (i != x_test.size() - 1) {
out << '\n';
}
}
end = std::chrono::system_clock::now();
elapsed_seconds = std::chrono::duration<double>(end - start).count();
std::cout << "writing time: " << elapsed_seconds << "s" << std::endl;
}
Okey I've wrote implementation with optimized formatting.
By using #Nelfeal code it was taking on my system around 250 seconds for the run to complete with write time taking the most by far. Or rather std::ofstream formatting taking most of the time.
I've written a C++20 version via std::format_to/format. It is a multi-threaded version that takes around 25-40 seconds to complete all the computations, formatting, and writing. If run in a single thread, it takes on my system around 70 seconds. Same performance should be achievable via fmt library on C++11/14/17.
Here is the code:
import <vector>;
import <thread>;
import <iostream>;
import <string>;
import <cmath>;
import <array>;
import <random>;
import <fstream>;
import <chrono>;
import <format>;
import <filesystem>;
using Point3D = std::array<double, 3>;
auto generateSampleData(Point3D scale) -> std::vector<Point3D>
{
static std::minstd_rand g(std::random_device{}());
std::uniform_real_distribution<> d(-1.0, 1.0);
std::vector<Point3D> data;
data.reserve(27000);
for (auto i = 0; i < 27000; ++i)
{
data.push_back({ d(g)* scale[0], d(g)* scale[1], d(g)* scale[2] });
}
return data;
}
double seKernel(Point3D const& x1, Point3D const& x2, Point3D const& lengthScale, double sigma0sq) {
double sum = 0.0;
for (auto i = 0u; i < 3u; ++i) {
double distance = (x1[i] - x2[i]) / lengthScale[i];
sum += distance * distance;
}
return sigma0sq * std::exp(-0.5 * sum);
}
void calculateKMatrixCpp(std::vector<Point3D> const& xtest, Point3D lengthScale, double sigma0sq, int threadCounter, int start, int stop, std::filesystem::path localPath)
{
using namespace std::string_view_literals;
std::vector<char> buffer;
buffer.reserve(15'000);
std::ofstream out(localPath);
std::cout << std::format("starting thread {}: from {} to {}\n"sv, threadCounter, start, stop);
for (int i = start; i < stop; ++i)
{
for (int j = 0; j < i; ++j)
{
double kij = seKernel(xtest[i], xtest[j], lengthScale, sigma0sq);
std::format_to(std::back_inserter(buffer), "{:.6g}, "sv, kij);
}
double kii = seKernel(xtest[i], xtest[i], lengthScale, sigma0sq);
std::format_to(std::back_inserter(buffer), "{:.6g}\n"sv, kii);
out.write(buffer.data(), buffer.size());
buffer.clear();
}
}
int main() {
double sigma0sq = 1;
Point3D lengthScale = { 0.7633, 0.6937, 3.3307e+07 };
const std::vector<Point3D> x_test = generateSampleData(lengthScale);
/* Finding data slices of similar size */
//This piece of code works, each thread is assigned roughly the same number of matrix entries
int numElements = x_test.size() * (x_test.size()+1) / 2;
const int numThreads = 3;
int elemsPerThread = numElements / numThreads;
std::vector<int> indices;
int j = 0;
for (std::size_t i = 1; i < x_test.size() + 1; ++i) {
int prod = i * (i + 1) / 2 - j * (j + 1) / 2;
if (prod > elemsPerThread) {
i--;
j = i;
indices.push_back(i);
if (indices.size() == numThreads - 1)
break;
}
}
indices.insert(indices.begin(), 0);
indices.push_back(x_test.size());
auto start = std::chrono::system_clock::now();
std::vector<std::thread> threads;
using namespace std::string_view_literals;
for (std::size_t i = 1; i < indices.size(); ++i)
{
threads.push_back(std::thread(calculateKMatrixCpp, std::ref(x_test), lengthScale, sigma0sq, i, indices[i - 1], indices[i], std::format("./matrix_{}.csv"sv, i-1)));
}
for (auto& t : threads)
{
t.join();
}
auto end = std::chrono::system_clock::now();
auto elapsed_seconds = std::chrono::duration<double>(end - start);
std::cout << std::format("total elapsed time: {}"sv, elapsed_seconds);
return 0;
}
Note: I used 6 digits of precision here as it is the default for std::ofstream. More digits means more writing time to disk and lower performance.

Matrix inversion slower using threads

I made a function that makes the inverse and then another multithreaded, as long I have to make inverse of arrays >2000 x 2000.
A 1000x1000 array unthreated takes 2.5 seconds (on a i5-4460 4 cores 2.9ghz)
and multithreaded takes 7.25 seconds
I placed the multithreads in the part that most time consumption is taken. Whai is wrong?
Is due vectors are used instead of 2 dimensions arrays?
This is the minimum code to test both versions:
#include<iostream>
#include <vector>
#include <stdlib.h>
#include <time.h>
#include <chrono>
#include <thread>
const int NUCLEOS = 8;
#ifdef __linux__
#include <unistd.h> //usleep()
typedef std::chrono::system_clock t_clock; //try to use high_resolution_clock on new linux x64 computer!
#else
typedef std::chrono::high_resolution_clock t_clock;
#pragma warning(disable:4996)
#endif
using namespace std;
std::chrono::time_point<t_clock> start_time, stop_time = start_time; char null_char = '\0';
void timer(char *title = 0, int data_size = 1) { stop_time = t_clock::now(); double us = (double)chrono::duration_cast<chrono::microseconds>(stop_time - start_time).count(); if (title) printf("%s time = %7lgms = %7lg MOPs\n", title, (double)us*1e-3, (double)data_size / us); start_time = t_clock::now(); }
//makes columns 0
void colum_zero(vector< vector<double> > &x, vector< vector<double> > &y, int pos0, int pos1,int dim, int ord);
//returns inverse of x, x is not modified, not threaded
vector< vector<double> > inverse(vector< vector<double> > x)
{
if (x.size() != x[0].size())
{
cout << "ERROR on inverse() not square array" << endl; getchar(); return{};//returns a null
}
size_t dim = x.size();
int i, j, ord;
vector< vector<double> > y(dim,vector<double>(dim,0));//initializes output = 0
//init_2Dvector(y, dim, dim);
//1. Unity array y:
for (i = 0; i < dim; i++)
{
y[i][i] = 1.0;
}
double diagon, coef;
double *ptrx, *ptry, *ptrx2, *ptry2;
for (ord = 0; ord<dim; ord++)
{
//2 Hacemos diagonal de x =1
int i2;
if (fabs(x[ord][ord])<1e-15) //If that element is 0, a line that contains a non zero is added
{
for (i2 = ord + 1; i2<dim; i2++)
{
if (fabs(x[i2][ord])>1e-15) break;
}
if (i2 >= dim)
return{};//error, returns null
for (i = 0; i<dim; i++)//added a line without 0
{
x[ord][i] += x[i2][i];
y[ord][i] += y[i2][i];
}
}
diagon = 1.0/x[ord][ord];
ptry = &y[ord][0];
ptrx = &x[ord][0];
for (i = 0; i < dim; i++)
{
*ptry++ *= diagon;
*ptrx++ *= diagon;
}
//uses the same function but not threaded:
colum_zero(x,y,0,dim,dim,ord);
}//end ord
return y;
}
//threaded version
vector< vector<double> > inverse_th(vector< vector<double> > x)
{
if (x.size() != x[0].size())
{
cout << "ERROR on inverse() not square array" << endl; getchar(); return{};//returns a null
}
int dim = (int) x.size();
int i, ord;
vector< vector<double> > y(dim, vector<double>(dim, 0));//initializes output = 0
//init_2Dvector(y, dim, dim);
//1. Unity array y:
for (i = 0; i < dim; i++)
{
y[i][i] = 1.0;
}
std::thread tarea[NUCLEOS];
double diagon;
double *ptrx, *ptry;// , *ptrx2, *ptry2;
for (ord = 0; ord<dim; ord++)
{
//2 Hacemos diagonal de x =1
int i2;
if (fabs(x[ord][ord])<1e-15) //If a diagonal element=0 it is added a column that is not 0 the diagonal element
{
for (i2 = ord + 1; i2<dim; i2++)
{
if (fabs(x[i2][ord])>1e-15) break;
}
if (i2 >= dim)
return{};//error, returns null
for (i = 0; i<dim; i++)//It is looked for a line without zero to be added to make the number a non zero one to avoid later divide by 0
{
x[ord][i] += x[i2][i];
y[ord][i] += y[i2][i];
}
}
diagon = 1.0 / x[ord][ord];
ptry = &y[ord][0];
ptrx = &x[ord][0];
for (i = 0; i < dim; i++)
{
*ptry++ *= diagon;
*ptrx++ *= diagon;
}
int pos0 = 0, N1 = dim;//initial array position
if ((N1<1) || (N1>5000))
{
cout << "It is detected out than 1-5000 simulations points=" << N1 << " ABORT or press enter to continue" << endl; getchar();
}
//cout << "Initiation of " << NUCLEOS << " threads" << endl;
for (int thread = 0; thread<NUCLEOS; thread++)
{
int pos1 = (int)((thread + 1)*N1 / NUCLEOS);//next position
tarea[thread] = std::thread(colum_zero, std::ref(x), std::ref(y), pos0, pos1, dim, ord);//ojo, coil current=1!!!!!!!!!!!!!!!!!!
pos0 = pos1;//next thread will work at next point
}
for (int thread = 0; thread<NUCLEOS; thread++)
{
tarea[thread].join();
//cout << "Thread num: " << thread << " end\n";
}
}//end ord
return y;
}
//makes columns 0
void colum_zero(vector< vector<double> > &x, vector< vector<double> > &y, int pos0, int pos1,int dim, int ord)
{
double coef;
double *ptrx, *ptry, *ptrx2, *ptry2;
//Hacemos '0' la columna ord salvo elemento diagonal:
for (int i = pos0; i<pos1; i++)//Begin to end for every thread
{
if (i == ord) continue;
coef = x[i][ord];//element to make 0
if (fabs(coef)<1e-15) continue; //If already zero, it is avoided
ptry = &y[i][0];
ptry2 = &y[ord][0];
ptrx = &x[i][0];
ptrx2 = &x[ord][0];
for (int j = 0; j < dim; j++)
{
*ptry++ = *ptry - coef * (*ptry2++);//1ª matriz
*ptrx++ = *ptrx - coef * (*ptrx2++);//2ª matriz
}
}
}
void test_6_inverse(int dim)
{
vector< vector<double> > vec1(dim, vector<double>(dim));
for (int i=0;i<dim;i++)
for (int j = 0; j < dim; j++)
{
vec1[i][j] = (-1.0 + 2.0*rand() / RAND_MAX) * 10000;
}
vector< vector<double> > vec2,vec3;
double ini, end;
ini = (double)clock();
vec2 = inverse(vec1);
end = (double)clock();
cout << "=== Time inverse unthreaded=" << (end - ini) / CLOCKS_PER_SEC << endl;
ini=end;
vec3 = inverse_th(vec1);
end = (double)clock();
cout << "=== Time inverse threaded=" << (end - ini) / CLOCKS_PER_SEC << endl;
cout<<vec2[2][2]<<" "<<vec3[2][2]<<endl;//to make the sw to do de inverse
cout << endl;
}
int main()
{
test_6_inverse(1000);
cout << endl << "=== END ===" << endl; getchar();
return 1;
}
After looking deeper in the code of the colum_zero() function I have seen that one thread rewrites in the data to be used by another threads, so the threads are not INDEPENDENT from each other. Fortunately the compiler detect it and avoid it.
Conclusions:
It is not recommended to try Gauss-Jordan method alone to make multithreads
If somebody detects that in multithread is slower and the initial function is spreaded correctly for every thread, perhaps is due one thread results are used by another
The main function inverse() works and can be used by other programmers, so this question should not be deleted
Non answered question:
What is a matrix inverse method that could be spreaded in a lot of independent threads to be used in a gpu?

Using Eigen class to sum certain numbers in a vector

I am new to C++ and I am using the Eigen library. I was wondering if there was a way to sum certain elements in a vector. For example, say I have a vector that is a 100 by 1 and I just want to sum the first 10 elements. Is there a way of doing that using the Eigen library?
What I am trying to do is this: say I have a vector that is 1000 by 1 and I want to take the mean of the first 10 elements, then the next 10 elements, and so on and store that in some vector. Hence I will have a vector of size 100 of the averages. Any thoughts or suggestions are greatly appreciated.
Here is the beginning steps I have in my code. I have a S_temp4vector that is 1000 by 1. Now I intialize a new vector S_A that I want to have as the vector of the means. Here is my messy sloppy code so far: (Note that my question resides in the crudeMonteCarlo function)
#include <iostream>
#include <cmath>
#include <math.h>
#include <Eigen/Dense>
#include <Eigen/Geometry>
#include <random>
#include <time.h>
using namespace Eigen;
using namespace std;
void crudeMonteCarlo(int N,double K, double r, double S0, double sigma, double T, int n);
VectorXd time_vector(double min, double max, int n);
VectorXd call_payoff(VectorXd S, double K);
int main(){
int N = 100;
double K = 100;
double r = 0.2;
double S0 = 100;
double sigma = 0.4;
double T = 0.1;
int n = 10;
crudeMonteCarlo(N,K,r,S0,sigma,T,n);
return 0;
}
VectorXd time_vector(double min, double max, int n){
VectorXd m(n + 1);
double delta = (max-min)/n;
for(int i = 0; i <= n; i++){
m(i) = min + i*delta;
}
return m;
}
MatrixXd generateGaussianNoise(int M, int N){
MatrixXd Z(M,N);
static random_device rd;
static mt19937 e2(time(0));
normal_distribution<double> dist(0.0, 1.0);
for(int i = 0; i < M; i++){
for(int j = 0; j < N; j++){
Z(i,j) = dist(e2);
}
}
return Z;
}
VectorXd call_payoff(VectorXd S, double K){
VectorXd C(S.size());
for(int i = 0; i < S.size(); i++){
if(S(i) - K > 0){
C(i) = S(i) - K;
}else{
C(i) = 0.0;
}
}
return C;
}
void crudeMonteCarlo(int N,double K, double r, double S0, double sigma, double T, int n){
// Create time vector
VectorXd tt = time_vector(0.0,T,n);
VectorXd t(n);
double dt = T/n;
for(int i = 0; i < n; i++){
t(i) = tt(i+1);
}
// Generate standard normal Z matrix
//MatrixXd Z = generateGaussianNoise(N,n);
// Generate the log normal stock process N times to get S_A for crude Monte Carlo
MatrixXd SS(N,n+1);
MatrixXd Z = generateGaussianNoise(N,n);
for(int i = 0; i < N; i++){
SS(i,0) = S0;
for(int j = 1; j <= n; j++){
SS(i,j) = SS(i,j-1)*exp((double) (r - pow(sigma,2.0))*dt + sigma*sqrt(dt)*(double)Z(i,j-1));
}
}
// This long bit of code gives me my S_A.....
Map<RowVectorXd> S_temp1(SS.data(), SS.size());
VectorXd S_temp2(S_temp1.size());
for(int i = 0; i < S_temp2.size(); i++){
S_temp2(i) = S_temp1(i);
}
VectorXd S_temp3(S_temp2.size() - N);
int count = 0;
for(int i = N; i < S_temp2.size(); i++){
S_temp3(count) = S_temp2(i);
count++;
}
VectorXd S_temp4(S_temp3.size());
for(int i = 0; i < S_temp4.size(); i++){
S_temp4(i) = S_temp3(i);
}
VectorXd S_A(N);
S_A(0) = (S_temp4(0) + S_temp4(1) + S_temp4(2) + S_temp4(3) + S_temp4(4) + S_temp4(5) + S_temp4(6) + S_temp4(7) + S_temp4(8) + S_temp4(9))/(n);
S_A(1) = (S_temp4(10) + S_temp4(11) + S_temp4(12) + S_temp4(13) + S_temp4(14) + S_temp4(15) + S_temp4(16) + S_temp4(17) + S_temp4(18) + S_temp4(19))/(n);
int count1 = 0;
for(int i = 0; i < S_temp4.size(); i++){
S_A(count1) =
}
// Calculate payoff of Asian option
//VectorXd call_fun = call_payoff(S_A,K);
}
This question includes a lot of code, which makes it hard to understand the question you're trying to ask. Consider including only the code specific to your question.
In any case, you can use Eigen directly to do all of these things quite simply. In Eigen, Vectors are just matrices with 1 column, so all of the reasoning here is directly applicable to what you've written.
const Eigen::Matrix<double, 100, 1> v = Eigen::Matrix<double, 100, 1>::Random();
const int num_rows = 10;
const int num_cols = 1;
const int starting_row = 0;
const int starting_col = 0;
const double sum_of_first_ten = v.block(starting_row, starting_col, num_rows, num_cols).sum();
const double mean_of_first_ten = sum_of_first_ten / num_rows;
In summary: You can use .block to get a block object, .sum() to sum that block, and then conventional division to get the mean.
You can reshape the input using Map and then do all sub-summations at once without any loop:
VectorXd A(1000); // input
Map<MatrixXd> B(A.data(), 10, A.size()/10); // reshaped version, no copy
VectorXd res = B.colwise().mean(); // partial reduction, you can also use .sum(), .minCoeff(), etc.
The Eigen documentation at https://eigen.tuxfamily.org/dox/group__TutorialBlockOperations.html says an Eigen block is a rectangular part of a matrix or array accessed by matrix.block(i,j,p,q) where i and j are the starting values (eg 0 and 0) and p and q are the block size (eg 10 and 1). Presumably you would then iterate i in steps of 10, and use std::accumulate or perhaps an explicit summation to find the mean of matrix.block(i,0,10,1).

Unexpected deviation from absolute value in Monte Carlo N dimensional integration in c++

I have written a code in c++ for Monte Carlo in tegration, which worked fine with my other functions when I used 2 dimensional integration. I generalized the code for N dimensional integration , in this particular case, I am taking n = 10.
I am trying to integrate a simple function f = x1+x2+x3+x4+x5+....+x10 where, x1....x10 falls within the limit [-5.0, 5.0]. I see a large deviation in the result, when I know the absolute result should be 0. I will greatly appreciate if anyone kindly takes a look at my code and figure out where my code breaks up. I am attaching the code as follows:
#include <iostream>
#include <fstream>
#include <iomanip>
#include <cmath>
#include <cstdlib>
#include <ctime>
using namespace std;
//define multivariate function F(x1, x2, ...xk)
double f(double x[], int n)
{
double y;
int j;
y = 0.0;
for (j = 0; j < n; j = j+1)
{
y = y + x[j];
}
y = y;
return y;
}
/*
* Function f(x1, x2, ... xk)
*/
//define function for Monte Carlo Multidimensional integration
double int_mcnd(double(*fn)(double[],int),double a[], double b[], int n, int m)
{
double r, x[n], p;
int i, j;
// initial seed value (use system time)
srand(time(NULL));
r = 0.0;
p = 1.0;
// step 1: calculate the common factor p
for (j = 0; j < n; j = j+1)
{
p = p*(b[j]-a[j]);
}
// step 2: integration
for (i = 1; i <= m; i=i+1)
{
// calculate random x[] points
for (j = 0; j < n; j = j+1)
{
x[j] = a[j] + static_cast <double> (rand()) /( static_cast <double> (RAND_MAX/(b[j]-a[j])));
}
r = r + fn(x,n);
}
r = r*p/m;
return r;
}
double f(double[], int);
double int_mcnd(double(*)(double[],int), double[], double[], int, int);
int main(int argc, char **argv)
{
/* define how many integrals */
const int n = 10;
double b[n] = {5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0,5.0};
double a[n] = {-5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0,-5.0};
double result;
int i, m;
int N = 20;
cout.precision(6);
cout.setf(ios::fixed | ios::showpoint);
// current time in seconds (begin calculations)
time_t seconds_i;
seconds_i = time (NULL);
m = 2; // initial number of intervals
// convert command-line input to N = number of points
//N = atoi( argv[1] );
for (i=0; i <=N; i=i+1)
{
result = int_mcnd(f, a, b, n, m);
cout << setw(30) << m << setw(30) << result <<endl;
m = m*2;
}
// current time in seconds (end of calculations)
time_t seconds_f;
seconds_f = time (NULL);
cout << endl << "total elapsed time = " << seconds_f - seconds_i << " seconds" << endl << endl;
return 0;
}
The output I am getting is like these:
4 -41046426010.339691
8 -14557222958.913620
16 25601187040.145161
32 29498213233.367203
64 -2422980618.248888
128 -13400105151.286720
256 -11237568021.855265
512 -5950177645.396674
1024 -4726707072.013641
2048 -1240029475.829825
4096 1890210492.995555
8192 573067706.448856
16384 356227781.143659
32768 -343198855.224271
65536 171823353.999405
131072 -143383711.461758
262144 -197599063.607231
524288 -59641584.846697
1048576 10130826.266767
2097152 100880200.681037
total elapsed time = 1 seconds
which is nothing close to my expected output zero. please help me fix the code and thanks in advance.

compute pi value using monte carlo method multithreading

I am trying to find value of PI using montecarlo method, and using parallel C code. i have write serail code and works fine. But the parallel code gives me wrong values of pi some times 0 or minus values
my code
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define NUM_THREADS 4 //number of threads
#define TOT_COUNT 10000055 //total number of iterations
void *doCalcs(void *threadid)
{
long longTid;
longTid = (long)threadid;
int tid = (int)longTid; //obtain the integer value of thread id
//using malloc for the return variable in order make
//sure that it is not destroyed once the thread call is finished
float *in_count = (float *)malloc(sizeof(float));
*in_count=0;
unsigned int rand_state = rand();
//get the total number of iterations for a thread
float tot_iterations= TOT_COUNT/NUM_THREADS;
int counter=0;
//calculation
for(counter=0;counter<tot_iterations;counter++){
//float x = (double)random()/RAND_MAX;
//float y = (double)random()/RAND_MAX;
//float result = sqrt((x*x) + (y*y));
double x = rand_r(&rand_state) / ((double)RAND_MAX + 1) * 2.0 - 1.0;
double y = rand_r(&rand_state) / ((double)RAND_MAX + 1) * 2.0 - 1.0;
float result = sqrt((x*x) + (y*y));
if(result<1){
*in_count+=1; //check if the generated value is inside a unit circle
}
}
//get the remaining iterations calculated by thread 0
if(tid==0){
float remainder = TOT_COUNT%NUM_THREADS;
for(counter=0;counter<remainder;counter++){
float x = (double)random()/RAND_MAX;
float y = (double)random()/RAND_MAX;
float result = sqrt((x*x) + (y*y));
if(result<1){
*in_count+=1; //check if the generated value is inside a unit circle
}
}
}
}
int main(int argc, char *argv[])
{
pthread_t threads[NUM_THREADS];
int rc;
long t;
void *status;
float tot_in=0;
for(t=0;t<NUM_THREADS;t++){
rc = pthread_create(&threads[t], NULL, doCalcs, (void *)t);
if (rc){
printf("ERROR; return code from pthread_create() is %d\n", rc);
exit(-1);
}
}
//join the threads
for(t=0;t<NUM_THREADS;t++){
pthread_join(threads[t], &status);
//printf("Return from thread %ld is : %f\n",t, *(float*)status);
tot_in+=*(float*)status; //keep track of the total in count
}
printf("Value for PI is %f \n",1, 4*(tot_in/TOT_COUNT));
/* Last thing that main() should do */
pthread_exit(NULL);
}
This is a solution using async and future as suggested by #vladon.
#include <iostream>
#include <vector>
#include <random>
#include <future>
using namespace std;
long random_circle_sampling(long n_samples){
std::random_device rd; //Will be used to obtain a seed for the random number engine
std::mt19937 gen(rd()); //Standard mersenne_twister_engine seeded with rd()
std::uniform_real_distribution<> dis(0.0, 1.0);
long points_inside = 0;
for(long i = 0; i < n_samples; ++i){
double x = dis(gen);
double y = dis(gen);
if(x*x + y*y <= 1.0){
++points_inside;
}
}
return points_inside;
}
double approximate_pi(long tot_samples, int n_threads){
long samples_per_thread = tot_samples / n_threads;
// Used to store the future results
vector<future<long>> futures;
for(int t = 0; t < n_threads; ++t){
// Start a new asynchronous task
futures.emplace_back(async(launch::async, random_circle_sampling, samples_per_thread));
}
long tot_points_inside = 0;
for(future<long>& f : futures){
// Wait for the result to be ready
tot_points_inside += f.get();
}
double pi = 4.0 * (double) tot_points_inside / (double) tot_samples;
return pi;
}
int main() {
cout.precision(32);
long tot_samples = 1e6;
int n_threads = 8;
double pi = 3.14159265358979323846;
double approx_pi = approximate_pi(tot_samples, n_threads);
double abs_diff = abs(pi - approx_pi);
cout << "pi\t\t" <<pi << endl;
cout << "approx_pi\t" <<approx_pi << endl;
cout << "abs_diff\t" <<abs_diff << endl;
return 0;
}
You can simply run it with:
$ g++ -std=c++11 -O3 pi.cpp -o pi && time ./pi
pi 3.1415926535897931159979634685442
approx_pi 3.1427999999999998159694314381341
abs_diff 0.0012073464102066999714679695898667
./pi 0.04s user 0.00s system 27% cpu 0.163 total
Your code is not C++, it's bad, very bad plain old C.
That is C++:
#include <cmath>
#include <iostream>
#include <numeric>
#include <random>
#include <thread>
#include <vector>
constexpr auto num_threads = 4; //number of threads
constexpr auto total_count = 10000055; //total number of iterations
void doCalcs(int total_iterations, int & in_count_result)
{
auto seed = std::random_device{}();
auto gen = std::mt19937{ seed };
auto dist = std::uniform_real_distribution<>{0, 1};
auto in_count{ 0 };
//calculation
for (auto counter = 0; counter < total_iterations; ++counter) {
auto x = dist(gen);
auto y = dist(gen);
auto result = std::sqrt(std::pow(x, 2) + std::pow(y, 2));
if (result < 1) {
++in_count; //check if the generated value is inside a unit circle
}
}
in_count_result = in_count;
}
void main()
{
std::vector<std::thread> threads(num_threads);
std::vector<int> in_count(num_threads);
in_count.resize(num_threads);
for (size_t i = 0; i < num_threads; ++i) {
int total_iterations = total_count / num_threads;
if (i == 0) {
total_iterations += total_count % num_threads; // get the remaining iterations calculated by thread 0
}
threads.emplace_back(doCalcs, total_iterations, std::ref(in_count[i]));
}
for (auto & thread : threads) {
if (thread.joinable()) {
thread.join();
}
}
double pi_value = 4.0 * static_cast<double>(std::accumulate(in_count.begin(), in_count.end(), 0)) / static_cast<double>(total_count);
std::cout << "Value of PI is: " << pi_value << std::endl;
}
P.S. And it is also not that good, read about futures, promises and std::async.