That's my code that should count Pi with monte Carlo method. We give in input: thread_count - number of processor threads, n - number of random generated points.
Here's my code below
using namespace std;
int main (int argc, char* argv[]) {
/*sprawdzanie danych: thread_count - liczba wątków, n - liczba punktów*/
if (argc != 3) {
cout << "Co Ty piszesz!. Ma być: ./pi <thread_count> <n>" << endl;
exit(-1);
}
/*Sprawiamy by liczby były całkowite*/
int thread_count = atoi(argv[1]);
long n = atoi(argv[2]);
/*Test wartosci liczb*/
if (thread_count <= 0 || n <= 0) {
cout << "Co Ty piszesz!. Ma byc większe od 0" << endl;
exit(-1);
}
unsigned ziarno;
double x, y;
long int Ustrzelone = 0;
double Start = omp_get_wtime();
#pragma omp parallel default(none) private(x,y, ziarno) firstprivate(n) reduction(+:Ustrzelone) num_threads(thread_count)
{
ziarno = 25231 + 16*omp_get_thread_num();
#pragma omp for schedule(dynamic)
for(long i = 0; i <= n; i++) {
x = (double) rand_r(&ziarno)/RAND_MAX * 2 - 1;
y = (double) rand_r(&ziarno)/RAND_MAX * 2 - 1;
if ((x*x) + (y*y) <= 1.0) {
Ustrzelone += 1;
}
}
}
double Stop = omp_get_wtime();
double czas_obliczen = 1000 * (Stop - Start);
/*Ustawienie ilosci liczb po przecinku i wyswietlenie wyniku*/
cout.precision(15);
//double pi = (double) 4*Ustrzelone/n;
//cout << "Pi wynosi " << pi << endl;
cout << czas_obliczen << endl;
return 0;
In output I get error like that:
[Error] 'rand_r' was not declared in this scope
Not have much time left for this so hope you can bring me to some conclusion how to make it better.
Thank in advance.
well since you are using namespace std; (which is not recommended), perhaps try ::rand_r to help it find the right implementation...
Related
When trying to compile OpenACC code with GCC-9.3.0 (g++) configured with --enable-languages=c,c++,lto --disable-multilib the following code does not use multiple cores, whereas if the same code is compiled with the pgc++ compiler it does use multiple cores.
g++ compilation: g++ -lgomp -Ofast -o jsolve -fopenacc jsolvec.cpp
pgc++ compilation: pgc++ -o jsolvec.exe jsolvec.cpp -fast -Minfo=opt -ta=multicore
Code from OpenACC Tutorial1/solver https://github.com/OpenACCuserGroup/openacc-users-group.git:
// Jacobi iterative method for solving a system of linear equations
// This is guaranteed to converge if the matrix is diagonally dominant,
// so we artificially force the matrix to be diagonally dominant.
// See https://en.wikipedia.org/wiki/Jacobi_method
//
// We solve for vector x in Ax = b
// Rewrite the matrix A as a
// lower triangular (L),
// upper triangular (U),
// and diagonal matrix (D).
//
// Ax = (L + D + U)x = b
//
// rearrange to get: Dx = b - (L+U)x --> x = (b-(L+U)x)/D
//
// we can do this iteratively: x_new = (b-(L+U)x_old)/D
// build with TYPE=double (default) or TYPE=float
// build with TOLERANCE=0.001 (default) or TOLERANCE= any other value
// three arguments:
// vector size
// maximum iteration count
// frequency of printing the residual (every n-th iteration)
#include <cmath>
#include <omp.h>
#include <cstdlib>
#include <iostream>
#include <iomanip>
using std::cout;
#ifndef TYPE
#define TYPE double
#endif
#define TOLERANCE 0.001
void
init_simple_diag_dom(int nsize, TYPE* A)
{
int i, j;
// In a diagonally-dominant matrix, the diagonal element
// is greater than the sum of the other elements in the row.
// Scale the matrix so the sum of the row elements is close to one.
for (i = 0; i < nsize; ++i) {
TYPE sum;
sum = (TYPE)0;
for (j = 0; j < nsize; ++j) {
TYPE x;
x = (rand() % 23) / (TYPE)1000;
A[i*nsize + j] = x;
sum += x;
}
// Fill diagonal element with the sum
A[i*nsize + i] += sum;
// scale the row so the final matrix is almost an identity matrix
for (j = 0; j < nsize; j++)
A[i*nsize + j] /= sum;
}
} // init_simple_diag_dom
int
main(int argc, char **argv)
{
int nsize; // A[nsize][nsize]
int i, j, iters, max_iters, riter;
double start_time, elapsed_time;
TYPE residual, err, chksum;
TYPE *A, *b, *x1, *x2, *xnew, *xold, *xtmp;
// set matrix dimensions and allocate memory for matrices
nsize = 0;
if (argc > 1)
nsize = atoi(argv[1]);
if (nsize <= 0)
nsize = 1000;
max_iters = 0;
if (argc > 2)
max_iters = atoi(argv[2]);
if (max_iters <= 0)
max_iters = 5000;
riter = 0;
if (argc > 3)
riter = atoi(argv[3]);
if (riter <= 0)
riter = 200;
cout << "nsize = " << nsize << ", max_iters = " << max_iters << "\n";
A = new TYPE[nsize*nsize];
b = new TYPE[nsize];
x1 = new TYPE[nsize];
x2 = new TYPE[nsize];
// generate a diagonally dominant matrix
init_simple_diag_dom(nsize, A);
// zero the x vectors, random values to the b vector
for (i = 0; i < nsize; i++) {
x1[i] = (TYPE)0.0;
x2[i] = (TYPE)0.0;
b[i] = (TYPE)(rand() % 51) / 100.0;
}
start_time = omp_get_wtime();
//
// jacobi iterative solver
//
residual = TOLERANCE + 1.0;
iters = 0;
xnew = x1; // swap these pointers in each iteration
xold = x2;
while ((residual > TOLERANCE) && (iters < max_iters)) {
++iters;
// swap input and output vectors
xtmp = xnew;
xnew = xold;
xold = xtmp;
#pragma acc parallel loop
for (i = 0; i < nsize; ++i) {
TYPE rsum = (TYPE)0;
#pragma acc loop reduction(+:rsum)
for (j = 0; j < nsize; ++j) {
if (i != j) rsum += A[i*nsize + j] * xold[j];
}
xnew[i] = (b[i] - rsum) / A[i*nsize + i];
}
//
// test convergence, sqrt(sum((xnew-xold)**2))
//
residual = 0.0;
#pragma acc parallel loop reduction(+:residual)
for (i = 0; i < nsize; i++) {
TYPE dif;
dif = xnew[i] - xold[i];
residual += dif * dif;
}
residual = sqrt((double)residual);
if (iters % riter == 0 ) cout << "Iteration " << iters << ", residual is " << residual << "\n";
}
elapsed_time = omp_get_wtime() - start_time;
cout << "\nConverged after " << iters << " iterations and " << elapsed_time << " seconds, residual is " << residual << "\n";
//
// test answer by multiplying my computed value of x by
// the input A matrix and comparing the result with the
// input b vector.
//
err = (TYPE)0.0;
chksum = (TYPE)0.0;
for (i = 0; i < nsize; i++) {
TYPE tmp;
xold[i] = (TYPE)0.0;
for (j = 0; j < nsize; j++)
xold[i] += A[i*nsize + j] * xnew[j];
tmp = xold[i] - b[i];
chksum += xnew[i];
err += tmp * tmp;
}
err = sqrt((double)err);
cout << "Solution error is " << err << "\n";
if (err > TOLERANCE)
cout << "****** Final Solution Out of Tolerance ******\n" << err << " > " << TOLERANCE << "\n";
delete A;
delete b;
delete x1;
delete x2;
return 0;
}
It's not yet supported in GCC to use OpenACC to schedule parallel loops onto multicore CPUs. Using OpenMP works for that, of course, and you can have code with mixed OpenACC (for GPU offloading, as already present in your code) and OpenMP directives (for CPU parallelization, not yet present in your code), so that the respective mechanism will be used depending on whether compiling with -fopenacc vs. -fopenmp.
Like PGI are doing, it certainly can be supported in GCC; we'll certainly be able to implement that, but it has not yet been scheduled, has not yet been funded for GCC.
something does not work as expected, please give me a piece of advice.
I try to find the PI number with many decimals, I user with success the atan series
but I try to use a faster method and it seems that Chudnovsky is one of the solution.
But after some tests it seems that something went wrongly, more exactly there are
just a few exact decimals.
#include "bigInt.h"
#define DECIMALS 200
Bint ONE_() {
string one("1");
for (int i = 1; i <= DECIMALS; i++)
one = one + "0";
return Bint(one.c_str());
}
Bint FOUR_() {
string four("4");
for (int i = 1; i <= DECIMALS; i++)
four = four + "0";
return Bint(four.c_str());
}
Bint EIGHT_() {
string eight("8");
for (int i = 1; i <= DECIMALS; i++)
eight = eight + "0";
return Bint(eight.c_str());
}
static Bint ONE, FOUR, EIGHT;
class Init {
public:
Init() {
ONE = ONE_();
FOUR = FOUR_();
EIGHT = EIGHT_();
}
};
..............
Bint Chudnovsky() {
Bint SQR("10002499687578100594479218787636");
Bint C("426880");
Bint L("13591409");
Bint LS("545140134");
Bint X("1");
Bint M("1");
Bint B("-262537412640768000");
Bint PI(L);
Bint K("6");
int i = 1;
while(i < 100) {
M = M * (K * K * K - K * 16) / (i + 1) / (i + 1) / (i++ + 1);
L = L + LS;
X = X * B;
PI = PI + M * L / X;
K = K + 12;
}
PI = ONE / PI;
PI = C *SQR * PI;
return PI;
}
.....................
main()
{
cout << "START ------------------------------" << endl;
auto start = std::chrono::high_resolution_clock::now();
Bint PI = Chudnovsky();
auto finish = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> elapsed = finish - start;
cout << "Elapsed time: " << elapsed.count() << " s" << endl;
cout << " Chudnovsky formula" << endl;
cout << endl << PI << endl << endl;
.................
}
START ------------------------------
Elapsed time: 21.3608 s
Chudnovsky formula
31415926535897342076684535915783681294558937929099183167837859930489914621802640182485862944746935361889264019646528185561923712250878477720742566131296615384026777503347886889431404794013630226633347235010074370488851025463587840
I made a function that makes the inverse and then another multithreaded, as long I have to make inverse of arrays >2000 x 2000.
A 1000x1000 array unthreated takes 2.5 seconds (on a i5-4460 4 cores 2.9ghz)
and multithreaded takes 7.25 seconds
I placed the multithreads in the part that most time consumption is taken. Whai is wrong?
Is due vectors are used instead of 2 dimensions arrays?
This is the minimum code to test both versions:
#include<iostream>
#include <vector>
#include <stdlib.h>
#include <time.h>
#include <chrono>
#include <thread>
const int NUCLEOS = 8;
#ifdef __linux__
#include <unistd.h> //usleep()
typedef std::chrono::system_clock t_clock; //try to use high_resolution_clock on new linux x64 computer!
#else
typedef std::chrono::high_resolution_clock t_clock;
#pragma warning(disable:4996)
#endif
using namespace std;
std::chrono::time_point<t_clock> start_time, stop_time = start_time; char null_char = '\0';
void timer(char *title = 0, int data_size = 1) { stop_time = t_clock::now(); double us = (double)chrono::duration_cast<chrono::microseconds>(stop_time - start_time).count(); if (title) printf("%s time = %7lgms = %7lg MOPs\n", title, (double)us*1e-3, (double)data_size / us); start_time = t_clock::now(); }
//makes columns 0
void colum_zero(vector< vector<double> > &x, vector< vector<double> > &y, int pos0, int pos1,int dim, int ord);
//returns inverse of x, x is not modified, not threaded
vector< vector<double> > inverse(vector< vector<double> > x)
{
if (x.size() != x[0].size())
{
cout << "ERROR on inverse() not square array" << endl; getchar(); return{};//returns a null
}
size_t dim = x.size();
int i, j, ord;
vector< vector<double> > y(dim,vector<double>(dim,0));//initializes output = 0
//init_2Dvector(y, dim, dim);
//1. Unity array y:
for (i = 0; i < dim; i++)
{
y[i][i] = 1.0;
}
double diagon, coef;
double *ptrx, *ptry, *ptrx2, *ptry2;
for (ord = 0; ord<dim; ord++)
{
//2 Hacemos diagonal de x =1
int i2;
if (fabs(x[ord][ord])<1e-15) //If that element is 0, a line that contains a non zero is added
{
for (i2 = ord + 1; i2<dim; i2++)
{
if (fabs(x[i2][ord])>1e-15) break;
}
if (i2 >= dim)
return{};//error, returns null
for (i = 0; i<dim; i++)//added a line without 0
{
x[ord][i] += x[i2][i];
y[ord][i] += y[i2][i];
}
}
diagon = 1.0/x[ord][ord];
ptry = &y[ord][0];
ptrx = &x[ord][0];
for (i = 0; i < dim; i++)
{
*ptry++ *= diagon;
*ptrx++ *= diagon;
}
//uses the same function but not threaded:
colum_zero(x,y,0,dim,dim,ord);
}//end ord
return y;
}
//threaded version
vector< vector<double> > inverse_th(vector< vector<double> > x)
{
if (x.size() != x[0].size())
{
cout << "ERROR on inverse() not square array" << endl; getchar(); return{};//returns a null
}
int dim = (int) x.size();
int i, ord;
vector< vector<double> > y(dim, vector<double>(dim, 0));//initializes output = 0
//init_2Dvector(y, dim, dim);
//1. Unity array y:
for (i = 0; i < dim; i++)
{
y[i][i] = 1.0;
}
std::thread tarea[NUCLEOS];
double diagon;
double *ptrx, *ptry;// , *ptrx2, *ptry2;
for (ord = 0; ord<dim; ord++)
{
//2 Hacemos diagonal de x =1
int i2;
if (fabs(x[ord][ord])<1e-15) //If a diagonal element=0 it is added a column that is not 0 the diagonal element
{
for (i2 = ord + 1; i2<dim; i2++)
{
if (fabs(x[i2][ord])>1e-15) break;
}
if (i2 >= dim)
return{};//error, returns null
for (i = 0; i<dim; i++)//It is looked for a line without zero to be added to make the number a non zero one to avoid later divide by 0
{
x[ord][i] += x[i2][i];
y[ord][i] += y[i2][i];
}
}
diagon = 1.0 / x[ord][ord];
ptry = &y[ord][0];
ptrx = &x[ord][0];
for (i = 0; i < dim; i++)
{
*ptry++ *= diagon;
*ptrx++ *= diagon;
}
int pos0 = 0, N1 = dim;//initial array position
if ((N1<1) || (N1>5000))
{
cout << "It is detected out than 1-5000 simulations points=" << N1 << " ABORT or press enter to continue" << endl; getchar();
}
//cout << "Initiation of " << NUCLEOS << " threads" << endl;
for (int thread = 0; thread<NUCLEOS; thread++)
{
int pos1 = (int)((thread + 1)*N1 / NUCLEOS);//next position
tarea[thread] = std::thread(colum_zero, std::ref(x), std::ref(y), pos0, pos1, dim, ord);//ojo, coil current=1!!!!!!!!!!!!!!!!!!
pos0 = pos1;//next thread will work at next point
}
for (int thread = 0; thread<NUCLEOS; thread++)
{
tarea[thread].join();
//cout << "Thread num: " << thread << " end\n";
}
}//end ord
return y;
}
//makes columns 0
void colum_zero(vector< vector<double> > &x, vector< vector<double> > &y, int pos0, int pos1,int dim, int ord)
{
double coef;
double *ptrx, *ptry, *ptrx2, *ptry2;
//Hacemos '0' la columna ord salvo elemento diagonal:
for (int i = pos0; i<pos1; i++)//Begin to end for every thread
{
if (i == ord) continue;
coef = x[i][ord];//element to make 0
if (fabs(coef)<1e-15) continue; //If already zero, it is avoided
ptry = &y[i][0];
ptry2 = &y[ord][0];
ptrx = &x[i][0];
ptrx2 = &x[ord][0];
for (int j = 0; j < dim; j++)
{
*ptry++ = *ptry - coef * (*ptry2++);//1ª matriz
*ptrx++ = *ptrx - coef * (*ptrx2++);//2ª matriz
}
}
}
void test_6_inverse(int dim)
{
vector< vector<double> > vec1(dim, vector<double>(dim));
for (int i=0;i<dim;i++)
for (int j = 0; j < dim; j++)
{
vec1[i][j] = (-1.0 + 2.0*rand() / RAND_MAX) * 10000;
}
vector< vector<double> > vec2,vec3;
double ini, end;
ini = (double)clock();
vec2 = inverse(vec1);
end = (double)clock();
cout << "=== Time inverse unthreaded=" << (end - ini) / CLOCKS_PER_SEC << endl;
ini=end;
vec3 = inverse_th(vec1);
end = (double)clock();
cout << "=== Time inverse threaded=" << (end - ini) / CLOCKS_PER_SEC << endl;
cout<<vec2[2][2]<<" "<<vec3[2][2]<<endl;//to make the sw to do de inverse
cout << endl;
}
int main()
{
test_6_inverse(1000);
cout << endl << "=== END ===" << endl; getchar();
return 1;
}
After looking deeper in the code of the colum_zero() function I have seen that one thread rewrites in the data to be used by another threads, so the threads are not INDEPENDENT from each other. Fortunately the compiler detect it and avoid it.
Conclusions:
It is not recommended to try Gauss-Jordan method alone to make multithreads
If somebody detects that in multithread is slower and the initial function is spreaded correctly for every thread, perhaps is due one thread results are used by another
The main function inverse() works and can be used by other programmers, so this question should not be deleted
Non answered question:
What is a matrix inverse method that could be spreaded in a lot of independent threads to be used in a gpu?
I am writing a Pi estimator for class. The point is to estimate Pi using OpenMP and to analyze the speedup provided by using three separate schedules, static, dynamic and guided. However my total time skyrockets when I add threads. My total time for 1 thread is around 21 seconds, and for 2 threads is roughly 145 seconds. I can't figure out why. Here is my code:
#include <stdio.h>
#include <time.h>
#include <ctime>
#include <omp.h>
#include <assert.h>
#include <cstdlib>
#include <string>
#include <iostream>
#include <stdlib.h>
using namespace std;
void computePi(int, int);
int main(int argc, char *argv[])
{
cout.precision(20);
omp_set_dynamic(0);
int i, p;
int n;
// loop {number of iterations} [number of threads]
if (argc > 1)
{
n = atoll(argv[1]);
p = atoi(argv[2]);
}
else
{
n = 10000000;
p = 8;
}
printf("Debug: dart throws = %d \n", n);
printf("Debug: number of requested threads = %d\n", p);
omp_set_num_threads(p);
double time = omp_get_wtime();
//dispArray(a,n);
computePi(n, p);
time = omp_get_wtime() - time;
printf("Total time = %f seconds \n ", time);
return 0;
}
void computePi(int n, int p) {
omp_set_num_threads(p);
int i;
int hits = 0;
srand(time(NULL));
double timeStatic = omp_get_wtime();
#pragma omp parallel for shared(i) schedule(static) reduction(+:hits)
for (i = 0; i<n; i++)
{
float r, x, xdiff, y, ydiff;
x = (float)rand() / (float)RAND_MAX;
y = (float)rand() / (float)RAND_MAX;
if (y > .50)
{
ydiff = y - .50;
}
else
{
ydiff = .50 - y;
}
if (x > .50)
{
xdiff = x - .50;
}
else
{
xdiff = .50 - x;
}
xdiff *= xdiff;
ydiff *= ydiff;
r = sqrt(ydiff + xdiff);
if (r <= .50)
{
hits += 1;
}
}
timeStatic = omp_get_wtime() - timeStatic;
float percentage;
percentage = (float)hits / (float)n;
cout << "Static Loop" << endl;
cout << "Hit Percentage: " << percentage * 100 << "%" << endl;
cout << "Pi Estimation: " << percentage * 4 << endl;
cout << "Time Taken: " << timeStatic << endl << endl;
hits = 0;
double timeDynamic = omp_get_wtime();
#pragma omp parallel for shared(i) schedule(dynamic) reduction(+:hits)
for (i = 0; i<n; i++)
{
float r, x, xdiff, y, ydiff;
x = (float)rand() / (float)RAND_MAX;
y = (float)rand() / (float)RAND_MAX;
if (y > .50)
{
ydiff = y - .50;
}
else
{
ydiff = .50 - y;
}
if (x > .50)
{
xdiff = x - .50;
}
else
{
xdiff = .50 - x;
}
xdiff *= xdiff;
ydiff *= ydiff;
r = sqrt(ydiff + xdiff);
if (r <= .50)
{
hits += 1;
}
}
timeDynamic = omp_get_wtime() - timeDynamic;
percentage = (float)hits / (float)n;
cout << "Dynamic Loop" << endl;
cout << "Hit Percentage: " << percentage * 100 << "%" << endl;
cout << "Pi Estimation: " << percentage * 4 << endl;
cout << "Time Taken: " << timeDynamic << endl << endl;
hits = 0;
double timeGuided = omp_get_wtime();
#pragma omp parallel for shared(i) schedule(guided) reduction(+:hits)
for (i = 0; i<n; i++)
{
float r, x, xdiff, y, ydiff;
x = (float)rand() / (float)RAND_MAX;
y = (float)rand() / (float)RAND_MAX;
if (y > .50)
{
ydiff = y - .50;
}
else
{
ydiff = .50 - y;
}
if (x > .50)
{
xdiff = x - .50;
}
else
{
xdiff = .50 - x;
}
xdiff *= xdiff;
ydiff *= ydiff;
r = sqrt(ydiff + xdiff);
if (r <= .50)
{
hits += 1;
}
}
timeGuided = omp_get_wtime() - timeGuided;
percentage = (float)hits / (float)n;
cout << "Guided Loop" << endl;
cout << "Hit Percentage: " << percentage * 100 << "%" << endl;
cout << "Pi Estimation: " << percentage * 4 << endl;
cout << "Time Taken: " << timeGuided << endl << endl;
return;
}
Any help would be appreciated.
doing a C++ approximation of Pi using a random number generator, output works exactly as expected on my AMD 64 machine running Ubuntu, however on my school machine the second algorithm I've implemented is broken, and would love some insight as to why. Code is as follows:
#ifndef RANDOMNUMBER_H_
#define RANDOMNUMBER_H_
class RandomNumber {
public:
RandomNumber() {
x = time(NULL);
m = pow(2, 19); //some constant value
M = 65915 * 7915; //multiply of some simple numbers p and q
method = 1;
}
RandomNumber(int seed) {
x = ((seed > 0) ? seed : time(NULL));
m = pow(2, 19); //some constant value
method = 1; //method number
M = 6543 * 7915; //multiply of some simple numbers p and q
}
void setSeed(long int seed) {
x = seed; //set start value
}
void chooseMethod(int method) {
this->method = ((method > 0 && method <= 2) ? method : 1); //choose one of two method
}
long int linearCongruential() { //first generator, that uses linear congruential method
long int c = 0; // some constant
long int a = 69069; //some constant
x = (a * x + c) % m; //solution next value
return x;
}
long int BBS() { //algorithm Blum - Blum - Shub
x = (long int) (pow(x, 2)) % M;
return x;
}
double nextPoint() { //return random number in range (-1;1)
double point;
if (method == 1) //use first method
point = linearCongruential() / double(m);
else
point = BBS() / double(M);
return point;
}
private:
long int x; //current value
long int m; // some range for first method
long int M; //some range for second method
int method; //method number
};
#endif /* RANDOMNUMBER_H_ */
and test class:
#include <iostream>
#include <stdlib.h>
#include <math.h>
#include <iomanip>
#include "RandomNumber.h"
using namespace std;
int main(int argc, char* argv[]) {
cout.setf(ios::fixed);
cout.precision(6);
RandomNumber random;
random.setSeed(argc);
srand((unsigned) time(NULL));
cout << "---------------------------------" << endl;
cout << " Monte Carlo Pi Approximation" << endl;
cout << "---------------------------------" << endl;
cout << " Enter number of points: ";
long int k1;
cin >> k1;
cout << "Select generator number: ";
int method;
cin >> method;
random.chooseMethod(method);
cout << "---------------------------------" << endl;
long int k2 = 0;
double sumX = 0;
double sumY = 0;
for (long int i = 0; i < k1; i++) {
double x = pow(-1, int(random.nextPoint() * 10) % 2)
* random.nextPoint();
double y = pow(-1, int(random.nextPoint() * 10) % 2)
* random.nextPoint();
sumX += x;
sumY += y;
if ((pow(x, 2) + pow(y, 2)) <= 1)
k2++;
}
double pi = 4 * (double(k2) / k1);
cout << "M(X) = " << setw(10) << sumX / k1 << endl; //mathematical expectation of x
cout << "M(Y) = " << setw(10) << sumY / k1 << endl; //mathematical expectation of y
cout << endl << "Pi = " << pi << endl << endl; //approximate Pi
return 0;
}
The second method returns 4.000 consistently on my lab machine, yet returns a rather close approximation on my personal machine.
For one thing, the BBS generator as you're using it will always return 1.
Since your program takes no arguments, presumably its argc will be 1. You pass argc as the seed (why?), so the initial value of x is 1.
BBS() has the following logic:
x = (long int) (pow(x, 2)) % M;
Clearly, 1 squared modulo M gives 1, so x never changes.
When you run the simulation with such a generator, your program will always output 4.
P.S. Wikipedia has the following to say about the initial value x0 for Blum Blum Shub:
The seed x0 should be an integer that's co-prime to M (i.e. p and q are not factors of x0) and not 1 or 0.