I've been playing around with RcppParallel and coded up a fairly simple example to figure out how things work. The code is displayed below.
The function float pdf(double x, double sigma) calculates a scaled version of a Gaussian distribution with mean 0 and standard deviation sigma.
Struct_1 is a struct that creates a worker to perform some calculations. I populate a matrix to figure out why certain things are not working correctly.
void Struct_check() performs the calculations.
The function seems to work but every now and again it does not work as expected. I think that it has to do with the types used to perform the calculations in the function pdf!
An example run is displayed below the code.
I would appreciate any help help!
#include <RcppParallel.h>
#include <RcppArmadillo.h>
#include <RcppArmadilloExtensions/sample.h>
#include <math.h>
#define pi 3.14159265358979323846 /* pi */
using namespace arma;
using namespace Rcpp;
using namespace R;
using namespace sugar;
using namespace std;
using namespace RcppParallel;
// Enable C++11 via this plugin (Rcpp 0.10.3 or later)
// [[Rcpp::plugins(cpp11)]]
// [[Rcpp::depends(RcppParallel)]]
// Returns the probability of x, given the distribution described by mu and sigma.
float pdf(double x, double sigma)
{
return exp( -1 * x * x / (2 * sigma * sigma)) / sigma;
}
struct Struct_1 : public Worker
{
arma::vec wr;
arma::vec sr;
NumericVector w2;
// source matrix
const RVector<double> input;
// destination matrix
RMatrix<double> output;
// initialize with source and destination
Struct_1(const NumericMatrix input, NumericMatrix output)
: input(input), output(output) {}
//what is done.
void operator()(std::size_t begin, std::size_t end) {
for (std::size_t i=begin; i<end; i++){ //the processor loop!
NumericVector w2(3);
for (int comp_j=0; comp_j<3; ++comp_j){
w2(comp_j) = wr(comp_j) * pdf( input[i], sr(comp_j) ) ;
}
double sw1 = sum(w2);
output(i,0) = w2(0);
output(i,1) = w2(1);
output(i,2) = w2(2);
output(i,3) = sw1;
w2 = w2/sw1;
output(i,4) = w2(0);
output(i,5) = w2(1);
output(i,6) = w2(2);
double sw2 = sum(w2);
output(i,7) = sw2;
}//end of i loop
}//end of operator
};
// [[Rcpp::depends("RcppArmadillo")]]
// [[Rcpp::export]]
void Struct_check(){
//Some vecs defined
arma::vec wr = {0.2522, 0.58523, 0.16257};
arma::vec s2r = {1.2131, 2.9955, 7.5458};
arma::vec sr = sqrt(s2r);
//an arma mat that will be used in the struct
arma::mat arb_mat;
arb_mat.randn(20);
Rcout<<"Arb_mat=\n"<<arb_mat<<endl;
NumericMatrix r_i_x_NM = as<NumericMatrix>(wrap( arb_mat )); //convert to NumericMatrix
NumericMatrix output( r_i_x_NM.nrow() , 8 ); //define the output matrix
Struct_1 struct_1( r_i_x_NM , output);
struct_1.wr = wr;
struct_1.sr = sr;
Rcout<<"nrow output = "<<output.nrow()<<endl;
Rcout<<"ncol output = "<<output.ncol()<<endl;
parallelFor(0, r_i_x_NM.length(), struct_1);
Rcout<<"completed Parallell calculations"<<endl;
Rcout<<"output = \n"<<output<<endl;
}
Run from within Rstudio. I am running OS X El Capitan if that matter.
Struct_check()
Arb_mat=
-0.4539
0.7915
0.2581
1.5917
0.3718
0.4452
0.1230
-1.4719
0.0024
2.6166
-0.4839
-1.2865
2.0492
-1.5980
-0.7531
-0.7312
-1.4482
0.0202
0.4434
-0.0224
nrow output = 20
ncol output = 8
completed Parallell calculations
output =
0.210336 0.326704 0.0583792 0.595419 0.353256 0.548696 0.0980473 1.00000
0.176872 0.304564 0.0567753 0.538211 0.328629 0.565882 0.105489 1.00000
0.222778 0.334398 0.0589211 0.616097 0.361596 0.542768 0.0956361 1.00000
0.0805904 0.221529 0.0500356 0.352155 0.228849 0.629067 0.142084 1.00000
0.216296 0.330423 0.0586421 0.605361 0.357301 0.545827 0.0968712 1.00000
0.211018 0.327133 0.0584096 0.596561 0.353724 0.548365 0.0979106 1.00000
0.227556 0.337284 0.0591224 0.623962 0.364695 0.540551 0.0947533 1.00000
0.0937487 0.235521 0.0512670 0.380537 0.246359 0.618918 0.134723 1.00000
0.228979 0.338136 0.0591817 0.626297 0.365608 0.539897 0.0944947 1.00000
0.0136216 0.107837 0.0375975 0.159056 0.0856401 0.677981 0.236379 1.00000
0.207911 0.325174 0.0582705 0.591355 0.351584 0.549879 0.0985372 1.00000
0.115751 0.256513 0.0530344 0.425298 0.272164 0.603137 0.124699 1.00000
0.0405607 0.167755 0.0448066 0.253123 0.160241 0.662743 0.177015 1.00000
0.0799309 0.220793 0.0499695 0.350694 0.227922 0.629590 0.142488 1.00000
0.181248 0.307594 0.0569989 0.545841 0.332053 0.563523 0.104424 1.00000
0.183689 0.309265 0.0571216 0.550075 0.333934 0.562222 0.103843 1.00000
**0.228941 0.338113 0.0591801 0.618557 0.591026 0.872861 0.152777 1.61666**
0.228941 0.338113 0.0591801 0.626234 0.365583 0.539915 0.0945016 1.61666
0.211153 0.327218 0.0584156 0.596786 0.353816 0.548300 0.0978837 1.00000
0.228932 0.338108 0.0591798 0.626220 0.365578 0.539919 0.0945032 1.00000
The error occurs when -1.4482 is evaluated to produce the following line 0.228941 0.338113 0.0591801 0.618557 0.591026 0.872861 0.152777 1.61666
In R - checking I get :
wr <- c(0.2522, 0.58523, 0.16257)
s2r <- c(1.2131, 2.9955, 7.5458)
sr <- sqrt(s2r)
w<-NULL
for (i in 1:3){
w[i] = wr[i]*exp( -0.5*((-1.4482/sr[i])^ 2))/(sr[i])
}
w
[1] 0.09646706 0.23826346 0.05150315
sum(w)
[1] 0.3862337
w = w/sum(w)
w
[1] 0.2497635 0.6168894 0.1333471
Related
I'm trying to reproduce some numpy code on Gaussian Processes (from here) using Eigen. Basically, I need to sample from a multivariate normal distribution:
samples = np.random.multivariate_normal(mu.ravel(), cov, 1)
The mean vector is currently zero, while the covariance matrix is a square matrix generated via the isotropic squared exponential kernel:
sqdist = np.sum(X1**2, 1).reshape(-1, 1) + np.sum(X2**2, 1) - 2 * np.dot(X1, X2.T)
return sigma_f**2 * np.exp(-0.5 / l**2 * sqdist)
I can generate the covariance matrix just fine for now (it can probably be cleaned but for now it's a POC):
Matrix2D kernel(const Matrix2D & x1, const Matrix2D & x2, double l = 1.0, double sigma = 1.0) {
auto dists = ((- 2.0 * (x1 * x2.transpose())).colwise()
+ x1.rowwise().squaredNorm()).rowwise() +
+ x2.rowwise().squaredNorm().transpose();
return std::pow(sigma, 2) * ((-0.5 / std::pow(l, 2)) * dists).array().exp();
}
However, my problems start when I need to sample the multivariate normal.
I've tried using the solution proposed in this accepted answer; however, the decomposition only works with covariance matrices of size up to 30x30; more than that and LLT fails to decompose the matrix. The alternative version provided in the answer also does not work, and creates NaNs. I tried LDLT as well but it also breaks (D contains negative values, so sqrt gives NaN).
Then, I got curious, and I looked into how numpy does this. Turns out the numpy implementation uses SVD decomposition (with LAPACK), rather than Cholesky. So I tried copying their implementation:
// SVD on the covariance matrix generated via kernel function
Eigen::BDCSVD<Matrix2D> solver(covs, Eigen::ComputeFullV);
normTransform = (-solver.matrixV().transpose()).array().colwise() * solver.singularValues().array().sqrt();
// Generate gaussian samples, "randN" is from the multivariate StackOverflow answer
Matrix2D gaussianSamples = Eigen::MatrixXd::NullaryExpr(1, means.size(), randN);
Eigen::MatrixXd samples = (gaussianSamples * normTransform).rowwise() + means.transpose();
The various minuses are me trying to exactly reproduce numpy's results.
In any case, this works perfectly fine, even with large dimensions. I was wondering why Eigen is not able to do LLT, but SVD works. The covariance matrix I use is the same. Is there something I can do to simply use LLT?
EDIT: Here is my full example:
#include <iostream>
#include <random>
#include <Eigen/Cholesky>
#include <Eigen/SVD>
#include <Eigen/Eigenvalues>
using Matrix2D = Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor | Eigen::AutoAlign>;
using Vector = Eigen::Matrix<double, Eigen::Dynamic, 1>;
/*
We need a functor that can pretend it's const,
but to be a good random number generator
it needs mutable state.
*/
namespace Eigen {
namespace internal {
template<typename Scalar>
struct scalar_normal_dist_op
{
static std::mt19937 rng; // The uniform pseudo-random algorithm
mutable std::normal_distribution<Scalar> norm; // The gaussian combinator
EIGEN_EMPTY_STRUCT_CTOR(scalar_normal_dist_op)
template<typename Index>
inline const Scalar operator() (Index, Index = 0) const { return norm(rng); }
};
template<typename Scalar> std::mt19937 scalar_normal_dist_op<Scalar>::rng;
template<typename Scalar>
struct functor_traits<scalar_normal_dist_op<Scalar> >
{ enum { Cost = 50 * NumTraits<Scalar>::MulCost, PacketAccess = false, IsRepeatable = false }; };
} // end namespace internal
} // end namespace Eigen
Matrix2D kernel(const Matrix2D & x1, const Matrix2D & x2, double l = 1.0, double sigma = 1.0) {
auto dists = ((- 2.0 * (x1 * x2.transpose())).colwise() + x1.rowwise().squaredNorm()).rowwise() + x2.rowwise().squaredNorm().transpose();
return std::pow(sigma, 2) * ((-0.5 / std::pow(l, 2)) * dists).array().exp();
}
int main() {
unsigned size = 50;
unsigned seed = 1;
Matrix2D X = Vector::LinSpaced(size, -5.0, 4.8);
Eigen::internal::scalar_normal_dist_op<double> randN; // Gaussian functor
Eigen::internal::scalar_normal_dist_op<double>::rng.seed(seed); // Seed the rng
Vector means = Vector::Zero(X.rows());
auto covs = kernel(X, X);
Eigen::LLT<Matrix2D> cholSolver(covs);
// We can only use the cholesky decomposition if
// the covariance matrix is symmetric, pos-definite.
// But a covariance matrix might be pos-semi-definite.
// In that case, we'll go to an EigenSolver
Eigen::MatrixXd normTransform;
if (cholSolver.info()==Eigen::Success) {
std::cout << "Used LLT\n";
// Use cholesky solver
normTransform = cholSolver.matrixL();
} else {
std::cout << "Broken\n";
Eigen::BDCSVD<Matrix2D> solver(covs, Eigen::ComputeFullV);
normTransform = (-solver.matrixV().transpose()).array().colwise() * solver.singularValues().array().sqrt();
}
Matrix2D gaussianSamples = Eigen::MatrixXd::NullaryExpr(1, means.size(), randN);
Eigen::MatrixXd samples = (gaussianSamples * normTransform).rowwise() + means.transpose();
return 0;
}
I presume, or rather hope, that I have a singular fixable problem or perhaps many smaller ones and should give up. Either way I am relatively new to Rcpp and extremely uninformed on parallel computation and can't find a solution online.
The problem is typically, a 'fatal error' in R or R gets stuck in a loop, something like 5 minuets for 10 iterations, when the non-parallel version will do 5K iterations in the same time, roughly speaking.
As this algorithm fits into a much larger project I call on several other functions, these are all in Rcpp and I rewrote them with only 'arma' objects as that seemed to help other people, here. I also ran the optimization part with a 'heat map' optimizer I wrote in Rcpp, again exclusively in 'arma' without improvement - I should also point out this returned as an 'arma::vec'.
// [[Rcpp::depends("RcppArmadillo")]]
// [[Rcpp::depends("RcppParallel")]]
#include <RcppArmadillo.h>
#include <RcppParallel.h>
using namespace Rcpp;
using namespace std;
using namespace arma;
using namespace RcppParallel;
struct Boot_Worker : public Worker {
//Generate Inputs
// Source vector to keep track of the number of bootstraps
const arma::vec Boot_reps;
// Initial non-linear theta parameter values
const arma::vec init_val;
// Decimal date vector
const arma::colvec T_series;
// Generate the price series observational vector
const arma::colvec Y_est;
const arma::colvec Y_res;
// Generate the optimization constants
const arma::mat U;
const arma::colvec C;
const int N;
// Generate Output Matrix
arma::mat Boots_out;
// Initialize with the proper input and output
Boot_Worker( const arma::vec Boot_reps, const arma::vec init_val, const arma::colvec T_series, const arma::colvec Y_est, const arma::colvec Y_res, const arma::mat U, const arma::colvec C, const int N, arma::mat Boots_out)
: Boot_reps(Boot_reps), init_val(init_val), T_series(T_series), Y_est(Y_est), Y_res(Y_res), U(U), C(C), N(N), Boots_out(Boots_out) {}
void operator()(std::size_t begin, std::size_t end){
//load necessary stuffs from around
Rcpp::Environment stats("package:stats");
Rcpp::Function constrOptim = stats["constrOptim"];
Rcpp::Function SDK_pred_mad( "SDK_pred_mad");
arma::mat fake_data(N,2);
arma::colvec index(N);
for(unsigned int i = begin; i < end; i ++){
// Need a nested loop to create and fill the fake data matrix
arma::vec pool = arma::regspace(0, N-1) ;
std::random_shuffle(pool.begin(), pool.end());
for(int k = 0; k <= N-1; k++){
fake_data(k, 0) = Y_est[k] + Y_res[ pool[k] ];
fake_data(k, 1) = T_series[k];
}
// Call the optimization
Rcpp::List opt_results = constrOptim(Rcpp::_["theta"] = init_val,
Rcpp::_["f"] = SDK_pred_mad,
Rcpp::_["data_in"] = fake_data,
Rcpp::_["grad"] = "NULL",
Rcpp::_["method"] = "Nelder-Mead",
Rcpp::_["ui"] = U,
Rcpp::_["ci"] = C );
/// fill the output matrix ///
// need to create an place holder arma vector for the parameter output
arma::vec opt_param = Rcpp::as<arma::vec>(opt_results[0]);
Boots_out(i, 0) = opt_param[0];
Boots_out(i, 1) = opt_param[1];
Boots_out(i, 2) = opt_param[2];
// for the cost function value at optimization
arma::vec opt_value = Rcpp::as<arma::vec>(opt_results[1]);
Boots_out(i, 3) = opt_value[0];
// for the number of function calls (?)
arma::vec counts = Rcpp::as<arma::vec>(opt_results[2]);
Boots_out(i, 4) = counts[0];
// for thhe convergence code
arma::vec convergence = Rcpp::as<arma::vec>(opt_results[3]);
Boots_out(i, 5) = convergence[0];
}
}
};
// [[Rcpp::export]]
arma::mat SDK_boots_test(arma::vec init_val, arma::mat data_in, int boots_n){
//First establish theta_sp, estimate and residuals
const int N = arma::size(data_in)[0];
// Create the constraints for the constrained optimization
// Make a boundry boundry condition matrix of the form Ui*theta - ci >= 0
arma::mat U(6, 3);
U(0, 0) = 1;
U(1, 0) = -1;
U(2, 0) = 0;
U(3, 0) = 0;
U(4, 0) = 0;
U(5, 0) = 0;
U(0, 1) = 0;
U(1, 1) = 0;
U(2, 1) = 1;
U(3, 1) = -1;
U(4, 1) = 0;
U(5, 1) = 0;
U(0, 2) = 0;
U(1, 2) = 0;
U(2, 2) = 0;
U(3, 2) = 0;
U(4, 2) = 1;
U(5, 2) = -1;
arma::colvec C(6);
C[0] = 0;
C[1] = -data_in(N-1, 9)-0.5;
C[2] = 0;
C[3] = -3;
C[4] = 0;
C[5] = -50;
Rcpp::Function SDK_est( "SDK_est");
Rcpp::Function SDK_res( "SDK_res");
arma::vec Y_est = as<arma::vec>(SDK_est(init_val, data_in));
arma::vec Y_res = as<arma::vec>(SDK_res(init_val, data_in));
// Generate feed items for the Bootstrap Worker
arma::vec T_series = data_in( span(0, N-1), 9);
arma::vec Boots_reps(boots_n+1);
// Allocate the output matrix
arma::mat Boots_out(boots_n, 6);
// Pass input and output the Bootstrap Worker
Boot_Worker Boot_Worker(Boots_reps, init_val, T_series, Y_est, Y_res, U, C, N, Boots_out);
// Now finnaly call the parallel for loop
parallelFor(0, Boots_reps.size(), Boot_Worker);
return Boots_out;
}
So I wrote back in my 'heat algorithm' to solve the optimization, this is entirely in Rcpp-armadillo, this simplifies the code massively as the constraints are written into the optimizer. Additionally, I removed the randomization, so it just has to solve the same optimization; just to see if that was the only problem. Without fail I am still having the same 'fatal error'.
as it stands here is code:
// [[Rcpp::depends("RcppArmadillo")]]
// [[Rcpp::depends("RcppParallel")]]
#include <RcppArmadillo.h>
#include <RcppParallel.h>
#include <random>
using namespace Rcpp;
using namespace std;
using namespace arma;
using namespace RcppParallel;
struct Boot_Worker : public Worker {
//Generate Inputs
// Source vector to keep track of the number of bootstraps
const arma::vec Boot_reps;
// Initial non-linear theta parameter values
const arma::vec init_val;
// Decimal date vector
const arma::colvec T_series;
// Generate the price series observational vector
const arma::colvec Y_est;
const arma::colvec Y_res;
const int N;
// Generate Output Matrix
arma::mat Boots_out;
// Initialize with the proper input and output
Boot_Worker( const arma::vec Boot_reps, const arma::vec init_val, const arma::colvec T_series, const arma::colvec Y_est, const arma::colvec Y_res, const int N, arma::mat Boots_out)
: Boot_reps(Boot_reps), init_val(init_val), T_series(T_series), Y_est(Y_est), Y_res(Y_res), N(N), Boots_out(Boots_out) {}
void operator()(std::size_t begin, std::size_t end){
//load necessary stuffs from around
Rcpp::Function SDK_heat( "SDK_heat");
arma::mat fake_data(N,2);
arma::colvec index(N);
for(unsigned int i = begin; i < end; i ++){
// Need a nested loop to create and fill the fake data matrix
//arma::vec pool = arma::shuffle( arma::regspace(0, N-1) );
for(int k = 0; k <= N-1; k++){
fake_data(k, 0) = Y_est[k] + Y_res[ k ];
//fake_data(k, 0) = Y_est[k] + Y_res[ pool[k] ];
fake_data(k, 1) = T_series[k];
}
// Call the optimization
arma::vec opt_results = Rcpp::as<arma::vec>( SDK_heat(Rcpp::_["data_in"] = fake_data, Rcpp::_["tol"] = 0.1) );
/// fill the output matrix ///
// need to create an place holder arma vector for the parameter output
Boots_out(i, 0) = opt_results[0];
Boots_out(i, 1) = opt_results[1];
Boots_out(i, 2) = opt_results[2];
// for the cost function value at optimization
Boots_out(i, 3) = opt_results[3];
}
}
};
// [[Rcpp::export]]
arma::mat SDK_boots_test(arma::vec init_val, arma::mat data_in, int boots_n){
//First establish theta_sp, estimate and residuals
const int N = arma::size(data_in)[0];
Rcpp::Function SDK_est( "SDK_est");
Rcpp::Function SDK_res( "SDK_res");
const arma::vec Y_est = as<arma::vec>(SDK_est(init_val, data_in));
const arma::vec Y_res = as<arma::vec>(SDK_res(init_val, data_in));
// Generate feed items for the Bootstrap Worker
const arma::vec T_series = data_in( span(0, N-1), 9);
arma::vec Boots_reps(boots_n+1);
// Allocate the output matrix
arma::mat Boots_out(boots_n, 4);
// Pass input and output the Bootstrap Worker
Boot_Worker Boot_Worker(Boots_reps, init_val, T_series, Y_est, Y_res, N, Boots_out);
// Now finnaly call the parallel for loop
parallelFor(0, Boots_reps.size(), Boot_Worker);
return Boots_out;
}
Looking at your code I see the following:
struct Boot_Worker : public Worker {
[...]
void operator()(std::size_t begin, std::size_t end){
//load necessary stuffs from around
Rcpp::Environment stats("package:stats");
Rcpp::Function constrOptim = stats["constrOptim"];
Rcpp::Function SDK_pred_mad( "SDK_pred_mad");
[...]
// Call the optimization
Rcpp::List opt_results = constrOptim(Rcpp::_["theta"] = init_val,
Rcpp::_["f"] = SDK_pred_mad,
Rcpp::_["data_in"] = fake_data,
Rcpp::_["grad"] = "NULL",
Rcpp::_["method"] = "Nelder-Mead",
Rcpp::_["ui"] = U,
Rcpp::_["ci"] = C );
You are calling an R function from a multi-threaded C++ context. That's something you should not do. R is single-threaded so this will lead to undefined behavior or crashes:
API Restrictions
The code that you write within parallel workers should not call the R or Rcpp API in any fashion. This is because R is single-threaded and concurrent interaction with it’s data structures can cause crashes and other undefined behavior. Here is the official guidance from Writing R Extensions:
Calling any of the R API from threaded code is ‘for experts only’: they will need to read the source code to determine if it is thread-safe. In particular, code which makes use of the stack-checking mechanism must not be called from threaded code.
Besides, calling back to R from C++ even in a single threaded context is not the best thing you can do for performance. It should be more efficient to use a optimization library that offers a direct C(++) interface. One possibility might be the development version of nlopt, c.f. this issue for a discussion and references to examples. In addition, std::random_shuffle is not only deprecated in C++14 and removed from C++17, but it is also not thread-safe.
In your second example, you say that the function SDK_heat is actually implemented in C++. In that case you can call it directly:
Remove importing the corresponding R function, i.e. the Rcpp::Function SDK_heat( "SDK_heat");
Make sure that the compiler knows the declaration of the C++ function and that the linker has the actual function:
Quick and dirty: Copy the function definition into your cpp file before the definition of BootWorker.
For a cleaner approach, see section "1.10 Sharing code" in the Rcpp attributes vignette
Call the function like any other C++ function, i.e. using positional arguments with types compatible to the function declaration.
All this assumes you are using sourceCpp as indicated by your usage of [[Rcpp::depends(...)]]. You are reaching a complexity that warrants to build a package from this.
I am trying to check if my 1D fftw3 implementation is right by testing for the first derivative of a sinusoidal input sample. The original sample size is nX and I've padded it with zeros on both ends of the sample such that the new sample size is 3 * nX (nX3).
The accuracy without the padding wasn't good for the derivatives and it got worse with higher derivatives, making zero padding seem necessary.
However, the padded sample input has accuracy much lesser than that of the non-padded one. The accuracy was checked by comparing the first order derivative of sample input (sin(x)), real(out[]), with its analytical value, cos(x). The code is shown below.
#include<iostream>
#include<cmath>
#include<complex>
#include<fftw3.h>
using namespace std;
int main()
{
int i,ir,nX;
nX = 16;
int nX3, nX2; //padded array dimension = nX3
nX3 = 3*nX;
nX2=2*nX;
double Nd = (double)nX3;
int id;
double pi = M_PI;
std::complex<double> *in, *out;
in = (complex<double>*)malloc(nX3*sizeof(complex<double>));
if(in==NULL) { cout<<"inalloc error\n"<<endl;}
out = (complex<double>*)malloc(nX3*sizeof(complex<double>));
if(out==NULL) { cout<<"outalloc error\n"<<endl;}
fftw_complex *bt;
bt = (fftw_complex*)fftw_malloc(sizeof(fftw_complex)*nX3);
fftw_plan p, q;
p = fftw_plan_dft_1d(nX3, reinterpret_cast<fftw_complex*>(in), bt, FFTW_FORWARD,FFTW_MEASURE);
q = fftw_plan_dft_1d(nX3, bt, reinterpret_cast<fftw_complex*>(out), FFTW_BACKWARD,FFTW_MEASURE);
for(i=0;i<nX3;i++)
{
in[i] = {0.0,0.0}; //initialising padded input array
}
for(i=nX;i<nX2;i++)
{
double id = (double)(i-nX)*pi/nX;
//sinusoidal input in the central square of the padded array (3 squares)
//varies in 'i' direction
in[i] = complex<double>(sin(id),0.0);
//cout<<"i\t"<<i-nX<<"\t"<<in[i]<<endl;
}
fftw_execute(p); //fourier transform
double kx;
int x1;
kx = 2.0*pi/(double)nX3;
double kx1;
for(i=0;i<nX;i++)
{
double btr(0.0), btc(0.0); //temporary variables
id=i+nX;
if(id<nX3/2) //1/2 filter
{
x1 = id;
}
else if(id==nX3/2)
{
x1 = 0;
}
else
{
x1 = id-nX3;
}
kx1=kx*(double)x1;
//complex first derivative array 'bt[id]'
btr = -1.0*kx1*bt[id][1]; //real part
btc = kx1*bt[id][0]; //complex part
bt[id][0] = btr;
bt[id][1] = btc;
}
fftw_execute(q); //inverse fourier transform
for(i=0;i<nX;i++)
{
//input sample varies only in the i direction, hence out[i+nX] should be identical for all 'y' and 'z' points inside the central cube at a specific value of 'i'
double id = (double)i*pi/nX;
double c = cos(id); //analytical value of first derivative for comparison with 'out[i+NX]'
cout<<"i\t"<<i<<"\t"<<c<<"\t"<<real(out[i+nX])/Nd<<endl;
//analytical value 'c' compared with fftw3 result, 'out'
//normalising 'out[i+nX]' by dividing it by volume of padded array dimension nX3
}
free(bt);
free(in);
free(out);
fftw_destroy_plan(p);
fftw_destroy_plan(q);
}
I'm trying to speedup my R code performing some computationally expensive task with C++ and Rcpp. My problem involves approximately a system of 100 equations, so any hint to speed up the computation is welcome.
What I need is to import a matrix MX created in R into a C++ script. The C++ script have to use rows of MX as x0 (x initial values) in a systems of ODE.
To simplify the explanation of my problem, the code below is based on the Lorenz systems.
As it is clear from the quality of my code, I'm new to C++ (and Rcpp).
For clarity, I don't post all my test code that are terrible, bu I really need your help to try solve this problem.
Any help will be really, really appreciated!
Thanks in advance.
#include <boost/array.hpp>
#include <boost/numeric/odeint.hpp>
#include <Rcpp.h>
// [[Rcpp::depends(BH)]]
// [[Rcpp::plugins(cpp11)]]
using namespace std;
using namespace boost::numeric::odeint;
double theta [] = {10.000,28,2.5};
typedef boost::array< double , 3 > state_type;
void lorenz( const state_type &x , state_type &dxdt , double t ) {
dxdt[0] = theta[0] * ( x[1] - x[0] );
dxdt[1] = theta[1] * x[0] - x[1] - x[0] * x[2];
dxdt[2] = -theta[2] * x[2] + x[0] * x[1];
}
struct foo { std::vector<double> a, b, c; };
struct foo f;
//observer should be a function that append a single output row for each input row of mx corresponding to the last integration step.
void append_lorenz(const state_type &x , const double t ) {
f.a.push_back(x[0]);
f.b.push_back(x[1]);
f.c.push_back(x[2]);
}
using namespace Rcpp;
// [[Rcpp::export]]
DataFrame callMain(NumericMatrix mx){
int n = mx.nrow();
NumericMatrix total(mx);
for(int i = 0; i < n; ++i) {
// state_type x should be mx rows
state_type x = total.row(i); // initial conditions
const double dt =0.1;
integrate(lorenz , x , 0.0 , 1.0 , dt , append_lorenz );
}
return DataFrame::create(Named("a") = f.a, Named("b") = f.b, Named("c") = f.c);
}
/*** R
mx=matrix(1:9,3,3)
res <- callMain(mx)
print((res))
*/
the error I get is:
error: conversion from ‘Rcpp::Matrix<14>::Row {aka Rcpp::MatrixRow<14>}’ to non-scalar type ‘state_type {aka boost::array}’ requested
state_type x = total.row(i); // initial conditions
I think the error message is clear enough.
state_type x = total.row(i);
There is no conversion between Rcpp object and boost::array, you need to develop your own.
I would like to use runge_kutta4 method in the odeint C++ library. I've solved the problem in Matlab. My following code in Matlab to solve x'' = -x - g*x', with initial values x1 = 1, x2 = 0, is as follows
main.m
clear all
clc
t = 0:0.1:10;
x0 = [1; 0];
[t, x] = ode45('ODESolver', t, x0);
plot(t, x(:,1));
title('Position');
xlabel('time (sec)');
ylabel('x(t)');
ODESolver.m
function dx = ODESolver(t, x)
dx = zeros(2,1);
g = 0.15;
dx(1) = x(2);
dx(2) = -x(1) - g*x(2);
end
I've installed the odeint Library. My code for using runge_kutta4 is as follows
#include <iostream>
#include <boost/numeric/odeint.hpp>
using namespace std;
using namespace boost::numeric::odeint;
/* The type of container used to hold the state vector */
typedef std::vector< double > state_type;
const double gam = 0.15;
/* The rhs of x' = f(x) */
void lorenz( const state_type &x , state_type &dx , double t )
{
dx[0] = x[1];
dx[1] = -x[0] - gam*x[1];
}
int main(int argc, char **argv)
{
const double dt = 0.1;
runge_kutta_dopri5<state_type> stepper;
state_type x(2);
x[0] = 1.0;
x[1] = 0.0;
double t = 0.0;
cout << x[0] << endl;
for ( size_t i(0); i <= 100; ++i){
stepper.do_step(lorenz, x , t, dt );
t += dt;
cout << x[0] << endl;
}
return 0;
}
The result is in the following picture
My question is why the result varies? Is there something wrong with my C++ code?
These are the first values of both methods
Matlab C++
-----------------
1.0000 0.9950
0.9950 0.9803
0.9803 0.9560
0.9560 0.9226
0.9226 0.8806
0.8806 0.8304
0.8304 0.7728
0.7728 0.7084
0.7083 0.6379
Update:
The problem is that I forgot to include the initial value in my C++ code. Thanks for #horchler for noticing it. After including the proper values and using runge_kutta_dopri5 as #horchler suggested, the result is
Matlab C++
-----------------
1.0000 1.0000
0.9950 0.9950
0.9803 0.9803
0.9560 0.9560
0.9226 0.9226
0.8806 0.8806
0.8304 0.8304
0.7728 0.7728
0.7083 0.7084
I've updated the code to reflect these modifications.
The runge_kutta4 stepper in odeint is nothing like Matlab's ode45, which is an adaptive scheme based on the Dormand-Prince method. To replicate Matlab's results, you should probably try the runge_kutta_dopri5 stepper. Also, make sure that your C++ code uses the same absolute and relative tolerances as ode45 (defaults are 1e-6 and 1e-3, respectively). Lastly, it looks like you may not be saving/printing your initial condition in your C++ results.
If you're confused at why ode45 is not taking fixed steps even though you specified t = 0:0.1:10;, see my detailed answer here.
If you must use the fixed steprunge_kutta4 stepper, then you'll need to reduce the integration step-size in your C++ code to match Matlab's output.
The Matlab ode45 function already includes error control and I think also interpolation (dense output). to compare with boost.odeint you should use the same functionality there. Boost.odeint provides integrate functions that perform step-size control and dense output if the used stepper algorithm provides this functionality. The following code piece shows how you this is used with the default error control parameters from Matlab given by horchler:
#include <boost/numeric/odeint.hpp>
using namespace std;
using namespace boost::numeric::odeint;
/* The type of container used to hold the state vector */
typedef std::vector< double > state_type;
const double gam = 0.15;
/* The rhs of x' = f(x) */
void damped_osc( const state_type &x , state_type &dx , const double t )
{
dx[0] = x[1];
dx[1] = -x[0] - gam*x[1];
}
void print( const state_type &x, const double t )
{
cout << x[0] << endl;
}
int main(int argc, char **argv)
{
cout.precision(16); // full precision output
const double dt = 0.1;
typedef runge_kutta_dopri5<state_type> stepper_type;
state_type x(2);
x[0] = 1.0;
x[1] = 0.0;
integrate_const(make_dense_output<stepper_type>( 1E-6, 1E-3 ),
damped_osc, x, 0.0, 10.0, dt , print);
return 0;
}
Please note that the results might still not be exactly the same (as in all 16 digits) because the error control in Boost.odeint might not be impemented exactly as in Matlab's ode45.