I presume, or rather hope, that I have a singular fixable problem or perhaps many smaller ones and should give up. Either way I am relatively new to Rcpp and extremely uninformed on parallel computation and can't find a solution online.
The problem is typically, a 'fatal error' in R or R gets stuck in a loop, something like 5 minuets for 10 iterations, when the non-parallel version will do 5K iterations in the same time, roughly speaking.
As this algorithm fits into a much larger project I call on several other functions, these are all in Rcpp and I rewrote them with only 'arma' objects as that seemed to help other people, here. I also ran the optimization part with a 'heat map' optimizer I wrote in Rcpp, again exclusively in 'arma' without improvement - I should also point out this returned as an 'arma::vec'.
// [[Rcpp::depends("RcppArmadillo")]]
// [[Rcpp::depends("RcppParallel")]]
#include <RcppArmadillo.h>
#include <RcppParallel.h>
using namespace Rcpp;
using namespace std;
using namespace arma;
using namespace RcppParallel;
struct Boot_Worker : public Worker {
//Generate Inputs
// Source vector to keep track of the number of bootstraps
const arma::vec Boot_reps;
// Initial non-linear theta parameter values
const arma::vec init_val;
// Decimal date vector
const arma::colvec T_series;
// Generate the price series observational vector
const arma::colvec Y_est;
const arma::colvec Y_res;
// Generate the optimization constants
const arma::mat U;
const arma::colvec C;
const int N;
// Generate Output Matrix
arma::mat Boots_out;
// Initialize with the proper input and output
Boot_Worker( const arma::vec Boot_reps, const arma::vec init_val, const arma::colvec T_series, const arma::colvec Y_est, const arma::colvec Y_res, const arma::mat U, const arma::colvec C, const int N, arma::mat Boots_out)
: Boot_reps(Boot_reps), init_val(init_val), T_series(T_series), Y_est(Y_est), Y_res(Y_res), U(U), C(C), N(N), Boots_out(Boots_out) {}
void operator()(std::size_t begin, std::size_t end){
//load necessary stuffs from around
Rcpp::Environment stats("package:stats");
Rcpp::Function constrOptim = stats["constrOptim"];
Rcpp::Function SDK_pred_mad( "SDK_pred_mad");
arma::mat fake_data(N,2);
arma::colvec index(N);
for(unsigned int i = begin; i < end; i ++){
// Need a nested loop to create and fill the fake data matrix
arma::vec pool = arma::regspace(0, N-1) ;
std::random_shuffle(pool.begin(), pool.end());
for(int k = 0; k <= N-1; k++){
fake_data(k, 0) = Y_est[k] + Y_res[ pool[k] ];
fake_data(k, 1) = T_series[k];
// Call the optimization
Rcpp::List opt_results = constrOptim(Rcpp::_["theta"] = init_val,
Rcpp::_["f"] = SDK_pred_mad,
Rcpp::_["data_in"] = fake_data,
Rcpp::_["grad"] = "NULL",
Rcpp::_["method"] = "Nelder-Mead",
Rcpp::_["ui"] = U,
Rcpp::_["ci"] = C );
/// fill the output matrix ///
// need to create an place holder arma vector for the parameter output
arma::vec opt_param = Rcpp::as<arma::vec>(opt_results[0]);
Boots_out(i, 0) = opt_param[0];
Boots_out(i, 1) = opt_param[1];
Boots_out(i, 2) = opt_param[2];
// for the cost function value at optimization
arma::vec opt_value = Rcpp::as<arma::vec>(opt_results[1]);
Boots_out(i, 3) = opt_value[0];
// for the number of function calls (?)
arma::vec counts = Rcpp::as<arma::vec>(opt_results[2]);
Boots_out(i, 4) = counts[0];
// for thhe convergence code
arma::vec convergence = Rcpp::as<arma::vec>(opt_results[3]);
Boots_out(i, 5) = convergence[0];
// [[Rcpp::export]]
arma::mat SDK_boots_test(arma::vec init_val, arma::mat data_in, int boots_n){
//First establish theta_sp, estimate and residuals
const int N = arma::size(data_in)[0];
// Create the constraints for the constrained optimization
// Make a boundry boundry condition matrix of the form Ui*theta - ci >= 0
arma::mat U(6, 3);
U(0, 0) = 1;
U(1, 0) = -1;
U(2, 0) = 0;
U(3, 0) = 0;
U(4, 0) = 0;
U(5, 0) = 0;
U(0, 1) = 0;
U(1, 1) = 0;
U(2, 1) = 1;
U(3, 1) = -1;
U(4, 1) = 0;
U(5, 1) = 0;
U(0, 2) = 0;
U(1, 2) = 0;
U(2, 2) = 0;
U(3, 2) = 0;
U(4, 2) = 1;
U(5, 2) = -1;
arma::colvec C(6);
C[0] = 0;
C[1] = -data_in(N-1, 9)-0.5;
C[2] = 0;
C[3] = -3;
C[4] = 0;
C[5] = -50;
Rcpp::Function SDK_est( "SDK_est");
Rcpp::Function SDK_res( "SDK_res");
arma::vec Y_est = as<arma::vec>(SDK_est(init_val, data_in));
arma::vec Y_res = as<arma::vec>(SDK_res(init_val, data_in));
// Generate feed items for the Bootstrap Worker
arma::vec T_series = data_in( span(0, N-1), 9);
arma::vec Boots_reps(boots_n+1);
// Allocate the output matrix
arma::mat Boots_out(boots_n, 6);
// Pass input and output the Bootstrap Worker
Boot_Worker Boot_Worker(Boots_reps, init_val, T_series, Y_est, Y_res, U, C, N, Boots_out);
// Now finnaly call the parallel for loop
parallelFor(0, Boots_reps.size(), Boot_Worker);
return Boots_out;
So I wrote back in my 'heat algorithm' to solve the optimization, this is entirely in Rcpp-armadillo, this simplifies the code massively as the constraints are written into the optimizer. Additionally, I removed the randomization, so it just has to solve the same optimization; just to see if that was the only problem. Without fail I am still having the same 'fatal error'.
as it stands here is code:
// [[Rcpp::depends("RcppArmadillo")]]
// [[Rcpp::depends("RcppParallel")]]
#include <RcppArmadillo.h>
#include <RcppParallel.h>
#include <random>
using namespace Rcpp;
using namespace std;
using namespace arma;
using namespace RcppParallel;
struct Boot_Worker : public Worker {
//Generate Inputs
// Source vector to keep track of the number of bootstraps
const arma::vec Boot_reps;
// Initial non-linear theta parameter values
const arma::vec init_val;
// Decimal date vector
const arma::colvec T_series;
// Generate the price series observational vector
const arma::colvec Y_est;
const arma::colvec Y_res;
const int N;
// Generate Output Matrix
arma::mat Boots_out;
// Initialize with the proper input and output
Boot_Worker( const arma::vec Boot_reps, const arma::vec init_val, const arma::colvec T_series, const arma::colvec Y_est, const arma::colvec Y_res, const int N, arma::mat Boots_out)
: Boot_reps(Boot_reps), init_val(init_val), T_series(T_series), Y_est(Y_est), Y_res(Y_res), N(N), Boots_out(Boots_out) {}
void operator()(std::size_t begin, std::size_t end){
//load necessary stuffs from around
Rcpp::Function SDK_heat( "SDK_heat");
arma::mat fake_data(N,2);
arma::colvec index(N);
for(unsigned int i = begin; i < end; i ++){
// Need a nested loop to create and fill the fake data matrix
//arma::vec pool = arma::shuffle( arma::regspace(0, N-1) );
for(int k = 0; k <= N-1; k++){
fake_data(k, 0) = Y_est[k] + Y_res[ k ];
//fake_data(k, 0) = Y_est[k] + Y_res[ pool[k] ];
fake_data(k, 1) = T_series[k];
// Call the optimization
arma::vec opt_results = Rcpp::as<arma::vec>( SDK_heat(Rcpp::_["data_in"] = fake_data, Rcpp::_["tol"] = 0.1) );
/// fill the output matrix ///
// need to create an place holder arma vector for the parameter output
Boots_out(i, 0) = opt_results[0];
Boots_out(i, 1) = opt_results[1];
Boots_out(i, 2) = opt_results[2];
// for the cost function value at optimization
Boots_out(i, 3) = opt_results[3];
// [[Rcpp::export]]
arma::mat SDK_boots_test(arma::vec init_val, arma::mat data_in, int boots_n){
//First establish theta_sp, estimate and residuals
const int N = arma::size(data_in)[0];
Rcpp::Function SDK_est( "SDK_est");
Rcpp::Function SDK_res( "SDK_res");
const arma::vec Y_est = as<arma::vec>(SDK_est(init_val, data_in));
const arma::vec Y_res = as<arma::vec>(SDK_res(init_val, data_in));
// Generate feed items for the Bootstrap Worker
const arma::vec T_series = data_in( span(0, N-1), 9);
arma::vec Boots_reps(boots_n+1);
// Allocate the output matrix
arma::mat Boots_out(boots_n, 4);
// Pass input and output the Bootstrap Worker
Boot_Worker Boot_Worker(Boots_reps, init_val, T_series, Y_est, Y_res, N, Boots_out);
// Now finnaly call the parallel for loop
parallelFor(0, Boots_reps.size(), Boot_Worker);
return Boots_out;

Looking at your code I see the following:
struct Boot_Worker : public Worker {
void operator()(std::size_t begin, std::size_t end){
//load necessary stuffs from around
Rcpp::Environment stats("package:stats");
Rcpp::Function constrOptim = stats["constrOptim"];
Rcpp::Function SDK_pred_mad( "SDK_pred_mad");
// Call the optimization
Rcpp::List opt_results = constrOptim(Rcpp::_["theta"] = init_val,
Rcpp::_["f"] = SDK_pred_mad,
Rcpp::_["data_in"] = fake_data,
Rcpp::_["grad"] = "NULL",
Rcpp::_["method"] = "Nelder-Mead",
Rcpp::_["ui"] = U,
Rcpp::_["ci"] = C );
You are calling an R function from a multi-threaded C++ context. That's something you should not do. R is single-threaded so this will lead to undefined behavior or crashes:
API Restrictions
The code that you write within parallel workers should not call the R or Rcpp API in any fashion. This is because R is single-threaded and concurrent interaction with it’s data structures can cause crashes and other undefined behavior. Here is the official guidance from Writing R Extensions:
Calling any of the R API from threaded code is ‘for experts only’: they will need to read the source code to determine if it is thread-safe. In particular, code which makes use of the stack-checking mechanism must not be called from threaded code.
Besides, calling back to R from C++ even in a single threaded context is not the best thing you can do for performance. It should be more efficient to use a optimization library that offers a direct C(++) interface. One possibility might be the development version of nlopt, c.f. this issue for a discussion and references to examples. In addition, std::random_shuffle is not only deprecated in C++14 and removed from C++17, but it is also not thread-safe.
In your second example, you say that the function SDK_heat is actually implemented in C++. In that case you can call it directly:
Remove importing the corresponding R function, i.e. the Rcpp::Function SDK_heat( "SDK_heat");
Make sure that the compiler knows the declaration of the C++ function and that the linker has the actual function:
Quick and dirty: Copy the function definition into your cpp file before the definition of BootWorker.
For a cleaner approach, see section "1.10 Sharing code" in the Rcpp attributes vignette
Call the function like any other C++ function, i.e. using positional arguments with types compatible to the function declaration.
All this assumes you are using sourceCpp as indicated by your usage of [[Rcpp::depends(...)]]. You are reaching a complexity that warrants to build a package from this.


Adding 3D vectors using SIMD intrinsics

I've got two streams of 3D vectors which I'd like to add using x86 AVX2 intrinsics. I'm using the GNU compiler 11.1.0. Hopefully, the code illustrates what I want to do:
// Example program
#include <utility> // std::size_t
#include <immintrin.h>
struct v3
float data[3] = {};
void add(const v3* a, const v3* b, v3* c, const std::size_t& n)
// c <- a + b
for (auto i = std::size_t{}; i < n; i += 2) // 2 vector3s at a time ~6 data
// masking
// [95:0] of a[i] move into [255:128], [95:0] of a[i+1] move into [255:128] of *another* 256-bit register
// ^same with b[i]
static const auto p1_mask = _mm256_setr_epi32(-1, -1, -1, 0, 0, 0, 0, 0);
static const auto p2_mask = _mm256_setr_epi32(0, 0, 0, -1, -1, -1, 0, 0);
const auto p1_leftop_packed = _mm256_maskload_ps(a[i].data, p1_mask);
const auto p2_lefttop_packed = _mm256_maskload_ps(a[i].data, p2_mask);
const auto p1_rightop_packed = _mm256_maskload_ps(b[i].data, p1_mask);
const auto p2_rightop_packed = _mm256_maskload_ps(b[i].data, p2_mask);
// addition is being done inefficiently with 2 AVX2 instructions!
const auto result1_packed = _mm256_add_ps(p1_leftop_packed, p1_rightop_packed);
const auto result2_packed = _mm256_add_ps(p2_leftop_packed, p2_rightop_packed);
// store them back
_mm256_maskstore_ps(c[i].data, p1_mask, result1_packed);
_mm256_maskstore_ps(c[i].data, p2_mask, result2_packed);
int main()
// data
const auto n = std::size_t{1000};
v3 a[n] = {};
v3 b[n] = {};
v3 c[n] = {};
// run
add(a, b, c, n);
return 0;
The above code works but the performance is quite terrible. To correct it, I think I need a version which looks approximately like the following:
// c <- a + b
for (auto i = std::size_t{}; i < n; i += 2) // 2 vector3s at a time ~6 data
// masking
// [95:0] of a[i] move into [255:128], [95:0] of a[i+1] in [127:0]
const auto leftop_packed = /*code required here*/;
const auto rightop_packed = /*code required here*/;
// addition is being done with only 1 AVX2 instruction
const auto result_packed = _mm256_add_ps(leftop_packed, rightop_packed);
// store them back
// [95:0] of result_packed move into c[i], [223:128] of result_packed into c[i+1]
/*code required here*/
How do I achieve this? I will gladly provide any additional information when needed. Any help would be much appreciated.
The two following comments say the same. They are good. Do as they say.
I think you can just load 8 floats at a time and then if you have anything left over at the end you can do a masked store (not sure about this part). – LHLaurini
Use char*, float*, or __m256* to work in 32-byte or 8-float chunks, ignoring vector boundaries since you're just doing pure vertical addition. float* should be good for cleanup of the last up-to-7 floats – Peter Cordes
The Eigen library supports vectorization. It also has a lot of the vector/matrix math algorithms already implemented, and quite efficiently too. If you can, I'd recommend looking into using it instead of rolling your own logic.

Drake: Integrate Mass Matrix and Bias Term in Optimization Problem

I am trying to implement Non Linear MPC for a 7-DOF manipulator in drake. To do this, in my constraints, I need to have dynamic parameters like the Mass matrix M(q) and the bias term C(q,q_dot)*q_dot, but those depend on the decision variables q, q_dot.
I tried the following
// finalize plant
// create builder, diagram, context, plant context
// formulate optimazation problem
drake::solvers::MathematicalProgram prog;
// create decision variables
std::vector<drake::solvers::VectorXDecisionVariable> q_v;
std::vector<drake::solvers::VectorXDecisionVariable> q_ddot;
for (int i = 0; i < H; i++) {
// add cost
// add constraints
for (int i = 0; i < H; i++) {
plant.SetPositionsAndVelocities(*plant_context, q_v[i]);
plant.CalcMassMatrix(*plant_context, M);
plant.CalcBiasTerm(*plant_context, C_q_dot);
for (int i = 0; i < H; i++) {
prog.AddConstraint( M * q_ddot[i] + C_q_dot + G >= lb );
prog.AddConstraint( M * q_ddot[i] + C_q_dot + G <= ub );
// solve prog
The above code will not work, because plant.SetPositionsAndVelocities(.) doesn't accept symbolic variables.
Is there any way to integrate M,C in my ocp constraints ?
I think you want to impose the following nonlinear nonconvex constraint
lb <= M * qddot + C(q, v) + g(q) <= ub
This constraint is non-convex. We will need to solve it through nonlinear optimization, and evaluate the constraint in every iteration of the nonlinear optimization. We can't do this evaluation using symbolic computation (it would be horribly slow with symbolic computation).
So you will need a constraint evaluator, something like this
// This constraint takes [q;v;vdot] and evaluate
// M * vdot + C(q, v) + g(q)
class MyConstraint : public solvers::Constraint {
MyConstraint(const MultibodyPlant<AutoDiffXd>& plant, systems::Context<AutoDiffXd>* context, const Eigen::Ref<const Eigen::VectorXd>& lb, const Eigen::Ref<const Eigen::VectorXd>& ub) : solvers::Constraint(plant.num_velocitiex(), plant.num_positions() + 2 * plant.num_velocities(), lb, ub), plant_{plant}, context_{context} {
void DoEval(const Eigen::Ref<const AutoDiffVecXd>& x, AutoDiffVecXd* y) const {
MultibodyPlant<AutoDiffXd> plant_;
systems::Context<AutoDiffXd>* context_;
int main() {
// Construct the constraint and add it to every time instances
std::vector<std::unique_ptr<systems::Context<AutoDiffXd>>> plant_contexts;
for (int i = 0; i < H; ++i) {
prog.AddConstraint(std::make_shared<MyConstraint>(plant, plant_context[i], lb, ub), {q_v[i], qddot[i]});
You could refer to the class CentroidalMomentumConstraint on how to construct your own MyConstraint class.

Alternating Error: "Invalid dimension for argument 0"

In converting the example below to a gfor loop. I encountered an error of the type "Invalid dimension for argument 0", the full error message below. However, the error occurs, then the function runs, then the same error. This pattern repeats. I am confused and am wondering if this error is in someway system dependent.
Full error message:
Error in random_shuffle(theta, 5, 1) :
ArrayFire Exception (Invalid input size:203):
In function af_err af_assign_seq(af_array *, const af_array, const unsigned int, const af_seq *, const af_array)
In file src/api/c/assign.cpp:168
Invalid dimension for argument 0
Expected: (outDims.ndims() >= inDims.ndims())
A second problem, the seed fails to change with the input parameter, when using the gfor loop.
#include "RcppArrayFire.h"
using namespace Rcpp;
using namespace RcppArrayFire;
// [[Rcpp::export]]
af::array random_shuffle(const RcppArrayFire::typed_array<f64> theta, int counts, int seed){
const int theta_size = theta.dims()[0];
af::array out(counts, theta_size, f64);
af::array seed_seq = af::seq(seed, seed+counts);
// for(int f = 0; f < counts; f++){
gfor ( af::seq f, counts-1 ){
af::randomEngine engine;
af::array index_shuffle(1, u16);
af::array temp_rand(1, f64);
af::array temp_end(1, f64);
af::array shuffled = theta;
// implementation of the Knuth-Fisher-Yates shuffle algo
for(int i = theta_size-1; i > 1; i --){
index_shuffle = af::round(af::randu(1, u16, engine)/(65536/(i+1)));
temp_rand = shuffled(index_shuffle);
temp_end = shuffled(i);
shuffled(index_shuffle) = temp_end;
shuffled(i) = temp_rand;
out(f, af::span) = shuffled;
return out;
/*** R
theta <- 10:20
random_shuffle(theta, 5, 1)
random_shuffle(theta, 5, 2)
Updated with Ralf Stunber's solution, but 'shuffled' samples in Column space.
// [[Rcpp::export]]
af::array random_shuffle2(const RcppArrayFire::typed_array<f64> theta, int counts, int seed) {
int len = theta.dims(0);
af::array tmp = af::randu(len, counts, 1);
af::array val, idx;
af::sort(val, idx, tmp, 1);
af::array shuffled = theta(idx);
return af::moddims(shuffled, len, counts);
/*** R
random_shuffle2(theta, 5, 1)
Here is a picture of output, sampling with replacement:
In the second part, of 50 repetitions, the samples moves towards anergodic outcome.
Why do you want to use multiple RNG engines in parallel? There is really no need for this. In general, it should be sufficient to use only the global RNG engine. It should also be sufficient to set the seed of this engine only once. You can do this from R with RcppArrayFire::arrayfire_set_seed. Besides, random number generation within a gfor loop does not work as one might expect, c.f. http://arrayfire.org/docs/page_gfor.htm.
Anyway, I am not an expert in writing efficient GPU algorithms, which is why I like using the methods implemented in libraries like ArrayFire. Unfortunately ArrayFire does not have a shuffle algorithm, but the corresponding issue has a nice implementation, which can be generalized to your case with multiple shuffles:
// [[Rcpp::depends(RcppArrayFire)]]
#include "RcppArrayFire.h"
// [[Rcpp::export]]
af::array random_shuffle(const RcppArrayFire::typed_array<f64> theta, int counts, int seed) {
int len = theta.dims(0);
af::array tmp = af::randu(counts, len, 1);
af::array val, idx;
af::sort(val, idx, tmp, 1);
af::array shuffled = theta(idx);
return af::moddims(shuffled, counts, len);
BTW, depending on the later usage it might make more sense to arrange the different samples in columns instead of rows, since both R and AF use column major layout.

Rcpp - Using the optim function

How to use the varargs functions of the R language, as is the case of the optim function?
Consider the code below where I want to maximize the log-likelihood function verossimilhanca:
#include <Rcpp.h>
#include <RInside.h>
using namespace Rcpp;
// [[Rcpp::export]]
double verossimilhanca(Function pdf, NumericVector par, NumericVector x){
NumericVector log_result = log(pdf(par,x));
double soma =0;
for(int i = 0; i < log_result.size(); i++){
soma += log_result[i];
return -1*soma;
// [[Rcpp::export]]
List bootC(NumericVector x, NumericVector init_val){
Rcpp::Environment stats("package:stats");
Rcpp::Function optim = stats["optim"];
R["my_objective_fn"] = Rcpp::InternalFunction(&verossimilhanca);
Rcpp::List opt_results = optim(Rcpp::_["par"] = init_val,
Rcpp::_["fn"] = Rcpp::InternalFunction(&verossimilhanca),
Rcpp::_["method"] = "BFGS", x);
return opt_results;
// x is a data vetor.
In summary, I have a log-likelihood function and I want to maximize this function and x is my data set. I know that RInside allows me to create instances of R in C++ but I want to solve this problem only by using the Rcpp.h library without resorting to RInside.h.
Replace x with Rcpp::_["x"] = x in the arguments of optim function.
It bothers me too until I find the answer of #coatless.

Using CPLEX python API for linear programming with continuous inputs

I am new to CPLEX Python API. I wish to solve a Linear Programming problem in python which I have already done in the CPLEX OPL IDE by taking a .mod and .dat files as inputs. I want to use it in python since I wish to vary my inputs continuously. My mod file for the problem is given below. Can someone help me on how to use this for the python API.
int n = ...;
int m = ...;
int c = ...;
int s = ...;
range v = 1..n;
range p = 1..m;
int c_req[v] = ...;
int s_req[v] = ...;
int trust[v][v] = ...;
// decision variables
dvar boolean assign[p][v];
// expressions
dexpr int used[pi in p] = max(vi in v) assign[pi][v]; // used[i] = 1 iff pi is used
dexpr int totalUsed = sum(pi in p) used[pi];
execute {
cplex.tilim = 60; // Time limit 60 seconds
// model
minimize totalUsed;
subject to {
forall(pi in p)
sum(vi in v) c_req[vi] * assign[pi][vi] <= c;
forall(pi in p)
sum(vi in v) s_req[vi] * assign[pi][vi] <= s;
forall(vi in v)
sum(pi in p) assign[pi][vi] == 1;
forall(pi in p, v1 in v, v2 in v) if (v1 < v2) if (trust[v1][v2] == 0)
assign[p][v1] + assign[p][v2] <= 1;
you could write
subprocess.check_call(["C:/CPLEXStudio127/opl/bin/x64_win64/oplrun", "diet.mod", "diet.dat"])
in order to call OPL from python. And you would generate diet.dat beforehand.
Full example at https://www.ibm.com/developerworks/community/forums/html/threadTopic?id=0b6cacbe-4dda-4da9-9282-f527c3464f47
Then you do not have to migrate your model from OPL to Python.
You may also translate your model to Python and then I recommend DOCPLEX : https://developer.ibm.com/docloud/documentation/optimization-modeling/modeling-for-python/