IPOPT does not obey constraints but does not record the violation when using CppAD - c++

I am trying to evaluate the coefficients and time of two fifth-order polynomials (one each for x and y position) that minimizes effort and time (the objective function) when connecting an initial position, velocity, and orientation to a desired final position and orientation with 0 velocity (equality constraints). Here is the code:
#include <vector>
#include <cppad/cppad.hpp>
#include <cppad/ipopt/solve.hpp>
using CppAD::AD;
typedef struct {
double x, y, theta, linear_velocity;
} Waypoint;
typedef std::vector<Waypoint> WaypointList;
struct TrajectoryConfig {
//! gain on accumulated jerk term in cost function
double Kj;
//! gain on time term in cost function
double Kt;
//! gain on terminal velocity term in cost function
double Kv;
};
class Trajectory {
public:
explicit Trajectory(TrajectoryConfig config);
~Trajectory();
void updateConfigs(TrajectoryConfig config);
void solve(WaypointList waypoints);
private:
//! solution vector
std::vector<double> solution_;
//! gain on accumulated jerk term in cost function
double Kj_;
//! gain on time term in cost function
double Kt_;
//! gain on terminal velocity term in cost function
double Kv_;
};
/*
Trajectory(TrajectoryConfig)
class constructor. Initializes class given configuration struct
*/
Trajectory::Trajectory(TrajectoryConfig config) {
Kj_ = config.Kj;
Kt_ = config.Kt;
Kv_ = config.Kv;
}
Trajectory::~Trajectory() {
std::cerr << "Trajectory Destructor!" << std::endl;
}
enum Indices { A0 = 0, A1, A2, A3, A4, A5, B0, B1, B2, B3, B4, B5, T };
class FGradEval {
public:
size_t M_;
// gains on cost;
double Kj_, Kt_;
// constructor
FGradEval(double Kj, double Kt) {
M_ = 13; // no. of parameters per trajectory segment: 2 x 6 coefficients + 1 time
Kj_ = Kj;
Kt_ = Kt;
}
typedef CPPAD_TESTVECTOR(AD<double>) ADvector;
void operator()(ADvector& fgrad, const ADvector& vars) {
fgrad[0] = 0;
AD<double> accum_jerk;
AD<double> a0, a1, a2, a3, a4, a5;
AD<double> b0, b1, b2, b3, b4, b5;
AD<double> T, T2, T3, T4, T5;
AD<double> x, y, vx, vy;
size_t offset = 1;
a0 = vars[Indices::A0];
a1 = vars[Indices::A1];
a2 = vars[Indices::A2];
a3 = vars[Indices::A3];
a4 = vars[Indices::A4];
a5 = vars[Indices::A5];
b0 = vars[Indices::B0];
b1 = vars[Indices::B1];
b2 = vars[Indices::B2];
b3 = vars[Indices::B3];
b4 = vars[Indices::B4];
b5 = vars[Indices::B5];
T = vars[Indices::T];
T2 = T*T;
T3 = T*T2;
T4 = T*T3;
T5 = T*T4;
x = a0 + a1*T + a2*T2 + a3*T3 + a4*T4 + a5*T5;
y = b0 + b1*T + b2*T2 + b3*T3 + b4*T4 + b5*T5;
vx = a1 + 2*a2*T + 3*a3*T2 + 4*b4*T3 + 5*a5*T4;
vy = b1 + 2*b2*T + 3*b3*T2 + 4*b4*T3 + 5*b5*T4;
//! cost-terms
//! accum_jerk is the analytic integral of int_0^T (jerk_x^2 + jerk_y^2) dt
accum_jerk = 36 * T * (a3*a3 + b3*b3) + 144 * T2 * (a3*a4 + b3*b4) + T3 * (240*(a3*a5 + b3*b5) + 192*(a4*a4 + b4*b4))
+ 720 * T4 * (a4*a5 + b4*b5) + 720 * T5 * (a5*a5 + b5*b5);
fgrad[0] += Kj_ * accum_jerk;
fgrad[0] += Kt_ * T;
//! initial equality constraints
fgrad[offset] = vars[Indices::A0];
fgrad[1 + offset] = vars[Indices::B0];
fgrad[2 + offset] = vars[Indices::A1];
fgrad[3 + offset] = vars[Indices::B1];
offset += 4;
//! terminal inequality constraints
fgrad[offset] = x;
fgrad[offset + 1] = y;
fgrad[offset + 2] = vx;
fgrad[offset + 3] = vy;
}
};
void Trajectory::solve(WaypointList waypoints) {
if (waypoints.size() != 2) {
std::cerr << "Trajectory::solve - Function requires 2 waypoints." << std::endl;
return;
}
//! status flag for solution
bool ok;
//! typedef for ipopt/cppad
typedef CPPAD_TESTVECTOR(double) Dvector;
//! no. of variables for optimization problem
size_t n_vars = 13;
//! no. of constraints
size_t n_cons = 4 * 2; // the start and final waypoint each contribute 4 constraints (x, y, theta, v) -> (x, y, vx, vy)
//! create vector container for optimizer solution
//! and initialize it to zero
Dvector vars(n_vars);
for (size_t i = 0; i < n_vars; i++) {
vars[i] = 0;
}
//! set initial state (this will only determine the first two coefficients of the initial polynomials)
double v = (fabs(waypoints[0].linear_velocity) < 1e-3)
? 1e-3 : waypoints[0].linear_velocity;
vars[Indices::A0] = waypoints[0].x;
vars[Indices::B0] = waypoints[0].y;
vars[Indices::A1] = v * cos(waypoints[0].theta);
vars[Indices::B1] = v * sin(waypoints[0].theta);
vars[Indices::T] = 0;
//! there are no explicit bounds on vars, so set to something large for the optimizer
//! we could perhaps put bounds on the coeffs corresponding to acc, jerk, snap, ..
Dvector vars_lb(n_vars);
Dvector vars_ub(n_vars);
for (size_t i = 0; i < n_vars; i++) {
vars_lb[i] = -1e10;
vars_ub[i] = 1e10;
}
//! time must be non-negative!
vars_lb[Indices::T] = 0;
//! set the bounds on the constraints
Dvector cons_lb(n_cons);
Dvector cons_ub(n_cons);
//! offset term on index
size_t offset = 0;
//! initial equality constraint - we must start from where we are!
cons_lb[0] = waypoints[0].x;
cons_ub[0] = waypoints[0].x;
cons_lb[1] = waypoints[0].y;
cons_ub[1] = waypoints[0].y;
cons_lb[2] = v * cos(waypoints[0].theta);
cons_ub[2] = v * cos(waypoints[0].theta);
cons_lb[3] = v * sin(waypoints[0].theta);
cons_ub[3] = v * sin(waypoints[0].theta);
offset += 4;
//! terminal point
cons_lb[offset] = waypoints[1].x;
cons_ub[offset] = waypoints[1].x;
cons_lb[offset + 1] = waypoints[1].y;
cons_ub[offset + 1] = waypoints[1].y;
cons_lb[offset + 2] = 1e-3 * cos(waypoints[1].theta);
cons_ub[offset + 2] = 1e-3 * cos(waypoints[1].theta);
cons_lb[offset + 3] = 1e-3 * sin(waypoints[1].theta);
cons_ub[offset + 3] = 1e-3 * sin(waypoints[1].theta);
//! create instance of objective function class
FGradEval fg_eval(Kj_, Kt_);
//! IPOPT INITIALIZATION
std::string options;
options += "Integer print_level 5\n";
options += "Sparse true forward\n";
options += "Sparse true reverse\n";
options += "Integer max_iter 100\n";
// options += "Numeric tol 1e-4\n";
//! compute the solution
CppAD::ipopt::solve_result<Dvector> solution;
//! solve
CppAD::ipopt::solve<Dvector, FGradEval>(
options, vars, vars_lb, vars_ub, cons_lb, cons_ub, fg_eval, solution);
//! check if the solver was successful
ok = solution.status == CppAD::ipopt::solve_result<Dvector>::success;
//! if the solver was unsuccessful, exit
//! this case will be handled by calling method
if (!ok) {
std::cerr << "Trajectory::solve - Failed to find a solution!" << std::endl;
return;
}
//! (DEBUG) output the final cost
std::cout << "Final Cost: " << solution.obj_value << std::endl;
//! populate output with argmin vector
for (size_t i = 0; i < n_vars; i++) {
solution_.push_back(solution.x[i]);
}
return;
}
Where I am having problems is in the following:
The initial equality constraint (starting position, velocity, and orientation) is being upheld, while the terminal velocity constraint is not. The algorithm terminates at the correct final (x,y,angle), but the velocity is not zero. I have looked through the code and I cannot understand why the position and orientation at the endpoint would be obeyed while the velocity would not. My suspicion is that my definition of the equality constraints is not what I think it is.
The problem does not converge regularly, but this seems a fairly simple problem as defined (see output)
******************************************************************************
This program contains Ipopt, a library for large-scale nonlinear optimization.
Ipopt is released as open source code under the Eclipse Public License (EPL).
For more information visit http://projects.coin-or.org/Ipopt
******************************************************************************
This is Ipopt version 3.11.9, running with linear solver mumps.
NOTE: Other linear solvers might be more efficient (see Ipopt documentation).
Number of nonzeros in equality constraint Jacobian...: 30
Number of nonzeros in inequality constraint Jacobian.: 0
Number of nonzeros in Lagrangian Hessian.............: 23
Total number of variables............................: 13
variables with only lower bounds: 0
variables with lower and upper bounds: 13
variables with only upper bounds: 0
Total number of equality constraints.................: 8
Total number of inequality constraints...............: 0
inequality constraints with only lower bounds: 0
inequality constraints with lower and upper bounds: 0
inequality constraints with only upper bounds: 0
iter objective inf_pr inf_du lg(mu) ||d|| lg(rg) alpha_du alpha_pr ls
0 9.9999900e-03 1.00e+00 5.00e-04 -1.0 0.00e+00 - 0.00e+00 0.00e+00 0
1 5.9117705e-02 1.00e+00 1.20e+02 -1.0 5.36e+07 - 1.04e-05 7.63e-06f 18
2 1.1927070e+00 1.00e+00 2.62e+06 -1.0 9.21e+05 -4.0 6.16e-15 2.29e-23H 1
3 2.9689692e-01 1.00e+00 1.80e+05 -1.0 2.24e+13 - 1.83e-07 8.42e-10f 20
4r 2.9689692e-01 1.00e+00 1.00e+03 -0.0 0.00e+00 - 0.00e+00 4.58e-07R 11
5r 2.1005820e+01 9.99e-01 5.04e+02 -0.0 6.60e-02 - 9.90e-01 4.95e-01f 2
6r 7.7118141e+04 9.08e-01 5.18e+03 -0.0 2.09e+00 - 4.21e-01 1.00e+00f 1
7r 1.7923891e+04 7.82e-01 1.54e+03 -0.0 3.63e+00 - 9.90e-01 1.00e+00f 1
8r 5.9690221e+03 5.41e-01 5.12e+02 -0.0 2.92e+00 - 9.90e-01 1.00e+00f 1
9r 4.6855625e+03 5.54e-01 1.95e+02 -0.0 5.14e-01 - 9.92e-01 1.00e+00f 1
iter objective inf_pr inf_du lg(mu) ||d|| lg(rg) alpha_du alpha_pr ls
10r 8.4901226e+03 5.55e-01 5.18e+01 -0.0 2.24e-01 - 1.00e+00 1.00e+00f 1
Number of Iterations....: 10
(scaled) (unscaled)
Objective...............: 8.4901225582208808e+03 8.4901225582208808e+03
Dual infeasibility......: 6.3613117039244315e+06 6.3613117039244315e+06
Constraint violation....: 5.5503677023620179e-01 5.5503677023620179e-01
Complementarity.........: 9.9999982900301554e-01 9.9999982900301554e-01
Overall NLP error.......: 6.3613117039244315e+06 6.3613117039244315e+06
Number of objective function evaluations = 43
Number of objective gradient evaluations = 6
Number of equality constraint evaluations = 71
Number of inequality constraint evaluations = 0
Number of equality constraint Jacobian evaluations = 12
Number of inequality constraint Jacobian evaluations = 0
Number of Lagrangian Hessian evaluations = 10
Total CPU secs in IPOPT (w/o function evaluations) = 0.006
Total CPU secs in NLP function evaluations = 0.001
EXIT: Maximum Number of Iterations Exceeded.
I am not looking for an answer to my problem specifically. What I am hoping for are some suggestions as to why my problem may not be working as expected. Specifically, do my constraints make sense, as defined? Is the variable initialization done properly?

The problem was in the following lines:
x = a0 + a1*T + a2*T2 + a3*T3 + a4*T4 + a5*T5;
y = b0 + b1*T + b2*T2 + b3*T3 + b4*T4 + b5*T5;
vx = a1 + 2*a2*T + 3*a3*T2 + 4*b4*T3 + 5*a5*T4;
vy = b1 + 2*b2*T + 3*b3*T2 + 4*b4*T3 + 5*b5*T4;
Specifically,
vx = a1 + 2*a2*T + 3*a3*T2 + 4*b4*T3 + 5*a5*T4;
should be
vx = a1 + 2*a2*T + 3*a3*T2 + 4*a4*T3 + 5*a5*T4;
based upon the mapping of a's to the x-coordinate and b's to the y-coordinate.
This fixed the problem of constraint violation.
With regards to the problem of convergence/feasibility, I found that ensuring that the initial guess is in the feasible set (obeys the equality constraints) fixed this problem; measures of optimizer performance (inf_pr and inf_du, etc...) were much smaller after fixing the initial condition.

Related

Efficient floating point scaling in C++

I'm working on my fast (and accurate) sin implementation in C++, and I have a problem regarding the efficient angle scaling into the +- pi/2 range.
My sin function for +-pi/2 using Taylor series is the following
(Note: FLOAT is a macro expanded to float or double just for the benchmark)
/**
* Sin for 'small' angles, accurate on [-pi/2, pi/2], fairly accurate on [-pi, pi]
*/
// To switch between float and double
#define FLOAT float
FLOAT
my_sin_small(FLOAT x)
{
constexpr FLOAT C1 = 1. / (7. * 6. * 5. * 4. * 3. * 2.);
constexpr FLOAT C2 = -1. / (5. * 4. * 3. * 2.);
constexpr FLOAT C3 = 1. / (3. * 2.);
constexpr FLOAT C4 = -1.;
// Correction for sin(pi/2) = 1, due to the ignored taylor terms
constexpr FLOAT corr = -1. / 0.9998431013994987;
const FLOAT x2 = x * x;
return corr * x * (x2 * (x2 * (x2 * C1 + C2) + C3) + C4);
}
So far so good... The problem comes when I try to scale an arbitrary angle into the +-pi/2 range. My current solution is:
FLOAT
my_sin(FLOAT x)
{
constexpr FLOAT pi = 3.141592653589793238462;
constexpr FLOAT rpi = 1 / pi;
// convert to +-pi/2 range
int n = std::nearbyint(x * rpi);
FLOAT xbar = (n * pi - x) * (2 * (n & 1) - 1);
// (2 * (n % 2) - 1) is a sign correction (see below)
return my_sin_small(xbar);
};
I made a benchmark, and I'm losing a lot for the +-pi/2 scaling.
Tricking with int(angle/pi + 0.5) is a nope since it is limited to the int precision, also requires +- branching, and i try to avoid branches...
What should I try to improve the performance for this scaling? I'm out of ideas.
Benchmark results for float. (In the benchmark the angle could be out of the validity range for my_sin_small, but for the bench I don't care about that...):
Benchmark results for double.
Sign correction for xbar in my_sin():
Algo accuracy compared to python sin() function:
Candidate improvements
Convert the radians x to rotations by dividing by 2*pi.
Retain only the fraction so we have an angle (-1.0 ... 1.0). This simplifies the OP's modulo step to a simple "drop the whole number" step instead. Going forward with different angle units simply involves a co-efficient set change. No need to scale back to radians.
For positive values, subtract 0.5 so we have (-0.5 ... 0.5) and then flip the sign. This centers the possible values about 0.0 and makes for better convergence of the approximating polynomial as compared to the math sine function. For negative values - see below.
Call my_sin_small1() that uses this (-0.5 ... 0.5) rotations range rather than [-pi ... +pi] radians.
In my_sin_small1(), fold constants together to drop the corr * step.
Rather than use the truncated Taylor's series, use a more optimal set. IMO, this will provide better answers, especially near +/-pi.
Notes: No int to/from float code. With more analysis, possible to get a better set of coefficients that fix my_sin(+/-pi) closer to 0.0. This is just a quick set of code to demo less FP steps and good potential results.
C like code for OP to port to C++
FLOAT my_sin_small1(FLOAT x) {
static const FLOAT A1 = -5.64744881E+01;
static const FLOAT A2 = +7.81017968E+01;
static const FLOAT A3 = -4.11145353E+01;
static const FLOAT A4 = +6.27923581E+00;
const FLOAT x2 = x * x;
return x * (x2 * (x2 * (x2 * A1 + A2) + A3) + A4);
}
FLOAT my_sin1(FLOAT x) {
static const FLOAT pi = 3.141592653589793238462;
static const FLOAT pi2i = 1/(pi * 2);
x *= pi2i;
FLOAT xfraction = 0.5f - (x - truncf(x));
return my_sin_small1(xfraction);
}
For negative values, use -my_sin1(-x) or like code to flip the sign - or add 0.5 in the above minus 0.5 step.
Test
#include <math.h>
#include <stdio.h>
int main(void) {
for (int d = 0; d <= 360; d += 20) {
FLOAT x = d / 180.0 * M_PI;
FLOAT y = my_sin1(x);
printf("%12.6f %11.8f %11.8f\n", x, sin(x), y);
}
}
Output
0.000000 0.00000000 -0.00022483
0.349066 0.34202013 0.34221691
0.698132 0.64278759 0.64255589
1.047198 0.86602542 0.86590189
1.396263 0.98480775 0.98496443
1.745329 0.98480775 0.98501128
2.094395 0.86602537 0.86603642
2.443461 0.64278762 0.64260530
2.792527 0.34202022 0.34183803
3.141593 -0.00000009 0.00000000
3.490659 -0.34202016 -0.34183764
3.839724 -0.64278757 -0.64260519
4.188790 -0.86602546 -0.86603653
4.537856 -0.98480776 -0.98501128
4.886922 -0.98480776 -0.98496443
5.235988 -0.86602545 -0.86590189
5.585053 -0.64278773 -0.64255613
5.934119 -0.34202036 -0.34221727
6.283185 0.00000017 -0.00022483
Alternate code below makes for better results near 0.0, yet might cost a tad more time. OP seems more inclined to speed.
FLOAT xfraction = 0.5f - (x - truncf(x));
// vs.
FLOAT xfraction = x - truncf(x);
if (x >= 0.5f) x -= 1.0f;
[Edit]
Below is a better set with about 10% reduced error.
-56.0833765f
77.92947047f
-41.0936875f
6.278635918f
Yet another approach:
Spend more time (code) to reduce the range to ±pi/4 (±45 degrees), then possible to use only 3 or 2 terms of a polynomial that is like the usually Taylors series.
float sin_quick_small(float x) {
const float x2 = x * x;
#if 0
// max error about 7e-7
static const FLOAT A2 = +0.00811656036940792f;
static const FLOAT A3 = -0.166597759850666f;
static const FLOAT A4 = +0.999994132743861f;
return x * (x2 * (x2 * A2 + A3) + A4);
#else
// max error about 0.00016
static const FLOAT A3 = -0.160343346851626f;
static const FLOAT A4 = +0.999031566686144f;
return x * (x2 * A3 + A4);
#endif
}
float cos_quick_small(float x) {
return cosf(x); // TBD code.
}
float sin_quick(float x) {
if (x < 0.0) {
return -sin_quick(-x);
}
int quo;
float x90 = remquof(fabsf(x), 3.141592653589793238462f / 2, &quo);
switch (quo % 4) {
case 0:
return sin_quick_small(x90);
case 1:
return cos_quick_small(x90);
case 2:
return sin_quick_small(-x90);
case 3:
return -cos_quick_small(x90);
}
return 0.0;
}
int main() {
float max_x = 0.0;
float max_error = 0.0;
for (int d = -45; d <= 45; d += 1) {
FLOAT x = d / 180.0 * M_PI;
FLOAT y = sin_quick(x);
double err = fabs(y - sin(x));
if (err > max_error) {
max_x = x;
max_error = err;
}
printf("%12.6f %11.8f %11.8f err:%11.8f\n", x, sin(x), y, err);
}
printf("x:%.6f err:%.6f\n", max_x, max_error);
return 0;
}

Fsolve equivalent in C++

I am trying to replicate Matlab's Fsolve as my project is in C++ solving an implicit RK4 scheme. I am using the NLopt library using the NLOPT_LD_MMA algorithm. I have run the required section in matlab and it is considerably faster. I was wondering whether anyone had any ideas of a better Fsolve equivalent in C++? Another reason is that I would like f1 and f2 to both tend to zero and it seems suboptimal to calculate the L2 norm to include both of them as NLopt seems to only allow a scalar return value from the objective function. Does anyone have any ideas of an alternative library or perhaps using a different algorithm/constraints to more closely replicate the default fsolve.
Would it be better (faster) perhaps to call the python scipy.minimise.fsolve from C++?
double implicitRK4(double time, double V, double dt, double I, double O, double C, double R){
const int number_of_parameters = 2;
double lb[number_of_parameters];
double ub[number_of_parameters];
lb[0] = -999; // k1 lb
lb[1] = -999;// k2 lb
ub[0] = 999; // k1 ub
ub[1] = 999; // k2 ub
double k [number_of_parameters];
k[0] = 0.01;
k[1] = 0.01;
kOptData addData(time,V,dt,I,O,C,R);
nlopt_opt opt; //NLOPT_LN_MMA NLOPT_LN_COBYLA
opt = nlopt_create(NLOPT_LD_MMA, number_of_parameters);
nlopt_set_lower_bounds(opt, lb);
nlopt_set_upper_bounds(opt, ub);
nlopt_result nlopt_remove_inequality_constraints(nlopt_opt opt);
// nlopt_result nlopt_remove_equality_constraints(nlopt_opt opt);
nlopt_set_min_objective(opt,solveKs,&addData);
double minf;
if (nlopt_optimize(opt, k, &minf) < 0) {
printf("nlopt failed!\n");
}
else {
printf("found minimum at f(%g,%g,%g) = %0.10g\n", k[0],k[1],minf);
}
nlopt_destroy(opt);
return V + (1/2)*dt*k[0] + (1/2)*dt*k[1];```
double solveKs(unsigned n, const double *x, double *grad, void *my_func_data){
kOptData *unpackdata = (kOptData*) my_func_data;
double t1,y1,t2,y2;
double f1,f2;
t1 = unpackdata->time + ((1/2)-(1/6)*sqrt(3));
y1 = unpackdata->V + (1/4)*unpackdata->dt*x[0] + ((1/4)-(1/6)*sqrt(3))*unpackdata->dt*x[1];
t2 = unpackdata->time + ((1/2)+(1/6)*sqrt(3));
y2 = unpackdata->V + ((1/4)+(1/6)*sqrt(3))*unpackdata->dt*x[0] + (1/4)*unpackdata->dt*x[1];
f1 = x[0] - stateDeriv_implicit(t1,y1,unpackdata->dt,unpackdata->I,unpackdata->O,unpackdata->C,unpackdata->R);
f2 = x[1] - stateDeriv_implicit(t2,y2,unpackdata->dt,unpackdata->I,unpackdata->O,unpackdata->C,unpackdata->R);
return sqrt(pow(f1,2) + pow(f2,2));
My matlab version below seems to be a lot simpler but I would prefer the whole code in c++!
k1 = 0.01;
k2 = 0.01;
x0 = [k1,k2];
fun = #(x)solveKs(x,t,z,h,I,OCV1,Cap,Rct,static);
options = optimoptions('fsolve','Display','none');
k = fsolve(fun,x0,options);
% Calculate the next state vector from the previous one using RungeKutta
% update equation
znext = z + (1/2)*h*k(1) + (1/2)*h*k(2);``
function [F] = solveKs(x,t,z,h,I,O,C,R,static)
t1 = t + ((1/2)-(1/6)*sqrt(3));
y1 = z + (1/4)*h*x(1) + ((1/4)-(1/6)*sqrt(3))*h *x(2);
t2 = t + ((1/2)+(1/6)*sqrt(3));
y2 = z + ((1/4)+(1/6)*sqrt(3))*h*x(1) + (1/4)*h*x(2);
F(1) = x(1) - stateDeriv_implicit(t1,y1,h,I,O,C,R,static);
F(2) = x(2) - stateDeriv_implicit(t2,y2,h,I,O,C,R,static);
end

C++ slow loop computation

I have the following loop for a monte-carlo computation I am performing:
the variables below are pre-computed/populated and is defined as:
w_ = std::vector<std::vector<double>>(150000, std::vector<double>(800));
C_ = Eigen::MatrixXd(800,800);
Eigen::VectorXd a(800);
Eigen::VectorXd b(800);
The while loop is taking me about 570 seconds to compute.Just going by the the loops I understand that I have nPaths*m = 150,000 * 800 = 120,000,000 sets of computations happening (I have not taken into account the cdf computations handled by boost libraries).
I am a below average programmer and was wondering if there are any obvious mistakes which I am making which maybe slowing the computation down. Or is there any other way to handle the computation which can speed things up.
int N(0);
int nPaths(150000);
int m(800);
double Varsum(0.);
double err;
double delta;
double v1, v2, v3, v4;
Eigen::VectorXd d = Eigen::VectorXd::Zero(m);
Eigen::VectorXd e = Eigen::VectorXd::Zero(m);
Eigen::VectorXd f = Eigen::VectorXd::Zero(m);
Eigen::VectorXd y;
y0 = Eigen::VectorXd::Zero(m);
boost::math::normal G(0, 1.);
d(0) = boost::math::cdf(G, a(0) / C_(0, 0));
e(0) = boost::math::cdf(G, b(0) / C_(0, 0));
f(0) = e(0) - d(0);
while (N < (nPaths-1))
{
y = y0;
for (int i = 1; i < m; i++)
{
v1 = d(i - 1) + w_[N][(i - 1)]*(e(i - 1) - d(i - 1));
y(i - 1) = boost::math::quantile(G, v1);
v2 = (a(i) - C_.row(i).dot(y)) / C_(i, i);
v3 = (b(i) - C_.row(i).dot(y)) / C_(i, i);
d(i) = boost::math::cdf(G, v2);
e(i) = boost::math::cdf(G, v3);
f(i) = (e(i) - d(i))*f(i - 1);
}
N++;
delta = (f(m-1) - Intsum) / N;
Intsum += delta;
Varsum = (N - 2)*Varsum / N + delta*delta;
err = alpha_*std::sqrt(Varsum);
}
If I understand your code right, the running time is actually O(nPaths*m*m)=10^11, due to the dot-product C_.row(i).dot(y) which needs O(m) operation.
You could speed up the program by factor of two by not calculating it twice:
double prod=C_.row(i).dot(y)
v2 = (a(i) - prod) / C_(i, i);
v3 = (b(i) - prod) / C_(i, i);
but maybe compiler already does it for you.
The other thing is that y consists of zeros (at least at the beginning) so you don't have to do the full dot-product but only until current value of i. That should give another factor 2 speed up.
So taken into the account the sheer number of operation your timings are not so bad. There is some room for improvement of the code, but if you are interested in speeding up some orders of magnitude you probably should be thinking about changing your formulation.

Calculate intersection of two lines using integers only

I can quite easily calculate the point of intersection given two lines. If I start with two vertices:
(x1,y1)
(x2,y2)
I can calculate the slope by doing (y1-y2)/(x1-x2), and then calculating the intercept
y1 - slope * x1
Then do that again, so I have to sets of slope and intercept, then just do:
x = (intercept2 - intercept1) / (slope1 - slope2)
y = slope1 * x + intercept1
(disclaimer: this might not even work, but i've gotten something very close to it to work, and it illustrates my general technique)
BUT that only works with data types with decimals, or non integral. Say the vertices are:
(0,1)
(10,2)
To calculate the slope would result in (1-2)/(0-10), which is -1/-10 which is not 1/10, it is 0.
How can I get code that yields a valid result using only integers?
Edit: I can't use floats AT ALL!. No casting, no nothing. Also, values are capped at 65535. And everything is unsigned.
In high school when subtracting fractions, our teachers taught us to find a common denominator
So 1/4 - 1/6 = 3/12 - 2/12 = 1/12
So do the same with your slopes.
int slope1 = n1 / d1; // numerator / denominator
int slope2 = n2 / d2;
// All divisions below should have 0 for remainder
int g = gcd( d1, d2 ); // gcd( 4, 6 ) = 2
int d = d1 * d2 / g; // common denominator (12 above)
int n = (d/d1) * n1 - (d/d2) * n2; // (1 in 1/12 above)
// n1/d1 - n2/d2 == n/d
I hope I got that right.
Hm..
(0,1)
(10,2)
and (y1-y2)/(x1-x2). Well, this is the description of one line, not the intersection of two lines.
As far as I remember lines are described in the form of x * v with x an skalar and v be a vector. Then it's
x * (0,1) = v2 and
x * (10, 2) = v2.
therefore the lines only intersect if exactly one solution to both equitions exist, overlap when there are infinitive numbers of solutions and don't intersect when they are parallel.
http://www.gamedev.net/topic/647810-intersection-point-of-two-vectors/
explains the calcuclation based on the dot - product.
Input: line L passing thru (x1, y1) and (x2, y2), and line M passing thru (X1, Y1) and (X2, Y2)
Output: (x, y) of the intersecting point of two lines L and M
Tell Wolfram Alpha to solve y = (y1-y2)/(x1-x2)*(x-x1)+y1 and y = (Y1-Y2)/(X1-X2)*(x-X1)+Y1 for x, y to get this solution:
http://www.wolframalpha.com/share/clip?f=d41d8cd98f00b204e9800998ecf8427e3at5u9evl8
But I have no idea on how to write a program to implement the above solution for your calculator with only uint16_t ALU.
Thanks to Graham Toal's answer, below is a primitive Rust implementation of the linked C code in their answer, modified to return the point of intersection for the complete line, as opposed to the line segment. It doesn't use much Rust-specific magic so should be reasonably easy to port to other languages.
The function returns a Point where the Lines intersect, if at all, and a flag denoting whether the intersection point lies on both intersected lines (true) or not (false).
/// 2D integer point
struct Point {
/// The x coordinate.
pub x: i32,
/// The y coordinate.
pub y: i32,
}
/// Line primitive
struct Line {
/// Start point
pub start: Point,
/// End point
pub end: Point,
}
/// Check signs of two signed numbers
///
/// Fastest ASM output compared to other methods. See: https://godbolt.org/z/zVx9cD
fn same_signs(a: i32, b: i32) -> bool {
a ^ b >= 0
}
/// Integer-only line segment intersection
///
/// If the point lies on both line segments, the second tuple argument will return `true`.
///
/// Inspired from https://stackoverflow.com/a/61485959/383609, which links to
/// https://webdocs.cs.ualberta.ca/~graphics/books/GraphicsGems/gemsii/xlines.c
fn intersection(l1: &Line, l2: &Line) -> Option<(Point, bool)> {
let Point { x: x1, y: y1 } = l1.start;
let Point { x: x2, y: y2 } = l1.end;
let Point { x: x3, y: y3 } = l2.start;
let Point { x: x4, y: y4 } = l2.end;
// First line coefficients where "a1 x + b1 y + c1 = 0"
let a1 = y2 - y1;
let b1 = x1 - x2;
let c1 = x2 * y1 - x1 * y2;
// Second line coefficients
let a2 = y4 - y3;
let b2 = x3 - x4;
let c2 = x4 * y3 - x3 * y4;
let denom = a1 * b2 - a2 * b1;
// Lines are colinear
if denom == 0 {
return None;
}
// Compute sign values
let r3 = a1 * x3 + b1 * y3 + c1;
let r4 = a1 * x4 + b1 * y4 + c1;
// Sign values for second line
let r1 = a2 * x1 + b2 * y1 + c2;
let r2 = a2 * x2 + b2 * y2 + c2;
// Flag denoting whether intersection point is on passed line segments. If this is false,
// the intersection occurs somewhere along the two mathematical, infinite lines instead.
//
// Check signs of r3 and r4. If both point 3 and point 4 lie on same side of line 1, the
// line segments do not intersect.
//
// Check signs of r1 and r2. If both point 1 and point 2 lie on same side of second line
// segment, the line segments do not intersect.
let is_on_segments = (r3 != 0 && r4 != 0 && same_signs(r3, r4))
|| (r1 != 0 && r2 != 0 && same_signs(r1, r2));
// If we got here, line segments intersect. Compute intersection point using method similar
// to that described here: http://paulbourke.net/geometry/pointlineplane/#i2l
// The denom/2 is to get rounding instead of truncating. It is added or subtracted to the
// numerator, depending upon the sign of the numerator.
let offset = if denom < 0 { -denom / 2 } else { denom / 2 };
let num = b1 * c2 - b2 * c1;
let x = if num < 0 { num - offset } else { num + offset } / denom;
let num = a2 * c1 - a1 * c2;
let y = if num < 0 { num - offset } else { num + offset } / denom;
Some((Point::new(x, y), is_on_segments))
}

Generating a C++ function from a list of argument

I am writing a function f to be used in a Runge Kutta integrator.
output RungeKutta(function f, initial conditions IC, etc.)
Since the function will be called many times, I am looking for a way to generate the function f at compile time.
In this case, function f depends on a fixed list of parameters vector p, where p is sparse and is fixed before the code is compiled. To be concrete,
double function f(vector<double> x) {
return x dot p;
}
Since p is sparse, taking the dot product in f is not the most efficient. Hard-coding x dot p seems to be the way to go, but p can be very long (1000).
What are my options?
Is writing another program (taking p as input) to generate a .cpp file my only option?
Thanks for the comments. Here is a more concrete example for the differential equation.
dy/dx = f_p(x)
One example for f_p(x):
p = [0, 1, 0]; x = [x1, x2, x3]
double f_p(vector<double> x) {
return x2; // This is what I meant by hard-coding
}
instead of:
double f(vector<double> p, vector<double> x) {
double r = 0;
for (i=0; i < p.length(); i++) {
r += p[i]*x[i];
}
return r;
}
The key problem you are trying to solve is that a "leaf" function in your calculation that will be called many times will also most often do no work given the problem domain. The hope is that the redundant work - namely multiplying a value with an element of an array known at compile time to be zero - can be collapsed as part of a compile time step.
C++ has language facilities to deal with this, namely template metaprogramming. C++ templates are very powerful (ie Turing complete) and allow for things like recursive calculations based on compile time constants.
Below is an example of how to implement your example using templates and template specialization (you can also find a runnable example I've created here http://ideone.com/BDtBt7). The basic idea behind the code is to generate a type with a static function that returns the resulting dot product of an input vector of values and a compile time constant array. The static function recursively calls instances of itself, passing a lower index value as it moves through the input/constant arrays of elements. It is also templated with whether the value in the compile time constant array p that is being evaluated is zero. If it is, we can skip calculating that and move onto the next value in the recursion. Lastly, there is a base case that stops the recursion once we have reached the first element in the array.
#include <array>
#include <iostream>
#include <vector>
constexpr std::array<double, 5> p = { 1.0, 0.0, 3.0, 5.0, 0.0 };
template<size_t index, bool isZero>
struct DotProductCalculator
{
static double Calculate(const std::vector<double>& xArg)
{
return (xArg[index] * p[index])
+ DotProductCalculator<index - 1, p[index - 1] == 0.0>::Calculate(xArg);
}
};
template<>
struct DotProductCalculator<0, true>
{
static double Calculate(const std::vector<double>& xArg)
{
return 0.0;
}
};
template<>
struct DotProductCalculator<0, false>
{
static double Calculate(const std::vector<double>& xArg)
{
return xArg[0] * p[0];
}
};
template<size_t index>
struct DotProductCalculator<index, true>
{
static double Calculate(const std::vector<double>& xArg)
{
return 0.0 + DotProductCalculator<index - 1, p[index - 1] == 0.0>::Calculate(xArg);
}
};
template<typename ArrayType>
double f_p_driver(const std::vector<double>& xArg, const ArrayType& pAsArgument)
{
return DotProductCalculator<std::tuple_size<ArrayType>::value - 1,
p[std::tuple_size<ArrayType>::value -1] == 0.0>::Calculate(xArg);
}
int main()
{
std::vector<double> x = { 1.0, 2.0, 3.0, 4.0, 5.0 };
double result = f_p_driver(x, p);
std::cout << "Result: " << result;
return 0;
}
You say in the comments that P really is a row or column of a matrix, and that the matrix is sparse. I'm not familiar with the specific physical problem you are solving, but often, sparse matrices have a fixed diagonal "banding" structure of some kind, e.g.:
| a1 b1 0 0 0 0 0 d1 |
| c1 a2 b2 0 0 0 0 0 |
| 0 c2 a3 b3 0 0 0 0 |
| 0 0 c3 a4 b4 0 0 0 |
| 0 0 0 c4 a5 b5 0 0 |
| 0 0 0 0 c5 a6 b6 0 |
| 0 0 0 0 0 c6 a7 b7 |
| e1 0 0 0 0 0 c7 a8 |
The most efficient way to store such matrices tends to be to store the diagonals as arrays/vectors, so:
A = [a1, a2, a3, a4, a5, a6, a7, a8]
B = [b1, b2, b3, b4, b5, b6, b7]
C = [c1, c2, c3, c4, c5, c6, c7]
D = [d1]
E = [e1]
Multiplying a row-vector X = [x1, x2, x3, x4, x5, x6, x7, x8] by the above matrix thus becomes:
Y = X . M
Y[0] = X[0] * A[0] + X[1] * C[0] + X[7] * E[0]
Y[1] = X[0] * B[0] + X[1] * A[1] + X[2] * C[1]
etc.
or more generally:
Y[i] = X[i-7] * D[i] + X[i-1] * B[i] + X[i] * A[i] + X[i+1] * C[i] + X[i+7] * E[i]
Where out-of-range array accesses (< 0 or >= 8 should be treated as evaluating to 0. To avoid having to test for out-of-bounds everywhere, you can actually store each diagonal and the vector itself in oversize arrays which have leading and trailing elements filled with zeroes.
Note that this will also be highly cache efficient, as all array accesses are linear.
With the given constraints I would create a custom function object which stores the matrix p and computes the operation in its function call operator. I would implement two versions of the function: one which preprocesses the matrix upon construction to "know" where the non-zero elements are and one which just does the operations as stated, accepting that many of the computations just result in 0. The quoted amount of 10% non-zero elements sounds likely to be too dense for the complication from taking advantage of the sparsity to pay off.
Ignoring that p is a matrix and using it as a vector, the version without preprocessing would be something like this:
class dotProduct {
std::vector<double> p;
public:
dotProduct(std::vector<double> const& p): p(p) {}
double operator()(std::vector<double> const& x) const {
return std::inner_product(p.begin(), p.end(), x.begin());
}
};
// ...
... RungeKutta(dotProduct(p), initial conditions IC, etc.);
When using C++11 a lambda function could be used instead:
... RungeKutta([=](std::vector<double> const& x) {
return std::inner_product(p.begin(), p.end(), x.begin());
}, intitial conditions IC, etc.);
For the preprocessing version you'd store a std::vector<std::pair<double, std::size_t>> indicating which indices actually need to be multiplied:
class sparseDotProduct {
typedef std::vector<std::pair<double, std::size_t>> Vector;
Vector p;
public:
sparsedotProduct(std::vector<double> const& op) {
for (std::size_t i(0), s(op.size()); i != s; ++i) {
if (op[i]) {
p.push_back(std::make_pair(op[i], i));
}
}
}
double operator()(std::vector<double> const& x) {
double result(0);
for (Vector::const_iterator it(p.begin()), end(p.end()); it != end; ++it) {
result += it->first * x[it->second];
}
return result;
}
};
The use of this function object is just the same although it may be reasonable to keep this object around if p doesn't change.
I would personally expect that the non-sparse version actually outperforms the sparse version if there are 10% non-zero values. However, with these two versions around it should be relatively simple to measure the performance of the different approaches. I wouldn't expect a custom created code to be substantially better although it could improve on the computation. If so, it may work to use meta programming techniques to create the code but I doubt that this would be too practical.