Row Reduction of augmented matrix - 3D Spline calculations - c++

I'm trying to implement Interpolation by relaxed cubic splines, which can be found in the 5th chapter of this article (page 9):
https://www.math.ucla.edu/~baker/149.1.02w/handouts/dd_splines.pdf
So far, I have the following:
auto GetControlPoints = [](const std::vector<Vector3d>& S) {
int n = S.size();
float var = n - 1.0f;
MatrixXd M(n - 1, n - 1);
VectorXd C[3] = {
VectorXd(n - 1),
VectorXd(n - 1),
VectorXd(n - 1)
};
for (int i = 0; i < n - 1; ++i) {
auto r = RowVectorXd(n - 1);
for (int j = 0; j < n - 1; ++j) {
if (j == i)
r[j] = var;
else if (j == i - 1 || j == i + 1)
r[j] = 1.f;
else
r[j] = 0.f;
}
M.row(i) = r;
if (i == 0) {
for (int j = 0; j < 3; ++j) {
C[j] << (n + 1) * S[1][j] - S[0][j];
}
}
else if (i == n - 1) {
for (int j = 0; j < 3; ++j) {
C[j] << (n + 1) * S[n - 1][j] - S[n][j];
}
}
else {
for (int j = 0; j < 3; ++j) {
C[j] << (n + 1) * S[i][j];
}
}
}
MatrixXd augMC[3] = {
MatrixXd(n - 1, n),
MatrixXd(n - 1, n),
MatrixXd(n - 1, n)
};
for (int i = 0; i < 3; ++i) {
augMC[i].block(0, 0, n - 1, n - 1) = M;
augMC[i].block(n - 1, n - 1, n - 1, 1) = C[i].transpose();
}
};
I got to the point where I made an augmented Matrix using M and C, but I have no idea on how to row reduce it. Any thoughts?

You could use the inplace-variant of PartialPivLU -- but it looks like you actually want to solve the system M*B = C for which you should just decompose M (as it is symmetric you can use an LLt or LDLt decomposition) and then use the solve method of that decomposition.
To setup M you should also use the diagonal method (untested):
MatrixXd M(n - 1, n - 1);
M.setZero();
M.diagonal().setConstant(n - 1.0);
M.diagonal<1>().setOnes();
M.diagonal<-1>().setOnes();
LLT<MatrixXd> lltOfM(M);
for (int i = 0; i < 3; ++i) { B[i] = lltOfM.solve(C[i]); }
For large n this is sub-optimal, since it does not exploit the tridiagonal structure of M. You could try out the sparse module for this, but there should actually be a direct algorithm (Eigen does not explicitly have a tridiagonal matrix type, though).
For C you probably could also use a MatrixX3d (I don't really understand how you fill your C vectors -- I think your current code should assert at run-time, unless n==2, or you disabled assertions).

Related

FLUTTER - Problem with List constructor and null safety mode

I would like to use this "Levenshtein" function to assess similarities between two strings (to check if user has committed a spelling mistake).
Since I work on null safe mode, it points out an error with the LIST constructor :
List<List<int>> d = List.generate(sa + 1, (int i) => List(sb + 1));
What can I write to replace List(sb+1)); ?
int levenshtein(String a, String b) {
a = a.toUpperCase();
b = b.toUpperCase();
int sa = a.length;
int sb = b.length;
int i, j, cost, min1, min2, min3;
int levenshtein;
// ignore: deprecated_member_use
List<List<int>> d = List.generate(sa + 1, (int i) => List(sb + 1));
if (a.length == 0) {
levenshtein = b.length;
return (levenshtein);
}
if (b.length == 0) {
levenshtein = a.length;
return (levenshtein);
}
for (i = 0; i <= sa; i++) d[i][0] = i;
for (j = 0; j <= sb; j++) d[0][j] = j;
for (i = 1; i <= a.length; i++)
for (j = 1; j <= b.length; j++) {
if (a[i - 1] == b[j - 1])
cost = 0;
else
cost = 1;
min1 = (d[i - 1][j] + 1);
min2 = (d[i][j - 1] + 1);
min3 = (d[i - 1][j - 1] + cost);
d[i][j] = min(min1, min(min2, min3));
}
levenshtein = d[a.length][b.length];
return (levenshtein);
}
You can use List.generate for the inner list as well.
List<List<int>> d = List.generate(sa + 1, (int i) => List.generate(sb + 1, (int j) => 0));
Also, if they're all going to be initialized to 0 you can just do this too:
List<List<int>> d = List.filled(sa + 1, List.filled(sb + 1, 0));

Reorganizing nested loops for multithreading

I'm trying to rewrite the main loop in a physics simulation and split the workload between more threads.
It calls dostuff on every unique pair of indices and looks like this:
for (int i = 0; i < n - 1; ++i)
{
for (int j = i + 1; j < n; ++j)
{
dostuff(i, j);
}
}
I came up with two options:
//#1
//sqrt is implemented as binary search on ints, floors the result
for (int x = 0; x < n * (n - 1) / 2; ++x)
{
int i = (1 + sqrt(1 + 8 * x)) / 2;
int j = x - i * (i - 1) / 2;
dostuff(i, j);
}
//#2
for (int x = 0; x < n * n; ++x)
{
int i = x % n;
int j = x / n;
if (i < j)
dostuff(i, j);
}
And for each option, there is corresponding thread loop using shared atomic counter:
//#1
while(int x = counter.fetch_add(1) < n * (n - 1) / 2)
{
int i = (1 + sqrt(1 + 8 * x)) / 2;
int j = x - i * (i - 1) / 2;
dostuff(i, j);
}
//#2
while(int x = counter.fetch_add(1) < n * n)
{
int i = x % n;
int j = x / n;
if (i < j)
dostuff(i, j);
}
My question is, what is the best way to share the workload of the main loop between threads for n < 10^6?
EDIT:
//dostuff
Element& a = elements[i];
Element& b = elements[j];
glm::dvec3 r = b.getPosition() - a.getPosition();
double rv = glm::length(r);
double base = G / (rv * rv);
glm::dvec3 dir = glm::normalize(r);
glm::dvec3 bd = dir * base;
accelerations[i] += bd * b.getMass();
accelerations[j] -= bd * a.getMass();
Your work is a triangle. You want to.divide the triangle into k distinct pieces.
If k is a power of 2 you can do this:
a
a a
b c d
b c d d
Each of those regions are equal in size.

Optimize outer loop with OpenMP and a reduction

I struggle a bit with a function. The calculation is wrong if I try to parallelize the outer loop with a
#pragma omp parallel reduction(+:det).
Can someone show me how to solve it and why it is failing?
// template<class T> using vector2D = std::vector<std::vector<T>>;
float Det(vector2DF &a, int n)
{
vector2DF m(n - 1, vector1DF(n - 1, 0));
if (n == 1) return a[0][0];
if (n == 2) return a[0][0] * a[1][1] - a[1][0] * a[0][1];
float det = 0;
for (int i = 0; i < n; i++)
{
int l = 0;
#pragma omp parallel for private(l)
for (int j = 1; j < n; j++)
{
l = 0;
for (int k = 0; k < n; k++)
{
if (k == i) continue;
m[j - 1][l] = a[j][k];
l++;
}
}
det += std::pow(-1.0, 1.0 + i + 1.0) * a[0][i] * Det(m, n - 1);
}
return det;
}
If you parallelize the outer loop, there is a race condition on this line:
m[j - 1][l] = a[j][k];
Also you likely want a parallel for reduction instead of just a parallel reduction.
The issue is, that m is shared, even though that wouldn't be necessary given that it is completely overwritten in the inner loop. Always declare variables as locally as possible, this avoids issues with wrongly shared variables, e.g.:
float Det(vector2DF &a, int n)
{
if (n == 1) return a[0][0];
if (n == 2) return a[0][0] * a[1][1] - a[1][0] * a[0][1];
float det = 0;
#pragma omp parallel reduction(+:det)
for (int i = 0; i < n; i++)
{
vector2DF m(n - 1, vector1DF(n - 1, 0));
for (int j = 1; j < n; j++)
{
int l = 0;
for (int k = 0; k < n; k++)
{
if (k == i) continue;
m[j - 1][l] = a[j][k];
l++;
}
}
det += std::pow(-1.0, 1.0 + i + 1.0) * a[0][i] * Det(m, n - 1);
}
return det;
}
Now that is correct, but since m can be expensive to allocate, performance could benefit from not doing it in each and every iteration. This can be done by splitting parallel and for directives as such:
float Det(vector2DF &a, int n)
{
if (n == 1) return a[0][0];
if (n == 2) return a[0][0] * a[1][1] - a[1][0] * a[0][1];
float det = 0;
#pragma omp parallel reduction(+:det)
{
vector2DF m(n - 1, vector1DF(n - 1, 0));
#pragma omp parallel for
for (int i = 0; i < n; i++)
{
for (int j = 1; j < n; j++)
{
int l = 0;
for (int k = 0; k < n; k++)
{
if (k == i) continue;
m[j - 1][l] = a[j][k];
l++;
}
}
det += std::pow(-1.0, 1.0 + i + 1.0) * a[0][i] * Det(m, n - 1);
}
}
return det;
}
Now you could also just declare m as firstprivate, but that would assume that the copy constructor makes a completely independent deep-copy and thus make the code more difficult to reason about.
Please be aware that you should always include expected output, actual output and a minimal complete and verifiable example.

Optimization of C++ code - std::vector operations

I have this funcition (RotateSlownessTop) and it's called about 800 times computing the corresponding values. But the calculation is slow and is there a way I can make the computations faster.
The number of element in X/Y is 7202. (Fairly large set)
I did the performance analysis and the screenshot has been attached.
void RotateSlownessTop(vector <double> &XR1, vector <double> &YR1, float theta = 0.0)
{
Matrix2d a;
a(0,0) = cos(theta);
a(0,1) = -sin(theta);
a(1, 0) = sin(theta);
a(1, 1) = cos(theta);
vector <double> XR2(7202), YR2(7202);
for (size_t i = 0; i < X.size(); ++i)
{
XR2[i] = (a(0, 0)*X[i] + a(0, 1)*Y[i]);
YR2[i] = (a(1, 0)*X[i] + a(1, 1)*Y[i]);
}
size_t i = 0;
size_t j = 0;
while (i < YR2.size())
{
if (i > 0)
if ((XR2[i]>0) && (XR2[i-1]<0))
j = i;
if (YR2[i] > (-1e-10) && YR2[i]<0.0)
YR2[i] = 0.0;
if (YR2[i] < (1e-10) && YR2[i]>0.0)
YR2[i] = -YR2[i];
if ( YR2[i]<0.0)
{
YR2.erase(YR2.begin() + i);
XR2.erase(XR2.begin() + i);
--i;
}
++i;
}
size_t k = 0;
while (j < YR2.size())
{
YR1[k] = (YR2[j]);
XR1[k] = (XR2[j]);
YR2.erase(YR2.begin() + j);
XR2.erase(XR2.begin() + j);
++k;
}
size_t l = 0;
for (; k < XR1.size(); ++k)
{
XR1[k] = XR2[l];
YR1[k] = YR2[l];
l++;
}
}
Edit1: I have updated the code by replacing all push_back() with operator[], since I read somewhere that this is much faster.
However the whole program is still slow. Any suggestions are appreciated.
If the size is large, you can improve the push_back by pre-allocating the space needed. Add this before the loop:
XR2.reserve(X.size());
YR2.reserve(X.size());

Python weave is abnormally slow when using calloc

I use the following code to make an in-place forward-backward FIR filtering :
lena = len(a)
lenb = len(b)
convol = zeros(a.shape)
code = """
// Forward convolution
int pad = (lenb-1)/2;
int i, j;
for (i=pad; i<pad+lena; i++)
{
int kmin, kmax, k;
// Reverse indexing for the next pass
j = lena-1-i+pad;
convol(j) = 0;
kmin = (i >= lenb - 1) ? i - (lenb - 1) : 0;
kmax = (i < lena - 1) ? i : lena - 1;
for (k = kmin; k <= kmax; k++)
{
convol(j) += a(k)*b(i - k);
}
}
// Backward convolution (the signal in convol has been
// reversed using reversed indexes)
for (i=pad; i<pad+lena; i++)
{
int kmin, kmax, k;
// Reverse indexing for reordering the output vector
j = lena-1-i+pad;
a(j) = 0;
kmin = (i >= lenb - 1) ? i - (lenb - 1) : 0;
kmax = (i < lena - 1) ? i : lena - 1;
for (k = kmin; k <= kmax; k++)
{
a(j) += convol(k)*b(i - k);
}
}
return_val = 1;
"""
weave.inline(code, [ 'a', 'b', 'lena', 'lenb', 'convol'],
type_converters=converters.blitz, compiler = 'g++')
Of course the “convol” variable does not need to be seen outside of the C/C++ scope and I need both space and processing optimality. So, it makes sense (at least to me) to replace this code by the following:
lena = len(a)
lenb = len(b)
code = """
// Forward convolution
int pad = (lenb-1)/2;
int i, j;
float* convol = (float*) calloc(lena, sizeof(float));
for (i=pad; i<pad+lena; i++)
{
int kmin, kmax, k;
// Reverse indexing for the next pass
j = lena-1-i+pad;
convol[j] = 0;
kmin = (i >= lenb - 1) ? i - (lenb - 1) : 0;
kmax = (i < lena - 1) ? i : lena - 1;
for (k = kmin; k <= kmax; k++)
{
convol[j] += a(k)*b(i - k);
}
}
// Backward convolution (the signal in convol has been
// reversed using reversed indexes)
for (i=pad; i<pad+lena; i++)
{
int kmin, kmax, k;
// Reverse indexing for reordering the output vector
j = lena-1-i+pad;
a(j) = 0;
kmin = (i >= lenb - 1) ? i - (lenb - 1) : 0;
kmax = (i < lena - 1) ? i : lena - 1;
for (k = kmin; k <= kmax; k++)
{
a(j) += convol[k]*b(i - k);
}
}
free(convol);
return_val = 1;
"""
weave.inline(code, [ 'a', 'b', 'lena', 'lenb'],
type_converters=converters.blitz, compiler = 'g++')
Where the only difference is instead of using a numpy array I used directly a C float array. The problem is that the second code takes about 2 times longer to process than the first one… why is that? Is there something wrong in the second version? It should be faster…!