Optimize outer loop with OpenMP and a reduction - c++

I struggle a bit with a function. The calculation is wrong if I try to parallelize the outer loop with a
#pragma omp parallel reduction(+:det).
Can someone show me how to solve it and why it is failing?
// template<class T> using vector2D = std::vector<std::vector<T>>;
float Det(vector2DF &a, int n)
{
vector2DF m(n - 1, vector1DF(n - 1, 0));
if (n == 1) return a[0][0];
if (n == 2) return a[0][0] * a[1][1] - a[1][0] * a[0][1];
float det = 0;
for (int i = 0; i < n; i++)
{
int l = 0;
#pragma omp parallel for private(l)
for (int j = 1; j < n; j++)
{
l = 0;
for (int k = 0; k < n; k++)
{
if (k == i) continue;
m[j - 1][l] = a[j][k];
l++;
}
}
det += std::pow(-1.0, 1.0 + i + 1.0) * a[0][i] * Det(m, n - 1);
}
return det;
}

If you parallelize the outer loop, there is a race condition on this line:
m[j - 1][l] = a[j][k];
Also you likely want a parallel for reduction instead of just a parallel reduction.
The issue is, that m is shared, even though that wouldn't be necessary given that it is completely overwritten in the inner loop. Always declare variables as locally as possible, this avoids issues with wrongly shared variables, e.g.:
float Det(vector2DF &a, int n)
{
if (n == 1) return a[0][0];
if (n == 2) return a[0][0] * a[1][1] - a[1][0] * a[0][1];
float det = 0;
#pragma omp parallel reduction(+:det)
for (int i = 0; i < n; i++)
{
vector2DF m(n - 1, vector1DF(n - 1, 0));
for (int j = 1; j < n; j++)
{
int l = 0;
for (int k = 0; k < n; k++)
{
if (k == i) continue;
m[j - 1][l] = a[j][k];
l++;
}
}
det += std::pow(-1.0, 1.0 + i + 1.0) * a[0][i] * Det(m, n - 1);
}
return det;
}
Now that is correct, but since m can be expensive to allocate, performance could benefit from not doing it in each and every iteration. This can be done by splitting parallel and for directives as such:
float Det(vector2DF &a, int n)
{
if (n == 1) return a[0][0];
if (n == 2) return a[0][0] * a[1][1] - a[1][0] * a[0][1];
float det = 0;
#pragma omp parallel reduction(+:det)
{
vector2DF m(n - 1, vector1DF(n - 1, 0));
#pragma omp parallel for
for (int i = 0; i < n; i++)
{
for (int j = 1; j < n; j++)
{
int l = 0;
for (int k = 0; k < n; k++)
{
if (k == i) continue;
m[j - 1][l] = a[j][k];
l++;
}
}
det += std::pow(-1.0, 1.0 + i + 1.0) * a[0][i] * Det(m, n - 1);
}
}
return det;
}
Now you could also just declare m as firstprivate, but that would assume that the copy constructor makes a completely independent deep-copy and thus make the code more difficult to reason about.
Please be aware that you should always include expected output, actual output and a minimal complete and verifiable example.

Related

Crout Decomposition Parallel loop - Incorrect results

I have a working sequential Crout Decomposition algorithm that I need to speed up if possible. I have looked online at various OpenMP methods of parallelising the algorithm and I can only get it to work correctly on the lower triangular matrix part of the code. The upper yields wrong results
I feel like I have been looking at the code too long and I may be blind to a data dependency that I am overlooking
Sequential code is as follows, which works correct
for (i = 0; i < size; i++)
{
// Upper Triangle
for (j = 0; j < i; j++)
{
q = matx[j * size + i];
for (k = 0; k < j; k++)
{
q -= matx[j * size + k] * matx[k * size + i];
}
matx[j * size + i] = q;
}
// Lower Triangle
for (j = i; j < size; j++)
{
q = matx[j * size + i];
for (k = 0; k < i; k++)
{
q -= matx[j * size + k] * matx[k * size + i];
}
matx[j * size + i] = q;
}
}
Now here is the code with the appropriate OpenMP directives
for (i = 0; i < size; i++)
{
// Upper Triangle
#pragma omp parallel for private(j,k,q)
for (j = 0; j < i; j++)
{
q = matx[j * size + i];
for (k = 0; k < j; k++)
{
q -= matx[j * size + k] * matx[k * size + i];
}
matx[j * size + i] = q;
}
// Lower Triangle
#pragma omp parallel for private(j,k,q)
for (j = i; j < size; j++)
{
q = matx[j * size + i];
for (k = 0; k < i; k++)
{
q -= matx[j * size + k] * matx[k * size + i];
}
matx[j * size + i] = q;
}
}
If only the lower triangle is in parallel I yield the correct decompostion, however the upper throws out discrepancies
Many thanks for any help with this

Reorganizing nested loops for multithreading

I'm trying to rewrite the main loop in a physics simulation and split the workload between more threads.
It calls dostuff on every unique pair of indices and looks like this:
for (int i = 0; i < n - 1; ++i)
{
for (int j = i + 1; j < n; ++j)
{
dostuff(i, j);
}
}
I came up with two options:
//#1
//sqrt is implemented as binary search on ints, floors the result
for (int x = 0; x < n * (n - 1) / 2; ++x)
{
int i = (1 + sqrt(1 + 8 * x)) / 2;
int j = x - i * (i - 1) / 2;
dostuff(i, j);
}
//#2
for (int x = 0; x < n * n; ++x)
{
int i = x % n;
int j = x / n;
if (i < j)
dostuff(i, j);
}
And for each option, there is corresponding thread loop using shared atomic counter:
//#1
while(int x = counter.fetch_add(1) < n * (n - 1) / 2)
{
int i = (1 + sqrt(1 + 8 * x)) / 2;
int j = x - i * (i - 1) / 2;
dostuff(i, j);
}
//#2
while(int x = counter.fetch_add(1) < n * n)
{
int i = x % n;
int j = x / n;
if (i < j)
dostuff(i, j);
}
My question is, what is the best way to share the workload of the main loop between threads for n < 10^6?
EDIT:
//dostuff
Element& a = elements[i];
Element& b = elements[j];
glm::dvec3 r = b.getPosition() - a.getPosition();
double rv = glm::length(r);
double base = G / (rv * rv);
glm::dvec3 dir = glm::normalize(r);
glm::dvec3 bd = dir * base;
accelerations[i] += bd * b.getMass();
accelerations[j] -= bd * a.getMass();
Your work is a triangle. You want to.divide the triangle into k distinct pieces.
If k is a power of 2 you can do this:
a
a a
b c d
b c d d
Each of those regions are equal in size.

Why does OMP nested parallelism execution outputs differently than linear execution?

I'm attempting to compare values of execution time when detecting edges of an image in a linear way and in a parallel way. Everything works fine in a linear way, but in a parallel way the image written has too much white pixels in a part of the image. To better show what I'm saying, see image below:
The left image is the output of the code executed linearly, and in the right is using parallelism. You can see the edges of the buildings in both images, and the bottom part of the right image close to its border doesen't have the same issue has the rest of it.
I cropped the "critical" part of the code that does this tasks, in hope that someone may know what may be causing this.
omp_set_nested(1);
#pragma omp parallel
while(col<cols-1) {
line = 1;
#pragma omp parallel
while(line<lines-1) {
gradient_x = 0;
gradient_y = 0;
for(int m = 0; m < mask_size; m++) {
for(int n = 0; n < mask_size; n++) {
int np_x = line + (m - 1);
int np_y = col + (n - 1);
float v = img(np_y,np_x);
int mask_index = (m*3) + n;
gradient_x = gradient_x + (x_mask[mask_index] * v);
gradient_y = gradient_y + (y_mask[mask_index] * v);
}
}
float gradient_sum = sqrt((gradient_x * gradient_x) + (gradient_y * gradient_y));
if(gradient_sum >= 255) {
gradient_sum = 255;
} else if(gradient_sum <= 0) {
gradient_sum = 0;
}
output(line, col) = gradient_sum;
#pragma omp critical
line++;
}
#pragma omp critical
col++;
}
I defined line and col variables as critical because they are the ones used for both reading and writing data, and I believe everything else is working propperly.
Without more context, is hard to tell. Nonetheless, those two nested parallel regions do not make sense, because you are not distributing tasks among threads; instead you are just executing the same code by multiple threads, with possible race-conditions on the updates of the variables gradient_x and gradient_y among others. Start with the following simpler parallel code:
omp_set_nested(0);
while(col<cols-1) {
line = 1;
while(line<lines-1) {
gradient_x = 0;
gradient_y = 0;
#pragma omp parallel for reduction(+:gradient_x,gradient_y)
for(int m = 0; m < mask_size; m++) {
for(int n = 0; n < mask_size; n++) {
int np_x = line + (m - 1);
int np_y = col + (n - 1);
float v = img(np_y,np_x);
int mask_index = (m*3) + n;
gradient_x = gradient_x + (x_mask[mask_index] * v);
gradient_y = gradient_y + (y_mask[mask_index] * v);
}
}
float gradient_sum = sqrt((gradient_x * gradient_x) + (gradient_y * gradient_y));
if(gradient_sum >= 255) {
gradient_sum = 255;
} else if(gradient_sum <= 0) {
gradient_sum = 0;
}
output(line, col) = gradient_sum;
line++;
}
col++;
}
You can try the following:
#pragma omp parallel for collapse(2)
for(int col = 0; col<cols-1; col++) {
for(int line = 1; line<lines-1; line++) {
float gradient_x = 0;
float gradient_y = 0;
for(int m = 0; m < mask_size; m++) {
for(int n = 0; n < mask_size; n++) {
int np_x = line + (m - 1);
int np_y = col + (n - 1);
float v = img(np_y,np_x);
int mask_index = (m*3) + n;
gradient_x = gradient_x + (x_mask[mask_index] * v);
gradient_y = gradient_y + (y_mask[mask_index] * v);
}
}
float gradient_sum = sqrt((gradient_x * gradient_x) +
(gradient_y * gradient_y));
if(gradient_sum >= 255) {
gradient_sum = 255;
} else if(gradient_sum <= 0) {
gradient_sum = 0;
}
output(line, col) = gradient_sum;
}
}
Of course, you need to check the race-condition in the code that you have cropped.

Row Reduction of augmented matrix - 3D Spline calculations

I'm trying to implement Interpolation by relaxed cubic splines, which can be found in the 5th chapter of this article (page 9):
https://www.math.ucla.edu/~baker/149.1.02w/handouts/dd_splines.pdf
So far, I have the following:
auto GetControlPoints = [](const std::vector<Vector3d>& S) {
int n = S.size();
float var = n - 1.0f;
MatrixXd M(n - 1, n - 1);
VectorXd C[3] = {
VectorXd(n - 1),
VectorXd(n - 1),
VectorXd(n - 1)
};
for (int i = 0; i < n - 1; ++i) {
auto r = RowVectorXd(n - 1);
for (int j = 0; j < n - 1; ++j) {
if (j == i)
r[j] = var;
else if (j == i - 1 || j == i + 1)
r[j] = 1.f;
else
r[j] = 0.f;
}
M.row(i) = r;
if (i == 0) {
for (int j = 0; j < 3; ++j) {
C[j] << (n + 1) * S[1][j] - S[0][j];
}
}
else if (i == n - 1) {
for (int j = 0; j < 3; ++j) {
C[j] << (n + 1) * S[n - 1][j] - S[n][j];
}
}
else {
for (int j = 0; j < 3; ++j) {
C[j] << (n + 1) * S[i][j];
}
}
}
MatrixXd augMC[3] = {
MatrixXd(n - 1, n),
MatrixXd(n - 1, n),
MatrixXd(n - 1, n)
};
for (int i = 0; i < 3; ++i) {
augMC[i].block(0, 0, n - 1, n - 1) = M;
augMC[i].block(n - 1, n - 1, n - 1, 1) = C[i].transpose();
}
};
I got to the point where I made an augmented Matrix using M and C, but I have no idea on how to row reduce it. Any thoughts?
You could use the inplace-variant of PartialPivLU -- but it looks like you actually want to solve the system M*B = C for which you should just decompose M (as it is symmetric you can use an LLt or LDLt decomposition) and then use the solve method of that decomposition.
To setup M you should also use the diagonal method (untested):
MatrixXd M(n - 1, n - 1);
M.setZero();
M.diagonal().setConstant(n - 1.0);
M.diagonal<1>().setOnes();
M.diagonal<-1>().setOnes();
LLT<MatrixXd> lltOfM(M);
for (int i = 0; i < 3; ++i) { B[i] = lltOfM.solve(C[i]); }
For large n this is sub-optimal, since it does not exploit the tridiagonal structure of M. You could try out the sparse module for this, but there should actually be a direct algorithm (Eigen does not explicitly have a tridiagonal matrix type, though).
For C you probably could also use a MatrixX3d (I don't really understand how you fill your C vectors -- I think your current code should assert at run-time, unless n==2, or you disabled assertions).

OpenMP Race Condition when finding Closest Pair

I'm doing an assignment to find the closest pair between two disjoint sets A and B. I'm using OpenMP to parallelize the recursion of the algorithm, but I am running into some data races. I am very new to OpenMP, so I think it has something to do with incorrect privating/sharing of variables. I have put the full algorithm below:
float OMPParticleSim::efficient_closest_pair(int n, vector<Particle> & p, vector<Particle> & q)
{
// brute force
if(n <= 3) {
float m = numeric_limits<float>::max();
for(int i = 0; i < n - 2; i++) {
for(int j = i + 1; j < n - 1; j++) {
if((set_A.find(p[i].id) != set_A.end() && set_A.find(p[j].id) != set_A.end()) || (set_B.find(p[i].id) != set_B.end() && set_B.find(p[j].id) != set_B.end())) {
continue;
}
float distsq = pow(p[i].x - p[j].x, 2) + pow(p[i].y - p[j].y, 2) + pow(p[i].z - p[j].z, 2);
pair<pair<Particle, Particle>, float> pa = make_pair(make_pair(p[i], p[j]), sqrt(distsq));
#pragma omp critical
insert(pa);
m = min(m, distsq);
}
}
return sqrt(m);
}
// copy first ceil(n/2) points of p to pl
vector<Particle> pl;
int ceiling = ceil(n/2);
for(int i = 0; i < ceiling; i++) {
pl.push_back(p[i]);
}
// copy first ceil(n/2) points of q to ql
vector<Particle> ql;
for(int i = 0; i < ceiling; i++) {
ql.push_back(q[i]);
}
// copy remaining floor(n/2) points of p to pr
vector<Particle> pr;
for(int i = ceiling; i < p.size(); i++) {
pr.push_back(p[i]);
}
// copy remaining floor(n/2) points of q to qr
vector<Particle> qr;
for(int i = ceiling; i < q.size(); i++) {
qr.push_back(p[i]);
}
float dl, dr, d;
#pragma omp task firstprivate(pl, ql, p, q, n) private(dl) shared(closest_pairs)
dl = efficient_closest_pair(ceil(n / 2), pl, ql);
#pragma omp task firstprivate(pl, ql, p, q, n) private(dr) shared(closest_pairs)
dr = efficient_closest_pair(ceil(n / 2), pr, qr);
#pragma omp taskwait
d = min(dl, dr);
float m = p[ceil(n / 2) - 1].x;
vector<Particle> s;
for(int i = 0; i < q.size(); i++) {
if(fabs(q[i].x - m) < d) {
s.push_back(Particle(q[i]));
}
}
int num = s.size();
float dminsq = d * d;
for (int i = 0; i < num - 2; i++) {
int k = i + 1;
while(k <= num - 1 && pow(s[k].y - s[i].y, 2) < dminsq) {
if((set_A.find(s[i].id) != set_A.end() && set_A.find(s[k].id) != set_A.end()) || (set_B.find(s[i].id) != set_B.end() && set_B.find(s[k].id) != set_B.end())) {
k++;
continue;
}
float dist = pow(s[k].x - s[i].x, 2) + pow(s[k].y - s[i].y, 2) + pow(s[k].z - s[i].z, 2);
pair<pair<Particle, Particle>, float> pa = make_pair(make_pair(s[i], s[k]), sqrt(dist));
#pragma omp critical
insert(pa);
dminsq = min(dist, dminsq);
k++;
}
}
return sqrt(dminsq);
}
The insert method looks like this:
void OMPParticleSim::insert(pair<pair<Particle, Particle>, float> & pair) {
if(closest_pairs.size() == 0) {
closest_pairs.push_back(pair);
return;
}
for(int i = 0; i < closest_pairs.size(); ++i) {
if(closest_pairs[i].second > pair.second) {
closest_pairs.insert(closest_pairs.begin() + i, 1, pair);
break;
}
}
if(closest_pairs.size() > k) {
closest_pairs.pop_back();
}
}
The start of the parallel region is here:
void OMPParticleSim::do_closest_pair(int num_threads) {
vector<Particle> p = set;
// presort on x
sort(p.begin(), p.end(), sortxomp);
vector<Particle> q = p;
// presort on y
sort(q.begin(), q.end(), sortyomp);
float cp;
#pragma omp parallel num_threads(num_threads)
{
#pragma omp single
{
cp = efficient_closest_pair(set.size(), p, q);
}
}
sort(closest_pairs.begin(), closest_pairs.end(), sortpairsomp);
}
All of the results are stored in a list closest_pairs and output to a file. The reason I know there are data races is because some of the Particle id's are negative (all of them start positive), and running the program multiple times results in different values being written to the file. Any help would be great!
The error was the dl and dr should have been shared between the tasks.