I have a working sequential Crout Decomposition algorithm that I need to speed up if possible. I have looked online at various OpenMP methods of parallelising the algorithm and I can only get it to work correctly on the lower triangular matrix part of the code. The upper yields wrong results
I feel like I have been looking at the code too long and I may be blind to a data dependency that I am overlooking
Sequential code is as follows, which works correct
for (i = 0; i < size; i++)
{
// Upper Triangle
for (j = 0; j < i; j++)
{
q = matx[j * size + i];
for (k = 0; k < j; k++)
{
q -= matx[j * size + k] * matx[k * size + i];
}
matx[j * size + i] = q;
}
// Lower Triangle
for (j = i; j < size; j++)
{
q = matx[j * size + i];
for (k = 0; k < i; k++)
{
q -= matx[j * size + k] * matx[k * size + i];
}
matx[j * size + i] = q;
}
}
Now here is the code with the appropriate OpenMP directives
for (i = 0; i < size; i++)
{
// Upper Triangle
#pragma omp parallel for private(j,k,q)
for (j = 0; j < i; j++)
{
q = matx[j * size + i];
for (k = 0; k < j; k++)
{
q -= matx[j * size + k] * matx[k * size + i];
}
matx[j * size + i] = q;
}
// Lower Triangle
#pragma omp parallel for private(j,k,q)
for (j = i; j < size; j++)
{
q = matx[j * size + i];
for (k = 0; k < i; k++)
{
q -= matx[j * size + k] * matx[k * size + i];
}
matx[j * size + i] = q;
}
}
If only the lower triangle is in parallel I yield the correct decompostion, however the upper throws out discrepancies
Many thanks for any help with this
Related
I'm trying to create a class that can procedurally create prisms (or cylinders if the precision is high enough) but only the sides of the 3d model are showing (not the top and bottom). This is using openGL and c++. Not going for efficiency, just modifying a previous class that made a sphere.
#define numSlices 2
Prism::Prism() {
init(3);
}
Prism::Prism(int prec) {
init(prec);
}
float Prism::toRadians(float degrees) { return (degrees * 2.0f * 3.14159f) / 360.0f; }
void Prism::init(int prec) {
prec = (prec < 3) ? 3 : prec;
numVertices = (prec + 1) * (numSlices+1);
numIndices = prec * numSlices * 6;
for (int i = 0; i < numVertices; i++) { vertices.push_back(glm::vec3()); }
for (int i = 0; i < numVertices; i++) { texCoords.push_back(glm::vec2()); }
for (int i = 0; i < numVertices; i++) { normals.push_back(glm::vec3()); }
for (int i = 0; i < numVertices; i++) { tangents.push_back(glm::vec3()); }
for (int i = 0; i < numIndices; i++) { indices.push_back(0); }
// calculate triangle vertices
for (int i = 0; i <= numSlices; i++) {
for (int j = 0; j <= prec; j++) {
float y = i;
float x = -(float)cos(toRadians(j * 360.0f / (float)prec));
float z = (float)sin(toRadians(j * 360.0f / (float)prec));
vertices[i * (prec + 1) + j] = glm::vec3(x, y, z);
texCoords[i * (prec + 1) + j] = glm::vec2(((float)j / prec), ((float)i / numSlices));
}
}
// calculate triangle indices
for (int i = 0; i < numSlices; i++) {
for (int j = 0; j < prec; j++) {
indices[6 * (i * prec + j) + 0] = i * (prec + 1) + j;
indices[6 * (i * prec + j) + 1] = i * (prec + 1) + j + 1;
indices[6 * (i * prec + j) + 2] = (i + 1) * (prec + 1) + j;
indices[6 * (i * prec + j) + 3] = i * (prec + 1) + j + 1;
indices[6 * (i * prec + j) + 4] = (i + 1) * (prec + 1) + j + 1;
indices[6 * (i * prec + j) + 5] = (i + 1) * (prec + 1) + j;
}
}
}
Any tips or solutions that stick closely to the code already written would much appreciated.
To render the top and bottom of the cylinder, you can create a "triangle fan" that starts from a vertex at the center of the top/bottom of the cylinder and creates one triangle for every side.
Adapting your code: (untested, I may have made mistakes against winding order)
int bottom_center = vertices.length(); vertices.push_back(glm::vec3(0,0,0));
int top_center = vertices.length(); vertices.push_back(glm::vec3(0,numSlices,0));
// Bottom
for (int j = 0; j < prec; j++) {
int base = 0;
indices.push_back(bottom_center);
indices.push_back(base+j);
indices.push_back(base+j+1);
}
// Top
for (int j = 0; j < prec; j++) {
int base = numSlices * (prec+1);
indices.push_back(top_center);
indices.push_back(base+j);
indices.push_back(base+j+1);
}
See http://www.songho.ca/opengl/gl_cylinder.html for a more worked-out example.
I struggle a bit with a function. The calculation is wrong if I try to parallelize the outer loop with a
#pragma omp parallel reduction(+:det).
Can someone show me how to solve it and why it is failing?
// template<class T> using vector2D = std::vector<std::vector<T>>;
float Det(vector2DF &a, int n)
{
vector2DF m(n - 1, vector1DF(n - 1, 0));
if (n == 1) return a[0][0];
if (n == 2) return a[0][0] * a[1][1] - a[1][0] * a[0][1];
float det = 0;
for (int i = 0; i < n; i++)
{
int l = 0;
#pragma omp parallel for private(l)
for (int j = 1; j < n; j++)
{
l = 0;
for (int k = 0; k < n; k++)
{
if (k == i) continue;
m[j - 1][l] = a[j][k];
l++;
}
}
det += std::pow(-1.0, 1.0 + i + 1.0) * a[0][i] * Det(m, n - 1);
}
return det;
}
If you parallelize the outer loop, there is a race condition on this line:
m[j - 1][l] = a[j][k];
Also you likely want a parallel for reduction instead of just a parallel reduction.
The issue is, that m is shared, even though that wouldn't be necessary given that it is completely overwritten in the inner loop. Always declare variables as locally as possible, this avoids issues with wrongly shared variables, e.g.:
float Det(vector2DF &a, int n)
{
if (n == 1) return a[0][0];
if (n == 2) return a[0][0] * a[1][1] - a[1][0] * a[0][1];
float det = 0;
#pragma omp parallel reduction(+:det)
for (int i = 0; i < n; i++)
{
vector2DF m(n - 1, vector1DF(n - 1, 0));
for (int j = 1; j < n; j++)
{
int l = 0;
for (int k = 0; k < n; k++)
{
if (k == i) continue;
m[j - 1][l] = a[j][k];
l++;
}
}
det += std::pow(-1.0, 1.0 + i + 1.0) * a[0][i] * Det(m, n - 1);
}
return det;
}
Now that is correct, but since m can be expensive to allocate, performance could benefit from not doing it in each and every iteration. This can be done by splitting parallel and for directives as such:
float Det(vector2DF &a, int n)
{
if (n == 1) return a[0][0];
if (n == 2) return a[0][0] * a[1][1] - a[1][0] * a[0][1];
float det = 0;
#pragma omp parallel reduction(+:det)
{
vector2DF m(n - 1, vector1DF(n - 1, 0));
#pragma omp parallel for
for (int i = 0; i < n; i++)
{
for (int j = 1; j < n; j++)
{
int l = 0;
for (int k = 0; k < n; k++)
{
if (k == i) continue;
m[j - 1][l] = a[j][k];
l++;
}
}
det += std::pow(-1.0, 1.0 + i + 1.0) * a[0][i] * Det(m, n - 1);
}
}
return det;
}
Now you could also just declare m as firstprivate, but that would assume that the copy constructor makes a completely independent deep-copy and thus make the code more difficult to reason about.
Please be aware that you should always include expected output, actual output and a minimal complete and verifiable example.
block_sparrse_matrix.h
#include "ldl_decomposition.h"
SMVS_NAMESPACE_BEGIN
template<int N>
void
BlockSparseMatrix<N>::invert_blocks_inplace(void)
{
for (std::size_t i = 0; i < this->values.size(); ++i)
{
std::array<double, N * N> b = values[i];
ldl_inverse(b.begin(), N);
bool nancheck = false;
for (int i = 0; i < N * N; ++i)
if (std::isnan(b[i]))
nancheck = true;
if(nancheck)
continue;
values[i] = b;
}
}
ldl_decomposition.h
SMVS_NAMESPACE_BEGIN
template<typename T>
void
ldl_inverse(T * A, int const size)
{
T * L = new T[size * size];
T * D = new T[size];
std::fill(L, L + size * size, 0.0);
std::fill(D, D + size, 0.0);
/* Factorize A into LDL^T */
for (int j = 0; j < size; ++j)
{
D[j] = A[j * size + j];
L[j * size + j] = 1.0;
for (int k = 0 ; k < j; ++k)
D[j] -= (L[j * size + k] * L[j * size + k]) * D[k];
if (D[j] == 0.0)
return;
for (int i = j+1; i < size; ++i)
{
L[i * size + j] = A[i * size + j];
for (int k = 0 ; k < j; ++k)
L[i * size + j] -= L[i * size + k] * D[k] * L[j * size + k];
L[i * size + j] /= D[j];
}
}
/* Invert L */
for (int i = 0; i < size; ++i)
for (int j = i+1; j < size; ++j)
{
T sum(0);
for (int k = i ; k < j; ++k)
sum -= L[j * size + k] * L[k * size + i];
L[j * size + i] = sum;
}
/* Invert D */
for (int i = 0; i < size; ++i)
D[i] = 1.0 / D[i];
/* Combine Matrices */
combine_ldl(L, D, size, A);
/* Cleanup memory */
delete[] L;
delete[] D;
}
I want to make it work on Windows, but it is error.
Error C2672 'ldl_inverse': no matching overloaded function found
(compiling source file E:\mve\libs\smvsrecon\gauss_newton_step.cc)
Error C2784 'void smvs::ldl_inverse(T *,const int)': could not deduce
template argument for 'T *' from 'std::_Array_iterator<_Ty,16>'
Instead of a normal dot product, my application requires a slightly modified version. Here is the original C++ code:
for (int m = 0; m < k; m++) {
for (int n = 0; n < l; n++) {
for (int t = 0; t < dims2[2]; t++) {
for (int dm = 0; dm < dims2[1]; dm++) {
for (int dn = 0; dn < dims2[0]; dn++) {
int ai = (n + dn) + (m + dm) * dims1[0] + t * dims1[0] * dims1[1];
int bi = dn + dm * dims2[0] + t * dims2[0] * dims2[1];
total += A[ai] * B[bi];
}
}
}
}
}
Feel free to change the order of the loops. How do I convert this to neon assembly?
I overloaded operator * which multiplying 2D arrays. I have some problems with multiplying, don't understand exactly an indexes when I am multiplying.
Here's some declarations:
int *const e; //pointer to the memory storing all integer elements of A
const int row, column; //r and c are the numbers of rows and columns respectively
And some code:
A A::operator*(const A& matrix)const
{
MAT result(matrix.row, matrix.column);
if (column == matrix.row)
{
for (int i = 0; i < row; ++i)
{
for (int j = 0; j < matrix.column; j++)
{
result.e[j*row + i] = 0;
for (int k = 0; k < column; k++)
{
result.e[j*row + i] += e[j*row + k] * matrix.e[k*row + column];
}
}
}
}
return result;
}
I know that I need 3 loops, I think I have some problems in
result.e[j*row + i] += e[j*row + k] * matrix.e[k*row + column];
Do you have any clue ? You can write me some ideas how can I figure out it myself, because I want to understand it. Thanks
Your line
result.e[j*row + i] += e[j*row + k] * matrix.e[k*row + column];
is broken. The product P of two matrices A (dim M,N) and B (dim N,P) has it's coefficient in position (i,j) defined by the following :
Pi,j = sum(k = 1..N, ai,k . bk,j).
Thus the line mentioned above should be :
result.e[j*row + i] += e[j*row + k] * matrix.e[k*row + i];