Reorganizing nested loops for multithreading - c++

I'm trying to rewrite the main loop in a physics simulation and split the workload between more threads.
It calls dostuff on every unique pair of indices and looks like this:
for (int i = 0; i < n - 1; ++i)
{
for (int j = i + 1; j < n; ++j)
{
dostuff(i, j);
}
}
I came up with two options:
//#1
//sqrt is implemented as binary search on ints, floors the result
for (int x = 0; x < n * (n - 1) / 2; ++x)
{
int i = (1 + sqrt(1 + 8 * x)) / 2;
int j = x - i * (i - 1) / 2;
dostuff(i, j);
}
//#2
for (int x = 0; x < n * n; ++x)
{
int i = x % n;
int j = x / n;
if (i < j)
dostuff(i, j);
}
And for each option, there is corresponding thread loop using shared atomic counter:
//#1
while(int x = counter.fetch_add(1) < n * (n - 1) / 2)
{
int i = (1 + sqrt(1 + 8 * x)) / 2;
int j = x - i * (i - 1) / 2;
dostuff(i, j);
}
//#2
while(int x = counter.fetch_add(1) < n * n)
{
int i = x % n;
int j = x / n;
if (i < j)
dostuff(i, j);
}
My question is, what is the best way to share the workload of the main loop between threads for n < 10^6?
EDIT:
//dostuff
Element& a = elements[i];
Element& b = elements[j];
glm::dvec3 r = b.getPosition() - a.getPosition();
double rv = glm::length(r);
double base = G / (rv * rv);
glm::dvec3 dir = glm::normalize(r);
glm::dvec3 bd = dir * base;
accelerations[i] += bd * b.getMass();
accelerations[j] -= bd * a.getMass();

Your work is a triangle. You want to.divide the triangle into k distinct pieces.
If k is a power of 2 you can do this:
a
a a
b c d
b c d d
Each of those regions are equal in size.

Related

Matlab VS. C++ in matrix calculation

I am using C++ to do some matrix calculations using Armadillo library.
I tried to make it similar to the Matlab version.
But when I run the code.
While Matlab took about 2 - 3 min, C++ took about 20 min.
I searched a bit and realized that some people also asked why C++ is slower than Matlab in matrix calculations.
But I heard that C++ is way faster than Matlab. So I was wondering whether C++ is not as good as Matlab in terms of Matrix calculations in usual.
Below is just part of my entire code.
Is there any way I can speed up C++ matrix calculations?
Should I use a different library?
while (dif >= tol && it <= itmax) {
it = it + 1;
V = Vnew;
Vfuture = beta * (Ptrans(0) * Vnew.slice(0) + Ptrans(1) * Vnew.slice(1) + Ptrans(2) * Vnew.slice(2));
for (int a = 0; a < Na; a++) {
for (int b = 0; b < Nd; b++) {
for (int c = 0; c < Ny; c++) {
Mat<double> YY(Na, Nd);
YY.fill(Y(c));
Mat<double> AA(Na, Nd);
AA.fill(A(a));
Mat<double> DD(Na, Nd);
DD.fill(D(b));
Mat<double> CC = YY + AA - mg_A_v / R - (mg_D_v - (1 - delta) * DD);
Mat<double> Val = 1 / (1 - 1 / sig) * pow(pow(CC, psi) % pow(mg_D_v, 1 - psi), (1 - 1 / sig)) + Vfuture;
double max_val = Val.max();
uword maxindex_val = Val.index_max();
int index_column = maxindex_val / Na; // column
int index_row = maxindex_val - index_column * Na; // row
Vnew(a, b, c) = max_val;
maxposition_a(a, b, c) = index_row;
maxposition_d(a, b, c) = index_column;
}
}
}
// Howard improvement
for (int h = 0; h < H; h++) {
Vhoward = Vnew;
for (int i = 0; i < Na; i++) {
for (int j = 0; j < Nd; j++) {
for (int k = 0; k < Ny; k++) {
temphoward(i, j) = beta * Vhoward(maxposition_a(i, j, k), maxposition_d(i, j, k), 0) * Ptrans(0) + beta * Vhoward(maxposition_a(i, j, k), maxposition_d(i, j, k), 1) * Ptrans(1) + beta * Vhoward(maxposition_a(i, j, k), maxposition_d(i, j, k), 2) * Ptrans(2);
Vnew(i, j, k) = temphoward(i, j) + utility(Y(k) + A(i) - A(maxposition_a(i, j, k)) / R - D(maxposition_d(i, j, k)) + (1 - delta) * D(j), D(maxposition_d(i, j, k)), sig, psi);
}
}
}
}
tempdiff = abs(V - Vnew);
dif = tempdiff.max();
cout << dif << endl;
cout << it << endl;
}
And this is the part from the matlab.
while dif >= tol && it <= itmax
tic;
it = it + 1;
V = Vnew;
vFuture = beta*reshape(V,Na*Nd,Ny)*P;
for i_a = 1:Na %Loop over state variable a
for i_d = 1:Nd %Loop over state variable d
for i_y = 1:Ny %Loop over state variable y
val = reshape(Utility(Y(i_y) + A(i_a) - mg_A_v/R - (mg_D_v - (1-delta)*D(i_d)),mg_D_v),Na*Nd,1) + vFuture;
[Vnew(i_a,i_d,i_y), indpol(i_a,i_d,i_y)] = max(val);
[indpol_ap(i_a,i_d,i_y),indpol_dp(i_a,i_d,i_y)] = ind2sub([Na,Nd],indpol(i_a,i_d,i_y));
end
end
end
% Howard improvement step
for h = 1:H
Vhoward = Vnew;
for i_a = 1:Na %Loop over state variable a
for i_d = 1:Nd %Loop over state variable d
for i_y = 1:Ny %Loop over state variable y
Vnew(i_a,i_d,i_y) = Utility(Y(i_y) + A(i_a) - A(indpol_ap(i_a,i_d,i_y))/R - ...
(D(indpol_dp(i_a,i_d,i_y)) - (1-delta)*D(i_d)),D(indpol_dp(i_a,i_d,i_y))) ...
+ beta*reshape(Vhoward(indpol_ap(i_a,i_d,i_y),indpol_dp(i_a,i_d,i_y),:),1,Ny)*P;
end
end
end
end
dif = max(max(max(abs(V-Vnew))));
disp([it dif toc])
end

Gausian Filter with OpenCV and C++

My goal is to apply a gaussian filter to an input image.
I don't want to use the OpenCV function (i want to program it by myself).
I mention the Error in the code. Can somebody help me? The code is based on a mean filter example.
I know the examples with the GaussianBlur function.
public:Mat gaussianfilter(const Mat input, int n, float sigmaT, float sigmaS, const char* opt) {
Mat kernel;
int row = input.rows;
int col = input.cols;
int kernel_size = (2 * n + 1);
int tempa;
int tempb;
float denom;
float kernelvalue{};
// Initialiazing Kernel Matrix
kernel = Mat::zeros(kernel_size, kernel_size, CV_32F);
denom = 0.0;
for (int a = -n; a <= n; a++) { // Denominator in m(s,t)
for (int b = -n; b <= n; b++) {
float value1 = exp(-(pow(a, 2) / (2 * pow(sigmaS, 2))) - (pow(b, 2) / (2 * pow(sigmaT, 2))));
kernel.at<float>(a + n, b + n) = value1;
denom += value1;
}
}
for (int a = -n; a <= n; a++) { // Denominator in m(s,t)
for (int b = -n; b <= n; b++) {
kernel.at<float>(a + n, b + n) /= denom;
}
}
Mat output = Mat::zeros(row, col, input.type());
for (int i = 0; i < row; i++) {
for (int j = 0; j < col; j++) {
float sum1 = 0.0;
for (int a = -n; a <= n; a++) {
for (int b = -n; b <= n; b++) {
/* Gaussian filter with Zero-paddle boundary process:
Fill the code:
*/
if ((i + a <= row - 1) && (i + a >= 0) && (j + b <= col - 1) && (j + b >= 0)) { //if the pixel is not a border pixel
/*here is a failure: Severity Code Description Project File Line Suppression State
Error (active) E0349 no operator "+=" */
sum1 += kernel * (float)(input.at<G>(i + a, j + b));
}
}
}
output.at<G>(i, j) = (G)sum1;
}
}
return output;
}
sum1 - float , kernel - Mat. You can't += for float and Mat. You need to append indices.
sum1 += kernel.at<float>(a+n,b+n) * (float)(input.at<G>(i + a, j + b));
I think.

Drawing the top and bottom of a cylinder

I'm trying to create a class that can procedurally create prisms (or cylinders if the precision is high enough) but only the sides of the 3d model are showing (not the top and bottom). This is using openGL and c++. Not going for efficiency, just modifying a previous class that made a sphere.
#define numSlices 2
Prism::Prism() {
init(3);
}
Prism::Prism(int prec) {
init(prec);
}
float Prism::toRadians(float degrees) { return (degrees * 2.0f * 3.14159f) / 360.0f; }
void Prism::init(int prec) {
prec = (prec < 3) ? 3 : prec;
numVertices = (prec + 1) * (numSlices+1);
numIndices = prec * numSlices * 6;
for (int i = 0; i < numVertices; i++) { vertices.push_back(glm::vec3()); }
for (int i = 0; i < numVertices; i++) { texCoords.push_back(glm::vec2()); }
for (int i = 0; i < numVertices; i++) { normals.push_back(glm::vec3()); }
for (int i = 0; i < numVertices; i++) { tangents.push_back(glm::vec3()); }
for (int i = 0; i < numIndices; i++) { indices.push_back(0); }
// calculate triangle vertices
for (int i = 0; i <= numSlices; i++) {
for (int j = 0; j <= prec; j++) {
float y = i;
float x = -(float)cos(toRadians(j * 360.0f / (float)prec));
float z = (float)sin(toRadians(j * 360.0f / (float)prec));
vertices[i * (prec + 1) + j] = glm::vec3(x, y, z);
texCoords[i * (prec + 1) + j] = glm::vec2(((float)j / prec), ((float)i / numSlices));
}
}
// calculate triangle indices
for (int i = 0; i < numSlices; i++) {
for (int j = 0; j < prec; j++) {
indices[6 * (i * prec + j) + 0] = i * (prec + 1) + j;
indices[6 * (i * prec + j) + 1] = i * (prec + 1) + j + 1;
indices[6 * (i * prec + j) + 2] = (i + 1) * (prec + 1) + j;
indices[6 * (i * prec + j) + 3] = i * (prec + 1) + j + 1;
indices[6 * (i * prec + j) + 4] = (i + 1) * (prec + 1) + j + 1;
indices[6 * (i * prec + j) + 5] = (i + 1) * (prec + 1) + j;
}
}
}
Any tips or solutions that stick closely to the code already written would much appreciated.
To render the top and bottom of the cylinder, you can create a "triangle fan" that starts from a vertex at the center of the top/bottom of the cylinder and creates one triangle for every side.
Adapting your code: (untested, I may have made mistakes against winding order)
int bottom_center = vertices.length(); vertices.push_back(glm::vec3(0,0,0));
int top_center = vertices.length(); vertices.push_back(glm::vec3(0,numSlices,0));
// Bottom
for (int j = 0; j < prec; j++) {
int base = 0;
indices.push_back(bottom_center);
indices.push_back(base+j);
indices.push_back(base+j+1);
}
// Top
for (int j = 0; j < prec; j++) {
int base = numSlices * (prec+1);
indices.push_back(top_center);
indices.push_back(base+j);
indices.push_back(base+j+1);
}
See http://www.songho.ca/opengl/gl_cylinder.html for a more worked-out example.

if statement runtime error

I originally had 3 equations: Pu, Pm & Pd. It ran fine.
Once I introduced the if statement, with variations on the 3 equations, depending on the loop iteration, I receive a runtime error.
Any help would be appreciated.
Cheers in advance.
#include <cmath>
#include <iostream>
#include <vector>
#include <iomanip>
int Rounding(double x)
{
int Integer = (int)x;
double Decimal = x - Integer;
if (Decimal > 0.49)
{
return (Integer + 1);
}
else
{
return Integer;
}
}
int main()
{
double a = 0.1;
double sigma = 0.01;
int delta_t = 1;
double M = -a * delta_t;
double V = sigma * sigma * delta_t;
double delta_r = sqrt(3 * V);
int count;
double PuValue;
double PmValue;
double PdValue;
int j_max;
int j_min;
j_max = Rounding(-0.184 / M);
j_min = -j_max;
std::vector<std::vector<double>> Pu((20), std::vector<double>(20));
std::vector<std::vector<double>> Pm((20), std::vector<double>(20));
std::vector<std::vector<double>> Pd((20), std::vector<double>(20));
std::cout << std::setprecision(10);
for (int i = 0; i <= 2; i++)
{
count = 0;
for (int j = i; j >= -i; j--)
{
count = count + 1;
if (j = j_max) // Exhibit 1C
{
PuValue = 7.0/6.0 + (j * j * M * M + 3 * j * M)/2.0;
PmValue = -1.0/3.0 - j * j * M * M - 2 * j * M;
PdValue = 1.0/6.0 + (j * j * M * M + j * M)/2.0;
}
else if (j = j_min) // Exhibit 1B
{
PuValue = 1.0/6.0 + (j * j * M * M - j * M)/2.0;
PmValue = -1.0/3.0 - j * j * M * M + 2 * j * M;
PdValue = 7.0/6.0 + (j * j * M * M - 3 * j * M)/2.0;
}
else
{
PuValue = 1.0/6.0 + (j * j * M * M + j * M)/2.0;
PmValue = 2.0/3.0 - j * j * M * M;
PdValue = 1.0/6.0 + (j * j * M * M - j * M)/2.0;
}
Pu[count][i] = PuValue;
Pm[count][i] = PmValue;
Pd[count][i] = PdValue;
std::cout << Pu[count][i] << ", ";
}
std::cout << std::endl;
}
return 0;
}
You are assigning instead of checking for equal: j_max to j in your if statements.
if (j = j_max)
// ^
else if (j = j_min)
// ^
Change if (j = j_max) to if (j == j_max),
And else if (j = j_min) to else if (j == j_min).
Correct the following if conditional check and all other instances of an if check
if(j=j_max)
with
if (j == j_max)
you are checking for an equality not assigning.
Your code was going into an infinite loop.

Laguerre interpolation algorithm, something's wrong with my implementation

This is a problem I have been struggling for a week, coming back just to give up after wasted hours...
I am supposed to find coefficents for the following Laguerre polynomial:
P0(x) = 1
P1(x) = 1 - x
Pn(x) = ((2n - 1 - x) / n) * P(n-1) - ((n - 1) / n) * P(n-2)
I believe there is an error in my implementation, because for some reason the coefficents I get seem way too big. This is the output this program generates:
a1 = -190.234
a2 = -295.833
a3 = 378.283
a4 = -939.537
a5 = 774.861
a6 = -400.612
Description of code (given below):
If you scroll the code down a little to the part where I declare array, you'll find given x's and y's.
The function polynomial just fills an array with values of said polynomial for certain x. It's a recursive function. I believe it works well, because I have checked the output values.
The gauss function finds coefficents by performing Gaussian elimination on output array. I think this is where the problems begin. I am wondering, if there's a mistake in this code or perhaps my method of veryfying results is bad? I am trying to verify them like that:
-190.234 * 1.5 ^ 5 - 295.833 * 1.5 ^ 4 ... - 400.612 = -3017,817625 =/= 2
Code:
#include "stdafx.h"
#include <conio.h>
#include <iostream>
#include <iomanip>
#include <math.h>
using namespace std;
double polynomial(int i, int j, double **tab)
{
double n = i;
double **array = tab;
double x = array[j][0];
if (i == 0) {
return 1;
} else if (i == 1) {
return 1 - x;
} else {
double minusone = polynomial(i - 1, j, array);
double minustwo = polynomial(i - 2, j, array);
double result = (((2.0 * n) - 1 - x) / n) * minusone - ((n - 1.0) / n) * minustwo;
return result;
}
}
int gauss(int n, double tab[6][7], double results[7])
{
double multiplier, divider;
for (int m = 0; m <= n; m++)
{
for (int i = m + 1; i <= n; i++)
{
multiplier = tab[i][m];
divider = tab[m][m];
if (divider == 0) {
return 1;
}
for (int j = m; j <= n; j++)
{
if (i == n) {
break;
}
tab[i][j] = (tab[m][j] * multiplier / divider) - tab[i][j];
}
for (int j = m; j <= n; j++) {
tab[i - 1][j] = tab[i - 1][j] / divider;
}
}
}
double s = 0;
results[n - 1] = tab[n - 1][n];
int y = 0;
for (int i = n-2; i >= 0; i--)
{
s = 0;
y++;
for (int x = 0; x < n; x++)
{
s = s + (tab[i][n - 1 - x] * results[n-(x + 1)]);
if (y == x + 1) {
break;
}
}
results[i] = tab[i][n] - s;
}
}
int _tmain(int argc, _TCHAR* argv[])
{
int num;
double **array;
array = new double*[5];
for (int i = 0; i <= 5; i++)
{
array[i] = new double[2];
}
//i 0 1 2 3 4 5
array[0][0] = 1.5; //xi 1.5 2 2.5 3.5 3.8 4.1
array[0][1] = 2; //yi 2 5 -1 0.5 3 7
array[1][0] = 2;
array[1][1] = 5;
array[2][0] = 2.5;
array[2][1] = -1;
array[3][0] = 3.5;
array[3][1] = 0.5;
array[4][0] = 3.8;
array[4][1] = 3;
array[5][0] = 4.1;
array[5][1] = 7;
double W[6][7]; //n + 1
for (int i = 0; i <= 5; i++)
{
for (int j = 0; j <= 5; j++)
{
W[i][j] = polynomial(j, i, array);
}
W[i][6] = array[i][1];
}
for (int i = 0; i <= 5; i++)
{
for (int j = 0; j <= 6; j++)
{
cout << W[i][j] << "\t";
}
cout << endl;
}
double results[6];
gauss(6, W, results);
for (int i = 0; i < 6; i++) {
cout << "a" << i + 1 << " = " << results[i] << endl;
}
_getch();
return 0;
}
I believe your interpretation of the recursive polynomial generation either needs revising or is a bit too clever for me.
given P[0][5] = {1,0,0,0,0,...}; P[1][5]={1,-1,0,0,0,...};
then P[2] is a*P[0] + convolution(P[1], { c, d });
where a = -((n - 1) / n)
c = (2n - 1)/n and d= - 1/n
This can be generalized: P[n] == a*P[n-2] + conv(P[n-1], { c,d });
In every step there is involved a polynomial multiplication with (c + d*x), which increases the degree by one (just by one...) and adding to P[n-1] multiplied with a scalar a.
Then most likely the interpolation factor x is in range [0..1].
(convolution means, that you should implement polynomial multiplication, which luckily is easy...)
[a,b,c,d]
* [e,f]
------------------
af,bf,cf,df +
ae,be,ce,de, 0 +
--------------------------
(= coefficients of the final polynomial)
The definition of P1(x) = x - 1 is not implemented as stated. You have 1 - x in the computation.
I did not look any further.