Speed up For loop with .push_back - c++

I'm wondering if we can speed up this loop using OpenMP or CUDA. Currently, it runs fine with sequential processing but I"m trying to optimize my coding:
for (int curCol = 0; curCol < numRows; ++curCol){ //Long Loop
int lb = csc_colIndices[curCol];
int ub = csc_colIndices[curCol + 1];
// push back the diagonal value to L matrix
vec_L_val[curCol].push_back(1.0f);
vec_L_indices[curCol].push_back(curCol);
for (int curIndex = lb; curIndex < ub; ++curIndex){
int curRow = csc_indices[curIndex];
float curVal = csc_val[curIndex];
if (!Equal(curVal, 0) && curRow <= curCol){// U entry
vec_U_val[curCol].push_back(curVal);
vec_U_indices[curCol].push_back(curRow);
}
else if (!Equal(curVal, 0) && curRow > curCol){// L entry
vec_L_val[curCol].push_back(curVal);
vec_L_indices[curCol].push_back(curRow);
}
}
}
In an effort to parallelise the processing I've tried the following with no effect:
#pragma omp parallel for private(curCol) shared(curIndex)
My suspicion is the use of .push_back but I can be wrong...
How can I improve this code?

first prealloc the whole thing:
for (int curCol = 0; curCol < numRows; ++curCol)
{
vec_L_val[curCol].resize( SIZE_OF_THE_INNER_VECTOR );
vec_L_indices[curCol].resize( SIZE_OF_THE_INNER_VECTOR );
vec_U_val[curCol].resize(SIZE_OF_THE_INNER_VECTOR )
vec_U_indices[curCol].resize(SIZE_OF_INNER_VECTOR )
}
Then your inner loops would probably work faster, as there would not be a need for reallocs inside of the internal vector structures.
for (int curCol = 0; curCol < numRows; ++curCol){ //Long Loop
int lb = csc_colIndices[curCol];
int ub = csc_colIndices[curCol + 1];
// push back the diagonal value to L matrix
vec_L_val[curCol].push_back(1.0f);
vec_L_indices[curCol].push_back(curCol);
for (int curIndex = lb; curIndex < ub; ++curIndex){
int curRow = csc_indices[curIndex];
float curVal = csc_val[curIndex];
if (!Equal(curVal, 0) && curRow <= curCol){// U entry
vec_U_val[curCol].push_back(curVal);
vec_U_indices[curCol].push_back(curRow);
}
else if (!Equal(curVal, 0) && curRow > curCol){// L entry
vec_L_val[curCol].push_back(curVal);
vec_L_indices[curCol].push_back(curRow);
}
}
}

Related

Why does OMP nested parallelism execution outputs differently than linear execution?

I'm attempting to compare values of execution time when detecting edges of an image in a linear way and in a parallel way. Everything works fine in a linear way, but in a parallel way the image written has too much white pixels in a part of the image. To better show what I'm saying, see image below:
The left image is the output of the code executed linearly, and in the right is using parallelism. You can see the edges of the buildings in both images, and the bottom part of the right image close to its border doesen't have the same issue has the rest of it.
I cropped the "critical" part of the code that does this tasks, in hope that someone may know what may be causing this.
omp_set_nested(1);
#pragma omp parallel
while(col<cols-1) {
line = 1;
#pragma omp parallel
while(line<lines-1) {
gradient_x = 0;
gradient_y = 0;
for(int m = 0; m < mask_size; m++) {
for(int n = 0; n < mask_size; n++) {
int np_x = line + (m - 1);
int np_y = col + (n - 1);
float v = img(np_y,np_x);
int mask_index = (m*3) + n;
gradient_x = gradient_x + (x_mask[mask_index] * v);
gradient_y = gradient_y + (y_mask[mask_index] * v);
}
}
float gradient_sum = sqrt((gradient_x * gradient_x) + (gradient_y * gradient_y));
if(gradient_sum >= 255) {
gradient_sum = 255;
} else if(gradient_sum <= 0) {
gradient_sum = 0;
}
output(line, col) = gradient_sum;
#pragma omp critical
line++;
}
#pragma omp critical
col++;
}
I defined line and col variables as critical because they are the ones used for both reading and writing data, and I believe everything else is working propperly.
Without more context, is hard to tell. Nonetheless, those two nested parallel regions do not make sense, because you are not distributing tasks among threads; instead you are just executing the same code by multiple threads, with possible race-conditions on the updates of the variables gradient_x and gradient_y among others. Start with the following simpler parallel code:
omp_set_nested(0);
while(col<cols-1) {
line = 1;
while(line<lines-1) {
gradient_x = 0;
gradient_y = 0;
#pragma omp parallel for reduction(+:gradient_x,gradient_y)
for(int m = 0; m < mask_size; m++) {
for(int n = 0; n < mask_size; n++) {
int np_x = line + (m - 1);
int np_y = col + (n - 1);
float v = img(np_y,np_x);
int mask_index = (m*3) + n;
gradient_x = gradient_x + (x_mask[mask_index] * v);
gradient_y = gradient_y + (y_mask[mask_index] * v);
}
}
float gradient_sum = sqrt((gradient_x * gradient_x) + (gradient_y * gradient_y));
if(gradient_sum >= 255) {
gradient_sum = 255;
} else if(gradient_sum <= 0) {
gradient_sum = 0;
}
output(line, col) = gradient_sum;
line++;
}
col++;
}
You can try the following:
#pragma omp parallel for collapse(2)
for(int col = 0; col<cols-1; col++) {
for(int line = 1; line<lines-1; line++) {
float gradient_x = 0;
float gradient_y = 0;
for(int m = 0; m < mask_size; m++) {
for(int n = 0; n < mask_size; n++) {
int np_x = line + (m - 1);
int np_y = col + (n - 1);
float v = img(np_y,np_x);
int mask_index = (m*3) + n;
gradient_x = gradient_x + (x_mask[mask_index] * v);
gradient_y = gradient_y + (y_mask[mask_index] * v);
}
}
float gradient_sum = sqrt((gradient_x * gradient_x) +
(gradient_y * gradient_y));
if(gradient_sum >= 255) {
gradient_sum = 255;
} else if(gradient_sum <= 0) {
gradient_sum = 0;
}
output(line, col) = gradient_sum;
}
}
Of course, you need to check the race-condition in the code that you have cropped.

Optimize outer loop with OpenMP and a reduction

I struggle a bit with a function. The calculation is wrong if I try to parallelize the outer loop with a
#pragma omp parallel reduction(+:det).
Can someone show me how to solve it and why it is failing?
// template<class T> using vector2D = std::vector<std::vector<T>>;
float Det(vector2DF &a, int n)
{
vector2DF m(n - 1, vector1DF(n - 1, 0));
if (n == 1) return a[0][0];
if (n == 2) return a[0][0] * a[1][1] - a[1][0] * a[0][1];
float det = 0;
for (int i = 0; i < n; i++)
{
int l = 0;
#pragma omp parallel for private(l)
for (int j = 1; j < n; j++)
{
l = 0;
for (int k = 0; k < n; k++)
{
if (k == i) continue;
m[j - 1][l] = a[j][k];
l++;
}
}
det += std::pow(-1.0, 1.0 + i + 1.0) * a[0][i] * Det(m, n - 1);
}
return det;
}
If you parallelize the outer loop, there is a race condition on this line:
m[j - 1][l] = a[j][k];
Also you likely want a parallel for reduction instead of just a parallel reduction.
The issue is, that m is shared, even though that wouldn't be necessary given that it is completely overwritten in the inner loop. Always declare variables as locally as possible, this avoids issues with wrongly shared variables, e.g.:
float Det(vector2DF &a, int n)
{
if (n == 1) return a[0][0];
if (n == 2) return a[0][0] * a[1][1] - a[1][0] * a[0][1];
float det = 0;
#pragma omp parallel reduction(+:det)
for (int i = 0; i < n; i++)
{
vector2DF m(n - 1, vector1DF(n - 1, 0));
for (int j = 1; j < n; j++)
{
int l = 0;
for (int k = 0; k < n; k++)
{
if (k == i) continue;
m[j - 1][l] = a[j][k];
l++;
}
}
det += std::pow(-1.0, 1.0 + i + 1.0) * a[0][i] * Det(m, n - 1);
}
return det;
}
Now that is correct, but since m can be expensive to allocate, performance could benefit from not doing it in each and every iteration. This can be done by splitting parallel and for directives as such:
float Det(vector2DF &a, int n)
{
if (n == 1) return a[0][0];
if (n == 2) return a[0][0] * a[1][1] - a[1][0] * a[0][1];
float det = 0;
#pragma omp parallel reduction(+:det)
{
vector2DF m(n - 1, vector1DF(n - 1, 0));
#pragma omp parallel for
for (int i = 0; i < n; i++)
{
for (int j = 1; j < n; j++)
{
int l = 0;
for (int k = 0; k < n; k++)
{
if (k == i) continue;
m[j - 1][l] = a[j][k];
l++;
}
}
det += std::pow(-1.0, 1.0 + i + 1.0) * a[0][i] * Det(m, n - 1);
}
}
return det;
}
Now you could also just declare m as firstprivate, but that would assume that the copy constructor makes a completely independent deep-copy and thus make the code more difficult to reason about.
Please be aware that you should always include expected output, actual output and a minimal complete and verifiable example.

C++ knapsack implementation

I have a problem with my knapsack algorithm. To be honest I dont have idea what is wrong. When I use program once, all works wrong, but when I gonna use my program in loop (for test) I have a lot problem.
For example:
Weight/Val in file : 100
max knapsack capacity: 1000
First iteration:
Max profit: 2597
The resulting weight: 994/1000
And its fine, but now another iteration.
Second iteration:
Max profit: 2538
The resulting weight: 1004/1000 <- and there is my problem, its over my max cap.
3rd,4th were okey, then 5th was wrong (1355/1000), and so on.
My function where is possible problem:
void intoKnapsack(int k, float actual_profit, float actual_weight)
{
if (actual_weight + weight[k] <= cap)
{
tmp[k] = 1;
if (k <= number_items)
intoKnapsack(k + 1, actual_profit + value[k], actual_weight + weight[k]);
if (((actual_profit + value[k]) > final_profit) && (k == number_items))
{
final_profit = actual_profit + value[k];
final_weight = actual_weight + weight[k];
for (j = 0; j <= k; j++)
knap[j] = tmp[j];
}
}
else if ((bound(actual_profit, actual_weight, k) >= final_profit))
{
tmp[k] = 0;
if (k <= number_items)
intoKnapsack(k + 1, actual_profit, actual_weight);
if ((actual_profit > final_profit) && (k == number_items))
{
final_profit = actual_profit;
final_weight = actual_weight;
for (j = 0; j <= k; j++)
knap[j] = tmp[j];
}
}
}
Can someone help with my problem?
Ok, so when I ready only once the same N (like 100 in example above) then it works fine, but when I do it in loop:
srand((unsigned int) time(NULL));
algorytm a;
fstream wynik;
wynik.open("result.txt",ios::out | ios::app);
for(int i=0; i<how_test; i++){ //how many tests
write(how_n); //how many n in my file, and create file
a.read() //read from file (n, and weight / val)
time_start();
a.sort(); //I sort it
a.intoKnapsack(0, 0.0, 0.0); //my function above, so I give here a 3x to do it properly over and over in loop
get_time(); //stop time
measurement+=get_time();
result<<get_time()<<" s."<<endl; //just for
}
so when I do by myself for example write(50), then in same program write(51) and so on it works good, but when I do write(50), then another write(50), then I have wrong algorithm.
Maybe when I do sort, before clear Knapsack it in another loop doesnt work, but in other hand I first need to do sort.
There is my sort function
void algorytm::sort() {
int a;
int b;
float c;
for (i = 0; i < number_items; i++)
factor[i] = (float) val[i] / (float) weight[i]; //to sort from best to worst
for (i = 0; i < number_items - 1; i++) {
for (j = i + 1; j < number_items; j++) {
if (factor[i] < factor[j]) {
c = factor[i]; //
factor[i] = factor[j];
factor[j] = c;
a = val[i]; //
val[i] = val[j];
val[j] = a;
b = weight[i]; //
weight[i] = weight[j];
weight[j] = b;
}
}
}
}

Optimization of C++ code - std::vector operations

I have this funcition (RotateSlownessTop) and it's called about 800 times computing the corresponding values. But the calculation is slow and is there a way I can make the computations faster.
The number of element in X/Y is 7202. (Fairly large set)
I did the performance analysis and the screenshot has been attached.
void RotateSlownessTop(vector <double> &XR1, vector <double> &YR1, float theta = 0.0)
{
Matrix2d a;
a(0,0) = cos(theta);
a(0,1) = -sin(theta);
a(1, 0) = sin(theta);
a(1, 1) = cos(theta);
vector <double> XR2(7202), YR2(7202);
for (size_t i = 0; i < X.size(); ++i)
{
XR2[i] = (a(0, 0)*X[i] + a(0, 1)*Y[i]);
YR2[i] = (a(1, 0)*X[i] + a(1, 1)*Y[i]);
}
size_t i = 0;
size_t j = 0;
while (i < YR2.size())
{
if (i > 0)
if ((XR2[i]>0) && (XR2[i-1]<0))
j = i;
if (YR2[i] > (-1e-10) && YR2[i]<0.0)
YR2[i] = 0.0;
if (YR2[i] < (1e-10) && YR2[i]>0.0)
YR2[i] = -YR2[i];
if ( YR2[i]<0.0)
{
YR2.erase(YR2.begin() + i);
XR2.erase(XR2.begin() + i);
--i;
}
++i;
}
size_t k = 0;
while (j < YR2.size())
{
YR1[k] = (YR2[j]);
XR1[k] = (XR2[j]);
YR2.erase(YR2.begin() + j);
XR2.erase(XR2.begin() + j);
++k;
}
size_t l = 0;
for (; k < XR1.size(); ++k)
{
XR1[k] = XR2[l];
YR1[k] = YR2[l];
l++;
}
}
Edit1: I have updated the code by replacing all push_back() with operator[], since I read somewhere that this is much faster.
However the whole program is still slow. Any suggestions are appreciated.
If the size is large, you can improve the push_back by pre-allocating the space needed. Add this before the loop:
XR2.reserve(X.size());
YR2.reserve(X.size());

optimizing for loops in c++ code

Integer Range = 1;
for(Integer k = -Range; k <= Range; ++k)
{
for(Integer j = -Range; j <= Range; ++j)
{
for(Integer i = -Range; i <= Range; ++i)
{
Integer MCID = GetCellID(&CONSTANT_BOUNDINGBOX,CIDX +i, CIDY + j,CIDZ
+ k);
if(MCID < 0 || MCID >= c_CellNum)
{
continue;
}
unsigned int TriangleNum = c_daCell[MCID].m_TriangleNum;
for(unsigned int l = 0; l < TriangleNum; ++l)
{
TriangleID=c_daCell[MCID].m_TriangleID[l];
if( TriangleID >= 0 && TriangleID < c_TriangleNum && TriangleID
!= NearestID)// No need to calculate again for the same triangle
{
CDistance Distance ;
Distance.Magnitude = CalcDistance(&c_daTriangles[TriangleID], &TargetPosition,
&Distance.Direction);
if(Distance.Magnitude < NearestDistance.Magnitude)
{
NearestDistance = Distance;
NearestID = TriangleID;
}
}
}
}
}
}
}
c_daSTLDistance[ID] = NearestDistance;
c_daSTLID[ID] = NearestID;
GetCellID is the function to return the cellid in the variable CID with CIDX,CIDY,CIDZ with its position in the 3 axes
here the above code is a function to calculate the distance ,actually STL distance between a point and the triangles of the stl. This code runs fine however the problem is it is too slow as it has large number of loops within the code. Now my concern is to optimize the loop. Is there any technique of optimizing the loops within the code?