OpenMP Race Condition when finding Closest Pair - c++

I'm doing an assignment to find the closest pair between two disjoint sets A and B. I'm using OpenMP to parallelize the recursion of the algorithm, but I am running into some data races. I am very new to OpenMP, so I think it has something to do with incorrect privating/sharing of variables. I have put the full algorithm below:
float OMPParticleSim::efficient_closest_pair(int n, vector<Particle> & p, vector<Particle> & q)
{
// brute force
if(n <= 3) {
float m = numeric_limits<float>::max();
for(int i = 0; i < n - 2; i++) {
for(int j = i + 1; j < n - 1; j++) {
if((set_A.find(p[i].id) != set_A.end() && set_A.find(p[j].id) != set_A.end()) || (set_B.find(p[i].id) != set_B.end() && set_B.find(p[j].id) != set_B.end())) {
continue;
}
float distsq = pow(p[i].x - p[j].x, 2) + pow(p[i].y - p[j].y, 2) + pow(p[i].z - p[j].z, 2);
pair<pair<Particle, Particle>, float> pa = make_pair(make_pair(p[i], p[j]), sqrt(distsq));
#pragma omp critical
insert(pa);
m = min(m, distsq);
}
}
return sqrt(m);
}
// copy first ceil(n/2) points of p to pl
vector<Particle> pl;
int ceiling = ceil(n/2);
for(int i = 0; i < ceiling; i++) {
pl.push_back(p[i]);
}
// copy first ceil(n/2) points of q to ql
vector<Particle> ql;
for(int i = 0; i < ceiling; i++) {
ql.push_back(q[i]);
}
// copy remaining floor(n/2) points of p to pr
vector<Particle> pr;
for(int i = ceiling; i < p.size(); i++) {
pr.push_back(p[i]);
}
// copy remaining floor(n/2) points of q to qr
vector<Particle> qr;
for(int i = ceiling; i < q.size(); i++) {
qr.push_back(p[i]);
}
float dl, dr, d;
#pragma omp task firstprivate(pl, ql, p, q, n) private(dl) shared(closest_pairs)
dl = efficient_closest_pair(ceil(n / 2), pl, ql);
#pragma omp task firstprivate(pl, ql, p, q, n) private(dr) shared(closest_pairs)
dr = efficient_closest_pair(ceil(n / 2), pr, qr);
#pragma omp taskwait
d = min(dl, dr);
float m = p[ceil(n / 2) - 1].x;
vector<Particle> s;
for(int i = 0; i < q.size(); i++) {
if(fabs(q[i].x - m) < d) {
s.push_back(Particle(q[i]));
}
}
int num = s.size();
float dminsq = d * d;
for (int i = 0; i < num - 2; i++) {
int k = i + 1;
while(k <= num - 1 && pow(s[k].y - s[i].y, 2) < dminsq) {
if((set_A.find(s[i].id) != set_A.end() && set_A.find(s[k].id) != set_A.end()) || (set_B.find(s[i].id) != set_B.end() && set_B.find(s[k].id) != set_B.end())) {
k++;
continue;
}
float dist = pow(s[k].x - s[i].x, 2) + pow(s[k].y - s[i].y, 2) + pow(s[k].z - s[i].z, 2);
pair<pair<Particle, Particle>, float> pa = make_pair(make_pair(s[i], s[k]), sqrt(dist));
#pragma omp critical
insert(pa);
dminsq = min(dist, dminsq);
k++;
}
}
return sqrt(dminsq);
}
The insert method looks like this:
void OMPParticleSim::insert(pair<pair<Particle, Particle>, float> & pair) {
if(closest_pairs.size() == 0) {
closest_pairs.push_back(pair);
return;
}
for(int i = 0; i < closest_pairs.size(); ++i) {
if(closest_pairs[i].second > pair.second) {
closest_pairs.insert(closest_pairs.begin() + i, 1, pair);
break;
}
}
if(closest_pairs.size() > k) {
closest_pairs.pop_back();
}
}
The start of the parallel region is here:
void OMPParticleSim::do_closest_pair(int num_threads) {
vector<Particle> p = set;
// presort on x
sort(p.begin(), p.end(), sortxomp);
vector<Particle> q = p;
// presort on y
sort(q.begin(), q.end(), sortyomp);
float cp;
#pragma omp parallel num_threads(num_threads)
{
#pragma omp single
{
cp = efficient_closest_pair(set.size(), p, q);
}
}
sort(closest_pairs.begin(), closest_pairs.end(), sortpairsomp);
}
All of the results are stored in a list closest_pairs and output to a file. The reason I know there are data races is because some of the Particle id's are negative (all of them start positive), and running the program multiple times results in different values being written to the file. Any help would be great!

The error was the dl and dr should have been shared between the tasks.

Related

How can I parallelize my code about deleting overlapping spheres?

I'm trying to parallelize a piece of code. What my code does is checking if some spheres (defined by their coordinates xcentro, ycentro, zcentro and their radii r) overlapp each other or not. If they overlap, I must delete them, but as I don't know how to delete a component of a vector (it's a mess with the index and stuff) I just set the radii to zero and do not take them into account later.
My problem comes when I try to parallelize the code. If I don't do it, it works properly (although the code is not efficient at all and I need to run it with millions of spheres). And if I try to parallelize it, I obtain several errors. For example, if I try to run the code the exact way it is written below, I obtain segmentation fault. If I eliminate the private(...) part, I don't obtain any error, but don't obtain the same results as without parallelization.
What can I be doing wrong?
Here's the code:
vector<double> xcentro, ycentro, zcentro, r;
r.reserve(34000000);
xcentro.reserve(34000000);
ycentro.reserve(34000000);
zcentro.reserve(34000000);
... read files and fill up xcentro ycentro zcentro r with data ...
//#pragma omp parallel for private(i, j, xcentro, ycentro, zcentro, d) shared(r)
for (size_t i = 0; i < r.size() - 1; i++)
{
//#pragma omp parallel for private(i, j, xcentro, ycentro, zcentro, d) shared(r)
for (size_t j = i + 1; j < r.size() - 1; j++)
{
auto dist_square = (xcentro[i] - xcentro[j]) * (xcentro[i] - xcentro[j])
+ (ycentro[i] - ycentro[j]) * (ycentro[i] - ycentro[j])
+ (zcentro[i] - zcentro[j]) * (zcentro[i] - zcentro[j]);
if ( dist_square < (r[i]+r[j])*(r[i]+r[j]) )
{
//hacer 0 el radio de la esfera j-esima
r[j] = 0;
//hacer 0 el radio de la esfera i-esima
r[i] = 0;
}
}
}
Okay, let's first consider an algorithm which actually works, i.e. obtain the subset of spheres with no overlap. To this end, we don't remove a sphere (before checking whether it overlaps with another one) but merely record that is has overlaps.
struct sphere { double R,X,Y,Z; };
inline constexpr double square(double x) noexcept
{ return x*x; }
inline constexpr bool overlap(sphere const&a, sphere const&b) noexcept
{ return square(a.X-b.X)+square(a.Y-b.Y)+square(a.Z-b.Z) > square(a.R+b.R); }
std::vector<sphere> keep_non_overlapping(std::vector<sphere> const&S)
{
std::vector<char> hasOverlap(Spheres.size(), char(0));
vector<sphere> result;
for(size_t i=0; i<S.size(); ++i) {
for(size_t j=i+1; j<S.size(); ++j)
if((!hasOverlap[i] || !hasOverlap[j]) && overlap(S[i],S[j])) {
hasOverlap[i] = 1;
hasOverlap[j] = 1;
}
if(!hasOverlap[i])
result.push_back(S[i]);
}
return result;
}
This algorithm loops every pair of spheres once. Since the test between spheres k and l is done when i equals the smaller of k and l and j the larger, the executions of the loop over i are still not mutually independent: there is still a race condition. This can be removed by looping over each pair of spheres twice:
std::vector<sphere> keep_non_overlapping(std::vector<sphere> const&S)
{
std::vector<char> hasOverlap(Spheres.size(), char(0));
#pragma omp parallel for
for(size_t i=0; i<S.size(); ++i) {
bool overlapping = false;
for(size_t j=0; !overlapping && j<S.size(); ++j)
if(j!=i && overlap(S[i],S[j])
overlapping = true;
hasOverlap[i] = !overlapping;
}
vector<sphere> result;
for(size_t i=0; i<S.size(); ++i)
if(!hasOverlap[i])
result.push_back(S[i]);
return result;
}
Note also that, depending on the distribution of spheres, it can make the execution significantly faster if you first order the sphere is descending radius (largest spheres first) as in
std::sort(S.begin(), S.end(), [](sphere const&a, sphere const&b) { return a.R > b.R; });
Note further that this naive O(N^2) algorithm is not optimal. There is likely a O(N ln(N)) algorithm which first arranges the spheres in some data structure (perhaps a spatial tree) in O(N ln(N)) time and then finds whether a sphere is overlapping in no more than O(ln N) time for each sphere.
Hereby, I answer your question asked in the comment:
How could I increase the speed of my program?
The best is to completely change the algorithm (as already suggested), but if you do not wish to change it for any reason, you can gain ca. 20% speed by parallelizing the outer loop:
#pragma omp parallel for schedule(dynamic, r.size()/500)
for (size_t i = 0; i < r.size(); ++i)
{
for (size_t j = i + 1; j < r.size(); ++j)
{
if ((((xcentro[i] - xcentro[j]) * (xcentro[i] - xcentro[j]) + (ycentro[i] - ycentro[j]) * (ycentro[i] - ycentro[j]) + (zcentro[i] - zcentro[j]) * (zcentro[i] - zcentro[j])) < (r[i] + r[j]) * (r[i] + r[j])))
{
#pragma omp atomic write
overlaps[i] = 1;
#pragma omp atomic write
overlaps[j] = 1;
}
}
}
UPDATE:
Based on #Walter’s response and code, I created a simple algorithm that is significantly faster than your code. The basic idea is as follows: Sort the data according to x values and determine the largest radius. For a given x value, it is not necessary to go through the entire range, it is enough to examine those x values that are closer than twice the largest radius. Thus, the number of loop cycles can be significantly reduced and the speed of the algorithm was increased by orders of magnitude. I tested the speed difference between your code and the new algorithm with the code below using arrays filled with data of randomly created spheres. I created the algorithm so that you don't have to change the rest of your program, the new_algorithm function takes the data from xcentro, ycentro, zcetro, r arrays and returns the indexes of the overlapping spheres in the overlay2 array. On compiler explorer significant speed increase was observed:
size=20000
Runtime(your method)=1216 ms
Runtime(new algorithm)=13 ms
Note that this is a simple algorithm and easy to understand how it works, but based on your real data better algorithms may be created. Here is the code:
#include <iostream>
#include <vector>
#include <chrono>
#include <omp.h>
#include <algorithm>
using namespace std;
constexpr size_t N=10000;
std::vector<double> xcentro, ycentro, zcentro, r;
struct sphere { double X,Y,Z,R; size_t index; };
std::vector<sphere> Spheres;
inline constexpr double square(double x) noexcept
{ return x*x; }
inline constexpr bool overlap(sphere const&a, sphere const&b) noexcept
{ return square(a.X-b.X)+square(a.Y-b.Y)+square(a.Z-b.Z) < square(a.R+b.R); }
void new_algorithm(const std::vector<double>& x, const std::vector<double>& y, const std::vector<double>& z, const std::vector<double>& r, std::vector<char>& overlaps)
{
const auto start = std::chrono::high_resolution_clock::now();
std::vector<sphere> S;
S.reserve(r.size());
for (size_t i = 0; i < r.size(); i++)
{
S.push_back(sphere{x[i],y[i],z[i],r[i], i});
}
//Sort ascending X
std::sort(S.begin(), S.end(), [](sphere const&a, sphere const&b) { return a.X < b.X; });
// Clear overlaps and determine maximum r value
double maxr=-1;
for (size_t i = 0; i < S.size(); i++)
{
overlaps[i]=0;
if(S[i].R>maxr) maxr=S[i].R;
}
//Create a vector for maximum indices
std::vector<size_t> max_index(S.size(),0);
//Determine maximum_index
size_t j=1;
for (size_t i = 0; i < S.size(); i++)
{
while(S[j].X-S[i].X<2*maxr)
{
if(j<r.size()) j++; else break;
}
max_index[i]=j;
}
#pragma omp parallel for
for(size_t i=0; i<S.size(); ++i)
{
for(size_t j=i+1; j<max_index[i]; ++j)
if(overlap(S[i],S[j]))
{
#pragma omp atomic write
overlaps[S[i].index] = 1;
#pragma omp atomic write
overlaps[S[j].index] = 1;
}
}
const auto stop = std::chrono::high_resolution_clock::now();
auto diff = std::chrono::duration_cast<std::chrono::milliseconds>(stop - start);
std::cout << "Runtime(new algorithm)=" << diff.count() << " ms\n";
}
void your_algorithm(std::vector<char>& overlaps)
{
size_t i,j;
const auto start = std::chrono::high_resolution_clock::now();
#pragma omp parallel for
for(i=0; i<r.size(); i++)
{
overlaps[i]=0;
}
for (i = 0; i < r.size(); i++)
{
#pragma omp parallel for
for (j = i + 1; j < r.size(); j++)
{
if ((((xcentro[i] - xcentro[j]) * (xcentro[i] - xcentro[j]) + (ycentro[i] - ycentro[j]) * (ycentro[i] - ycentro[j]) + (zcentro[i] - zcentro[j]) * (zcentro[i] - zcentro[j])) < (r[i] + r[j]) * (r[i] + r[j])))
{
overlaps[i] = 1;
overlaps[j] = 1;
}
}
}
const auto stop = std::chrono::high_resolution_clock::now();
auto diff = std::chrono::duration_cast<std::chrono::milliseconds>(stop - start);
std::cout << "Runtime(your method)=" << diff.count() << " ms" << std::endl;
}
int main() {
std::vector<char> overlaps1, overlaps2;
r.reserve(N);
xcentro.reserve(N);
ycentro.reserve(N);
zcentro.reserve(N);
overlaps1.reserve(N);
overlaps2.reserve(N);
//fill the arrays with random numbers
for(size_t i=0; i<N; i++)
{
double x=(rand() % 1000)/10.0;
double y=(rand() % 1000)/10.0;
double z=(rand() % 1000)/10.0;
double R=(rand() % 10000)/((double)N ) + 0.1;
xcentro.push_back( x );
ycentro.push_back( y );
zcentro.push_back( z );
r.push_back(R);
}
std::cout << "size=" << r.size() << std::endl;
your_algorithm(overlaps1);
new_algorithm(xcentro,ycentro,zcentro,r,overlaps2);
// Check if array of overlap is the same for the 2 methods
for(size_t i=0; i<N; i++)
{
if(overlaps1[i]!=overlaps2[i])
{
cout << "error\n"; exit (-1);
}
}
cout << "OK\n";
}
UPDATE2: Here is the code mentioned in comment (sort by R and remove the bigger sphere only)
std::vector<sphere> S;
S.reserve(r.size());
for (size_t i = 0; i < r.size(); i++)
{
overlaps[i]=0;
S.push_back(sphere{x[i],y[i],z[i],r[i], i});
}
//Sort descending R
std::sort(S.begin(), S.end(), [](sphere const&a, sphere const&b) { return a.R > b.R; });
#pragma omp parallel for
for(size_t i=0; i<S.size(); ++i)
{
for(size_t j=i+1; j<S.size(); ++j)
if(overlap(S[i],S[j]))
{
overlaps[S[i].index] = 1;
break;
}
}
Let us first improve your serial code a bit by avoiding to loop over already deleted spheres:
for(size_t i = 0; i < r.size(); ++i)
if(r[i]>0) {
for(size_t j=i+1; j<r.size(); ++j)
if(r[j]>0 && (xcentro[i] - xcentro[j]) * (xcentro[i] - xcentro[j])
+ (ycentro[i] - ycentro[j]) * (ycentro[i] - ycentro[j])
+ (zcentro[i] - zcentro[j]) * (zcentro[i] - zcentro[j])
< (r[i]+r[j])*(r[i]+r[j]) ) {
r[j] = 0;
r[i] = 0;
}
}
This immediately shows you that the execution of the outer loop depends on all previous executions at smaller index, since these may have removed some of the spheres. This interdependence of the loop executions implies that your algorithm cannot be straightforwardly parallelized (in the way you attempted it).
Also, you have race conditions in the variables r[], which are read and written to. Your naive parallelization didn't take care of that problem either.
Just in case someone is still interested, I've improved my code and now it's more efficient (althought now enough for me yet) and now it does eliminate overlapping spheres:
#pragma omp parallel for
for(i=0; i<r.size(); i++)
{
overlaps[i]=0;
}
cout << "overlaps igualados a cero..." << endl;
//Me queda ver qué esferas se superponen y eliminarlas. Primero voy a ver comprobar qué esferas se superponen y posteriormente hago el radio de aquellas
//que se superponen igual a cero.
double cero = 0.0;
for (i = 0; i < r.size(); i++)
{
contador=0;
#pragma omp parallel for reduction(+:contador)
for (j = i + 1; j < r.size(); j++)
{
if ((((xcentro[i] - xcentro[j]) * (xcentro[i] - xcentro[j]) + (ycentro[i] - ycentro[j]) * (ycentro[i] - ycentro[j]) + (zcentro[i] - zcentro[j]) * (zcentro[i] - zcentro[j])) < (r[i] + r[j]) * (r[i] + r[j])))
{
contador++;
overlaps[i] = contador;
overlaps[j]=contador;
}
}
}
#pragma omp parallel for
for(i=0; i<r.size(); i++)
{
if(overlaps[i]!=0)
{
r[i]=0;
}
}

Why does OMP nested parallelism execution outputs differently than linear execution?

I'm attempting to compare values of execution time when detecting edges of an image in a linear way and in a parallel way. Everything works fine in a linear way, but in a parallel way the image written has too much white pixels in a part of the image. To better show what I'm saying, see image below:
The left image is the output of the code executed linearly, and in the right is using parallelism. You can see the edges of the buildings in both images, and the bottom part of the right image close to its border doesen't have the same issue has the rest of it.
I cropped the "critical" part of the code that does this tasks, in hope that someone may know what may be causing this.
omp_set_nested(1);
#pragma omp parallel
while(col<cols-1) {
line = 1;
#pragma omp parallel
while(line<lines-1) {
gradient_x = 0;
gradient_y = 0;
for(int m = 0; m < mask_size; m++) {
for(int n = 0; n < mask_size; n++) {
int np_x = line + (m - 1);
int np_y = col + (n - 1);
float v = img(np_y,np_x);
int mask_index = (m*3) + n;
gradient_x = gradient_x + (x_mask[mask_index] * v);
gradient_y = gradient_y + (y_mask[mask_index] * v);
}
}
float gradient_sum = sqrt((gradient_x * gradient_x) + (gradient_y * gradient_y));
if(gradient_sum >= 255) {
gradient_sum = 255;
} else if(gradient_sum <= 0) {
gradient_sum = 0;
}
output(line, col) = gradient_sum;
#pragma omp critical
line++;
}
#pragma omp critical
col++;
}
I defined line and col variables as critical because they are the ones used for both reading and writing data, and I believe everything else is working propperly.
Without more context, is hard to tell. Nonetheless, those two nested parallel regions do not make sense, because you are not distributing tasks among threads; instead you are just executing the same code by multiple threads, with possible race-conditions on the updates of the variables gradient_x and gradient_y among others. Start with the following simpler parallel code:
omp_set_nested(0);
while(col<cols-1) {
line = 1;
while(line<lines-1) {
gradient_x = 0;
gradient_y = 0;
#pragma omp parallel for reduction(+:gradient_x,gradient_y)
for(int m = 0; m < mask_size; m++) {
for(int n = 0; n < mask_size; n++) {
int np_x = line + (m - 1);
int np_y = col + (n - 1);
float v = img(np_y,np_x);
int mask_index = (m*3) + n;
gradient_x = gradient_x + (x_mask[mask_index] * v);
gradient_y = gradient_y + (y_mask[mask_index] * v);
}
}
float gradient_sum = sqrt((gradient_x * gradient_x) + (gradient_y * gradient_y));
if(gradient_sum >= 255) {
gradient_sum = 255;
} else if(gradient_sum <= 0) {
gradient_sum = 0;
}
output(line, col) = gradient_sum;
line++;
}
col++;
}
You can try the following:
#pragma omp parallel for collapse(2)
for(int col = 0; col<cols-1; col++) {
for(int line = 1; line<lines-1; line++) {
float gradient_x = 0;
float gradient_y = 0;
for(int m = 0; m < mask_size; m++) {
for(int n = 0; n < mask_size; n++) {
int np_x = line + (m - 1);
int np_y = col + (n - 1);
float v = img(np_y,np_x);
int mask_index = (m*3) + n;
gradient_x = gradient_x + (x_mask[mask_index] * v);
gradient_y = gradient_y + (y_mask[mask_index] * v);
}
}
float gradient_sum = sqrt((gradient_x * gradient_x) +
(gradient_y * gradient_y));
if(gradient_sum >= 255) {
gradient_sum = 255;
} else if(gradient_sum <= 0) {
gradient_sum = 0;
}
output(line, col) = gradient_sum;
}
}
Of course, you need to check the race-condition in the code that you have cropped.

Low Accuracy of DNN

I've been implementing NN recently based on http://neuralnetworksanddeeplearning.com/. I've made whole algorithm for backprop and SGD almost the same way as author of this book. The problem is that while he gets accuracy around 90 % after one epoch i get 30% after 5 epochs even though i have the same hiperparameters. Do you have any idea what might be the cause ?
Here s my respository.
https://github.com/PiPower/Deep-Neural-Network
Here is part with algorithm for backprop and SGD implemented in Network.cpp:
void Network::Train(MatrixD_Array& TrainingData, MatrixD_Array& TrainingLabels, int BatchSize,int epochs, double LearningRate)
{
assert(TrainingData.size() == TrainingLabels.size() && CostFunc != nullptr && CostFuncDer != nullptr && LearningRate > 0);
std::vector<long unsigned int > indexes;
for (int i = 0; i < TrainingData.size(); i++) indexes.push_back(i);
std::random_device rd;
std::mt19937 g(rd());
std::vector<Matrix<double>> NablaWeights;
std::vector<Matrix<double>> NablaBiases;
NablaWeights.resize(Layers.size());
NablaBiases.resize(Layers.size());
for (int i = 0; i < Layers.size(); i++)
{
NablaWeights[i] = Matrix<double>(Layers[i].GetInDim(), Layers[i].GetOutDim());
NablaBiases[i] = Matrix<double>(1, Layers[i].GetOutDim());
}
//---- Epoch iterating
for (int i = 0; i < epochs; i++)
{
cout << "Epoch number: " << i << endl;
shuffle(indexes.begin(), indexes.end(), g);
// Batch iterating
for (int batch = 0; batch < TrainingData.size(); batch = batch + BatchSize)
{
for (int i = 0; i < Layers.size(); i++)
{
NablaWeights[i].Clear();
NablaBiases[i].Clear();
}
int i = 0;
while( i < BatchSize && (i+batch)< TrainingData.size())
{
std::vector<Matrix<double>> ActivationOutput;
std::vector<Matrix<double>> Z_Output;
ActivationOutput.resize(Layers.size() + 1);
Z_Output.resize(Layers.size());
ActivationOutput[0] = TrainingData[indexes[i + batch]];
int index = 0;
// Pushing values through
for (auto layer : Layers)
{
Z_Output[index] = layer.Mul(ActivationOutput[index]);
ActivationOutput[index + 1] = layer.ApplyActivation(Z_Output[index]);
index++;
}
// ---- Calculating Nabla that will be later devided by batch size element wise
auto DeltaNabla = BackPropagation(ActivationOutput, Z_Output, TrainingLabels[indexes[i + batch]]);
for (int i = 0; i < Layers.size(); i++)
{
NablaWeights[i] = NablaWeights[i] + DeltaNabla.first[i];
NablaBiases[i] = NablaBiases[i] + DeltaNabla.second[i];
}
i++;
}
for (int g = 0; g < Layers.size(); g++)
{
Layers[g].Weights = Layers[g].Weights - NablaWeights[g] * LearningRate;
Layers[g].Biases = Layers[g].Biases - NablaBiases[g] * LearningRate;
}
// std::transform(NablaWeights.begin(), NablaWeights.end(), NablaWeights.begin(),[BatchSize, LearningRate](Matrix<double>& Weight) {return Weight * (LearningRate / BatchSize);});
//std::transform(NablaBiases.begin(), NablaBiases.end(), NablaBiases.begin(), [BatchSize, LearningRate](Matrix<double>& Bias) {return Bias * (LearningRate / BatchSize); });
}
}
}
std::pair<MatrixD_Array, MatrixD_Array> Network::BackPropagation( MatrixD_Array& ActivationOutput, MatrixD_Array& Z_Output,Matrix<double>& label)
{
MatrixD_Array NablaWeight;
MatrixD_Array NablaBias;
NablaWeight.resize(Layers.size());
NablaBias.resize(Layers.size());
auto zs = Layers[Layers.size() - 1].ActivationPrime(Z_Output[Z_Output.size() - 1]);
Matrix<double> Delta_L = Hadamard(CostFuncDer(ActivationOutput[ActivationOutput.size() - 1],label), zs);
NablaWeight[Layers.size() - 1] = Delta_L * ActivationOutput[ActivationOutput.size() - 2].Transpose();
NablaBias[Layers.size() - 1] = Delta_L;
for (int j = 2; j <= Layers.size() ; j++)
{
auto sp = Layers[Layers.size() - j].ActivationPrime(Z_Output[Layers.size() -j]);
Delta_L = Hadamard(Layers[Layers.size() - j+1 ].Weights.Transpose() * Delta_L, sp);
NablaWeight[Layers.size() - j] = Delta_L * ActivationOutput[ActivationOutput.size() -j-1].Transpose();
NablaBias[Layers.size() - j] = Delta_L;
}
return make_pair(NablaWeight, NablaBias);
}
It turned out that mnist loader didnt work correctly.

Optimize outer loop with OpenMP and a reduction

I struggle a bit with a function. The calculation is wrong if I try to parallelize the outer loop with a
#pragma omp parallel reduction(+:det).
Can someone show me how to solve it and why it is failing?
// template<class T> using vector2D = std::vector<std::vector<T>>;
float Det(vector2DF &a, int n)
{
vector2DF m(n - 1, vector1DF(n - 1, 0));
if (n == 1) return a[0][0];
if (n == 2) return a[0][0] * a[1][1] - a[1][0] * a[0][1];
float det = 0;
for (int i = 0; i < n; i++)
{
int l = 0;
#pragma omp parallel for private(l)
for (int j = 1; j < n; j++)
{
l = 0;
for (int k = 0; k < n; k++)
{
if (k == i) continue;
m[j - 1][l] = a[j][k];
l++;
}
}
det += std::pow(-1.0, 1.0 + i + 1.0) * a[0][i] * Det(m, n - 1);
}
return det;
}
If you parallelize the outer loop, there is a race condition on this line:
m[j - 1][l] = a[j][k];
Also you likely want a parallel for reduction instead of just a parallel reduction.
The issue is, that m is shared, even though that wouldn't be necessary given that it is completely overwritten in the inner loop. Always declare variables as locally as possible, this avoids issues with wrongly shared variables, e.g.:
float Det(vector2DF &a, int n)
{
if (n == 1) return a[0][0];
if (n == 2) return a[0][0] * a[1][1] - a[1][0] * a[0][1];
float det = 0;
#pragma omp parallel reduction(+:det)
for (int i = 0; i < n; i++)
{
vector2DF m(n - 1, vector1DF(n - 1, 0));
for (int j = 1; j < n; j++)
{
int l = 0;
for (int k = 0; k < n; k++)
{
if (k == i) continue;
m[j - 1][l] = a[j][k];
l++;
}
}
det += std::pow(-1.0, 1.0 + i + 1.0) * a[0][i] * Det(m, n - 1);
}
return det;
}
Now that is correct, but since m can be expensive to allocate, performance could benefit from not doing it in each and every iteration. This can be done by splitting parallel and for directives as such:
float Det(vector2DF &a, int n)
{
if (n == 1) return a[0][0];
if (n == 2) return a[0][0] * a[1][1] - a[1][0] * a[0][1];
float det = 0;
#pragma omp parallel reduction(+:det)
{
vector2DF m(n - 1, vector1DF(n - 1, 0));
#pragma omp parallel for
for (int i = 0; i < n; i++)
{
for (int j = 1; j < n; j++)
{
int l = 0;
for (int k = 0; k < n; k++)
{
if (k == i) continue;
m[j - 1][l] = a[j][k];
l++;
}
}
det += std::pow(-1.0, 1.0 + i + 1.0) * a[0][i] * Det(m, n - 1);
}
}
return det;
}
Now you could also just declare m as firstprivate, but that would assume that the copy constructor makes a completely independent deep-copy and thus make the code more difficult to reason about.
Please be aware that you should always include expected output, actual output and a minimal complete and verifiable example.

Matlab Spdiags equivalent in EIGEN C++

I'm searching for an equivalent of A=Spdiags(B,d,N,N)in C++. This function extracts the diagonals element of the matrix B by taking the columns of B and placing them along the diagonals specified by the vector d. N N are the size of the output matrix A.
I've searched in Eigen, but it seems that it does not exist.
any ideas?
There's no built in method as far as I know but it's not too hard to do this by building a new matrix via indices. Notice that the kth diagonal runs from index (max(1, 1-k), max(1, 1-k)+k) to (min(m, n-k), min(m, n-k)+k)
template <typename Scalar>
Eigen::SparseMatrix<Scalar> spdiags(const Matrix<Scalar, -1, -1>& B, const Eigen::Matrix<int, -1, 1>& d, size_t m, size_t n) {
Eigen::SparseMatrix<Scalar> A(m,n);
typedef Eigen::Triplet<Scalar> T;
std::vector<T> triplets;
triplets.reserve(std::min(m,n)*d.size());
for (int k = 0; k < d.size(); k++) {
int i_min = std::max(0, -d(k));
int i_max = std::min(m - 1, n - d(k) - 1);
int B_idx_start = m >= n ? d(k) : 0;
for (int i = i_min; i <= i_max; i++) {
triplets.push_back( T(i, i+k, B(B_idx_start + i, k)) );
}
}
A.setFromTriplets(triplets.begin(), triplets.end());
return A;
}
Note I haven't tested this but you get the idea. The first index into B is a little weird but I think it's right.
Other version, spdiags(A):
Eigen::MatrixXd spdiags(const Eigen::SparseMatrix<double>& A) {
// find nonzero diagonals by iterating over all nonzero elements
// d(i) = 1 if the ith diagonal of A contains a nonzero, 0 else
Eigen::VectorXi d = Eigen::VectorXi::Zero(A.rows() + A.cols() - 1);
for (int k=0; k < A.outerSize(); ++k) {
for (SparseMatrix<double>::InnerIterator it(A,k); it; ++it) {
d(it.col() - it.row() + A.rows() - 1) = 1;
}
}
int num_diags = d.sum();
Eigen::MatrixXd B(std::min(A.cols(), A.rows()), num_diags);
// fill B with diagonals
int B_col_idx = 0;
int B_row_sign = A.rows() >= A.cols() ? 1 : -1;
for (int i = 1 - A.rows(); i <= A.cols() - 1; i++) {
if (d(i + A.rows() - 1)) {
const auto& diag = A.diagonal(i);
int B_row_start = std::max(0, B_row_sign * i);
B.block(B_row_start, B_col_idx, diag.size(), 1) = diag;
B_col_idx++;
}
}
return B;
}
same disclaimer: haven't tested, but should work. Replace double with template <typename Scalar> as before if you want
here is a solution i've made. I've implemented the diagonal(i) because this function is not taken account by my eigen version (how can i know which version i use?). I obtain a good results with this, but i don't know if can more optimize it :
void spdiags(Eigen::SparseMatrix<double> A)
{
//Extraction of the diagnols before the main diagonal
vector<double> vec1; int flag=0;int l=0;
int i=0; int j=0; vector<vector<double> > diagD;
vector<vector<double> > diagG; int z=0; int z1=0;
for(int i=0;i<A.rows();i++)
{l=i;
do
{
if(A.coeff(l,j)!=0)
flag=1;
vec1.push_back(A.coeff(l,j));
l++;j++;
}while(l<A.rows() && j<A.cols());
if(flag==1) {diagG.resize(diagG.size()+1);diagG[z]=vec1; z++; }
vec1.clear(); l=0;j=0; flag=0; cout<<endl;
}
flag=0;z=0; vec1.clear();
// Extraction of the diagonals after the main diagonal
for(int i=1;i<A.cols();i++)
{l=i;
do
{
if(A.coeff(j,l)!=0)
flag=1;
vec1.push_back(A.coeff(j,l));
l++;j++;
}while(l<A.cols() && j<A.rows());
if(flag==1) {diagD.resize(diagD.size()+1);diagD[z]=vec1; z++; }
vec1.clear(); l=0;j=0; flag=0; cout<<endl;
}
// End extraction of the diagonals
Eigen::VectorXi d = Eigen::VectorXi::Zero(A.rows() + A.cols() - 1);
for (int k=0; k < A.outerSize(); ++k)
{
for (SparseMatrix<double>::InnerIterator it(A,k); it; ++it)
{
d(it.col() - it.row() + A.rows() - 1) = 1;
}
}
int num_diags = d.sum();
Eigen::MatrixXd B(std::min(A.cols(), A.rows()), num_diags);
// fill B with diagonals
Eigen::ArrayXd v;
int B_col_idx = 0;
int B_row_sign = A.rows() >= A.cols() ? 1 : -1;
int indG=diagG.size()-1; int indD=0;
for (int i = 1 - A.rows(); i <=A.cols() - 1; i++)
{
if (d(i + A.rows() - 1))
{
if(i<1)
{ v.resize(diagG[indG].size());
for(int i=0;i<diagG[indG].size();i++)
{
v(i)=diagG[indG][i];
}
int B_row_start = std::max(0, B_row_sign * i);
B.block(B_row_start, B_col_idx, diagG[indG].size(), 1) = v;
B_col_idx++;
indG--;
}
else
{
v.resize(diagD[indD].size());
for(int i=0;i<diagD[indD].size();i++)
{
v(i)=diagD[indD][i] ;
}
int B_row_start = std::max(0, B_row_sign * i);
B.block(B_row_start, B_col_idx, diagD[indD].size(), 1) = v;
B_col_idx++;
indD++;
}
}
}
cout<<B<<endl; //the result of the function
}//end of the function