C++ openmp parallel computation calculates wrong results - c++

I have an algorithm that I want to execute in parallel using openmp. I could validate the results for single-threaded execution (OMP_NUM_THREADS=1), but the results are slightly different as soon as I set the number of threads to 2 or higher. I also tried parallelizing the inner for-loops, but that doesn't yield correct results, either.
I'm fairly new to openmp and multi-threading in general. I suspect that my implementation shares variables between threads improperly somehow, but I can't figure it out.
extern "C" double *lomb_scargle(double *x, double *y, double *f, int NX, int NF) {
double *result = (double*) malloc(2 * NF * sizeof(double));
double w, tau, SS, SC, SST1, SST2, SCT1, SCT2, Ai, Bi;
int i, j;
#pragma omp parallel for
for (i=0; i<NF; i++) {
w = 2 * M_PI * f[i];
SS = 0.;
SC = 0.;
for (j=0; j<NX; j++) {
SS += sin(2*w*x[j]);
SC += cos(2*w*x[j]);
}
tau = atan2(SS, SC) / (2 * w);
SCT1 = 0.;
SCT2 = 0.;
SST1 = 0.;
SST2 = 0.;
for (j=0; j<NX; j++) {
SCT1 += y[j] * cos(w * (x[j] - tau));
SCT2 += pow(cos(w * (x[j] - tau)),2);
SST1 += y[j] * sin(w * (x[j] - tau));
SST2 += pow(sin(w * (x[j] - tau)),2);
}
Ai = SCT1 / SCT2;
Bi = SST1 / SST2;
// result contains the amplitude first, and then the phase
result[i] = sqrt(Ai*Ai + Bi*Bi);
result[i+NF] = - (atan2(Bi, Ai) + w * tau);
}
return result;
}
EDIT: typo

By default, OpenMP shares all variables declared in an outer scope between all workers. You need to move the temporary variables into the inner block (or declare them private).

Related

How to parallelize accumulative probability function with OpenMP?

I'm trying to make more efficient via parallelizing my code that calculates the accumulative probability function. I have a vector<double> of radii called r and I need to count how many elements there are with a radius bigger than a given one > R. In addition, I need to calculate the accumulative probability function for the volume.
The code I have is the following one:
int i, j;
double aux, contar, contar1, aux;
vector<double> r, contador, contador1, vol,
for (i = 0; i != r.size() - 1; i++)
{
aux = r[i];
contador[i] = 0;
contador1[i] = 0;
contar = 0;
contar1 = 0;
vol[i] = 0.0;
for (j = 0; j != r.size() - 1; j++)
{
if(aux <= r[j])
{
contar++;
#pragma omp atomic write
vol[i] = vol[i] + 4.0 * 3.141592653589793 * r[j] * r[j] * r[j] / 3.0;
}
if(aux==r[j])
{
contar1++;
}
}
#pragma omp atomic write
contador[i]=contar;
#pragma omp atomic write
contador1[i]=contar1;
}
but it's not efficient at all. Any help in order to make it more efficient with OpenMP?

Why omp version is slower than serial?

It's a follow-up question to this one
Now I have the code:
#include <iostream>
#include <cmath>
#include <omp.h>
#define max(a, b) (a)>(b)?(a):(b)
const int m = 2001;
const int n = 2000;
const int p = 4;
double v[m + 2][m + 2];
double x[m + 2];
double y[m + 2];
double _new[m + 2][m + 2];
double maxdiffA[p + 1];
int icol, jrow;
int main() {
omp_set_num_threads(p);
double h = 1.0 / (n + 1);
double start = omp_get_wtime();
#pragma omp parallel for private(icol) shared(x, y, v, _new)
for (icol = 0; icol <= n + 1; ++icol) {
x[icol] = y[icol] = icol * h;
_new[icol][0] = v[icol][0] = 6 - 2 * x[icol];
_new[n + 1][icol] = v[n + 1][icol] = 4 - 2 * y[icol];
_new[icol][n + 1] = v[icol][n + 1] = 3 - x[icol];
_new[0][icol] = v[0][icol] = 6 - 3 * y[icol];
}
const double eps = 0.01;
#pragma omp parallel private(icol, jrow) shared(_new, v, maxdiffA)
{
while (true) { //for [iters=1 to maxiters by 2]
#pragma omp single
for (int i = 0; i < p; i++) maxdiffA[i] = 0;
#pragma omp for
for (icol = 1; icol <= n; icol++)
for (jrow = 1; jrow <= n; jrow++)
_new[icol][jrow] =
(v[icol - 1][jrow] + v[icol + 1][jrow] + v[icol][jrow - 1] + v[icol][jrow + 1]) / 4;
#pragma omp for
for (icol = 1; icol <= n; icol++)
for (jrow = 1; jrow <= n; jrow++)
v[icol][jrow] = (_new[icol - 1][jrow] + _new[icol + 1][jrow] + _new[icol][jrow - 1] +
_new[icol][jrow + 1]) / 4;
#pragma omp for
for (icol = 1; icol <= n; icol++)
for (jrow = 1; jrow <= n; jrow++)
maxdiffA[omp_get_thread_num()] = max(maxdiffA[omp_get_thread_num()],
fabs(_new[icol][jrow] - v[icol][jrow]));
#pragma omp barrier
double maxdiff = 0.0;
for (int k = 0; k < p; ++k) {
maxdiff = max(maxdiff, maxdiffA[k]);
}
if (maxdiff < eps)
break;
#pragma omp barrier
//#pragma omp single
//std::cout << maxdiff << std::endl;
}
}
double end = omp_get_wtime();
printf("start = %.16lf\nend = %.16lf\ndiff = %.16lf\n", start, end, end - start);
return 0;
}
But why it works 2-3 times slower (32sec vs 18sec) than serial analog:
#include <iostream>
#include <cmath>
#include <omp.h>
#define max(a,b) (a)>(b)?(a):(b)
const int m = 2001;
const int n = 2000;
double v[m + 2][m + 2];
double x[m + 2];
double y[m + 2];
double _new[m + 2][m + 2];
int main() {
double h = 1.0 / (n + 1);
double start = omp_get_wtime();
for (int i = 0; i <= n + 1; ++i) {
x[i] = y[i] = i * h;
_new[i][0]=v[i][0] = 6 - 2 * x[i];
_new[n + 1][i]=v[n + 1][i] = 4 - 2 * y[i];
_new[i][n + 1]=v[i][n + 1] = 3 - x[i];
_new[0][i]=v[0][i] = 6 - 3 * y[i];
}
const double eps=0.01;
while(true){ //for [iters=1 to maxiters by 2]
double maxdiff=0.0;
for (int i=1;i<=n;i++)
for (int j=1;j<=n;j++)
_new[i][j]=(v[i-1][j]+v[i+1][j]+v[i][j-1]+v[i][j+1])/4;
for (int i=1;i<=n;i++)
for (int j=1;j<=n;j++)
v[i][j]=(_new[i-1][j]+_new[i+1][j]+_new[i][j-1]+_new[i][j+1])/4;
for (int i=1;i<=n;i++)
for (int j=1;j<=n;j++)
maxdiff=max(maxdiff, fabs(_new[i][j]-v[i][j]));
if(maxdiff<eps) break;
std::cout << maxdiff<<std::endl;
}
double end = omp_get_wtime();
printf("start = %.16lf\nend = %.16lf\ndiff = %.16lf\n", start, end, end - start);
return 0;
}
Also interesting that it works SAME TIME as version (I can post it here if you say so) which looks like so
while(true){ //106 iteratins here!!!
#pragma omp paralell for
for(...)
#pragma omp paralell for
for(...)
#pragma omp paralell for
for(...)
}
But I thought that what making omp code slow is spawning threads inside while loop 106 times... But no! Then probably threads simultaneously write to the same array cells.. But where does it happen? I don't see it could you show me please?
Maybe it's because too much barriers? But Lecturer told me to implement the code like so and "analyse it" Maybe the answer is "Jacobi algorithm isn't meant to run well in parallel"? Or it's just my lame coding?
So the root of evel was
max(maxdiffA[w],fabs(_new[icol][jrow] - v[icol][jrow]))
because it's
#define max(a, b) (a)>(b)?(a):(b)
It's probably creating TOO much branching ('if's ) Without this thing parallel version works 8 times faster and loading CPU 68% instead of 99%..
The starange thing: same "max" doesn't affect serioal version
I am writing to make you aware of a few situations. It is not short to write in a comment, so I decided to write as an answer.
every time a thread is made, it takes some time for its creation. if your program's running time in a single core is short, then the creation of threads will make this time longer for multi-core.
plus using a barrier makes all your threads wait for others, which could somehow be slowed down in cpu. this way, even if all threads finish the job very fast, that last one will make the total run time longer.
try to run your program with bigger sized arrays where time is around 2 minutes for single threading. then make your way to multi-core.
then try to wrap your main code in a normal loop to run it a few times and prints the timings for each. the first run of the loop might be slow because of loading libraries, but the next runs should be faster to prove the increasing speed.
if above suggestions do not give a result, then it means your coding needs more editing.
EDIT:
To downvoters, If you don't like a post, please at least be polite and leave a comment. Or better, give your own answer so be helpful to community.

using OPENMP to parallelize a c++ code

I hav parallelized my c++ code using openmp as below:
....//some code
double S_theta = 0.0, S_x = 0.0, S_y = 0.0;
#pragma omp parallel for private(dx,dy,theta_new) reduction(+ : S_x,S_y,S_theta)
for(int j = 0; j < N; j++)
{
if (j==i) continue;
double delta_x = x[i]-x[j],
for(int ky = -1; ky<= 1; ky++)
{
for(int kx = -1; kx<= 1; kx++)
{
if (r_ij_square > l0_two)
{
double r_ij = sqrt(r_ij_square);
r_x_ij/= r_ij;
r_y_ij/= r_ij;
double rdote = r_x_ij * e_x[j] + r_y_ij * e_y[j];
S_theta += e_dot_e * ( e_cross_e - rdote * (e_x[i] * r_y_ij - e_y[i] * r_x_ij) ) / (r_ij_square * r_ij);
double S = rdote /r_ij_square;
S_x += r_x_ij * S;
S_y += r_y_ij * S;
}
}
}
double zeta = -eta / 2.0 ;
theta_new[i] += zeta + dt * dlthet;
}
....//some code
But the result changes each time when I run it! When I remove this line
#pragma omp parallel for reduction(+ : S_x,S_y,S_theta)
, the result is the same in different runs!
What is wrong with my code?
Your variable S is declared outside the loop, and will be shared among all the threads running your loop. Given how it is used, it can be declared as double S = rdote / r_ij_square; inside the loop where it is used.

Optimize log entropy calculation in sparse matrix

I have a 3007 x 1644 dimensional matrix of terms and documents. I am trying to assign weights to frequency of terms in each document so I'm using this log entropy formula http://en.wikipedia.org/wiki/Latent_semantic_indexing#Term_Document_Matrix (See entropy formula in the last row).
I'm successfully doing this but my code is running for >7 minutes.
Here's the code:
int N = mat.cols();
for(int i=1;i<=mat.rows();i++){
double gfi = sum(mat(i,colon()))(1,1); //sum of occurrence of terms
double g =0;
if(gfi != 0){// to avoid divide by zero error
for(int j = 1;j<=N;j++){
double tfij = mat(i,j);
double pij = gfi==0?0.0:tfij/gfi;
pij = pij + 1; //avoid log0
double G = (pij * log(pij))/log(N);
g = g + G;
}
}
double gi = 1 - g;
for(int j=1;j<=N;j++){
double tfij = mat(i,j) + 1;//avoid log0
double aij = gi * log(tfij);
mat(i,j) = aij;
}
}
Anyone have ideas how I can optimize this to make it faster? Oh and mat is a RealSparseMatrix from amlpp matrix library.
UPDATE
Code runs on Linux mint with 4gb RAM and AMD Athlon II dual core
Running time before change: > 7mins
After #Kereks answer: 4.1sec
Here's a very naive rewrite that removes some redundancies:
int const N = mat.cols();
double const logN = log(N);
for (int i = 1; i <= mat.rows(); ++i)
{
double const gfi = sum(mat(i, colon()))(1, 1); // sum of occurrence of terms
double g = 0;
if (gfi != 0)
{
for (int j = 1; j <= N; ++j)
{
double const pij = mat(i, j) / gfi + 1;
g += pij * log(pij);
}
g /= logN;
}
for (int j = 1; j <= N; ++j)
{
mat(i,j) = (1 - g) * log(mat(i, j) + 1);
}
}
Also make sure that the matrix data structure is sane (e.g. a flat array accessed in strides; not a bunch of dynamically allocated rows).
Also, I think the first + 1 is a bit silly. You know that x -> x * log(x) is continuous at zero with limit zero, so you should write:
double const pij = mat(i, j) / gfi;
if (pij != 0) { g += pij + log(pij); }
In fact, you might even write the first inner for loop like this, avoiding a division when it isn't needed:
for (int j = 1; j <= N; ++j)
{
if (double pij = mat(i, j))
{
pij /= gfi;
g += pij * log(pij);
}
}

DFT with Frequency Range

We need to change/reimplement standard DFT implementation in GSL, which is
int
FUNCTION(gsl_dft_complex,transform) (const BASE data[],
const size_t stride, const size_t n,
BASE result[],
const gsl_fft_direction sign)
{
size_t i, j, exponent;
const double d_theta = 2.0 * ((int) sign) * M_PI / (double) n;
/* FIXME: check that input length == output length and give error */
for (i = 0; i < n; i++)
{
ATOMIC sum_real = 0;
ATOMIC sum_imag = 0;
exponent = 0;
for (j = 0; j < n; j++)
{
double theta = d_theta * (double) exponent;
/* sum = exp(i theta) * data[j] */
ATOMIC w_real = (ATOMIC) cos (theta);
ATOMIC w_imag = (ATOMIC) sin (theta);
ATOMIC data_real = REAL(data,stride,j);
ATOMIC data_imag = IMAG(data,stride,j);
sum_real += w_real * data_real - w_imag * data_imag;
sum_imag += w_real * data_imag + w_imag * data_real;
exponent = (exponent + i) % n;
}
REAL(result,stride,i) = sum_real;
IMAG(result,stride,i) = sum_imag;
}
return 0;
}
In this implementation, GSL iterates over input vector twice for sample/input size. However, we need to construct for different frequency bins. For instance, we have 4096 samples, but we need to calculate DFT for 128 different frequencies. Could you help me to define or implement required DFT behaviour? Thanks in advance.
EDIT: We do not search for first m frequencies.
Actually, is below approach correct for finding DFT result with given frequency bin number?
N = sample size
B = frequency bin size
k = 0,...,127 X[k] = SUM(0,N){x[i]*exp(-j*2*pi*k*i/B)}
EDIT: I might have not explained the problem for DFT elaborately, nevertheless, I am happy to provide the answer below:
void compute_dft(const std::vector<double>& signal,
const std::vector<double>& frequency_band,
std::vector<double>& result,
const double sampling_rate)
{
if(0 == result.size() || result.size() != (frequency_band.size() << 1)){
result.resize(frequency_band.size() << 1, 0.0);
}
//note complex signal assumption
const double d_theta = -2.0 * PI * sampling_rate;
for(size_t k = 0; k < frequency_band.size(); ++k){
const double f_k = frequency_band[k];
double real_sum = 0.0;
double imag_sum = 0.0;
for(size_t n = 0; n < (signal.size() >> 1); ++n){
double theta = d_theta * f_k * (n + 1);
double w_real = cos(theta);
double w_imag = sin(theta);
double d_real = signal[2*n];
double d_imag = signal[2*n + 1];
real_sum += w_real * d_real - w_imag * d_imag;
imag_sum += w_real * d_imag + w_imag * d_real;
}
result[2*k] = real_sum;
result[2*k + 1] = imag_sum;
}
}
Assuming you just want the the first m output frequencies:
int
FUNCTION(gsl_dft_complex,transform) (const BASE data[],
const size_t stride,
const size_t n, // input size
const size_t m, // output size (m <= n)
BASE result[],
const gsl_fft_direction sign)
{
size_t i, j, exponent;
const double d_theta = 2.0 * ((int) sign) * M_PI / (double) n;
/* FIXME: check that m <= n and give error */
for (i = 0; i < m; i++) // for each of m output bins
{
ATOMIC sum_real = 0;
ATOMIC sum_imag = 0;
exponent = 0;
for (j = 0; j < n; j++) // for each of n input points
{
double theta = d_theta * (double) exponent;
/* sum = exp(i theta) * data[j] */
ATOMIC w_real = (ATOMIC) cos (theta);
ATOMIC w_imag = (ATOMIC) sin (theta);
ATOMIC data_real = REAL(data,stride,j);
ATOMIC data_imag = IMAG(data,stride,j);
sum_real += w_real * data_real - w_imag * data_imag;
sum_imag += w_real * data_imag + w_imag * data_real;
exponent = (exponent + i) % n;
}
REAL(result,stride,i) = sum_real;
IMAG(result,stride,i) = sum_imag;
}
return 0;
}