Removing single construct results in incorrect execution

Removing single construct results in incorrect execution - c++

This code works as expected:
#include <iostream>
#include <cmath>
#include <omp.h>
//https://stackoverflow.com/questions/37970024/jacobi-relaxation-in-mpi
#define max(a, b) (a)>(b)?(a):(b)
const int m = 2001;
const int n = 1500;
const int p = 4;
double v[m + 2][m + 2];
double x[m + 2];
double y[m + 2];
double _new[m + 2][m + 2];
double maxdiffA[p + 1];
int icol, jrow;
int main() {
omp_set_num_threads(p);
double h = 1.0 / (n + 1);
double start = omp_get_wtime();
#pragma omp parallel for private(icol) shared(x, y, v, _new)
for (icol = 0; icol <= n + 1; ++icol) {
x[icol] = y[icol] = icol * h;
_new[icol][0] = v[icol][0] = 6 - 2 * x[icol];
_new[n + 1][icol] = v[n + 1][icol] = 4 - 2 * y[icol];
_new[icol][n + 1] = v[icol][n + 1] = 3 - x[icol];
_new[0][icol] = v[0][icol] = 6 - 3 * y[icol];
}
const double eps = 0.01;
#pragma omp parallel private(icol, jrow) shared(_new, v, maxdiffA)
{
while (true) { //for [iters=1 to maxiters by 2]
#pragma omp single
for (int i = 0; i < p; i++) maxdiffA[i] = 0;
#pragma omp for
for (icol = 1; icol <= n; icol++)
for (jrow = 1; jrow <= n; jrow++)
_new[icol][jrow] =
(v[icol - 1][jrow] + v[icol + 1][jrow] + v[icol][jrow - 1] + v[icol][jrow + 1]) / 4;
#pragma omp for
for (icol = 1; icol <= n; icol++)
for (jrow = 1; jrow <= n; jrow++)
v[icol][jrow] = (_new[icol - 1][jrow] + _new[icol + 1][jrow] + _new[icol][jrow - 1] +
_new[icol][jrow + 1]) / 4;
#pragma omp for
for (icol = 1; icol <= n; icol++)
for (jrow = 1; jrow <= n; jrow++)
maxdiffA[omp_get_thread_num()] = max(maxdiffA[omp_get_thread_num()],
fabs(_new[icol][jrow] - v[icol][jrow]));
#pragma omp barrier
double maxdiff = 0.0;
for (int k = 0; k < p; ++k) {
maxdiff = max(maxdiff, maxdiffA[k]);
}
if (maxdiff < eps)
break;
#pragma omp single
std::cout << maxdiff << std::endl;
}
}
double end = omp_get_wtime();
printf("start = %.16lf\nend = %.16lf\ndiff = %.16lf\n", start, end, end - start);
return 0;
}
It outputs
1.12454
<106 iterations here>
0.0100436
start = 1527366091.3069999217987061
end = 1527366110.8169999122619629
diff = 19.5099999904632568
But if I remove
#pragma omp single
std::cout << maxdiff << std::endl;
the program either seems to run infinitely long or I get
start = 1527368219.8810000419616699
end = 1527368220.5710000991821289
diff = 0.6900000572204590
Why is that so?

You overwrite maxdiffA at the beginning of the while loop - this must be isolated from reading maxdiffA at the end to check the condition. Otherwise one thread may already reset the values before another thread gets the chance to read them. The omp single construct at the end of the loop acts as isolation due to the implicit barrier at the end of omp single constructs. However there is no barrier at the beginning of omp single constructs. Also "a whole lot of code" is not a safe barrier. So if there is no valid implicit barrier, you must protect entry to the reset code with an #pragma omp barrier.
That said I highly recommend to restructure the code to have a shared exit condition that is also computed in a single construct. This makes it more clear that all threads process exit the while-loop at the same time. Otherwise the code is ill-defined.

Related

OpenMP with good performance in C function, but not in C++ class method

I have been trying to port a code for an university-related project from C to C++ while also adapting it to an object-oriented paradigm. The original code makes use of OpenMP, through pragmas to create a parallel variation of a function used to calculate certain values and matrices according to an input given by the user.
In C, I wrote a function that returns a struct data type I have defined myself like such. A note: the DoubVet1D/2D functions basically call calloc to initialize arrays of a size defined by their argument(s).
SaidaEstacionario getDDParallel(DadosEntrada e){
SaidaEstacionario dd;
/*
* Preenche os vetores com os dados do dom�nio
*/
int J = 0;
for (int i = 0; i < e.numReg; i++)
J += e.partReg[i];
int N = e.ordQuad;
dd.dimFi = J + 1;
//NNR = e.NNR;
//double Stop = e.erro;
//TipoContorno = e.CCETipo + e.CCDTipo;
double *St = DoubVet1D(J);
double *Ss = DoubVet1D(J);
double *Q = DoubVet1D(J);
double *h = DoubVet1D(J);
double *S = DoubVet1D(J);
//double *Fi = DoubVet1D(J + 1);
//double *x = DoubVet1D(J + 1);
//double *TaxaAbR = DoubVet1D(e.numReg);
double **psiOld = DoubVet2D(e.ordQuad, J + 1);
double **psi = DoubVet2D(e.ordQuad, J + 1);
double tmpErro = 0;
double erro = 0;
int k = 0;
dd.fi = DoubVet1D(J + 1);
dd.x = DoubVet1D(J + 1);
dd.x[0] = 0;
for (int i = 0; i < e.numReg; i++)
{
double temp = e.tamReg[i] / e.partReg[i];
for (int j = 0; j < e.partReg[i]; j++)
{
h[k] = temp;
Ss[k] = e.ssZon[e.zonReg[i]];
St[k] = e.stZon[e.zonReg[i]];
Q[k] = e.fontReg[i];
dd.x[k + 1] = dd.x[k] + temp;
k++;
}
}
/* Estabelecendo a condi��o de contorno */
for (int n = 0; n < e.ordQuad/2; n++)
{
if(e.cce < 0){
psi[n][0] = 0;
psiOld[n][0] = psi[n][0];
}
else{
psi[n][0] = e.cce;
psiOld[n][0] = psi[n][0];
}
if(e.ccd < 0){
psi[e.ordQuad/2 + n][J] = 0;
psiOld[e.ordQuad/2 + n][0] = psi[e.ordQuad/2 + n][0];
}
else{
psi[e.ordQuad/2 + n][J] = e.ccd;
psiOld[e.ordQuad/2 + n][0] = psi[e.ordQuad/2 + n][0];
}
}
/* Obtendo as quadraturas de Gauss-Legendre */
PLegendre pl = GaussLegendreAbsPes(e.ordQuad);
double *Mi = pl.mi;
double *W = pl.w;
//#############################################
double start;
dd.numInter = 0;
do{
dd.numInter++;
/* Calculando a fonte de espalhamento */
#pragma omp parallel for
for (int j = 0; j < J; j++)
{
S[j] = 0;
for (int n = 0; n < N; n++)
S[j] += 0.25 * Ss[j] * (psi[n][j + 1] + psi[n][j]) * W[n];
}
/* Varredura para a direita/esquerda */
#pragma omp sections
{
#pragma omp section
{
#pragma omp parallel for
for (int m = 0; m < N / 2; m++)
for (int j = 0; j < J; j++)
{
/* Varredura para a direita */
psiOld[m][j + 1] = psi[m][j + 1];
psi[m][j + 1] = ((Mi[m] / h[j] - 0.5 * St[j]) * psi[m][j] + S[j] + Q[j]) / (Mi[m] / h[j] + 0.5 * St[j]);
}
}
#pragma omp section
{
#pragma omp parallel for
for (int m = 0; m < N / 2; m++)
for (int j = 0; j < J; j++)
{
/* Varredura para a esquerda */
psiOld[m + N / 2][J - 1 - j] = psi[m + N / 2][J - 1 - j];
psi[m + N / 2][J - 1 - j] = ((Mi[m] / h[J - 1 - j] - 0.5 * St[J - 1 - j]) * psi[m + N / 2][J - j] + S[J - 1 - j] + Q[J - 1 - j]) / (Mi[m] / h[J - 1 - j] + 0.5 * St[J - 1 - j]);
}
}
} // Aqui h� uma barreira impl�cita
/* Estabelecendo a condi��o de contorno */
if(e.cce < 0 || e.ccd < 0){
#pragma omp parallel for
for (int n = 0; n < e.ordQuad/2; n++)
{
if(e.cce < 0){
psiOld[n][0] = psi[n][0];
psi[n][0] = psi[e.ordQuad/2 + n][ 0];
}
if(e.ccd < 0){
psiOld[e.ordQuad/2 + n][0] = psi[n][0];
psi[e.ordQuad/2 + n][J] = psi[n][J];
}
}
}
/* Calculando o erro para o crit�rio de parada*/
tmpErro = 0;
erro = 0;
for (int m = 0; m < N; m++)
for (int j = 0; j < J; j++){
tmpErro = Abs(psi[m][j] - psiOld[m][j])/psi[m][j];
if(tmpErro > erro) erro = tmpErro;
}
}while(erro > e.erro);
dd.tempoVar = omp_get_wtime() - start;
dd.psi = DoubVet2D(e.ordQuad, J + 1);
/* Calcula o fluxo escalar */
#pragma omp parallel for
for (int j = 0; j <= J; j++){
dd.fi[j] = 0;
for (int m = 0; m < N; m++){
dd.fi[j] += psi[m][j] * W[m];
dd.psi[m][j] = psi[m][j];
}
}
/*Calcula a taxa de absorção*/
int i = 0;
dd.taxaAbsorRegiao = DoubVet1D(e.numReg);
dd.taxaAbsorTotal = 0;
for (int r = 0; r < e.numReg; r++)
{
dd.taxaAbsorRegiao[r] = 0;
#pragma omp parallel for
for (int j = i; j < i + e.partReg[r]; j++)
dd.taxaAbsorRegiao[r] += 0.5 * (dd.fi[j] + dd.fi[j + 1]);
dd.taxaAbsorRegiao[r] *= (St[i] - Ss[i]) * h[i];
dd.taxaAbsorTotal += dd.taxaAbsorRegiao[r];
i += e.partReg[r];
}
return dd;
}
///Execu��o dos c�lculos do m�todo Diamod Difference DD ou Dg estacion�rio vers�o parallel
SaidaEstacionario getDgDDParallel(DadosEntrada e){
SaidaEstacionario saida;
/*
* Preenche os vetores com os dados do dom�nio
*/
int J = 0;
for (int i = 0; i < e.numReg; i++)
J += e.partReg[i];
int N = e.ordQuad;
saida.dimFi = J + 1;
//NNR = e.NNR;
//double Stop = e.erro;
//TipoContorno = e.CCETipo + e.CCDTipo;
double *St = DoubVet1D(J);
double *Ss = DoubVet1D(J);
double *Q = DoubVet1D(J);
double *h = DoubVet1D(J);
double *S = DoubVet1D(J);
//double *Fi = DoubVet1D(J + 1);
//double *x = DoubVet1D(J + 1);
//double *TaxaAbR = DoubVet1D(e.numReg);
double **psiOld = DoubVet2D(e.ordQuad, J + 1);
double **psi = DoubVet2D(e.ordQuad, J + 1);
double tmpErro = 0;
double erro = 0;
int k = 0;
saida.fi = DoubVet1D(J + 1);
saida.x = DoubVet1D(J + 1);
saida.x[0] = 0;
for (int i = 0; i < e.numReg; i++)
{
double temp = e.tamReg[i] / e.partReg[i];
for (int j = 0; j < e.partReg[i]; j++)
{
h[k] = temp;
Ss[k] = e.ssZon[e.zonReg[i]];
St[k] = e.stZon[e.zonReg[i]];
Q[k] = e.fontReg[i];
saida.x[k + 1] = saida.x[k] + temp;
k++;
}
}
/* Estabelecendo a condi��o de contorno */
for (int n = 0; n < e.ordQuad/2; n++)
{
if(e.cce < 0){
psi[n][0] = 0;
psiOld[n][0] = psi[n][0];
}
else{
psi[n][0] = e.cce;
psiOld[n][0] = psi[n][0];
}
if(e.ccd < 0){
psi[e.ordQuad/2 + n][J] = 0;
psiOld[e.ordQuad/2 + n][0] = psi[e.ordQuad/2 + n][0];
}
else{
psi[e.ordQuad/2 + n][J] = e.ccd;
psiOld[e.ordQuad/2 + n][0] = psi[e.ordQuad/2 + n][0];
}
}
/* Obtendo as quadraturas de Gauss-Legendre */
PLegendre pl = GaussLegendreAbsPes(e.ordQuad);
double *Mi = pl.mi;
double *W = pl.w;
//#############################################
double start;
start = omp_get_wtime();
saida.numInter = 0;
do{
saida.numInter++;
/* Calculando a fonte de espalhamento */
#pragma omp parallel for
for (int j = 0; j < J; j++)
{
S[j] = 0;
for (int n = 0; n < N; n++)
S[j] += 0.25 * Ss[j] * (psi[n][j + 1] + psi[n][j]) * W[n];
}
/* Varredura para a direita/esquerda */
#pragma omp sections
{
#pragma omp section
{
#pragma omp parallel for
for (int m = 0; m < N / 2; m++)
for (int j = 0; j < J; j++)
{
/* Varredura para a direita */
psiOld[m][j + 1] = psi[m][j + 1];
psi[m][j + 1] = ((Mi[m] / h[j] - 0.5 * St[j] * (1 - e.teta)) * psi[m][j] + S[j] + Q[j]) / (Mi[m] / h[j] + 0.5 * St[j] * (1 + e.teta));
}
}
#pragma omp section
{
#pragma omp parallel for
for (int m = 0; m < N / 2; m++)
for (int j = 0; j < J; j++)
{
/* Varredura para a esquerda */
psiOld[m + N / 2][J - 1 - j] = psi[m + N / 2][J - 1 - j];
psi[m + N / 2][J - 1 - j] = ((Mi[m] / h[J - 1 - j] - 0.5 * St[J - 1 - j] * (1 - e.teta)) * psi[m + N / 2][J - j] + S[J - 1 - j] + Q[J - 1 - j]) / (Mi[m] / h[J - 1 - j] + 0.5 * St[J - 1 - j] * (1 + e.teta));
}
}
} // Aqui h� uma barreira impl�cita
/* Estabelecendo a condi��o de contorno */
if(e.cce < 0 || e.ccd < 0){
#pragma omp parallel for
for (int n = 0; n < e.ordQuad/2; n++)
{
if(e.cce < 0){
psiOld[n][0] = psi[n][0];
psi[n][0] = psi[e.ordQuad/2 + n][ 0];
}
if(e.ccd < 0){
psiOld[e.ordQuad/2 + n][0] = psi[n][0];
psi[e.ordQuad/2 + n][J] = psi[n][J];
}
}
}
/* Calculando o erro para o crit�rio de parada*/
tmpErro = 0;
erro = 0;
for (int m = 0; m < N; m++)
for (int j = 0; j < J; j++){
tmpErro = Abs(psi[m][j] - psiOld[m][j])/psi[m][j];
if(tmpErro > erro) erro = tmpErro;
}
}while(erro > e.erro);
saida.tempoVar = omp_get_wtime() - start;
saida.psi = DoubVet2D(e.ordQuad, J + 1);
/* Calcula o fluxo escalar */
#pragma omp parallel for
for (int j = 0; j <= J; j++){
saida.fi[j] = 0;
for (int m = 0; m < N; m++){
saida.fi[j] += psi[m][j] * W[m];
saida.psi[m][j] = psi[m][j];
}
}
/*Calcula a taxa de absorção*/
int i = 0;
saida.taxaAbsorRegiao = DoubVet1D(e.numReg);
saida.taxaAbsorTotal = 0;
for (int r = 0; r < e.numReg; r++)
{
saida.taxaAbsorRegiao[r] = 0;
#pragma omp parallel for
for (int j = i; j < i + e.partReg[r]; j++)
saida.taxaAbsorRegiao[r] += 0.5 * (saida.fi[j] + saida.fi[j + 1]);
saida.taxaAbsorRegiao[r] *= (St[i] - Ss[i]) * h[i];
saida.taxaAbsorTotal += saida.taxaAbsorRegiao[r];
i += e.partReg[r];
}
return saida;
}
As you can see, I only use #pragma omp parallel for and #pragma omp parallel section. In C++, as part of the object orientation, I transformed the original SaidaEstacionario struct in two classes called Methodand StationaryMethod (which inherits from Method), and implemented the function above as a method of a subclass at the lower end of the inheritance, while also encapsulating the variables and creating getters/setters. The end result was a StationaryMethodclass, which inherits from StationaryMethod, with the following method:
void StationaryDD::calculateParallel(){
/*
* Preenche os vetores com os dados do dom�nio
*/
int J = 0;
for (int i = 0; i < this->inputData.getNumReg(); i++)
J += this->inputData.getPartReg()[i];
int N = this->inputData.getQuadOrder();
this->dimFi = J + 1;
//NNR = this->inputData.NNR;
//double Stop = this->inputData.erro;
//TipoContorno = this->inputData.getCcl()Tipo + this->inputData.getCcr()Tipo;
double *St = DoubVet1D(J);
double *Ss = DoubVet1D(J);
double *Q = DoubVet1D(J);
double *h = DoubVet1D(J);
double *S = DoubVet1D(J);
//double *Fi = DoubVet1D(J + 1);
//double *x = DoubVet1D(J + 1);
//double *TaxaAbR = DoubVet1D(this->inputData.numReg);
double **psiOld = DoubVet2D(this->inputData.getQuadOrder(), J + 1);
double **psi = DoubVet2D(this->inputData.getQuadOrder(), J + 1);
double tmpErro = 0;
double erro = 0;
int k = 0;
this->fi = DoubVet1D(J + 1);
this->x = DoubVet1D(J + 1);
this->x[0] = 0;
for (int i = 0; i < this->inputData.getNumReg(); i++)
{
double temp = this->inputData.getSizeReg()[i] / this->inputData.getPartReg()[i];
for (int j = 0; j < this->inputData.getPartReg()[i]; j++)
{
h[k] = temp;
Ss[k] = this->inputData.getSsZon()[this->inputData.getZonReg()[i]];
St[k] = this->inputData.getStZon()[this->inputData.getZonReg()[i]];
Q[k] = this->inputData.getSrcReg()[i];
this->x[k + 1] = this->x[k] + temp;
k++;
}
}
/* Estabelecendo a condi��o de contorno */
for (int n = 0; n < this->inputData.getQuadOrder()/2; n++)
{
if(this->inputData.getCcl() < 0){
psi[n][0] = 0;
psiOld[n][0] = psi[n][0];
}
else{
psi[n][0] = this->inputData.getCcl();
psiOld[n][0] = psi[n][0];
}
if(this->inputData.getCcr() < 0){
psi[this->inputData.getQuadOrder()/2 + n][J] = 0;
psiOld[this->inputData.getQuadOrder()/2 + n][0] = psi[this->inputData.getQuadOrder()/2 + n][0];
}
else{
psi[this->inputData.getQuadOrder()/2 + n][J] = this->inputData.getCcr();
psiOld[this->inputData.getQuadOrder()/2 + n][0] = psi[this->inputData.getQuadOrder()/2 + n][0];
}
}
/* Obtendo as quadraturas de Gauss-Legendre */
PLegendre pl = PLegendre(this->inputData.getQuadOrder());
double *Mi = pl.getMi();
double *W = pl.getW();
//#############################################
double start;
this->numIter = 0;
do{
this->numIter++;
/* Calculando a fonte de espalhamento */
#pragma omp parallel for
for (int j = 0; j < J; j++)
{
printf("T%d: j = %d\n", omp_get_thread_num(), j);
S[j] = 0;
for (int n = 0; n < N; n++)
S[j] += 0.25 * Ss[j] * (psi[n][j + 1] + psi[n][j]) * W[n];
}
/* Varredura para a direita/esquerda */
#pragma omp sections
{
#pragma omp section
{
#pragma omp parallel for
for (int m = 0; m < N / 2; m++)
for (int j = 0; j < J; j++)
{
/* Varredura para a direita */
psiOld[m][j + 1] = psi[m][j + 1];
psi[m][j + 1] = ((Mi[m] / h[j] - 0.5 * St[j]) * psi[m][j] + S[j] + Q[j]) / (Mi[m] / h[j] + 0.5 * St[j]);
}
}
#pragma omp section
{
#pragma omp parallel for
for (int m = 0; m < N / 2; m++)
for (int j = 0; j < J; j++)
{
/* Varredura para a esquerda */
psiOld[m + N / 2][J - 1 - j] = psi[m + N / 2][J - 1 - j];
psi[m + N / 2][J - 1 - j] = ((Mi[m] / h[J - 1 - j] - 0.5 * St[J - 1 - j]) * psi[m + N / 2][J - j] + S[J - 1 - j] + Q[J - 1 - j]) / (Mi[m] / h[J - 1 - j] + 0.5 * St[J - 1 - j]);
}
}
} // Aqui h� uma barreira impl�cita
/* Estabelecendo a condi��o de contorno */
if(this->inputData.getCcl() < 0 || this->inputData.getCcr() < 0){
#pragma omp parallel for
for (int n = 0; n < this->inputData.getQuadOrder()/2; n++)
{
if(this->inputData.getCcl() < 0){
psiOld[n][0] = psi[n][0];
psi[n][0] = psi[this->inputData.getQuadOrder()/2 + n][ 0];
}
if(this->inputData.getCcr() < 0){
psiOld[this->inputData.getQuadOrder()/2 + n][0] = psi[n][0];
psi[this->inputData.getQuadOrder()/2 + n][J] = psi[n][J];
}
}
}
/* Calculando o erro para o crit�rio de parada*/
tmpErro = 0;
erro = 0;
for (int m = 0; m < N; m++)
for (int j = 0; j < J; j++){
tmpErro = fabs(psi[m][j] - psiOld[m][j])/psi[m][j];
if(tmpErro > erro) erro = tmpErro;
}
}while(erro > this->inputData.getE());
this->tempoVar = omp_get_wtime() - start;
this->psi = DoubVet2D(this->inputData.getQuadOrder(), J + 1);
/* Calcula o fluxo escalar */
#pragma omp parallel for
for (int j = 0; j <= J; j++){
this->fi[j] = 0;
for (int m = 0; m < N; m++){
this->fi[j] += psi[m][j] * W[m];
this->psi[m][j] = psi[m][j];
}
}
/*Calcula a taxa de absorção*/
int i = 0;
this->taxaAbsorRegiao = DoubVet1D(this->inputData.getNumReg());
this->taxaAbsorTotal = 0;
for (int r = 0; r < this->inputData.getNumReg(); r++)
{
this->taxaAbsorRegiao[r] = 0;
#pragma omp parallel for
for (int j = i; j < i + this->inputData.getPartReg()[r]; j++)
this->taxaAbsorRegiao[r] += 0.5 * (this->fi[j] + this->fi[j + 1]);
this->taxaAbsorRegiao[r] *= (St[i] - Ss[i]) * h[i];
this->taxaAbsorTotal += this->taxaAbsorRegiao[r];
i += this->inputData.getPartReg()[r];
}
}
Which basically does the same as the function above, but instead of returning the struct data type, simply stores all the data in the object itself defined by the class.
However, when comparing the time spent in each method, the method implemented in a C++ class takes a LOT more time than the original C function to execute. It is extremely bad to a point that, where the original C method would result in a fair bit of time saved compared to a non-paralellized variation of the same method (that is, without using OpenMP pragmas), this one will be much worse than it, taking up to a minute to finish a calculation that would be done in a fraction of a second. At first, I thought that the encapsulation could be slowing things down, but simply making everything public and calling the attributes directly instead of getters did not work. Would anyone have any insight as to why this could be happening?

Why omp version is slower than serial?

It's a follow-up question to this one
Now I have the code:
#include <iostream>
#include <cmath>
#include <omp.h>
#define max(a, b) (a)>(b)?(a):(b)
const int m = 2001;
const int n = 2000;
const int p = 4;
double v[m + 2][m + 2];
double x[m + 2];
double y[m + 2];
double _new[m + 2][m + 2];
double maxdiffA[p + 1];
int icol, jrow;
int main() {
omp_set_num_threads(p);
double h = 1.0 / (n + 1);
double start = omp_get_wtime();
#pragma omp parallel for private(icol) shared(x, y, v, _new)
for (icol = 0; icol <= n + 1; ++icol) {
x[icol] = y[icol] = icol * h;
_new[icol][0] = v[icol][0] = 6 - 2 * x[icol];
_new[n + 1][icol] = v[n + 1][icol] = 4 - 2 * y[icol];
_new[icol][n + 1] = v[icol][n + 1] = 3 - x[icol];
_new[0][icol] = v[0][icol] = 6 - 3 * y[icol];
}
const double eps = 0.01;
#pragma omp parallel private(icol, jrow) shared(_new, v, maxdiffA)
{
while (true) { //for [iters=1 to maxiters by 2]
#pragma omp single
for (int i = 0; i < p; i++) maxdiffA[i] = 0;
#pragma omp for
for (icol = 1; icol <= n; icol++)
for (jrow = 1; jrow <= n; jrow++)
_new[icol][jrow] =
(v[icol - 1][jrow] + v[icol + 1][jrow] + v[icol][jrow - 1] + v[icol][jrow + 1]) / 4;
#pragma omp for
for (icol = 1; icol <= n; icol++)
for (jrow = 1; jrow <= n; jrow++)
v[icol][jrow] = (_new[icol - 1][jrow] + _new[icol + 1][jrow] + _new[icol][jrow - 1] +
_new[icol][jrow + 1]) / 4;
#pragma omp for
for (icol = 1; icol <= n; icol++)
for (jrow = 1; jrow <= n; jrow++)
maxdiffA[omp_get_thread_num()] = max(maxdiffA[omp_get_thread_num()],
fabs(_new[icol][jrow] - v[icol][jrow]));
#pragma omp barrier
double maxdiff = 0.0;
for (int k = 0; k < p; ++k) {
maxdiff = max(maxdiff, maxdiffA[k]);
}
if (maxdiff < eps)
break;
#pragma omp barrier
//#pragma omp single
//std::cout << maxdiff << std::endl;
}
}
double end = omp_get_wtime();
printf("start = %.16lf\nend = %.16lf\ndiff = %.16lf\n", start, end, end - start);
return 0;
}
But why it works 2-3 times slower (32sec vs 18sec) than serial analog:
#include <iostream>
#include <cmath>
#include <omp.h>
#define max(a,b) (a)>(b)?(a):(b)
const int m = 2001;
const int n = 2000;
double v[m + 2][m + 2];
double x[m + 2];
double y[m + 2];
double _new[m + 2][m + 2];
int main() {
double h = 1.0 / (n + 1);
double start = omp_get_wtime();
for (int i = 0; i <= n + 1; ++i) {
x[i] = y[i] = i * h;
_new[i][0]=v[i][0] = 6 - 2 * x[i];
_new[n + 1][i]=v[n + 1][i] = 4 - 2 * y[i];
_new[i][n + 1]=v[i][n + 1] = 3 - x[i];
_new[0][i]=v[0][i] = 6 - 3 * y[i];
}
const double eps=0.01;
while(true){ //for [iters=1 to maxiters by 2]
double maxdiff=0.0;
for (int i=1;i<=n;i++)
for (int j=1;j<=n;j++)
_new[i][j]=(v[i-1][j]+v[i+1][j]+v[i][j-1]+v[i][j+1])/4;
for (int i=1;i<=n;i++)
for (int j=1;j<=n;j++)
v[i][j]=(_new[i-1][j]+_new[i+1][j]+_new[i][j-1]+_new[i][j+1])/4;
for (int i=1;i<=n;i++)
for (int j=1;j<=n;j++)
maxdiff=max(maxdiff, fabs(_new[i][j]-v[i][j]));
if(maxdiff<eps) break;
std::cout << maxdiff<<std::endl;
}
double end = omp_get_wtime();
printf("start = %.16lf\nend = %.16lf\ndiff = %.16lf\n", start, end, end - start);
return 0;
}
Also interesting that it works SAME TIME as version (I can post it here if you say so) which looks like so
while(true){ //106 iteratins here!!!
#pragma omp paralell for
for(...)
#pragma omp paralell for
for(...)
#pragma omp paralell for
for(...)
}
But I thought that what making omp code slow is spawning threads inside while loop 106 times... But no! Then probably threads simultaneously write to the same array cells.. But where does it happen? I don't see it could you show me please?
Maybe it's because too much barriers? But Lecturer told me to implement the code like so and "analyse it" Maybe the answer is "Jacobi algorithm isn't meant to run well in parallel"? Or it's just my lame coding?

So the root of evel was
max(maxdiffA[w],fabs(_new[icol][jrow] - v[icol][jrow]))
because it's
#define max(a, b) (a)>(b)?(a):(b)
It's probably creating TOO much branching ('if's ) Without this thing parallel version works 8 times faster and loading CPU 68% instead of 99%..
The starange thing: same "max" doesn't affect serioal version

I am writing to make you aware of a few situations. It is not short to write in a comment, so I decided to write as an answer.
every time a thread is made, it takes some time for its creation. if your program's running time in a single core is short, then the creation of threads will make this time longer for multi-core.
plus using a barrier makes all your threads wait for others, which could somehow be slowed down in cpu. this way, even if all threads finish the job very fast, that last one will make the total run time longer.
try to run your program with bigger sized arrays where time is around 2 minutes for single threading. then make your way to multi-core.
then try to wrap your main code in a normal loop to run it a few times and prints the timings for each. the first run of the loop might be slow because of loading libraries, but the next runs should be faster to prove the increasing speed.
if above suggestions do not give a result, then it means your coding needs more editing.
EDIT:
To downvoters, If you don't like a post, please at least be polite and leave a comment. Or better, give your own answer so be helpful to community.

Optimize outer loop with OpenMP and a reduction

I struggle a bit with a function. The calculation is wrong if I try to parallelize the outer loop with a
#pragma omp parallel reduction(+:det).
Can someone show me how to solve it and why it is failing?
// template<class T> using vector2D = std::vector<std::vector<T>>;
float Det(vector2DF &a, int n)
{
vector2DF m(n - 1, vector1DF(n - 1, 0));
if (n == 1) return a[0][0];
if (n == 2) return a[0][0] * a[1][1] - a[1][0] * a[0][1];
float det = 0;
for (int i = 0; i < n; i++)
{
int l = 0;
#pragma omp parallel for private(l)
for (int j = 1; j < n; j++)
{
l = 0;
for (int k = 0; k < n; k++)
{
if (k == i) continue;
m[j - 1][l] = a[j][k];
l++;
}
}
det += std::pow(-1.0, 1.0 + i + 1.0) * a[0][i] * Det(m, n - 1);
}
return det;
}

If you parallelize the outer loop, there is a race condition on this line:
m[j - 1][l] = a[j][k];
Also you likely want a parallel for reduction instead of just a parallel reduction.
The issue is, that m is shared, even though that wouldn't be necessary given that it is completely overwritten in the inner loop. Always declare variables as locally as possible, this avoids issues with wrongly shared variables, e.g.:
float Det(vector2DF &a, int n)
{
if (n == 1) return a[0][0];
if (n == 2) return a[0][0] * a[1][1] - a[1][0] * a[0][1];
float det = 0;
#pragma omp parallel reduction(+:det)
for (int i = 0; i < n; i++)
{
vector2DF m(n - 1, vector1DF(n - 1, 0));
for (int j = 1; j < n; j++)
{
int l = 0;
for (int k = 0; k < n; k++)
{
if (k == i) continue;
m[j - 1][l] = a[j][k];
l++;
}
}
det += std::pow(-1.0, 1.0 + i + 1.0) * a[0][i] * Det(m, n - 1);
}
return det;
}
Now that is correct, but since m can be expensive to allocate, performance could benefit from not doing it in each and every iteration. This can be done by splitting parallel and for directives as such:
float Det(vector2DF &a, int n)
{
if (n == 1) return a[0][0];
if (n == 2) return a[0][0] * a[1][1] - a[1][0] * a[0][1];
float det = 0;
#pragma omp parallel reduction(+:det)
{
vector2DF m(n - 1, vector1DF(n - 1, 0));
#pragma omp parallel for
for (int i = 0; i < n; i++)
{
for (int j = 1; j < n; j++)
{
int l = 0;
for (int k = 0; k < n; k++)
{
if (k == i) continue;
m[j - 1][l] = a[j][k];
l++;
}
}
det += std::pow(-1.0, 1.0 + i + 1.0) * a[0][i] * Det(m, n - 1);
}
}
return det;
}
Now you could also just declare m as firstprivate, but that would assume that the copy constructor makes a completely independent deep-copy and thus make the code more difficult to reason about.
Please be aware that you should always include expected output, actual output and a minimal complete and verifiable example.

How to allocate big arrays on the stack?

I have implemented this code from this to vectorizing it:
int c=0;
for (int j=-halfHeight; j<=halfHeight; ++j)
{
#pragma omp simd
for(int i=-halfWidth; i<=halfWidth; ++i){
wx_[c] = ofsx + j * a12 + i * a11;
wy_[c] = ofsy + j * a22 + i * a21;
x_[c] = (int) floor(wx_[c]);
y_[c] = (int) floor(wy_[c]);
++c;
}
}
std::cout<<"First size="<<size<<std::endl;
float imat_1[size];
std::cout<<"imat1"<<std::endl;
float imat_2[size];
std::cout<<"imat2"<<std::endl;
float imat_3[size];
std::cout<<"imat3"<<std::endl;
float imat_4[size];`
std::cout<<"imat4"<<std::endl;
#pragma omp simd
for(int c=0; c<size; c++){
if (x_[c] >= 0 && y_[c] >= 0 && x_[c] < width && y_[c] < height){
wx_[c] -= x_[c];
wy_[c] -= y_[c];
imat_1[c] = im.at<float>(y_[c],x_[c]);
imat_2[c] = im.at<float>(y_[c],x_[c]+1);
imat_3[c] = im.at<float>(y_[c]+1,x_[c]);
imat_4[c] = im.at<float>(y_[c]+1,x_[c]+1);
}
else{
wx_[c] = 0;
wy_[c] = 0;
imat_1[c] = 0;
imat_2[c] = 0;
imat_3[c] = 0;
imat_4[c] = 0;
ret = true;
}
}
std::cout<<"Second"<<std::endl;
#pragma omp simd
for(int c=0; c<size; c++){
out[c] =
(1.0f - wy_[c]) * ((1.0f - wx_[c]) * imat_1[c] + wx_[c] * imat_2[c]) +
( wy_[c]) * ((1.0f - wx_[c]) * imat_3[c] + wx_[c] * imat_4[c]);
}
In particular, size can reach up to 275625. When I run this code, it goes in segmentation fault at line float imat_4[size];. In fact, this is solved by using float *imat_4 = (float*)malloc(sizeof(float)*size);
I think that this is because of this, so we run out of memory on the stack... But then, how can I solve this? I don't see much other possibilities for vectorizing this code.
notice that performance are crucial here, so allocating on the stack is less efficient (right?)

Open Mp nested parallelism

So I have outer parallel region with two inner parallel regions. Is it possible to put 2 threads into outer parallel and 4 threads into each inner ones? I made something like this, but it seems not work how i want it to. Any suggestions?
start_r = omp_get_wtime();
omp_set_nested(1);
omp_set_num_threads(2);
#pragma omp parallel
{
printf("Thread %d executes the outer parallel region\n",omp_get_thread_num());
omp_set_num_threads(4);
#pragma omp parellel for private(i,j,color)schedule(guided, chunk) default(shared)
{
// Blur
for (int i = 1; i < x-1; i++)
for (int j = 1; j < y-1; j++)
for (int k = 0; k < 3; k++)
{
wynik = 0;
wynik = ((color[(i-1)][((j - 1))][k] +
color[(i-1)][j][k] +
color[(i-1)][(j + 1)][k] +
color[i][(j - 1)][k] +
color[i][j][k] +
color[i][(j + 1)][k] +
color[(i+1)][(j - 1)][k] +
color[(i+1)][j][k] +
color[(i+1)][(j + 1)][k])/9);
if (wynik>255)wynik = 255;
if (wynik<0)wynik = 0;
color2[i][j][k] = wynik;
}
stop_r = omp_get_wtime();
cout << "Wyostrzenie zejelo : " << (stop_r-start_r) <<" sekund"<< endl;
cout<<omp_get_nested( )<<endl;
cout<<"Ilość wątków dla rozmycia : "<<omp_get_num_threads( )<<endl;
printf("Thread %d executes the inner parallel region\n",omp_get_thread_num());
}
omp_set_num_threads(4);
#pragma omp parellel for schedule(guided, chunk) privat(i,j,color) default(shared)
{
// Sharp
for (int i = 1; i < x - 1; i++)
for (int j = 1; j < y - 1; j++)
for (int k = 0; k < 3; k++)
{
wynik = 0;
wynik = (color[(i-1)][(j - 1)][k] * (0) +
color[(i-1)][j][k] * (-1) +
color[(i-1)][(j + 1)][k] * (0) +
color[i][(j - 1)][k] * (-1) +
color[i][j][k] * 20 +
color[i][(j + 1)][k] * (-1) +
color[(i+1)][(j - 1)][k] * (0) +
color[(i+1)][j][k] * (-1) +
color[(i+1)][(j + 1)][k] * (0))/16;
wynik = wynik % 255;
color3[i][j][k] = wynik;
}
cout<<omp_get_nested( )<<endl;
cout<<"Ilość wątków dla wyostrzenia : "<<omp_get_num_threads( )<<endl;
printf("Thread %d executes the inner parallel region\n",omp_get_thread_num());
}
}
for (int j = 0; j < y; j++)
for (int i = 0; i < x; i++)
{
fwrite(color2[i][j], 1, 3, fp2);
fwrite(color3[i][j], 1, 3, fp3);
}
fclose(fp);
fclose(fp2);
fclose(fp3);
system("PAUSE");
return 0;
}
}

This works in VS2012
example:
#include <iostream>
#include <omp.h>
int main()
{
omp_set_nested(2);
#pragma omp parallel num_threads( 2 )
{
int threadID1 = omp_get_thread_num();
#pragma omp parallel num_threads( 4 )
{
int threadID2 = omp_get_thread_num();
#pragma omp critical
{
std::cout << "tID1: " << threadID1 << std::endl;
std::cout << "tID2: " << threadID2 << std::endl;
std::cout << std::endl;
}
}
}
return EXIT_SUCCESS;
}
Output:
tID1: 0
tID2: 0
tID1: 0
tID2: 2
tID1: 0
tID2: 1
tID1: 0
tID2: 3
tID1: 1
tID2: 0
tID1: 1
tID2: 1
tID1: 1
tID2: 2
tID1: 1
tID2: 3

It is possible to set the number of threads on a loop with this:
#pragma parallel for num_threads(variable)
see also this post openmp difference beetween num_threads vs. omp_set_num_threads vs OMP_NUM_THREADS

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

Removing single construct results in incorrect execution - c++

Related

OpenMP with good performance in C function, but not in C++ class method

Why omp version is slower than serial?

Optimize outer loop with OpenMP and a reduction

How to allocate big arrays on the stack?

Open Mp nested parallelism

Categories

Resources