Open Mp nested parallelism - c++

So I have outer parallel region with two inner parallel regions. Is it possible to put 2 threads into outer parallel and 4 threads into each inner ones? I made something like this, but it seems not work how i want it to. Any suggestions?
start_r = omp_get_wtime();
omp_set_nested(1);
omp_set_num_threads(2);
#pragma omp parallel
{
printf("Thread %d executes the outer parallel region\n",omp_get_thread_num());
omp_set_num_threads(4);
#pragma omp parellel for private(i,j,color)schedule(guided, chunk) default(shared)
{
// Blur
for (int i = 1; i < x-1; i++)
for (int j = 1; j < y-1; j++)
for (int k = 0; k < 3; k++)
{
wynik = 0;
wynik = ((color[(i-1)][((j - 1))][k] +
color[(i-1)][j][k] +
color[(i-1)][(j + 1)][k] +
color[i][(j - 1)][k] +
color[i][j][k] +
color[i][(j + 1)][k] +
color[(i+1)][(j - 1)][k] +
color[(i+1)][j][k] +
color[(i+1)][(j + 1)][k])/9);
if (wynik>255)wynik = 255;
if (wynik<0)wynik = 0;
color2[i][j][k] = wynik;
}
stop_r = omp_get_wtime();
cout << "Wyostrzenie zejelo : " << (stop_r-start_r) <<" sekund"<< endl;
cout<<omp_get_nested( )<<endl;
cout<<"Ilość wątków dla rozmycia : "<<omp_get_num_threads( )<<endl;
printf("Thread %d executes the inner parallel region\n",omp_get_thread_num());
}
omp_set_num_threads(4);
#pragma omp parellel for schedule(guided, chunk) privat(i,j,color) default(shared)
{
// Sharp
for (int i = 1; i < x - 1; i++)
for (int j = 1; j < y - 1; j++)
for (int k = 0; k < 3; k++)
{
wynik = 0;
wynik = (color[(i-1)][(j - 1)][k] * (0) +
color[(i-1)][j][k] * (-1) +
color[(i-1)][(j + 1)][k] * (0) +
color[i][(j - 1)][k] * (-1) +
color[i][j][k] * 20 +
color[i][(j + 1)][k] * (-1) +
color[(i+1)][(j - 1)][k] * (0) +
color[(i+1)][j][k] * (-1) +
color[(i+1)][(j + 1)][k] * (0))/16;
wynik = wynik % 255;
color3[i][j][k] = wynik;
}
cout<<omp_get_nested( )<<endl;
cout<<"Ilość wątków dla wyostrzenia : "<<omp_get_num_threads( )<<endl;
printf("Thread %d executes the inner parallel region\n",omp_get_thread_num());
}
}
for (int j = 0; j < y; j++)
for (int i = 0; i < x; i++)
{
fwrite(color2[i][j], 1, 3, fp2);
fwrite(color3[i][j], 1, 3, fp3);
}
fclose(fp);
fclose(fp2);
fclose(fp3);
system("PAUSE");
return 0;
}
}

This works in VS2012
example:
#include <iostream>
#include <omp.h>
int main()
{
omp_set_nested(2);
#pragma omp parallel num_threads( 2 )
{
int threadID1 = omp_get_thread_num();
#pragma omp parallel num_threads( 4 )
{
int threadID2 = omp_get_thread_num();
#pragma omp critical
{
std::cout << "tID1: " << threadID1 << std::endl;
std::cout << "tID2: " << threadID2 << std::endl;
std::cout << std::endl;
}
}
}
return EXIT_SUCCESS;
}
Output:
tID1: 0
tID2: 0
tID1: 0
tID2: 2
tID1: 0
tID2: 1
tID1: 0
tID2: 3
tID1: 1
tID2: 0
tID1: 1
tID2: 1
tID1: 1
tID2: 2
tID1: 1
tID2: 3

It is possible to set the number of threads on a loop with this:
#pragma parallel for num_threads(variable)
see also this post openmp difference beetween num_threads vs. omp_set_num_threads vs OMP_NUM_THREADS

Related

OpenMP with good performance in C function, but not in C++ class method

I have been trying to port a code for an university-related project from C to C++ while also adapting it to an object-oriented paradigm. The original code makes use of OpenMP, through pragmas to create a parallel variation of a function used to calculate certain values and matrices according to an input given by the user.
In C, I wrote a function that returns a struct data type I have defined myself like such. A note: the DoubVet1D/2D functions basically call calloc to initialize arrays of a size defined by their argument(s).
SaidaEstacionario getDDParallel(DadosEntrada e){
SaidaEstacionario dd;
/*
* Preenche os vetores com os dados do dom�nio
*/
int J = 0;
for (int i = 0; i < e.numReg; i++)
J += e.partReg[i];
int N = e.ordQuad;
dd.dimFi = J + 1;
//NNR = e.NNR;
//double Stop = e.erro;
//TipoContorno = e.CCETipo + e.CCDTipo;
double *St = DoubVet1D(J);
double *Ss = DoubVet1D(J);
double *Q = DoubVet1D(J);
double *h = DoubVet1D(J);
double *S = DoubVet1D(J);
//double *Fi = DoubVet1D(J + 1);
//double *x = DoubVet1D(J + 1);
//double *TaxaAbR = DoubVet1D(e.numReg);
double **psiOld = DoubVet2D(e.ordQuad, J + 1);
double **psi = DoubVet2D(e.ordQuad, J + 1);
double tmpErro = 0;
double erro = 0;
int k = 0;
dd.fi = DoubVet1D(J + 1);
dd.x = DoubVet1D(J + 1);
dd.x[0] = 0;
for (int i = 0; i < e.numReg; i++)
{
double temp = e.tamReg[i] / e.partReg[i];
for (int j = 0; j < e.partReg[i]; j++)
{
h[k] = temp;
Ss[k] = e.ssZon[e.zonReg[i]];
St[k] = e.stZon[e.zonReg[i]];
Q[k] = e.fontReg[i];
dd.x[k + 1] = dd.x[k] + temp;
k++;
}
}
/* Estabelecendo a condi��o de contorno */
for (int n = 0; n < e.ordQuad/2; n++)
{
if(e.cce < 0){
psi[n][0] = 0;
psiOld[n][0] = psi[n][0];
}
else{
psi[n][0] = e.cce;
psiOld[n][0] = psi[n][0];
}
if(e.ccd < 0){
psi[e.ordQuad/2 + n][J] = 0;
psiOld[e.ordQuad/2 + n][0] = psi[e.ordQuad/2 + n][0];
}
else{
psi[e.ordQuad/2 + n][J] = e.ccd;
psiOld[e.ordQuad/2 + n][0] = psi[e.ordQuad/2 + n][0];
}
}
/* Obtendo as quadraturas de Gauss-Legendre */
PLegendre pl = GaussLegendreAbsPes(e.ordQuad);
double *Mi = pl.mi;
double *W = pl.w;
//#############################################
double start;
dd.numInter = 0;
do{
dd.numInter++;
/* Calculando a fonte de espalhamento */
#pragma omp parallel for
for (int j = 0; j < J; j++)
{
S[j] = 0;
for (int n = 0; n < N; n++)
S[j] += 0.25 * Ss[j] * (psi[n][j + 1] + psi[n][j]) * W[n];
}
/* Varredura para a direita/esquerda */
#pragma omp sections
{
#pragma omp section
{
#pragma omp parallel for
for (int m = 0; m < N / 2; m++)
for (int j = 0; j < J; j++)
{
/* Varredura para a direita */
psiOld[m][j + 1] = psi[m][j + 1];
psi[m][j + 1] = ((Mi[m] / h[j] - 0.5 * St[j]) * psi[m][j] + S[j] + Q[j]) / (Mi[m] / h[j] + 0.5 * St[j]);
}
}
#pragma omp section
{
#pragma omp parallel for
for (int m = 0; m < N / 2; m++)
for (int j = 0; j < J; j++)
{
/* Varredura para a esquerda */
psiOld[m + N / 2][J - 1 - j] = psi[m + N / 2][J - 1 - j];
psi[m + N / 2][J - 1 - j] = ((Mi[m] / h[J - 1 - j] - 0.5 * St[J - 1 - j]) * psi[m + N / 2][J - j] + S[J - 1 - j] + Q[J - 1 - j]) / (Mi[m] / h[J - 1 - j] + 0.5 * St[J - 1 - j]);
}
}
} // Aqui h� uma barreira impl�cita
/* Estabelecendo a condi��o de contorno */
if(e.cce < 0 || e.ccd < 0){
#pragma omp parallel for
for (int n = 0; n < e.ordQuad/2; n++)
{
if(e.cce < 0){
psiOld[n][0] = psi[n][0];
psi[n][0] = psi[e.ordQuad/2 + n][ 0];
}
if(e.ccd < 0){
psiOld[e.ordQuad/2 + n][0] = psi[n][0];
psi[e.ordQuad/2 + n][J] = psi[n][J];
}
}
}
/* Calculando o erro para o crit�rio de parada*/
tmpErro = 0;
erro = 0;
for (int m = 0; m < N; m++)
for (int j = 0; j < J; j++){
tmpErro = Abs(psi[m][j] - psiOld[m][j])/psi[m][j];
if(tmpErro > erro) erro = tmpErro;
}
}while(erro > e.erro);
dd.tempoVar = omp_get_wtime() - start;
dd.psi = DoubVet2D(e.ordQuad, J + 1);
/* Calcula o fluxo escalar */
#pragma omp parallel for
for (int j = 0; j <= J; j++){
dd.fi[j] = 0;
for (int m = 0; m < N; m++){
dd.fi[j] += psi[m][j] * W[m];
dd.psi[m][j] = psi[m][j];
}
}
/*Calcula a taxa de absorção*/
int i = 0;
dd.taxaAbsorRegiao = DoubVet1D(e.numReg);
dd.taxaAbsorTotal = 0;
for (int r = 0; r < e.numReg; r++)
{
dd.taxaAbsorRegiao[r] = 0;
#pragma omp parallel for
for (int j = i; j < i + e.partReg[r]; j++)
dd.taxaAbsorRegiao[r] += 0.5 * (dd.fi[j] + dd.fi[j + 1]);
dd.taxaAbsorRegiao[r] *= (St[i] - Ss[i]) * h[i];
dd.taxaAbsorTotal += dd.taxaAbsorRegiao[r];
i += e.partReg[r];
}
return dd;
}
///Execu��o dos c�lculos do m�todo Diamod Difference DD ou Dg estacion�rio vers�o parallel
SaidaEstacionario getDgDDParallel(DadosEntrada e){
SaidaEstacionario saida;
/*
* Preenche os vetores com os dados do dom�nio
*/
int J = 0;
for (int i = 0; i < e.numReg; i++)
J += e.partReg[i];
int N = e.ordQuad;
saida.dimFi = J + 1;
//NNR = e.NNR;
//double Stop = e.erro;
//TipoContorno = e.CCETipo + e.CCDTipo;
double *St = DoubVet1D(J);
double *Ss = DoubVet1D(J);
double *Q = DoubVet1D(J);
double *h = DoubVet1D(J);
double *S = DoubVet1D(J);
//double *Fi = DoubVet1D(J + 1);
//double *x = DoubVet1D(J + 1);
//double *TaxaAbR = DoubVet1D(e.numReg);
double **psiOld = DoubVet2D(e.ordQuad, J + 1);
double **psi = DoubVet2D(e.ordQuad, J + 1);
double tmpErro = 0;
double erro = 0;
int k = 0;
saida.fi = DoubVet1D(J + 1);
saida.x = DoubVet1D(J + 1);
saida.x[0] = 0;
for (int i = 0; i < e.numReg; i++)
{
double temp = e.tamReg[i] / e.partReg[i];
for (int j = 0; j < e.partReg[i]; j++)
{
h[k] = temp;
Ss[k] = e.ssZon[e.zonReg[i]];
St[k] = e.stZon[e.zonReg[i]];
Q[k] = e.fontReg[i];
saida.x[k + 1] = saida.x[k] + temp;
k++;
}
}
/* Estabelecendo a condi��o de contorno */
for (int n = 0; n < e.ordQuad/2; n++)
{
if(e.cce < 0){
psi[n][0] = 0;
psiOld[n][0] = psi[n][0];
}
else{
psi[n][0] = e.cce;
psiOld[n][0] = psi[n][0];
}
if(e.ccd < 0){
psi[e.ordQuad/2 + n][J] = 0;
psiOld[e.ordQuad/2 + n][0] = psi[e.ordQuad/2 + n][0];
}
else{
psi[e.ordQuad/2 + n][J] = e.ccd;
psiOld[e.ordQuad/2 + n][0] = psi[e.ordQuad/2 + n][0];
}
}
/* Obtendo as quadraturas de Gauss-Legendre */
PLegendre pl = GaussLegendreAbsPes(e.ordQuad);
double *Mi = pl.mi;
double *W = pl.w;
//#############################################
double start;
start = omp_get_wtime();
saida.numInter = 0;
do{
saida.numInter++;
/* Calculando a fonte de espalhamento */
#pragma omp parallel for
for (int j = 0; j < J; j++)
{
S[j] = 0;
for (int n = 0; n < N; n++)
S[j] += 0.25 * Ss[j] * (psi[n][j + 1] + psi[n][j]) * W[n];
}
/* Varredura para a direita/esquerda */
#pragma omp sections
{
#pragma omp section
{
#pragma omp parallel for
for (int m = 0; m < N / 2; m++)
for (int j = 0; j < J; j++)
{
/* Varredura para a direita */
psiOld[m][j + 1] = psi[m][j + 1];
psi[m][j + 1] = ((Mi[m] / h[j] - 0.5 * St[j] * (1 - e.teta)) * psi[m][j] + S[j] + Q[j]) / (Mi[m] / h[j] + 0.5 * St[j] * (1 + e.teta));
}
}
#pragma omp section
{
#pragma omp parallel for
for (int m = 0; m < N / 2; m++)
for (int j = 0; j < J; j++)
{
/* Varredura para a esquerda */
psiOld[m + N / 2][J - 1 - j] = psi[m + N / 2][J - 1 - j];
psi[m + N / 2][J - 1 - j] = ((Mi[m] / h[J - 1 - j] - 0.5 * St[J - 1 - j] * (1 - e.teta)) * psi[m + N / 2][J - j] + S[J - 1 - j] + Q[J - 1 - j]) / (Mi[m] / h[J - 1 - j] + 0.5 * St[J - 1 - j] * (1 + e.teta));
}
}
} // Aqui h� uma barreira impl�cita
/* Estabelecendo a condi��o de contorno */
if(e.cce < 0 || e.ccd < 0){
#pragma omp parallel for
for (int n = 0; n < e.ordQuad/2; n++)
{
if(e.cce < 0){
psiOld[n][0] = psi[n][0];
psi[n][0] = psi[e.ordQuad/2 + n][ 0];
}
if(e.ccd < 0){
psiOld[e.ordQuad/2 + n][0] = psi[n][0];
psi[e.ordQuad/2 + n][J] = psi[n][J];
}
}
}
/* Calculando o erro para o crit�rio de parada*/
tmpErro = 0;
erro = 0;
for (int m = 0; m < N; m++)
for (int j = 0; j < J; j++){
tmpErro = Abs(psi[m][j] - psiOld[m][j])/psi[m][j];
if(tmpErro > erro) erro = tmpErro;
}
}while(erro > e.erro);
saida.tempoVar = omp_get_wtime() - start;
saida.psi = DoubVet2D(e.ordQuad, J + 1);
/* Calcula o fluxo escalar */
#pragma omp parallel for
for (int j = 0; j <= J; j++){
saida.fi[j] = 0;
for (int m = 0; m < N; m++){
saida.fi[j] += psi[m][j] * W[m];
saida.psi[m][j] = psi[m][j];
}
}
/*Calcula a taxa de absorção*/
int i = 0;
saida.taxaAbsorRegiao = DoubVet1D(e.numReg);
saida.taxaAbsorTotal = 0;
for (int r = 0; r < e.numReg; r++)
{
saida.taxaAbsorRegiao[r] = 0;
#pragma omp parallel for
for (int j = i; j < i + e.partReg[r]; j++)
saida.taxaAbsorRegiao[r] += 0.5 * (saida.fi[j] + saida.fi[j + 1]);
saida.taxaAbsorRegiao[r] *= (St[i] - Ss[i]) * h[i];
saida.taxaAbsorTotal += saida.taxaAbsorRegiao[r];
i += e.partReg[r];
}
return saida;
}
As you can see, I only use #pragma omp parallel for and #pragma omp parallel section. In C++, as part of the object orientation, I transformed the original SaidaEstacionario struct in two classes called Methodand StationaryMethod (which inherits from Method), and implemented the function above as a method of a subclass at the lower end of the inheritance, while also encapsulating the variables and creating getters/setters. The end result was a StationaryMethodclass, which inherits from StationaryMethod, with the following method:
void StationaryDD::calculateParallel(){
/*
* Preenche os vetores com os dados do dom�nio
*/
int J = 0;
for (int i = 0; i < this->inputData.getNumReg(); i++)
J += this->inputData.getPartReg()[i];
int N = this->inputData.getQuadOrder();
this->dimFi = J + 1;
//NNR = this->inputData.NNR;
//double Stop = this->inputData.erro;
//TipoContorno = this->inputData.getCcl()Tipo + this->inputData.getCcr()Tipo;
double *St = DoubVet1D(J);
double *Ss = DoubVet1D(J);
double *Q = DoubVet1D(J);
double *h = DoubVet1D(J);
double *S = DoubVet1D(J);
//double *Fi = DoubVet1D(J + 1);
//double *x = DoubVet1D(J + 1);
//double *TaxaAbR = DoubVet1D(this->inputData.numReg);
double **psiOld = DoubVet2D(this->inputData.getQuadOrder(), J + 1);
double **psi = DoubVet2D(this->inputData.getQuadOrder(), J + 1);
double tmpErro = 0;
double erro = 0;
int k = 0;
this->fi = DoubVet1D(J + 1);
this->x = DoubVet1D(J + 1);
this->x[0] = 0;
for (int i = 0; i < this->inputData.getNumReg(); i++)
{
double temp = this->inputData.getSizeReg()[i] / this->inputData.getPartReg()[i];
for (int j = 0; j < this->inputData.getPartReg()[i]; j++)
{
h[k] = temp;
Ss[k] = this->inputData.getSsZon()[this->inputData.getZonReg()[i]];
St[k] = this->inputData.getStZon()[this->inputData.getZonReg()[i]];
Q[k] = this->inputData.getSrcReg()[i];
this->x[k + 1] = this->x[k] + temp;
k++;
}
}
/* Estabelecendo a condi��o de contorno */
for (int n = 0; n < this->inputData.getQuadOrder()/2; n++)
{
if(this->inputData.getCcl() < 0){
psi[n][0] = 0;
psiOld[n][0] = psi[n][0];
}
else{
psi[n][0] = this->inputData.getCcl();
psiOld[n][0] = psi[n][0];
}
if(this->inputData.getCcr() < 0){
psi[this->inputData.getQuadOrder()/2 + n][J] = 0;
psiOld[this->inputData.getQuadOrder()/2 + n][0] = psi[this->inputData.getQuadOrder()/2 + n][0];
}
else{
psi[this->inputData.getQuadOrder()/2 + n][J] = this->inputData.getCcr();
psiOld[this->inputData.getQuadOrder()/2 + n][0] = psi[this->inputData.getQuadOrder()/2 + n][0];
}
}
/* Obtendo as quadraturas de Gauss-Legendre */
PLegendre pl = PLegendre(this->inputData.getQuadOrder());
double *Mi = pl.getMi();
double *W = pl.getW();
//#############################################
double start;
this->numIter = 0;
do{
this->numIter++;
/* Calculando a fonte de espalhamento */
#pragma omp parallel for
for (int j = 0; j < J; j++)
{
printf("T%d: j = %d\n", omp_get_thread_num(), j);
S[j] = 0;
for (int n = 0; n < N; n++)
S[j] += 0.25 * Ss[j] * (psi[n][j + 1] + psi[n][j]) * W[n];
}
/* Varredura para a direita/esquerda */
#pragma omp sections
{
#pragma omp section
{
#pragma omp parallel for
for (int m = 0; m < N / 2; m++)
for (int j = 0; j < J; j++)
{
/* Varredura para a direita */
psiOld[m][j + 1] = psi[m][j + 1];
psi[m][j + 1] = ((Mi[m] / h[j] - 0.5 * St[j]) * psi[m][j] + S[j] + Q[j]) / (Mi[m] / h[j] + 0.5 * St[j]);
}
}
#pragma omp section
{
#pragma omp parallel for
for (int m = 0; m < N / 2; m++)
for (int j = 0; j < J; j++)
{
/* Varredura para a esquerda */
psiOld[m + N / 2][J - 1 - j] = psi[m + N / 2][J - 1 - j];
psi[m + N / 2][J - 1 - j] = ((Mi[m] / h[J - 1 - j] - 0.5 * St[J - 1 - j]) * psi[m + N / 2][J - j] + S[J - 1 - j] + Q[J - 1 - j]) / (Mi[m] / h[J - 1 - j] + 0.5 * St[J - 1 - j]);
}
}
} // Aqui h� uma barreira impl�cita
/* Estabelecendo a condi��o de contorno */
if(this->inputData.getCcl() < 0 || this->inputData.getCcr() < 0){
#pragma omp parallel for
for (int n = 0; n < this->inputData.getQuadOrder()/2; n++)
{
if(this->inputData.getCcl() < 0){
psiOld[n][0] = psi[n][0];
psi[n][0] = psi[this->inputData.getQuadOrder()/2 + n][ 0];
}
if(this->inputData.getCcr() < 0){
psiOld[this->inputData.getQuadOrder()/2 + n][0] = psi[n][0];
psi[this->inputData.getQuadOrder()/2 + n][J] = psi[n][J];
}
}
}
/* Calculando o erro para o crit�rio de parada*/
tmpErro = 0;
erro = 0;
for (int m = 0; m < N; m++)
for (int j = 0; j < J; j++){
tmpErro = fabs(psi[m][j] - psiOld[m][j])/psi[m][j];
if(tmpErro > erro) erro = tmpErro;
}
}while(erro > this->inputData.getE());
this->tempoVar = omp_get_wtime() - start;
this->psi = DoubVet2D(this->inputData.getQuadOrder(), J + 1);
/* Calcula o fluxo escalar */
#pragma omp parallel for
for (int j = 0; j <= J; j++){
this->fi[j] = 0;
for (int m = 0; m < N; m++){
this->fi[j] += psi[m][j] * W[m];
this->psi[m][j] = psi[m][j];
}
}
/*Calcula a taxa de absorção*/
int i = 0;
this->taxaAbsorRegiao = DoubVet1D(this->inputData.getNumReg());
this->taxaAbsorTotal = 0;
for (int r = 0; r < this->inputData.getNumReg(); r++)
{
this->taxaAbsorRegiao[r] = 0;
#pragma omp parallel for
for (int j = i; j < i + this->inputData.getPartReg()[r]; j++)
this->taxaAbsorRegiao[r] += 0.5 * (this->fi[j] + this->fi[j + 1]);
this->taxaAbsorRegiao[r] *= (St[i] - Ss[i]) * h[i];
this->taxaAbsorTotal += this->taxaAbsorRegiao[r];
i += this->inputData.getPartReg()[r];
}
}
Which basically does the same as the function above, but instead of returning the struct data type, simply stores all the data in the object itself defined by the class.
However, when comparing the time spent in each method, the method implemented in a C++ class takes a LOT more time than the original C function to execute. It is extremely bad to a point that, where the original C method would result in a fair bit of time saved compared to a non-paralellized variation of the same method (that is, without using OpenMP pragmas), this one will be much worse than it, taking up to a minute to finish a calculation that would be done in a fraction of a second. At first, I thought that the encapsulation could be slowing things down, but simply making everything public and calling the attributes directly instead of getters did not work. Would anyone have any insight as to why this could be happening?

Why omp version is slower than serial?

It's a follow-up question to this one
Now I have the code:
#include <iostream>
#include <cmath>
#include <omp.h>
#define max(a, b) (a)>(b)?(a):(b)
const int m = 2001;
const int n = 2000;
const int p = 4;
double v[m + 2][m + 2];
double x[m + 2];
double y[m + 2];
double _new[m + 2][m + 2];
double maxdiffA[p + 1];
int icol, jrow;
int main() {
omp_set_num_threads(p);
double h = 1.0 / (n + 1);
double start = omp_get_wtime();
#pragma omp parallel for private(icol) shared(x, y, v, _new)
for (icol = 0; icol <= n + 1; ++icol) {
x[icol] = y[icol] = icol * h;
_new[icol][0] = v[icol][0] = 6 - 2 * x[icol];
_new[n + 1][icol] = v[n + 1][icol] = 4 - 2 * y[icol];
_new[icol][n + 1] = v[icol][n + 1] = 3 - x[icol];
_new[0][icol] = v[0][icol] = 6 - 3 * y[icol];
}
const double eps = 0.01;
#pragma omp parallel private(icol, jrow) shared(_new, v, maxdiffA)
{
while (true) { //for [iters=1 to maxiters by 2]
#pragma omp single
for (int i = 0; i < p; i++) maxdiffA[i] = 0;
#pragma omp for
for (icol = 1; icol <= n; icol++)
for (jrow = 1; jrow <= n; jrow++)
_new[icol][jrow] =
(v[icol - 1][jrow] + v[icol + 1][jrow] + v[icol][jrow - 1] + v[icol][jrow + 1]) / 4;
#pragma omp for
for (icol = 1; icol <= n; icol++)
for (jrow = 1; jrow <= n; jrow++)
v[icol][jrow] = (_new[icol - 1][jrow] + _new[icol + 1][jrow] + _new[icol][jrow - 1] +
_new[icol][jrow + 1]) / 4;
#pragma omp for
for (icol = 1; icol <= n; icol++)
for (jrow = 1; jrow <= n; jrow++)
maxdiffA[omp_get_thread_num()] = max(maxdiffA[omp_get_thread_num()],
fabs(_new[icol][jrow] - v[icol][jrow]));
#pragma omp barrier
double maxdiff = 0.0;
for (int k = 0; k < p; ++k) {
maxdiff = max(maxdiff, maxdiffA[k]);
}
if (maxdiff < eps)
break;
#pragma omp barrier
//#pragma omp single
//std::cout << maxdiff << std::endl;
}
}
double end = omp_get_wtime();
printf("start = %.16lf\nend = %.16lf\ndiff = %.16lf\n", start, end, end - start);
return 0;
}
But why it works 2-3 times slower (32sec vs 18sec) than serial analog:
#include <iostream>
#include <cmath>
#include <omp.h>
#define max(a,b) (a)>(b)?(a):(b)
const int m = 2001;
const int n = 2000;
double v[m + 2][m + 2];
double x[m + 2];
double y[m + 2];
double _new[m + 2][m + 2];
int main() {
double h = 1.0 / (n + 1);
double start = omp_get_wtime();
for (int i = 0; i <= n + 1; ++i) {
x[i] = y[i] = i * h;
_new[i][0]=v[i][0] = 6 - 2 * x[i];
_new[n + 1][i]=v[n + 1][i] = 4 - 2 * y[i];
_new[i][n + 1]=v[i][n + 1] = 3 - x[i];
_new[0][i]=v[0][i] = 6 - 3 * y[i];
}
const double eps=0.01;
while(true){ //for [iters=1 to maxiters by 2]
double maxdiff=0.0;
for (int i=1;i<=n;i++)
for (int j=1;j<=n;j++)
_new[i][j]=(v[i-1][j]+v[i+1][j]+v[i][j-1]+v[i][j+1])/4;
for (int i=1;i<=n;i++)
for (int j=1;j<=n;j++)
v[i][j]=(_new[i-1][j]+_new[i+1][j]+_new[i][j-1]+_new[i][j+1])/4;
for (int i=1;i<=n;i++)
for (int j=1;j<=n;j++)
maxdiff=max(maxdiff, fabs(_new[i][j]-v[i][j]));
if(maxdiff<eps) break;
std::cout << maxdiff<<std::endl;
}
double end = omp_get_wtime();
printf("start = %.16lf\nend = %.16lf\ndiff = %.16lf\n", start, end, end - start);
return 0;
}
Also interesting that it works SAME TIME as version (I can post it here if you say so) which looks like so
while(true){ //106 iteratins here!!!
#pragma omp paralell for
for(...)
#pragma omp paralell for
for(...)
#pragma omp paralell for
for(...)
}
But I thought that what making omp code slow is spawning threads inside while loop 106 times... But no! Then probably threads simultaneously write to the same array cells.. But where does it happen? I don't see it could you show me please?
Maybe it's because too much barriers? But Lecturer told me to implement the code like so and "analyse it" Maybe the answer is "Jacobi algorithm isn't meant to run well in parallel"? Or it's just my lame coding?
So the root of evel was
max(maxdiffA[w],fabs(_new[icol][jrow] - v[icol][jrow]))
because it's
#define max(a, b) (a)>(b)?(a):(b)
It's probably creating TOO much branching ('if's ) Without this thing parallel version works 8 times faster and loading CPU 68% instead of 99%..
The starange thing: same "max" doesn't affect serioal version
I am writing to make you aware of a few situations. It is not short to write in a comment, so I decided to write as an answer.
every time a thread is made, it takes some time for its creation. if your program's running time in a single core is short, then the creation of threads will make this time longer for multi-core.
plus using a barrier makes all your threads wait for others, which could somehow be slowed down in cpu. this way, even if all threads finish the job very fast, that last one will make the total run time longer.
try to run your program with bigger sized arrays where time is around 2 minutes for single threading. then make your way to multi-core.
then try to wrap your main code in a normal loop to run it a few times and prints the timings for each. the first run of the loop might be slow because of loading libraries, but the next runs should be faster to prove the increasing speed.
if above suggestions do not give a result, then it means your coding needs more editing.
EDIT:
To downvoters, If you don't like a post, please at least be polite and leave a comment. Or better, give your own answer so be helpful to community.

Removing single construct results in incorrect execution

This code works as expected:
#include <iostream>
#include <cmath>
#include <omp.h>
//https://stackoverflow.com/questions/37970024/jacobi-relaxation-in-mpi
#define max(a, b) (a)>(b)?(a):(b)
const int m = 2001;
const int n = 1500;
const int p = 4;
double v[m + 2][m + 2];
double x[m + 2];
double y[m + 2];
double _new[m + 2][m + 2];
double maxdiffA[p + 1];
int icol, jrow;
int main() {
omp_set_num_threads(p);
double h = 1.0 / (n + 1);
double start = omp_get_wtime();
#pragma omp parallel for private(icol) shared(x, y, v, _new)
for (icol = 0; icol <= n + 1; ++icol) {
x[icol] = y[icol] = icol * h;
_new[icol][0] = v[icol][0] = 6 - 2 * x[icol];
_new[n + 1][icol] = v[n + 1][icol] = 4 - 2 * y[icol];
_new[icol][n + 1] = v[icol][n + 1] = 3 - x[icol];
_new[0][icol] = v[0][icol] = 6 - 3 * y[icol];
}
const double eps = 0.01;
#pragma omp parallel private(icol, jrow) shared(_new, v, maxdiffA)
{
while (true) { //for [iters=1 to maxiters by 2]
#pragma omp single
for (int i = 0; i < p; i++) maxdiffA[i] = 0;
#pragma omp for
for (icol = 1; icol <= n; icol++)
for (jrow = 1; jrow <= n; jrow++)
_new[icol][jrow] =
(v[icol - 1][jrow] + v[icol + 1][jrow] + v[icol][jrow - 1] + v[icol][jrow + 1]) / 4;
#pragma omp for
for (icol = 1; icol <= n; icol++)
for (jrow = 1; jrow <= n; jrow++)
v[icol][jrow] = (_new[icol - 1][jrow] + _new[icol + 1][jrow] + _new[icol][jrow - 1] +
_new[icol][jrow + 1]) / 4;
#pragma omp for
for (icol = 1; icol <= n; icol++)
for (jrow = 1; jrow <= n; jrow++)
maxdiffA[omp_get_thread_num()] = max(maxdiffA[omp_get_thread_num()],
fabs(_new[icol][jrow] - v[icol][jrow]));
#pragma omp barrier
double maxdiff = 0.0;
for (int k = 0; k < p; ++k) {
maxdiff = max(maxdiff, maxdiffA[k]);
}
if (maxdiff < eps)
break;
#pragma omp single
std::cout << maxdiff << std::endl;
}
}
double end = omp_get_wtime();
printf("start = %.16lf\nend = %.16lf\ndiff = %.16lf\n", start, end, end - start);
return 0;
}
It outputs
1.12454
<106 iterations here>
0.0100436
start = 1527366091.3069999217987061
end = 1527366110.8169999122619629
diff = 19.5099999904632568
But if I remove
#pragma omp single
std::cout << maxdiff << std::endl;
the program either seems to run infinitely long or I get
start = 1527368219.8810000419616699
end = 1527368220.5710000991821289
diff = 0.6900000572204590
Why is that so?
You overwrite maxdiffA at the beginning of the while loop - this must be isolated from reading maxdiffA at the end to check the condition. Otherwise one thread may already reset the values before another thread gets the chance to read them. The omp single construct at the end of the loop acts as isolation due to the implicit barrier at the end of omp single constructs. However there is no barrier at the beginning of omp single constructs. Also "a whole lot of code" is not a safe barrier. So if there is no valid implicit barrier, you must protect entry to the reset code with an #pragma omp barrier.
That said I highly recommend to restructure the code to have a shared exit condition that is also computed in a single construct. This makes it more clear that all threads process exit the while-loop at the same time. Otherwise the code is ill-defined.

Optimize outer loop with OpenMP and a reduction

I struggle a bit with a function. The calculation is wrong if I try to parallelize the outer loop with a
#pragma omp parallel reduction(+:det).
Can someone show me how to solve it and why it is failing?
// template<class T> using vector2D = std::vector<std::vector<T>>;
float Det(vector2DF &a, int n)
{
vector2DF m(n - 1, vector1DF(n - 1, 0));
if (n == 1) return a[0][0];
if (n == 2) return a[0][0] * a[1][1] - a[1][0] * a[0][1];
float det = 0;
for (int i = 0; i < n; i++)
{
int l = 0;
#pragma omp parallel for private(l)
for (int j = 1; j < n; j++)
{
l = 0;
for (int k = 0; k < n; k++)
{
if (k == i) continue;
m[j - 1][l] = a[j][k];
l++;
}
}
det += std::pow(-1.0, 1.0 + i + 1.0) * a[0][i] * Det(m, n - 1);
}
return det;
}
If you parallelize the outer loop, there is a race condition on this line:
m[j - 1][l] = a[j][k];
Also you likely want a parallel for reduction instead of just a parallel reduction.
The issue is, that m is shared, even though that wouldn't be necessary given that it is completely overwritten in the inner loop. Always declare variables as locally as possible, this avoids issues with wrongly shared variables, e.g.:
float Det(vector2DF &a, int n)
{
if (n == 1) return a[0][0];
if (n == 2) return a[0][0] * a[1][1] - a[1][0] * a[0][1];
float det = 0;
#pragma omp parallel reduction(+:det)
for (int i = 0; i < n; i++)
{
vector2DF m(n - 1, vector1DF(n - 1, 0));
for (int j = 1; j < n; j++)
{
int l = 0;
for (int k = 0; k < n; k++)
{
if (k == i) continue;
m[j - 1][l] = a[j][k];
l++;
}
}
det += std::pow(-1.0, 1.0 + i + 1.0) * a[0][i] * Det(m, n - 1);
}
return det;
}
Now that is correct, but since m can be expensive to allocate, performance could benefit from not doing it in each and every iteration. This can be done by splitting parallel and for directives as such:
float Det(vector2DF &a, int n)
{
if (n == 1) return a[0][0];
if (n == 2) return a[0][0] * a[1][1] - a[1][0] * a[0][1];
float det = 0;
#pragma omp parallel reduction(+:det)
{
vector2DF m(n - 1, vector1DF(n - 1, 0));
#pragma omp parallel for
for (int i = 0; i < n; i++)
{
for (int j = 1; j < n; j++)
{
int l = 0;
for (int k = 0; k < n; k++)
{
if (k == i) continue;
m[j - 1][l] = a[j][k];
l++;
}
}
det += std::pow(-1.0, 1.0 + i + 1.0) * a[0][i] * Det(m, n - 1);
}
}
return det;
}
Now you could also just declare m as firstprivate, but that would assume that the copy constructor makes a completely independent deep-copy and thus make the code more difficult to reason about.
Please be aware that you should always include expected output, actual output and a minimal complete and verifiable example.

Hourglass sum in 2D array

We are given a (6*6) 2D array of which we have to find largest sum of a hourglass in it.
For example, if we create an hourglass using the number 1 within an array full of zeros, it may look like this:
The sum of an hourglass is the sum of all the numbers within it. The sum for the hourglasses above are 7, 4, and 2, respectively.
I had written a code for it as follows.It is basically a competitive programming question and as I am new to the field,I have written the code with a very bad compplexity..perhaps so much that the program could not produce the desired output within the stipulated period of time.Below is my code:
int main(){
vector< vector<int> > arr(6,vector<int>(6));
for(int arr_i = 0;arr_i < 6;arr_i++)
{
for(int arr_j = 0;arr_j < 6;arr_j++)
{
cin >> arr[arr_i][arr_j];
}
} //numbers input
int temp; //temporary sum storing variable
int sum=INT_MIN; //largest sum storing variable
for(int i=0;i+2<6;i++) //check if at least3 exist at bottom
{
int c=0; //starting point of traversing column wise for row
while(c+2<6) //three columns exist ahead from index
{
int f=0; //test case variable
while(f!=1)
{ //if array does not meet requirements,no need of more execution
for(int j=c;j<=j+2;j++)
{ //1st and 3rd row middle element is 0 and 2nd row is non 0.
//condition for hourglass stucture
if((j-c)%2==0 && arr[i+1][j]==0||((j-c)%2==1 && arr[i+1][j]!=0)
//storing 3 dimensional subarray sum column wise
temp+=arr[i][j]+arr[i+1][j]+arr[i+2][j]; //sum storage
else
f=1; //end traversing further on failure
if(sum<temp)
sum=temp;
f=1;//exit condition
}//whiel loop of test variable
temp=0; //reset for next subarray execution
c++; /*begin traversal from one index greater column wise till
condition*/
}// while loop of c
}
}
cout<<sum;
return 0;
}
This is my implementation of the code which failed to process in the time interval.Please suggest a better solution considering the time complexity and feel free to point out any mistakes from my side in understanding the problem.The question is from Hackerrank.
Here is the link if you need it anyways:
https://www.hackerrank.com/challenges/2d-array
The solution for your problem is:
#include <cstdio>
#include <iostream>
#include <climits>
int main() {
int m[6][6];
// Read 2D Matrix-Array
for (int i = 0; i < 6; ++i) {
for (int j = 0; j < 6; ++j) {
std:: cin >> m[i][j];
}
}
// Compute the sum of hourglasses
long temp_sum = 0, MaxSum = LONG_MIN;
for (int i = 0; i < 6; ++i) {
for (int j = 0; j < 6; ++j) {
if (j + 2 < 6 && i + 2 < 6) {
temp_sum = m[i][j] + m[i][j + 1] + m[i][j + 2] + m[i + 1][j + 1] + m[i + 2][j] + m[i + 2][j + 1] + m[i + 2][j + 2];
if (temp_sum >= MaxSum) {
MaxSum = temp_sum;
}
}
}
}
fprintf(stderr, "Max Sum: %ld\n", MaxSum);
return 0;
}
The algorithm is simple, it sums all the Hourglasses starting of the upper left corner and the last 2 columns and 2 rows are not processed because it can not form hourglasses.
The above code is almost correct, but it does not work for a negative array elements.We should not take max sum as 0 as negative numbers array might not reach their max sum total >=0. In this case, initializing max sum to INT_MIN is a better option.
I have solved in Python 3.0 and passed all test cases in HackerRank:
Idea: in just 3 simple steps:
To extract all 16 3X3 in 6X6 matrix. Get each sub-matrix sum Find
the max of all sub-matrix sum
I have initialized max as -1000 for negative values you can also initialize it with Minimum_Integer value
# Complete the hourglassSum function below.
def hourglassSum(arr):
max = -1000
s= []
sub_array = []
for m in range(4)://Move vertically down the rows like(012,123,234,345 and taking 3 values horizontally)
for col in range(4):
for row in range(3):
sub_array.append(arr[row+m][col:col+3])
s = sub_array//Extracting all 16 3X3 matrices
hour_sum = sum_list(s[0])+s[1][1]+sum_list(s[2])//Mask array for hour_glass index[[1,1,1],[0,1,1],[1,1,1]]
if (max<hour_sum):
max = hour_sum
sub_array = []
return max
def sum_list(list1):
total = 0
for ele in range(0, len(list1)):
total = total + list1[ele]
return total
"""
Extra: Try replacing this in your Spyder for lesser lines of code
Instead of
Existing: without numpy
hour_sum = sum_list(s[0])+s[1][1]+sum_list(s[2])//Mask array for hour_glass index[[1,1,1],[0,1,1],[1,1,1]]
if (max<hour_sum):
max = hour_sum
With numpy:
import numpy as np
import numpy.ma as ma
hour_glass = ma.array(sub_array, mask=mask)
sum = hour_glass.data.sum()
"""
Swift 4 version:
func hourglassSum(arr matrix: [[Int]]) -> Int {
let h = matrix.count
if h < 3 {
return 0
}
let w = matrix[0].count
if w < 3 {
return 0
}
var maxSum: Int?
for i in 0 ..< h - 2 {
for j in 0 ..< w - 2 {
// Considering matrix[i][j] as top left cell of hour glass.
let sum = matrix[i][j] + matrix[i][j+1] + matrix[i][j+2]
+ matrix[i+1][j+1]
+ matrix[i+2][j] + matrix[i+2][j+1] + matrix[i+2][j+2]
// If previous sum is less then current sum then update new sum in maxSum
if let maxValue = maxSum {
maxSum = max(maxValue, sum)
} else {
maxSum = sum
}
}
}
return maxSum ?? 0
}
#JavaScript(Nodejs)
function hourglassSum(arr) {
// Write your code here
let count = -63;
for(let i = 0; i <= 3; i++){
for(let j = 0; j <= 3; j++){
let sum = arr[i][j] + arr[i][j+1] + arr[i][j+2] + arr[i+1][j+1]
+ arr[i+2][j] + arr[i+2][j+1] + arr[i+2][j+2]
if(sum > count){
count = sum
}
}
}
return count;
}
Here is python implementation of this algorithm.
arr = []
for _ in xrange(6):
arr.append(map(int, raw_input().rstrip().split()))
maxSum = -99999999
for row in range(len(arr)):
tempSum = 0
for col in range(len(arr[row])):
if col+2 >= len(arr[row]) or row+2 >= len(arr[col]):
continue
tempSum = arr[row][col] + arr[row][col+1] + arr[row][col+2] + arr[row+1][col+1] + arr[row+2][col] + arr[row+2][col+1] + arr[row+2][col+2]
if maxSum < tempSum:
maxSum = tempSum
print(maxSum)
Basic solution for java;
static int hourglassSum(int[][] arr) {
int sum = 0;
for(int i = 2; i<6; i++){
for(int j = 2; j<6; j++){
int up = arr[i-2][j-2] + arr[i-2][j-1] + arr[i-2][j];
int mid = arr[i-1][j-1];
int down = arr[i][j-2] + arr[i][j-1] + arr[i][j];
if(up+mid+down > sum){
sum = up+mid+down;
}
}
}
return sum;
}
Python clean and fast solution
def hourglassSum(arr):
arr_sum = -5000
tmp_sum = 0
for i in range(0, 6-2):
for j in range (0, 6-2):
tmp_sum = arr[i][j] + arr[i][j+1] + arr[i][j+2] + \
+ arr[i+1][j+1] + \
arr[i+2][j] + arr[i+2][j+1] + arr[i+2][j+2]
if arr_sum < tmp_sum:
arr_sum = tmp_sum
return arr_sum
Just avoided four for loop iterations
int main()
{
int arr[6][6],max=-1,sum;
for(int arr_i = 0; arr_i < 6; arr_i++){
for(int arr_j = 0; arr_j < 6; arr_j++){
scanf("%d",&arr[arr_i][arr_j]);
if(arr[arr_i][arr_j]<-9||arr[arr_i][arr_j]>9)
exit(0);
}
}
for(int arr_i = 0; arr_i <4; arr_i++)
{
sum=0;
for(int arr_j = 0; arr_j < 4; arr_j++){
sum=arr[arr_i][arr_j]+arr[arr_i][arr_j+1]+arr[arr_i][arr_j+2]+arr[arr_i+1][arr_j+1]+arr[arr_i+2][arr_j]+arr[arr_i+2][arr_j+1]+arr[arr_i+2][arr_j+2];
if(sum>max)
max=sum;
}
}
printf("%d",max);
return 0;
}
int main(){
vector< vector<int> > arr(6,vector<int>(6));
for(int arr_i = 0;arr_i < 6;arr_i++){
for(int arr_j = 0;arr_j < 6;arr_j++){
cin >> arr[arr_i][arr_j];
}
}
int sum=-100, temp;
for(int arr_i = 0;arr_i < 4;arr_i++){
for(int arr_j = 0;arr_j < 4;arr_j++){
temp=(arr[arr_i][arr_j]+arr[arr_i][arr_j+1]+arr[arr_i][arr_j+2]+arr[arr_i+1][arr_j+1]+arr[arr_i+2][arr_j]+arr[arr_i+2][arr_j+1]+arr[arr_i+2][arr_j+2]);
if(temp>sum)
sum=temp;
}
}
cout << sum;
return 0;
}
def hourglassSum(arr)
maxHourGlass = -82
counter = 0
for i in 1..4
for j in 1..4
acc = arr[i][j]
counter= counter +1
for x in -1..1
acc = acc + arr[i-1][j+x] + arr[i+1][j+x]
end
maxHourGlass = acc if acc > maxHourGlass
end
end
maxHourGlass
end
This is written in C++14 and passes all nine test cases. I think someone could improve it to use more C++14 features.
int hourglassSum(vector<vector<int>> arr)
{
if(arr.size() < 3 || arr[0].size() < 3 )
return -1;
int rowSize = arr[0].size();
int sum = -9 * 6; // smallest negative sum possible;
for( int i = 1; i < arr.size()-1; i++ )
{
int tmp_sum = 0;
for( int j = 1; j < rowSize-1; j++ )
{
tmp_sum = (arr[i - 1][j - 1] + arr[i - 1][j] + arr[i - 1][j + 1] );
tmp_sum += (arr[i ][j ]);
tmp_sum += (arr[i + 1][j - 1] + arr[i + 1][j] + arr[i + 1][j + 1]);
sum = max(tmp_sum, sum);
}
}
return sum;
}
class Solution {
static void Main(string[] args) {
int[][] arr = new int[6][];
for (int i = 0; i < 6; i++) {
arr[i] = Array.ConvertAll(Console.ReadLine().Split(' '), arrTemp => Convert.ToInt32(arrTemp));
}
int[] sum=new int[16];
int j;
int count=0;
for(int i=0; i<4; i++)
{
for(j=0;j<4;j++)
{
if(count<16)
{
sum[count]=arr[i][j]+arr[i][j+1]+arr[i][j+2]+arr[i+1][j+1]+arr[i+2][j]+arr[i+2][j+1]+arr[i+2][j+2];
count++;
}
}
}
int max=sum.Max();
Console.WriteLine(max);
}
}
Largest (maximum) hourglass sum found in the array will be -63 as the element cannot be greater than -9 i.e. -9*7 = -63
C#
int max_hourglass_sum = -63;
for (int i = 0; i <arr.Length-2; i++) { //row
for (int j = 0 ; j<arr.Length-2; j++) { //column
int current_hourglass_sum = arr[i][j] + arr[i][j+1] + arr[i][j+2] //1st row
+ arr[i+1][j+1] //2nd row
+ arr[i+2][j] + arr[i+2][j+1] + arr[i+2][j+2] ;//3rd row
max_hourglass_sum = Math.Max(max_hourglass_sum , current_hourglass_sum);
}
}
static int hourglassSum(int[][] arr) {
int result = int.MinValue;
int rowLength = arr.GetLength(0);
int colLength = arr.Length;
for (int i = 0; i < rowLength - 2; i++)
{
for(int j=0; j< colLength - 2; j++)
{
int sum = 0;
sum = arr[i][j] + arr[i][j+1] + arr[i][j+2]+ arr[i+1][j+1]
+ arr[i+2][j] + arr[i+2][j+1] + arr[i+2][j+2];
result = Math.Max(result,sum);
}
}
return result;
}
function hourglassSum(arr) {
const hourGlass = [];
for (let i = 0; i < 4; i++) {
for (let x = 0; x < 4; x++) {
let hourGlassSumValue = arr[i][x] + arr[i][x + 1] + arr[i][x + 2] + arr[i + 1][x + 1] + arr[i + 2]enter code here[x] + arr[i + 2][x + 1] + arr[i + 2][x + 2];
hourGlass.push(hourGlassSumValue);
}
}
return Math.max(...hourGlass);
}
console.log(hourglassSum(cars));