OpenMP slower than serial codes

OpenMP slower than serial codes - c++

I am trying to do the parallelization of a serial preconditioned conjugate gradient solver codes for 3D fire simulation using OpenMP (Intel compiler). But the performance seems not to be improved.
The grid dimension is 79x81x79 and the solver can converge after 565 iterations. The serial codes cost 3.39 seconds and the OpenMP version needs 3.86 seconds on Intel i7 2600 (OS: openSUSE 13.1).
Please help me to check the codes. Thanks a lot.
// preconditioned conjugate gradient solver ...
void PCGSSolver::solveNew(const Array3D<double>& sn, const Array3D<double>& ae, const Array3D<double>&aw,
const Array3D<double>& as, const Array3D<double>& an, const Array3D<double>&at, const Array3D<double>&ab,
const Array3D<double>& ap, Array3D<double>& ptmp){
std::size_t dimX=sn.getDimI();
std::size_t dimY=sn.getDimJ();
std::size_t dimZ=sn.getDimK();
Array3D<double> p1(dimX,dimY,dimZ,0.0);
Array3D<double> res(dimX,dimY,dimZ,0.0);
Array3D<double> d(dimX,dimY,dimZ,0.0);
Array3D<double> ain(dimX,dimY,dimZ,0.0);
double tiny=1.0e-30;
#pragma omp parallel
{
//Jacobi preconditioner
#pragma omp for nowait
for(std::size_t k=1;k<dimZ-1; k++){
for(std::size_t j=1; j<dimY-1; j++){
for(std::size_t i=1; i<dimX-1; i++){
d(i,j,k)=1./ap(i,j,k);
}
}
}
#pragma omp for nowait
for(std::size_t k=1;k<dimZ-1; k++){
for(std::size_t j=1; j<dimY-1; j++){
for(std::size_t i=1; i<dimX-1; i++){
res(i,j,k)=ae(i,j,k)*ptmp(i+1,j,k) + aw(i,j,k)*ptmp(i-1,j,k)+an(i,j,k)*ptmp(i,j+1,k)+as(i,j,k)*ptmp(i,j-1,k)+
at(i,j,k)*ptmp(i,j,k+1)+ab(i,j,k)*ptmp(i,j,k-1)+sn(i,j,k)-ap(i,j,k)*ptmp(i,j,k);
}
}
}
}
double big =1.0e+30;
double s1old=big;
//start iteration
for(std::size_t intswp=0; intswp<this->nswpvr; intswp++){
double alpha=0.0;
double bbeta=0.0;
double s1=0.0;
double s2=0.0;
double testir=0.0;
#pragma omp parallel
{
#pragma omp for reduction(+:s1)
for(std::size_t k=1;k<dimZ-1; k++){
for(std::size_t j=1; j<dimY-1; j++){
for(std::size_t i=1; i<dimX-1; i++){
ain(i,j,k)=res(i,j,k)*d(i,j,k);
s1+=(res(i,j,k)*ain(i,j,k));
}
}
}
#pragma omp single
{
bbeta=s1/(s1old+tiny);
}
#pragma omp for
for(std::size_t k=1;k<dimZ-1; k++){
for(std::size_t j=1; j<dimY-1; j++){
for(std::size_t i=1; i<dimX-1; i++){
p1(i,j,k)=ain(i,j,k)+bbeta*p1(i,j,k);
}
}
}
#pragma omp for reduction(+:s2)
for(std::size_t k=1;k<dimZ-1; k++){
for(std::size_t j=1; j<dimY-1; j++){
for(std::size_t i=1; i<dimX-1; i++){
ain(i,j,k)=ap(i,j,k)*p1(i,j,k)-ae(i,j,k)*p1(i+1,j,k)-aw(i,j,k)*p1(i-1,j,k)-
an(i,j,k)*p1(i,j+1,k)-as(i,j,k)*p1(i,j-1,k)-
at(i,j,k)*p1(i,j,k+1)-ab(i,j,k)*p1(i,j,k-1);
s2+=(p1(i,j,k)*ain(i,j,k));
}
}
}
#pragma omp single
{
alpha=s1/(s2+tiny);
}
#pragma omp for reduction(+:testir)
for(std::size_t k=1;k<dimZ-1; k++){
for(std::size_t j=1; j<dimY-1; j++){
for(std::size_t i=1; i<dimX-1; i++){
ptmp(i,j,k)=ptmp(i,j,k)+alpha*p1(i,j,k);
res(i,j,k)=res(i,j,k)-alpha*ain(i,j,k);
testir+=fabs(res(i,j,k));
}
}
}
}//==openmp region end
s1old=s1;
//test stop criteria
if(testir < ccvar){
std::cout<<"PCGS solver coverage at "<<(intswp+1)<<" iterations!"<<std::scientific<<testir<<std::endl;
return;
}
}
std::cout<<"PCGS solver can not coverage "<<std::endl;
}
The Array3D is a my 3 dimension array class.
#ifndef ARRAY3D_H
#define ARRAY3D_H
#include <vector>
#include <algorithm>
template<typename T> class Array3D
{
public:
typedef T value_type;
Array3D(){
dim_i=dim_j=dim_k=0;
dim_ij=0;
}
Array3D(std::size_t size_i, std::size_t size_j, std::size_t size_k){
this->resize(size_i,size_j,size_k);
}
Array3D(std::size_t size_i, std::size_t size_j, std::size_t size_k,const value_type& defaultValue){
this->resize(size_i,size_j,size_k,defaultValue);
}
virtual ~Array3D(){}
std::size_t getDimI()const{
return this->dim_i;
}
std::size_t getDimJ()const{
return this->dim_j;
}
std::size_t getDimK()const{
return this->dim_k;
}
//check if valid indices
bool checkIndices(std::size_t i, std::size_t j, std::size_t k){
return (i<this->dim_i ) && (j<this->dim_j) && (k<this->dim_k);
}
void resize(std::size_t size_i, std::size_t size_j, std::size_t size_k,const value_type& defaultValue){
this->resize(size_i,size_j,size_k);
this->fillValue(defaultValue);
}
//resize the array. The data will be ereased.
void resize(std::size_t size_i, std::size_t size_j, std::size_t size_k){
this->dim_i=size_i;
this->dim_j=size_j;
this->dim_k=size_k;
this->dim_ij=this->dim_i*this->dim_j;
std::size_t totalSize=this->dim_i*this->dim_j*this->dim_k;
this->data.resize(totalSize);
}
std::size_t size()const{
return this->data.size();
}
void fillValue(const value_type& defaultValue){
std::fill(this->data.begin(),this->data.end(),defaultValue);
}
value_type minValue()const{
return *(std::min_element(data.begin(),data.end()));
}
value_type maxValue()const{
return *(std::max_element(data.begin(),data.end()));
}
//Fill the array value using the sum of two array
void setValueSum(const Array3D& array1, const Array3D& array2){
size_t minSize=std::min(std::min(array1.data.size(),array2.data.size()),this->data.size());
for(size_t i=0; i<minSize; i++)
this->data[i]=array1.data[i]+array2.data[i];
}
void clear(){
dim_i=dim_j=dim_k=0;
dim_ij=0;
this->data.clear();
}
//get value reference at (i,j,k) or (x,y,z) or (u,v,w)...
const value_type& operator () (std::size_t i, std::size_t j, std::size_t k )const{
return this->data.at(this->calIndex(i,j,k));
}
value_type& operator ()(std::size_t i, std::size_t j, std::size_t k ){
return this->data.at(this->calIndex(i,j,k));
}
//access the raw data by 1D index
const value_type& operator [] (std::size_t i )const{
return this->data.at(i);
}
value_type& operator [](std::size_t i ){
return this->data.at(i);
}
std::vector<value_type>* rawData(){
return &(data);
}
private:
inline std::size_t calIndex(std::size_t i, std::size_t j, std::size_t k )const{
return k*this->dim_ij+j*this->dim_i+i;
}
private:
//dimension of array (i,j,k)(x,y,z)(u,v,w)...
std::size_t dim_i, dim_j, dim_k;
//raw data, order is I-J-K
std::vector<value_type> data;
//dim_i*dim_j
std::size_t dim_ij;
};
#endif // ARRAY3D_H
I measure the time using a Timer class codes downloaded from internet.
timer.start();
PCGSSolver solver;
solver.setTolerance(this->ccvar);
solver.setIteNum(this->nswpp);
solver.solveNew(sn,ae,aw,as,an,at,ab,ap,ptmp);
timer.stop();
std::cout<<"PCGS time:"<<timer.getElapsedTimeInSec()<<"sec"<<std::endl;
Timer.h
//////////////////////////////////////////////////////////////////////////////
// Timer.h
// =======
// High Resolution Timer.
// This timer is able to measure the elapsed time with 1 micro-second accuracy
// in both Windows, Linux and Unix system
//
// AUTHOR: Song Ho Ahn (song.ahn#gmail.com)
// CREATED: 2003-01-13
// UPDATED: 2006-01-13
//
// Copyright (c) 2003 Song Ho Ahn
//////////////////////////////////////////////////////////////////////////////
#ifndef TIMER_H_DEF
#define TIMER_H_DEF
#ifdef WIN32 // Windows system specific
#include <windows.h>
#else // Unix based system specific
#include <sys/time.h>
#endif
class Timer
{
public:
Timer(); // default constructor
~Timer(); // default destructor
void start(); // start timer
void stop(); // stop the timer
double getElapsedTime(); // get elapsed time in second
double getElapsedTimeInSec(); // get elapsed time in second (same as getElapsedTime)
double getElapsedTimeInMilliSec(); // get elapsed time in milli-second
double getElapsedTimeInMicroSec(); // get elapsed time in micro-second
protected:
private:
double startTimeInMicroSec; // starting time in micro-second
double endTimeInMicroSec; // ending time in micro-second
int stopped; // stop flag
#ifdef WIN32
LARGE_INTEGER frequency; // ticks per second
LARGE_INTEGER startCount; //
LARGE_INTEGER endCount; //
#else
timeval startCount; //
timeval endCount; //
#endif
};
#endif // TIMER_H_DEF
Timer.cpp
//////////////////////////////////////////////////////////////////////////////
// Timer.cpp
// =========
// High Resolution Timer.
// This timer is able to measure the elapsed time with 1 micro-second accuracy
// in both Windows, Linux and Unix system
//
// AUTHOR: Song Ho Ahn (song.ahn#gmail.com)
// CREATED: 2003-01-13
// UPDATED: 2006-01-13
//
// Copyright (c) 2003 Song Ho Ahn
//////////////////////////////////////////////////////////////////////////////
#include "Timer.h"
#include <stdlib.h>
///////////////////////////////////////////////////////////////////////////////
// constructor
///////////////////////////////////////////////////////////////////////////////
Timer::Timer()
{
#ifdef WIN32
QueryPerformanceFrequency(&frequency);
startCount.QuadPart = 0;
endCount.QuadPart = 0;
#else
startCount.tv_sec = startCount.tv_usec = 0;
endCount.tv_sec = endCount.tv_usec = 0;
#endif
stopped = 0;
startTimeInMicroSec = 0;
endTimeInMicroSec = 0;
}
///////////////////////////////////////////////////////////////////////////////
// distructor
///////////////////////////////////////////////////////////////////////////////
Timer::~Timer()
{
}
///////////////////////////////////////////////////////////////////////////////
// start timer.
// startCount will be set at this point.
///////////////////////////////////////////////////////////////////////////////
void Timer::start()
{
stopped = 0; // reset stop flag
#ifdef WIN32
QueryPerformanceCounter(&startCount);
#else
gettimeofday(&startCount, NULL);
#endif
}
///////////////////////////////////////////////////////////////////////////////
// stop the timer.
// endCount will be set at this point.
///////////////////////////////////////////////////////////////////////////////
void Timer::stop()
{
stopped = 1; // set timer stopped flag
#ifdef WIN32
QueryPerformanceCounter(&endCount);
#else
gettimeofday(&endCount, NULL);
#endif
}
///////////////////////////////////////////////////////////////////////////////
// compute elapsed time in micro-second resolution.
// other getElapsedTime will call this first, then convert to correspond resolution.
///////////////////////////////////////////////////////////////////////////////
double Timer::getElapsedTimeInMicroSec()
{
#ifdef WIN32
if(!stopped)
QueryPerformanceCounter(&endCount);
startTimeInMicroSec = startCount.QuadPart * (1000000.0 / frequency.QuadPart);
endTimeInMicroSec = endCount.QuadPart * (1000000.0 / frequency.QuadPart);
#else
if(!stopped)
gettimeofday(&endCount, NULL);
startTimeInMicroSec = (startCount.tv_sec * 1000000.0) + startCount.tv_usec;
endTimeInMicroSec = (endCount.tv_sec * 1000000.0) + endCount.tv_usec;
#endif
return endTimeInMicroSec - startTimeInMicroSec;
}
///////////////////////////////////////////////////////////////////////////////
// divide elapsedTimeInMicroSec by 1000
///////////////////////////////////////////////////////////////////////////////
double Timer::getElapsedTimeInMilliSec()
{
return this->getElapsedTimeInMicroSec() * 0.001;
}
///////////////////////////////////////////////////////////////////////////////
// divide elapsedTimeInMicroSec by 1000000
///////////////////////////////////////////////////////////////////////////////
double Timer::getElapsedTimeInSec()
{
return this->getElapsedTimeInMicroSec() * 0.000001;
}
///////////////////////////////////////////////////////////////////////////////
// same as getElapsedTimeInSec()
///////////////////////////////////////////////////////////////////////////////
double Timer::getElapsedTime()
{
return this->getElapsedTimeInSec();
}

A quick glance over your code shows a few areas where you can improve the performance. I'll leave the implementation up to you.
OMP Parallel For
Firstly its generally cheaper to use
#pragma omp parallel for
for (...) {
...
}
versus
#pragma omp parallel
{
#pragma omp for
for (...) {
...
}
}
Not by much but there is a slight improvement. See [1], the graphic at the end.
OMP SINGLE
The key benefit of using #pragma omp parallel for in this case is that it allows us to remove the #pragma omp single directive. When your program encounters a #pragma omp single directive every thread waits here until the others are finished processing their chunk of work. This could lead to a situation where several of your threads finish early and have to wait on another to finish until they can proceed.
Use of #pragma omp single and #pragma omp barrier is strongly discouraged in high performing parallelised code.
Collapsing Loops (The Hard Way)
The next area you need to look at is collapsing your loops. The following
#pragma omp parallel for
for (int k = 0; k < o; ++k) {
for (int j = 0; j < m; ++j) {
for (int i = 0; i < n; ++i) {
...
}
}
}
will generally parallelise the outer loop for (int k = ...) but run the inner loops in serial on each thread. You can achieve parallelisation of the entire loop by unravelling them like
#pragma omp parallel for
for (int l = 0; l < o*m*n; ++l) {
int i = l % n;
int j = (l / n) % m;
int k = ((l / n) / m) % o;
...
}
In most of your loops you can simply use l and the overloaded [] operator. Most Conjugate Gradient solvers will only need the l index and not the i, j and k indices as they operate on vectors. The only time when i, j and k are needed is when you are computing A*x (or A'*x). This change will increase the level of parallelisation in your code and should provide noticeable improvements.
Collapsing Loops (The Easy Way)
It should be mentioned that OpenMP as of Version 3.0 supports the collapse(n) clause which can be used to tell the compiler to automatically collapse the for() loops as I've described above. An example of this is
#pragma omp parallel for collapse(3)
for (int k = 0; k < o; ++k) {
for (int j = 0; j < m; ++j) {
for (int i = 0; i < n; ++i) {
...
}
}
}
which will cause the compiler to form a single for() loop and then parallelise it.
Reduction Clause
Lastly, and probably the most costly element in your code is the reduction() clause. Edit: I incorrectly previously mentioned this could be removed after collapsing the loops in my haste to finish the answer.
Source [1]

I don't know precisely why the OpenMP parallelisation doesn't make the code faster, but what's obvious is that you've got all you loops in the wrong order.
So, first thing first, start by swapping the i and k loops in your code and I'm sure you'll see a dramatic performance boost. Then you can have a look at OpenMP.

Related

Getting speed improvement with OpenMP in nested for loops with dependencies

I am trying to implement a procedure in parallel processing form with OpenMP. It contains four level nested for loops (dependent) and has a variable sum_p to be updated in the innermost loop. In short, the my question is regarding the parallel implementation of the following code snippet:
for (int i = (test_map.size() - 1); i >= 1; --i) {
bin_i = test_map.at(i); //test_map is a "STL map of vectors"
len_rank_bin_i = bin_i.size(); // bin_i is a vector
for (int j = (i - 1); j >= 0; --j) {
bin_j = test_map.at(j);
len_rank_bin_j = bin_j.size();
for (int u_i = 0; u_i < len_rank_bin_i; u_i++) {
node_u = bin_i[u_i]; //node_u is a scalar
for (int v_i = 0; v_i < len_rank_bin_j; v_i++) {
node_v = bin_j[v_i];
if (node_u> node_v)
sum_p += 1;
}
}
}
}
The full program is given below:
#include <iostream>
#include <vector>
#include <omp.h>
#include <random>
#include <unordered_map>
#include <algorithm>
#include <functional>
#include <time.h>
int main(int argc, char* argv[]){
double time_temp;
int test_map_size = 5000;
std::unordered_map<unsigned int, std::vector<unsigned int> > test_map(test_map_size);
// Fill the test map with random intergers ---------------------------------
std::random_device rd;
std::mt19937 gen1(rd());
std::uniform_int_distribution<int> dist(1, 5);
auto gen = std::bind(dist, gen1);
for(int i = 0; i < test_map_size; i++)
{
int vector_len = dist(gen1);
std::vector<unsigned int> tt(vector_len);
std::generate(begin(tt), end(tt), gen);
test_map.insert({i,tt});
}
// Sequential implementation -----------------------------------------------
time_temp = omp_get_wtime();
std::vector<unsigned int> bin_i, bin_j;
unsigned int node_v, node_u;
unsigned int len_rank_bin_i;
unsigned int len_rank_bin_j;
int sum_s = 0;
for (unsigned int i = (test_map_size - 1); i >= 1; --i) {
bin_i = test_map.at(i);
len_rank_bin_i = bin_i.size();
for (unsigned int j = i; j-- > 0; ) {
bin_j = test_map.at(j);
len_rank_bin_j = bin_j.size();
for (unsigned int u_i = 0; u_i < len_rank_bin_i; u_i++) {
node_u = bin_i[u_i];
for (unsigned int v_i = 0; v_i < len_rank_bin_j; v_i++) {
node_v = bin_j[v_i];
if (node_u> node_v)
sum_s += 1;
}
}
}
}
std::cout<<"Estimated sum (seq): "<<sum_s<<std::endl;
time_temp = omp_get_wtime() - time_temp;
printf("Time taken for sequential implementation: %.2fs\n", time_temp);
// Parallel implementation -----------------------------------------------
time_temp = omp_get_wtime();
int sum_p = 0;
omp_set_num_threads(4);
#pragma omp parallel
{
std::vector<unsigned int> bin_i, bin_j;
unsigned int node_v, node_u;
unsigned int len_rank_bin_i;
unsigned int len_rank_bin_j;
unsigned int i, u_i, v_i;
int j;
#pragma omp parallel for private(j,u_i,v_i) reduction(+:sum_p)
for (i = (test_map_size - 1); i >= 1; --i) {
bin_i = test_map.at(i);
len_rank_bin_i = bin_i.size();
#pragma omp parallel for private(u_i,v_i)
for (j = (i - 1); j >= 0; --j) {
bin_j = test_map.at(j);
len_rank_bin_j = bin_j.size();
#pragma omp parallel for private(v_i)
for (u_i = 0; u_i < len_rank_bin_i; u_i++) {
node_u = bin_i[u_i];
#pragma omp parallel for
for (v_i = 0; v_i < len_rank_bin_j; v_i++) {
node_v = bin_j[v_i];
if (node_u> node_v)
sum_p += 1;
}
}
}
}
}
std::cout<<"Estimated sum (parallel): "<<sum_p<<std::endl;
time_temp = omp_get_wtime() - time_temp;
printf("Time taken for parallel implementation: %.2fs\n", time_temp);
return 0;
}
Running the code with command g++-7 -fopenmp -std=c++11 -O3 -Wall -o so_qn so_qn.cpp in macOS 10.13.3 (i5 processor with four logical cores) gives the following output:
Estimated sum (seq): 38445750
Time taken for sequential implementation: 0.49s
Estimated sum (parallel): 38445750
Time taken for parallel implementation: 50.54s
The time taken for parallel implementation is multiple times higher than sequential implementation. Do you think the code or logic can deduced to parallel implementation? I have spent a few days to improve the terrible performance of my code but to no avail. Any help is greatly appreciated.
Update
With the changes suggested by JimCownie, i.e., "using omp for, not omp parallel for" and removing the parellelism of inner loops, the performance is greatly improved.
Estimated sum (seq): 42392944
Time taken for sequential implementation: 0.48s
Estimated sum (parallel): 42392944
Time taken for parallel implementation: 0.27s
My CPU has four logical cores (and I am using four threads), now I am wondering, would there be anyway to get four times better performance than the sequential implementation.
I see a different problem here when my map of vectors test_map is short, but fat at each level, i.e., the map size is small and but the vector size at each of the keys is very large. In such a case the performance of sequential and parallel implementations are comparable, without much difference. It seems like we need to parallelize inner loops too. Do you know how to achieve it in this context?

“private variable cannot be reduction” although that variable is defined outside the SIMD block

I have a C++ project which uses OpenMP, I try to compile that with LLVM on Blue Gene/Q. There is one function that, stripped down, looks like this:
template <typename FT, int veclen>
inline void xmyNorm2Spinor(FT *res,
FT *x,
FT *y,
double &n2res,
int n,
int n_cores,
int n_simt,
int n_blas_simt) {
#if defined(__GNUG__) && !defined(__INTEL_COMPILER)
double norm2res __attribute__((aligned(QPHIX_LLC_CACHE_ALIGN))) = 0;
#else
__declspec(align(QPHIX_LLC_CACHE_ALIGN)) double norm2res = 0;
#endif
#pragma omp parallel shared(norm_array)
{
// […]
if (smtid < n_blas_simt) {
// […]
double lnorm = 0;
//#pragma prefetch x,y,res
//#pragma vector aligned(x,y,res)
#pragma omp simd aligned(res, x, y : veclen) reduction(+ : lnorm)
for (int i = low; i < hi; i++) {
res[i] = x[i] - y[i];
double tmpd = (double)res[i];
lnorm += (tmpd * tmpd);
}
// […]
}
}
// […]
}
The error is this right here:
In file included from /homec/hbn28/hbn28e/Sources/qphix/tests/timeDslashNoQDP.cc:6:
In file included from /homec/hbn28/hbn28e/Sources/qphix/include/qphix/blas.h:8:
/homec/hbn28/hbn28e/Sources/qphix/include/qphix/blas_c.h:156:54: error: private variable cannot be reduction
#pragma omp simd aligned(res,x,y:veclen) reduction(+:lnorm)
^
/homec/hbn28/hbn28e/Sources/qphix/include/qphix/blas_c.h:151:12: note: predetermined as private
double lnorm=0;
^
Due to the outer omp parallel block, the variable lnorm is defined for every thread. Then there is an additional SIMD section where each thread uses a SIMD lane. The reduction should be done within the thread, so the scope of the variables looks right. Still the compiler does not want it this way.
What is wrong here?

The problem seems to be that the private attribute attached to the lnorm variable by the omp parallel block conflicts with the requirements imposed by the OpenMP reduction() clause on its argument variable (even though lnorm is not private with respect to the nested omp simd block to which the reduction() clause applies).
You can try solving that problem by extracting the lnorm calculation code into a function of its own:
template <typename FT, int veclen>
inline double compute_res_and_lnorm(FT *res,
FT *x,
FT *y,
int low,
int hi)
{
double lnorm = 0;
#pragma omp simd aligned(res, x, y : veclen) reduction(+ : lnorm)
for (int i = low; i < hi; i++) {
res[i] = x[i] - y[i];
double tmpd = (double)res[i];
lnorm += (tmpd * tmpd);
}
return lnorm;
}
template <typename FT, int veclen>
inline void xmyNorm2Spinor(FT *res,
FT *x,
FT *y,
double &n2res,
int n,
int n_cores,
int n_simt,
int n_blas_simt) {
#if defined(__GNUG__) && !defined(__INTEL_COMPILER)
double norm2res __attribute__((aligned(QPHIX_LLC_CACHE_ALIGN))) = 0;
#else
__declspec(align(QPHIX_LLC_CACHE_ALIGN)) double norm2res = 0;
#endif
#pragma omp parallel shared(norm_array)
{
// […]
if (smtid < n_blas_simt) {
// […]
double lnorm = compute_res_and_lnorm(res, x, y, low, hi);
// […]
}
}
// […]
}

Using openMP to get the index of minimum element parallelly

I tried to write this code
float* theArray; // the array to find the minimum value
int index, i;
float thisValue, min;
index = 0;
min = theArray[0];
#pragma omp parallel for reduction(min:min_dist)
for (i=1; i<size; i++) {
thisValue = theArray[i];
if (thisValue < min)
{ /* find the min and its array index */
min = thisValue;
index = i;
}
}
return(index);
However this one is not outputting correct answers. Seems the min is OK but the correct index has been destroyed by threads.
I also tried some ways provided on the Internet and here (using parallel for for outer loop and use critical for final comparison) but this cause a speed drop rather than speedup.
What should I do to make both the min value and its index correct? Thanks!

I don't know of an elegant want to do a minimum reduction and save an index. I do this by finding the local minimum and index for each thread and then the global minimum and index in a critical section.
index = 0;
min = theArray[0];
#pragma omp parallel
{
int index_local = index;
float min_local = min;
#pragma omp for nowait
for (i = 1; i < size; i++) {
if (theArray[i] < min_local) {
min_local = theArray[i];
index_local = i;
}
}
#pragma omp critical
{
if (min_local < min) {
min = min_local;
index = index_local;
}
}
}
With OpenMP 4.0 it's possible to use user-defined reductions. A user-defined minimum reduction can be defined like this
struct Compare { float val; sizt_t index; };
#pragma omp declare reduction(minimum : struct Compare : omp_out = omp_in.val < omp_out.val ? omp_in : omp_out)
Then the reduction can be done like this
struct Compare min;
min.val = theArray[0];
min.index = 0;
#pragma omp parallel for reduction(minimum:min)
for(int i = 1; i<size; i++) {
if(theArray[i]<min.val) {
min.val = a[i];
min.index = i;
}
}
That works for C and C++. User defined reductions have other advantages besides simplified code. There are multiple algorithms for doing reductions. For example the merging can be done in O(number of threads) or O(Log(number of threads). The first solution I gave does this in O(number of threads) however using user-defined reductions let's OpenMP choose the algorithm.

Basic Idea
This can be accomplished without any parellelization-breaking critical or atomic sections by creating a custom reduction. Basically, define an object that stores both the index and value, and then create a function that sorts two of these objects by only the value, not the index.
Details
An object to store an index and value together:
typedef std::pair<unsigned int, float> IndexValuePair;
You can access the index by accessing the first property and the value by accessing the second property, i.e.,
IndexValuePair obj(0, 2.345);
unsigned int ix = obj.first; // 0
float val = obj.second; // 2.345
Define a function to sort two IndexValuePair objects:
IndexValuePair myMin(IndexValuePair a, IndexValuePair b){
return a.second < b.second ? a : b;
}
Then, construct a custom reduction following the guidelines in the OpenMP documentation:
#pragma omp declare reduction \
(minPair:IndexValuePair:omp_out=myMin(omp_out, omp_in)) \
initializer(omp_priv = IndexValuePair(0, 1000))
In this case, I've chosen to initialize the index to 0 and the value to 1000. The value should be initialized to some number larger than the largest value you expect to sort.
Functional Example
Finally, combine all these pieces with the parallel for loop!
// Compile with g++ -std=c++11 -fopenmp demo.cpp
#include <iostream>
#include <utility>
#include <vector>
typedef std::pair<unsigned int, float> IndexValuePair;
IndexValuePair myMin(IndexValuePair a, IndexValuePair b){
return a.second < b.second ? a : b;
}
int main(){
std::vector<float> vals {10, 4, 6, 2, 8, 0, -1, 2, 3, 4, 4, 8};
unsigned int i;
IndexValuePair minValueIndex(0, 1000);
#pragma omp declare reduction \
(minPair:IndexValuePair:omp_out=myMin(omp_out, omp_in)) \
initializer(omp_priv = IndexValuePair(0, 1000))
#pragma omp parallel for reduction(minPair:minValueIndex)
for(i = 0; i < vals.size(); i++){
if(vals[i] < minValueIndex.second){
minValueIndex.first = i;
minValueIndex.second = vals[i];
}
}
std::cout << "minimum value = " << minValueIndex.second << std::endl; // Should be -1
std::cout << "index = " << minValueIndex.first << std::endl; // Should be 6
return EXIT_SUCCESS;
}

Because you're not only trying to find the minimal value (reduction(min:___)) but also retain the index, you need to make the check critical. This can significantly slow down the loop (as reported). In general, make sure that there is enough work so you don't encounter overhead as in this question. An alternative would be to have each thread find the minimum and it's index and save them to a unique variable and have the master thread do a final check on those as in the following program.
#include <iostream>
#include <vector>
#include <ctime>
#include <random>
#include <omp.h>
using std::cout;
using std::vector;
void initializeVector(vector<double>& v)
{
std::mt19937 generator(time(NULL));
std::uniform_real_distribution<double> dis(0.0, 1.0);
v.resize(100000000);
for(int i = 0; i < v.size(); i++)
{
v[i] = dis(generator);
}
}
int main()
{
vector<double> vec;
initializeVector(vec);
float minVal = vec[0];
int minInd = 0;
int startTime = clock();
for(int i = 1; i < vec.size(); i++)
{
if(vec[i] < minVal)
{
minVal = vec[i];
minInd = i;
}
}
int elapsedTime1 = clock() - startTime;
// Change the number of threads accordingly
vector<float> threadRes(4, std::numeric_limits<float>::max());
vector<int> threadInd(4);
startTime = clock();
#pragma omp parallel for
for(int i = 0; i < vec.size(); i++)
{
{
if(vec[i] < threadRes[omp_get_thread_num()])
{
threadRes[omp_get_thread_num()] = vec[i];
threadInd[omp_get_thread_num()] = i;
}
}
}
float minVal2 = threadRes[0];
int minInd2 = threadInd[0];
for(int i = 1; i < threadRes.size(); i++)
{
if(threadRes[i] < minVal2)
{
minVal2 = threadRes[i];
minInd2 = threadInd[i];
}
}
int elapsedTime2 = clock() - startTime;
cout << "Min " << minVal << " at " << minInd << " took " << elapsedTime1 << std::endl;
cout << "Min " << minVal2 << " at " << minInd2 << " took " << elapsedTime2 << std::endl;
}
Please note that with optimizations on and nothing else to be done in the loop, the serial version seems to remain king. With optimizations turned off, OMP gains the upper hand.
P.S. you wrote reduction(min:min_dist) and the proceeded to use min instead of min_dist.

Actually, we can use omp critical directive to make only one thread run the code inside the critical region at a time.So only one thread can run it and the indexvalue wont be destroyed by other threads.
About omp critical directive:
The omp critical directive identifies a section of code that must be executed by a single thread at a time.
This code solves your issue:
#include <stdio.h>
#include <omp.h>
int main() {
int i;
int arr[10] = {11,42,53,64,55,46,47, 68, 59, 510};
float* theArray; // the array to find the minimum value
int index;
float thisValue, min;
index = 0;
min = arr[0];
int size=10;
#pragma omp parallel for
for (i=1; i<size; i++) {
thisValue = arr[i];
#pragma omp critical
if (thisValue < min)
{ /* find the min and its array index */
min = thisValue;
index = i;
}
}
printf("min:%d index:%d",min,index);
return 0;
}

Threads failing to affect performance

Below is a small program meant to parallelize the approximation of the 1/(n^2) series. Note the global parameter NUM_THREADS.
My issue is that increasing the number of threads from 1 to 4 (the number of processors my computer has is 4) does not significantly affect the outcomes of timing experiments. Do you see a logical flaw in the ThreadFunction? Is there false sharing or misplaced blocking that ends up serializing the execution?
#include <iostream>
#include <thread>
#include <vector>
#include <mutex>
#include <string>
#include <future>
#include <chrono>
std::mutex sum_mutex; // This mutex is for the sum vector
std::vector<double> sum_vec; // This is the sum vector
int NUM_THREADS = 1;
int UPPER_BD = 1000000;
/* Thread function */
void ThreadFunction(std::vector<double> &l, int beg, int end, int thread_num)
{
double sum = 0;
for(int i = beg; i < end; i++) sum += (1 / ( l[i] * l[i]) );
std::unique_lock<std::mutex> lock1 (sum_mutex, std::defer_lock);
lock1.lock();
sum_vec.push_back(sum);
lock1.unlock();
}
void ListFill(std::vector<double> &l, int z)
{
for(int i = 0; i < z; ++i) l.push_back(i);
}
int main()
{
std::vector<double> l;
std::vector<std::thread> thread_vec;
ListFill(l, UPPER_BD);
int len = l.size();
int lower_bd = 1;
int increment = (UPPER_BD - lower_bd) / NUM_THREADS;
for (int j = 0; j < NUM_THREADS; ++j)
{
thread_vec.push_back(std::thread(ThreadFunction, std::ref(l), lower_bd, lower_bd + increment, j));
lower_bd += increment;
}
for (auto &t : thread_vec) t.join();
double big_sum;
for (double z : sum_vec) big_sum += z;
std::cout << big_sum << std::endl;
return 0;
}

From looking at your code, I suspect that ListFill is taking longer than ThreadFunction. Why pass a list of values to the thread instead of the bounds each thread should loop over? Something like:
void ThreadFunction( int beg, int end ) {
double sum = 0.0;
for(double i = beg; i < end; i++)
sum += (1.0 / ( i * i) );
std::unique_lock<std::mutex> lock1 (sum_mutex);
sum_vec.push_back(sum);
}
To maximize parallelism, you need to push as much work as possible onto the threads. See Amdahl's Law

In addition to dohashi's nice improvement, you can remove the need for the mutex by populating the sum_vec in advance in the main thread:
sum_vec.resize(4);
then writing directly to it in ThreadFunction:
sum_vec[thread_num] = sum;
since each thread writes to a distinct element and doesn't modify the vector itself there is no need to lock anything.

pointers with OpenMP

i am trying to use OpenMP in my program (i am newbie using OpenMP) and the program return in two places errors.
Here is an example code:
#include <iostream>
#include <cstdint>
#include <vector>
#include <boost/multi_array.hpp>
#include <omp.h>
class CNachbarn {
public:
CNachbarn () { a = 0; }
uint32_t Get_Next_Neighbor() { return a++; }
private:
uint32_t a;
};
class CNetwork {
public:
CNetwork ( uint32_t num_elements_ );
~CNetwork();
void Validity();
void Clean();
private:
uint32_t num_elements;
uint32_t nachbar;
std::vector<uint32_t> remove_node_v;
CNachbarn *Nachbar;
};
CNetwork::CNetwork( uint32_t num_elements_ ) {
num_elements = num_elements_;
Nachbar = new CNachbarn();
remove_node_v.reserve( num_elements );
}
CNetwork::~CNetwork() {
delete Nachbar;
}
inline void CNetwork::Validity() {
#pragma omp parallel for
for ( uint32_t i = 0 ; i < num_elements ; i++ ) {
#pragma omp critical
remove_node_v.push_back(i);
}
}
void CNetwork::Clean () {
#pragma omp parallel for
for ( uint8_t j = 0 ; j < 2 ; j++ ) {
nachbar = Nachbar->Get_Next_Neighbor();
std::cout << "i: " << i << ", neighbor: " << nachbar << std::endl;
}
remove_node_v.clear();
}
int main() {
uint32_t num_elements = 1u << 3;
uint32_t i = 0;
CNetwork Network( num_elements );
do {
Network.Validity();
Network.Clean();
} while (++i < 2);
return 0;
}
I would like to know
if #pragma omp critical is a good solution for push_back()? (Does solve this problem?) would it be better to define for each thread its own vector and then combine them (using insert() )? or some kind of lock?
In my original code i get a running error at: nachbar = Nachbar->Get_Next_Neighbor( &remove_node_v[i] ); but in this example not. Nether the less, i would like OpenMP to use as the number of cores CNachbarn classes, since CNachbarn is recursive computation and should not be influenced from the other threads. The question is how to do it smarty? (I dont think it is smart to define CNachbarn each time i start the for-loop, since i call this function more the million times in my simulation and time is important.

Concerning your first problem:
Your function Validity is a perfect way to achieve below serial performance in a parallel loop. However, you already gave the correct answer. You should fill independent vectors for each thread and merge them afterwards.
inline void CNetwork::Validity() {
#pragma omp parallel for
for ( uint32_t i = 0 ; i < num_elements ; i++ ) {
#pragma omp critical
remove_node_v.push_back(i);
}
}
EDIT: A possible remedy could look like this (if you require serial access to your elements, you need to change the loop a bit)
inline void CNetwork::Validity() {
remove_node_v.reserve(num_elements);
#pragma omp parallel
{
std::vector<uint32_t> remove_node_v_thread_local;
uint32_t thread_id=omp_get_thread_num();
uint32_t n_threads=omp_get_num_threads();
for ( uint32_t i = thread_id ; i < num_elements ; i+=n_threads )
remove_node_v_thread_local.push_back(i);
#pragma omp critical
remove_node_v.insert(remove_node_v.end(), remove_node_v_thread_local.begin(), remove_node_v_thread_local.end());
}
}
Your second problem could be solved by defining an array of CNachbarn with the size of the maximum number of OMP threads possible, and access distinct elements of the array from each thread like:
CNachbarn* meine_nachbarn=alle_meine_nachbarn[omp_get_thread_num()]

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

OpenMP slower than serial codes - c++

Related

Getting speed improvement with OpenMP in nested for loops with dependencies

“private variable cannot be reduction” although that variable is defined outside the SIMD block

Using openMP to get the index of minimum element parallelly

Threads failing to affect performance

pointers with OpenMP

Categories

Resources