I am trying to do the parallelization of a serial preconditioned conjugate gradient solver codes for 3D fire simulation using OpenMP (Intel compiler). But the performance seems not to be improved.
The grid dimension is 79x81x79 and the solver can converge after 565 iterations. The serial codes cost 3.39 seconds and the OpenMP version needs 3.86 seconds on Intel i7 2600 (OS: openSUSE 13.1).
Please help me to check the codes. Thanks a lot.
// preconditioned conjugate gradient solver ...
void PCGSSolver::solveNew(const Array3D<double>& sn, const Array3D<double>& ae, const Array3D<double>&aw,
const Array3D<double>& as, const Array3D<double>& an, const Array3D<double>&at, const Array3D<double>&ab,
const Array3D<double>& ap, Array3D<double>& ptmp){
std::size_t dimX=sn.getDimI();
std::size_t dimY=sn.getDimJ();
std::size_t dimZ=sn.getDimK();
Array3D<double> p1(dimX,dimY,dimZ,0.0);
Array3D<double> res(dimX,dimY,dimZ,0.0);
Array3D<double> d(dimX,dimY,dimZ,0.0);
Array3D<double> ain(dimX,dimY,dimZ,0.0);
double tiny=1.0e-30;
#pragma omp parallel
{
//Jacobi preconditioner
#pragma omp for nowait
for(std::size_t k=1;k<dimZ-1; k++){
for(std::size_t j=1; j<dimY-1; j++){
for(std::size_t i=1; i<dimX-1; i++){
d(i,j,k)=1./ap(i,j,k);
}
}
}
#pragma omp for nowait
for(std::size_t k=1;k<dimZ-1; k++){
for(std::size_t j=1; j<dimY-1; j++){
for(std::size_t i=1; i<dimX-1; i++){
res(i,j,k)=ae(i,j,k)*ptmp(i+1,j,k) + aw(i,j,k)*ptmp(i-1,j,k)+an(i,j,k)*ptmp(i,j+1,k)+as(i,j,k)*ptmp(i,j-1,k)+
at(i,j,k)*ptmp(i,j,k+1)+ab(i,j,k)*ptmp(i,j,k-1)+sn(i,j,k)-ap(i,j,k)*ptmp(i,j,k);
}
}
}
}
double big =1.0e+30;
double s1old=big;
//start iteration
for(std::size_t intswp=0; intswp<this->nswpvr; intswp++){
double alpha=0.0;
double bbeta=0.0;
double s1=0.0;
double s2=0.0;
double testir=0.0;
#pragma omp parallel
{
#pragma omp for reduction(+:s1)
for(std::size_t k=1;k<dimZ-1; k++){
for(std::size_t j=1; j<dimY-1; j++){
for(std::size_t i=1; i<dimX-1; i++){
ain(i,j,k)=res(i,j,k)*d(i,j,k);
s1+=(res(i,j,k)*ain(i,j,k));
}
}
}
#pragma omp single
{
bbeta=s1/(s1old+tiny);
}
#pragma omp for
for(std::size_t k=1;k<dimZ-1; k++){
for(std::size_t j=1; j<dimY-1; j++){
for(std::size_t i=1; i<dimX-1; i++){
p1(i,j,k)=ain(i,j,k)+bbeta*p1(i,j,k);
}
}
}
#pragma omp for reduction(+:s2)
for(std::size_t k=1;k<dimZ-1; k++){
for(std::size_t j=1; j<dimY-1; j++){
for(std::size_t i=1; i<dimX-1; i++){
ain(i,j,k)=ap(i,j,k)*p1(i,j,k)-ae(i,j,k)*p1(i+1,j,k)-aw(i,j,k)*p1(i-1,j,k)-
an(i,j,k)*p1(i,j+1,k)-as(i,j,k)*p1(i,j-1,k)-
at(i,j,k)*p1(i,j,k+1)-ab(i,j,k)*p1(i,j,k-1);
s2+=(p1(i,j,k)*ain(i,j,k));
}
}
}
#pragma omp single
{
alpha=s1/(s2+tiny);
}
#pragma omp for reduction(+:testir)
for(std::size_t k=1;k<dimZ-1; k++){
for(std::size_t j=1; j<dimY-1; j++){
for(std::size_t i=1; i<dimX-1; i++){
ptmp(i,j,k)=ptmp(i,j,k)+alpha*p1(i,j,k);
res(i,j,k)=res(i,j,k)-alpha*ain(i,j,k);
testir+=fabs(res(i,j,k));
}
}
}
}//==openmp region end
s1old=s1;
//test stop criteria
if(testir < ccvar){
std::cout<<"PCGS solver coverage at "<<(intswp+1)<<" iterations!"<<std::scientific<<testir<<std::endl;
return;
}
}
std::cout<<"PCGS solver can not coverage "<<std::endl;
}
The Array3D is a my 3 dimension array class.
#ifndef ARRAY3D_H
#define ARRAY3D_H
#include <vector>
#include <algorithm>
template<typename T> class Array3D
{
public:
typedef T value_type;
Array3D(){
dim_i=dim_j=dim_k=0;
dim_ij=0;
}
Array3D(std::size_t size_i, std::size_t size_j, std::size_t size_k){
this->resize(size_i,size_j,size_k);
}
Array3D(std::size_t size_i, std::size_t size_j, std::size_t size_k,const value_type& defaultValue){
this->resize(size_i,size_j,size_k,defaultValue);
}
virtual ~Array3D(){}
std::size_t getDimI()const{
return this->dim_i;
}
std::size_t getDimJ()const{
return this->dim_j;
}
std::size_t getDimK()const{
return this->dim_k;
}
//check if valid indices
bool checkIndices(std::size_t i, std::size_t j, std::size_t k){
return (i<this->dim_i ) && (j<this->dim_j) && (k<this->dim_k);
}
void resize(std::size_t size_i, std::size_t size_j, std::size_t size_k,const value_type& defaultValue){
this->resize(size_i,size_j,size_k);
this->fillValue(defaultValue);
}
//resize the array. The data will be ereased.
void resize(std::size_t size_i, std::size_t size_j, std::size_t size_k){
this->dim_i=size_i;
this->dim_j=size_j;
this->dim_k=size_k;
this->dim_ij=this->dim_i*this->dim_j;
std::size_t totalSize=this->dim_i*this->dim_j*this->dim_k;
this->data.resize(totalSize);
}
std::size_t size()const{
return this->data.size();
}
void fillValue(const value_type& defaultValue){
std::fill(this->data.begin(),this->data.end(),defaultValue);
}
value_type minValue()const{
return *(std::min_element(data.begin(),data.end()));
}
value_type maxValue()const{
return *(std::max_element(data.begin(),data.end()));
}
//Fill the array value using the sum of two array
void setValueSum(const Array3D& array1, const Array3D& array2){
size_t minSize=std::min(std::min(array1.data.size(),array2.data.size()),this->data.size());
for(size_t i=0; i<minSize; i++)
this->data[i]=array1.data[i]+array2.data[i];
}
void clear(){
dim_i=dim_j=dim_k=0;
dim_ij=0;
this->data.clear();
}
//get value reference at (i,j,k) or (x,y,z) or (u,v,w)...
const value_type& operator () (std::size_t i, std::size_t j, std::size_t k )const{
return this->data.at(this->calIndex(i,j,k));
}
value_type& operator ()(std::size_t i, std::size_t j, std::size_t k ){
return this->data.at(this->calIndex(i,j,k));
}
//access the raw data by 1D index
const value_type& operator [] (std::size_t i )const{
return this->data.at(i);
}
value_type& operator [](std::size_t i ){
return this->data.at(i);
}
std::vector<value_type>* rawData(){
return &(data);
}
private:
inline std::size_t calIndex(std::size_t i, std::size_t j, std::size_t k )const{
return k*this->dim_ij+j*this->dim_i+i;
}
private:
//dimension of array (i,j,k)(x,y,z)(u,v,w)...
std::size_t dim_i, dim_j, dim_k;
//raw data, order is I-J-K
std::vector<value_type> data;
//dim_i*dim_j
std::size_t dim_ij;
};
#endif // ARRAY3D_H
I measure the time using a Timer class codes downloaded from internet.
timer.start();
PCGSSolver solver;
solver.setTolerance(this->ccvar);
solver.setIteNum(this->nswpp);
solver.solveNew(sn,ae,aw,as,an,at,ab,ap,ptmp);
timer.stop();
std::cout<<"PCGS time:"<<timer.getElapsedTimeInSec()<<"sec"<<std::endl;
Timer.h
//////////////////////////////////////////////////////////////////////////////
// Timer.h
// =======
// High Resolution Timer.
// This timer is able to measure the elapsed time with 1 micro-second accuracy
// in both Windows, Linux and Unix system
//
// AUTHOR: Song Ho Ahn (song.ahn#gmail.com)
// CREATED: 2003-01-13
// UPDATED: 2006-01-13
//
// Copyright (c) 2003 Song Ho Ahn
//////////////////////////////////////////////////////////////////////////////
#ifndef TIMER_H_DEF
#define TIMER_H_DEF
#ifdef WIN32 // Windows system specific
#include <windows.h>
#else // Unix based system specific
#include <sys/time.h>
#endif
class Timer
{
public:
Timer(); // default constructor
~Timer(); // default destructor
void start(); // start timer
void stop(); // stop the timer
double getElapsedTime(); // get elapsed time in second
double getElapsedTimeInSec(); // get elapsed time in second (same as getElapsedTime)
double getElapsedTimeInMilliSec(); // get elapsed time in milli-second
double getElapsedTimeInMicroSec(); // get elapsed time in micro-second
protected:
private:
double startTimeInMicroSec; // starting time in micro-second
double endTimeInMicroSec; // ending time in micro-second
int stopped; // stop flag
#ifdef WIN32
LARGE_INTEGER frequency; // ticks per second
LARGE_INTEGER startCount; //
LARGE_INTEGER endCount; //
#else
timeval startCount; //
timeval endCount; //
#endif
};
#endif // TIMER_H_DEF
Timer.cpp
//////////////////////////////////////////////////////////////////////////////
// Timer.cpp
// =========
// High Resolution Timer.
// This timer is able to measure the elapsed time with 1 micro-second accuracy
// in both Windows, Linux and Unix system
//
// AUTHOR: Song Ho Ahn (song.ahn#gmail.com)
// CREATED: 2003-01-13
// UPDATED: 2006-01-13
//
// Copyright (c) 2003 Song Ho Ahn
//////////////////////////////////////////////////////////////////////////////
#include "Timer.h"
#include <stdlib.h>
///////////////////////////////////////////////////////////////////////////////
// constructor
///////////////////////////////////////////////////////////////////////////////
Timer::Timer()
{
#ifdef WIN32
QueryPerformanceFrequency(&frequency);
startCount.QuadPart = 0;
endCount.QuadPart = 0;
#else
startCount.tv_sec = startCount.tv_usec = 0;
endCount.tv_sec = endCount.tv_usec = 0;
#endif
stopped = 0;
startTimeInMicroSec = 0;
endTimeInMicroSec = 0;
}
///////////////////////////////////////////////////////////////////////////////
// distructor
///////////////////////////////////////////////////////////////////////////////
Timer::~Timer()
{
}
///////////////////////////////////////////////////////////////////////////////
// start timer.
// startCount will be set at this point.
///////////////////////////////////////////////////////////////////////////////
void Timer::start()
{
stopped = 0; // reset stop flag
#ifdef WIN32
QueryPerformanceCounter(&startCount);
#else
gettimeofday(&startCount, NULL);
#endif
}
///////////////////////////////////////////////////////////////////////////////
// stop the timer.
// endCount will be set at this point.
///////////////////////////////////////////////////////////////////////////////
void Timer::stop()
{
stopped = 1; // set timer stopped flag
#ifdef WIN32
QueryPerformanceCounter(&endCount);
#else
gettimeofday(&endCount, NULL);
#endif
}
///////////////////////////////////////////////////////////////////////////////
// compute elapsed time in micro-second resolution.
// other getElapsedTime will call this first, then convert to correspond resolution.
///////////////////////////////////////////////////////////////////////////////
double Timer::getElapsedTimeInMicroSec()
{
#ifdef WIN32
if(!stopped)
QueryPerformanceCounter(&endCount);
startTimeInMicroSec = startCount.QuadPart * (1000000.0 / frequency.QuadPart);
endTimeInMicroSec = endCount.QuadPart * (1000000.0 / frequency.QuadPart);
#else
if(!stopped)
gettimeofday(&endCount, NULL);
startTimeInMicroSec = (startCount.tv_sec * 1000000.0) + startCount.tv_usec;
endTimeInMicroSec = (endCount.tv_sec * 1000000.0) + endCount.tv_usec;
#endif
return endTimeInMicroSec - startTimeInMicroSec;
}
///////////////////////////////////////////////////////////////////////////////
// divide elapsedTimeInMicroSec by 1000
///////////////////////////////////////////////////////////////////////////////
double Timer::getElapsedTimeInMilliSec()
{
return this->getElapsedTimeInMicroSec() * 0.001;
}
///////////////////////////////////////////////////////////////////////////////
// divide elapsedTimeInMicroSec by 1000000
///////////////////////////////////////////////////////////////////////////////
double Timer::getElapsedTimeInSec()
{
return this->getElapsedTimeInMicroSec() * 0.000001;
}
///////////////////////////////////////////////////////////////////////////////
// same as getElapsedTimeInSec()
///////////////////////////////////////////////////////////////////////////////
double Timer::getElapsedTime()
{
return this->getElapsedTimeInSec();
}
A quick glance over your code shows a few areas where you can improve the performance. I'll leave the implementation up to you.
OMP Parallel For
Firstly its generally cheaper to use
#pragma omp parallel for
for (...) {
...
}
versus
#pragma omp parallel
{
#pragma omp for
for (...) {
...
}
}
Not by much but there is a slight improvement. See [1], the graphic at the end.
OMP SINGLE
The key benefit of using #pragma omp parallel for in this case is that it allows us to remove the #pragma omp single directive. When your program encounters a #pragma omp single directive every thread waits here until the others are finished processing their chunk of work. This could lead to a situation where several of your threads finish early and have to wait on another to finish until they can proceed.
Use of #pragma omp single and #pragma omp barrier is strongly discouraged in high performing parallelised code.
Collapsing Loops (The Hard Way)
The next area you need to look at is collapsing your loops. The following
#pragma omp parallel for
for (int k = 0; k < o; ++k) {
for (int j = 0; j < m; ++j) {
for (int i = 0; i < n; ++i) {
...
}
}
}
will generally parallelise the outer loop for (int k = ...) but run the inner loops in serial on each thread. You can achieve parallelisation of the entire loop by unravelling them like
#pragma omp parallel for
for (int l = 0; l < o*m*n; ++l) {
int i = l % n;
int j = (l / n) % m;
int k = ((l / n) / m) % o;
...
}
In most of your loops you can simply use l and the overloaded [] operator. Most Conjugate Gradient solvers will only need the l index and not the i, j and k indices as they operate on vectors. The only time when i, j and k are needed is when you are computing A*x (or A'*x). This change will increase the level of parallelisation in your code and should provide noticeable improvements.
Collapsing Loops (The Easy Way)
It should be mentioned that OpenMP as of Version 3.0 supports the collapse(n) clause which can be used to tell the compiler to automatically collapse the for() loops as I've described above. An example of this is
#pragma omp parallel for collapse(3)
for (int k = 0; k < o; ++k) {
for (int j = 0; j < m; ++j) {
for (int i = 0; i < n; ++i) {
...
}
}
}
which will cause the compiler to form a single for() loop and then parallelise it.
Reduction Clause
Lastly, and probably the most costly element in your code is the reduction() clause. Edit: I incorrectly previously mentioned this could be removed after collapsing the loops in my haste to finish the answer.
Source [1]
I don't know precisely why the OpenMP parallelisation doesn't make the code faster, but what's obvious is that you've got all you loops in the wrong order.
So, first thing first, start by swapping the i and k loops in your code and I'm sure you'll see a dramatic performance boost. Then you can have a look at OpenMP.
I tried to write this code
float* theArray; // the array to find the minimum value
int index, i;
float thisValue, min;
index = 0;
min = theArray[0];
#pragma omp parallel for reduction(min:min_dist)
for (i=1; i<size; i++) {
thisValue = theArray[i];
if (thisValue < min)
{ /* find the min and its array index */
min = thisValue;
index = i;
}
}
return(index);
However this one is not outputting correct answers. Seems the min is OK but the correct index has been destroyed by threads.
I also tried some ways provided on the Internet and here (using parallel for for outer loop and use critical for final comparison) but this cause a speed drop rather than speedup.
What should I do to make both the min value and its index correct? Thanks!
I don't know of an elegant want to do a minimum reduction and save an index. I do this by finding the local minimum and index for each thread and then the global minimum and index in a critical section.
index = 0;
min = theArray[0];
#pragma omp parallel
{
int index_local = index;
float min_local = min;
#pragma omp for nowait
for (i = 1; i < size; i++) {
if (theArray[i] < min_local) {
min_local = theArray[i];
index_local = i;
}
}
#pragma omp critical
{
if (min_local < min) {
min = min_local;
index = index_local;
}
}
}
With OpenMP 4.0 it's possible to use user-defined reductions. A user-defined minimum reduction can be defined like this
struct Compare { float val; sizt_t index; };
#pragma omp declare reduction(minimum : struct Compare : omp_out = omp_in.val < omp_out.val ? omp_in : omp_out)
Then the reduction can be done like this
struct Compare min;
min.val = theArray[0];
min.index = 0;
#pragma omp parallel for reduction(minimum:min)
for(int i = 1; i<size; i++) {
if(theArray[i]<min.val) {
min.val = a[i];
min.index = i;
}
}
That works for C and C++. User defined reductions have other advantages besides simplified code. There are multiple algorithms for doing reductions. For example the merging can be done in O(number of threads) or O(Log(number of threads). The first solution I gave does this in O(number of threads) however using user-defined reductions let's OpenMP choose the algorithm.
Basic Idea
This can be accomplished without any parellelization-breaking critical or atomic sections by creating a custom reduction. Basically, define an object that stores both the index and value, and then create a function that sorts two of these objects by only the value, not the index.
Details
An object to store an index and value together:
typedef std::pair<unsigned int, float> IndexValuePair;
You can access the index by accessing the first property and the value by accessing the second property, i.e.,
IndexValuePair obj(0, 2.345);
unsigned int ix = obj.first; // 0
float val = obj.second; // 2.345
Define a function to sort two IndexValuePair objects:
IndexValuePair myMin(IndexValuePair a, IndexValuePair b){
return a.second < b.second ? a : b;
}
Then, construct a custom reduction following the guidelines in the OpenMP documentation:
#pragma omp declare reduction \
(minPair:IndexValuePair:omp_out=myMin(omp_out, omp_in)) \
initializer(omp_priv = IndexValuePair(0, 1000))
In this case, I've chosen to initialize the index to 0 and the value to 1000. The value should be initialized to some number larger than the largest value you expect to sort.
Functional Example
Finally, combine all these pieces with the parallel for loop!
// Compile with g++ -std=c++11 -fopenmp demo.cpp
#include <iostream>
#include <utility>
#include <vector>
typedef std::pair<unsigned int, float> IndexValuePair;
IndexValuePair myMin(IndexValuePair a, IndexValuePair b){
return a.second < b.second ? a : b;
}
int main(){
std::vector<float> vals {10, 4, 6, 2, 8, 0, -1, 2, 3, 4, 4, 8};
unsigned int i;
IndexValuePair minValueIndex(0, 1000);
#pragma omp declare reduction \
(minPair:IndexValuePair:omp_out=myMin(omp_out, omp_in)) \
initializer(omp_priv = IndexValuePair(0, 1000))
#pragma omp parallel for reduction(minPair:minValueIndex)
for(i = 0; i < vals.size(); i++){
if(vals[i] < minValueIndex.second){
minValueIndex.first = i;
minValueIndex.second = vals[i];
}
}
std::cout << "minimum value = " << minValueIndex.second << std::endl; // Should be -1
std::cout << "index = " << minValueIndex.first << std::endl; // Should be 6
return EXIT_SUCCESS;
}
Because you're not only trying to find the minimal value (reduction(min:___)) but also retain the index, you need to make the check critical. This can significantly slow down the loop (as reported). In general, make sure that there is enough work so you don't encounter overhead as in this question. An alternative would be to have each thread find the minimum and it's index and save them to a unique variable and have the master thread do a final check on those as in the following program.
#include <iostream>
#include <vector>
#include <ctime>
#include <random>
#include <omp.h>
using std::cout;
using std::vector;
void initializeVector(vector<double>& v)
{
std::mt19937 generator(time(NULL));
std::uniform_real_distribution<double> dis(0.0, 1.0);
v.resize(100000000);
for(int i = 0; i < v.size(); i++)
{
v[i] = dis(generator);
}
}
int main()
{
vector<double> vec;
initializeVector(vec);
float minVal = vec[0];
int minInd = 0;
int startTime = clock();
for(int i = 1; i < vec.size(); i++)
{
if(vec[i] < minVal)
{
minVal = vec[i];
minInd = i;
}
}
int elapsedTime1 = clock() - startTime;
// Change the number of threads accordingly
vector<float> threadRes(4, std::numeric_limits<float>::max());
vector<int> threadInd(4);
startTime = clock();
#pragma omp parallel for
for(int i = 0; i < vec.size(); i++)
{
{
if(vec[i] < threadRes[omp_get_thread_num()])
{
threadRes[omp_get_thread_num()] = vec[i];
threadInd[omp_get_thread_num()] = i;
}
}
}
float minVal2 = threadRes[0];
int minInd2 = threadInd[0];
for(int i = 1; i < threadRes.size(); i++)
{
if(threadRes[i] < minVal2)
{
minVal2 = threadRes[i];
minInd2 = threadInd[i];
}
}
int elapsedTime2 = clock() - startTime;
cout << "Min " << minVal << " at " << minInd << " took " << elapsedTime1 << std::endl;
cout << "Min " << minVal2 << " at " << minInd2 << " took " << elapsedTime2 << std::endl;
}
Please note that with optimizations on and nothing else to be done in the loop, the serial version seems to remain king. With optimizations turned off, OMP gains the upper hand.
P.S. you wrote reduction(min:min_dist) and the proceeded to use min instead of min_dist.
Actually, we can use omp critical directive to make only one thread run the code inside the critical region at a time.So only one thread can run it and the indexvalue wont be destroyed by other threads.
About omp critical directive:
The omp critical directive identifies a section of code that must be executed by a single thread at a time.
This code solves your issue:
#include <stdio.h>
#include <omp.h>
int main() {
int i;
int arr[10] = {11,42,53,64,55,46,47, 68, 59, 510};
float* theArray; // the array to find the minimum value
int index;
float thisValue, min;
index = 0;
min = arr[0];
int size=10;
#pragma omp parallel for
for (i=1; i<size; i++) {
thisValue = arr[i];
#pragma omp critical
if (thisValue < min)
{ /* find the min and its array index */
min = thisValue;
index = i;
}
}
printf("min:%d index:%d",min,index);
return 0;
}
When I am using OpenMP without functions with the reduction(+ : sum) , the OpenMP version works fine.
#include <iostream>
#include <omp.h>
using namespace std;
int sum = 0;
void summation()
{
sum = sum + 1;
}
int main()
{
int i,sum;
#pragma omp parallel for reduction (+ : sum)
for(i = 0; i < 1000000000; i++)
summation();
#pragma omp parallel for reduction (+ : sum)
for(i = 0; i < 1000000000; i++)
summation();
#pragma omp parallel for reduction (+ : sum)
for(i = 0; i < 1000000000; i++)
summation();
std::cerr << "Sum is=" << sum << std::endl;
}
But when I am calling a function summation over a global variable, the OpenMP version is taking even more time than the sequential version.
I would like to know the reason for the same and the changes that should be made.
The summation function doesn't use the OMP shared variable that you are reducing to. Fix it:
#include <iostream>
#include <omp.h>
void summation(int& sum) { sum++; }
int main()
{
int sum;
#pragma omp parallel for reduction (+ : sum)
for(int i = 0; i < 1000000000; ++i)
summation(sum);
std::cerr << "Sum is=" << sum << '\n';
}
The time taken to synchronize the access to this one variable will be way in excess of what you gain by using multiple cores- they will all be endlessly waiting on each other, because there is only one variable and only one core can access it at a time. This design is not capable of concurrency and all the sync you're paying will just increase the run-time.