OpenMP C++ (trouble with compliling)

OpenMP C++ (trouble with compliling) - c++

I decided to calculate e as the sum of rows to get 2.718....
Well my code without OpenMP works perfectly and I measured the time which it is taking for calculations. When I used OpenMP to parralelize my calculation however, I got an error. I am running my program on core i7(8 cores 4 logic and 4 physical). As people say I must get a time twice as fast without using openMP. Below is my code:
#include <iostream>
#include <time.h>
#include <math.h>
#include "fact.h"
#include <cstdlib>;
#include <conio.h>;
using namespace std;
int main()
{
clock_t t1,t2;
int n;
long double exp=0;
long double y;
int p;
cout<<"Enter n:";
cin>>n;
t1=clock();
#pragma omp parallel for num_threads(2);
for(int i=1; i<n; i++)
{
p=i+1;
exp=exp+(1/((fact(p))));
}
t2=clock();
double total_clock;
total_clock=t2-t1;
long double total_exp;
total_exp=exp+2;
cout<<total_clock<<"\n the time is used for parralel calculations"<<endl;
cout<<total_exp<<endl;
cin.get();
getch();
return 0;
}
Fact() using function to calculate factorial of the number
long double fact(int N)
{
if(N < 0)
return 0;
if (N == 0)
return 1;
else
return N * fact(N - 1);
}
Error 3 error C3005: ;: unexpected token in directive OpenMP "parallel for" c:\users\александр\documents\visual studio 2012\projects\consoleapplication1\consoleapplication1\openmp.cpp 18

When using openmp pragmas, semicolons are not needed, hence:
"#pragma omp parallel for num_threads(2);"
should be "#pragma omp parallel for num_threads(2)"
without the ;

Related

From serial to omp: no speedup

I'm new to openMP. I'm working on All Pair Shortest Path Algorithm and here is the serial C++ routine i need to parallelize (complete code at the end of the post):
void mini(vector<vector<double>> &M, size_t n, vector<double> &rowk, vector<double> &colk)
{
size_t i, j;
for ( i=0; i<n; i++)
for ( j=0; j<n; j++)
M[i][j]=min(rowk[j]+colk[i], M[i][j]);
}
At execution I get this :
$ time ./floyd
real 0m0,349s
user 0m0,349s
sys 0m0,000s
Now, I try to insert some directives:
void mini(vector<vector<double>> &M, size_t n, vector<double> &rowk, vector<double> &colk)
{
#pragma omp parallel
{
size_t i, j;
#pragma omp parallel for
for ( i=0; i<n; i++)
for ( j=0; j<n; j++)
M[i][j]=min(rowk[j]+colk[i], M[i][j]);
}
}
Unfortunately, there is no speedup:
$ grep -c ^processor /proc/cpuinfo
4
$ time ./floyd
real 0m0,547s
user 0m2,073s
sys 0m0,004s
What am I doing wrong?
EDIT
Processor: Intel(R) Core(TM) i5-4590 CPU (4 hardware cores)
Complete code :
#include <cstdio>
#include <vector>
#include <limits>
#include <ctime>
#include <random>
#include <set>
#include <omp.h>
using namespace std;
typedef struct Wedge
{
int a, b;
double w;
} Wedge;
typedef pair<int, int> edge;
int randrange(int end, int start=0)
{
random_device rd;
mt19937 gen(rd());
uniform_int_distribution<> dis(start, end-1);
return dis(gen);
}
void relax_omp(vector<vector<double>> &M, size_t n, vector<double> &rowk, vector<double> &colk)
{
#pragma omp parallel
{
size_t i, j;
#pragma omp parallel for
for (i=0; i<n; i++)
for ( j=0; j<n; j++)
M[i][j]=min(rowk[j]+colk[i], M[i][j]);
}
}
void relax_serial(vector<vector<double>> &M, size_t n, vector<double> &rowk, vector<double> &colk)
{
size_t i, j;
for (i=0; i<n; i++)
for ( j=0; j<n; j++)
M[i][j]=min(rowk[j]+colk[i], M[i][j]);
}
void floyd(vector<vector<double>> &dist, bool serial)
{
size_t i, k;
size_t n {dist.size()};
for (k=0; k<n; k++)
{
vector<double> rowk =dist[k];
vector<double> colk(n);
for (i=0; i<n; i++)
colk[i]=dist[i][k];
if (serial)
relax_serial(dist, n, rowk, colk);
else
relax_omp(dist, n, rowk, colk);
}
for (i=0; i<n; i++)
dist[i][i]=0;
}
vector<Wedge> random_edges(int n, double density, double max_weight)
{
int M{n*(n-1)/2};
double m{density*M};
set<edge> edges;
vector<Wedge> wedges;
while (edges.size()<m)
{
pair<int,int> L;
L.first=randrange(n);
L.second=randrange(n);
if (L.first!=L.second && edges.find(L) == edges.end())
{
double w=randrange(max_weight);
Wedge wedge{L.first, L.second, w};
wedges.push_back(wedge);
edges.insert(L);
}
}
return wedges;
}
vector<vector<double>> fill_distances(vector<Wedge> wedges, int n)
{
double INF = std::numeric_limits<double>::infinity();
size_t i, m=wedges.size();
vector<vector<double>> dist(n, vector<double>(n, INF));
int a, b;
double w;
for (i=0; i<m; i++)
{ a=wedges[i].a;
b=wedges[i].b;
w=wedges[i].w;
dist[a][b]=w;
}
return dist;
}
int main (void)
{
double density{0.33};
double max_weight{200};
int n{800};
bool serial;
int ntest=10;
double avge_serial=0, avge_omp=0;
for (int i=0; i<ntest; i++)
{
vector<Wedge> wedges=random_edges(n, density, max_weight);
vector<vector<double>> dist=fill_distances(wedges, n);
double dtime;
dtime = omp_get_wtime();
serial=true;
floyd(dist, serial);
dtime = omp_get_wtime() - dtime;
avge_serial+=dtime;
dtime = omp_get_wtime();
serial=false;
floyd(dist, serial);
dtime = omp_get_wtime() - dtime;
avge_omp+=dtime;
}
printf("%d tests, n=%d\n", ntest, n);
printf("Average serial : %.2lf\n", avge_serial/ntest);
printf("Average openMP : %.2lf\n", avge_omp/ntest);
return 0;
}
output :
20 tests, n=800
Average serial : 0.31
Average openMP : 0.61
command line:
g++ -std=c++11 -Wall -O2 -Wno-unused-result -Wno-unused-variable -Wno-unused-but-set-variable -Wno-unused-parameter floyd.cpp -o floyd -lm -fopenmp

Your main issue is that you accidentally use nested parallelism:
#pragma omp parallel
{
size_t i, j;
#pragma omp parallel for
Since you already are in a parallel region, your second line should be
#pragma omp for
Otherwise, since a omp parallel for equals a omp parallel and a omp for, you have two nested parallel regions which is typically bad. Fixing this minor thing gets an ~2x speedup on a similar CPU.
There are several limitations why you are unlikely to get a full 4x speedup, such as but not limited to:
Memory bandwidth as a bottleneck
Relative overhead due to the small amount of work done within the parallel loop
Lower clock frequencies with multiple threads in turbo mode
Edit:
By the way, the much more idiomatic way to write your code is the following:
void relax_omp(...) {
#pragma omp parallel for
for (size_t i=0; i<n; i++) {
for (size_t j=0; j<n; j++) {
M[i][j]=min(rowk[j]+colk[i], M[i][j]);
}
}
}
If you declare variables as locally as possible, OpenMP wil almost alaways do the right thing. Which, in this case, means that i and j are private. In general it is much easier to reason about code this way.

There could be many reasons for this, the most obvious being that the work load is too small to notice speed up. The initial work load is 300ms. I would suggest enclosing this in a serial outerloop that repeats this work for at least 20 times, then you are starting with a serial time of (300ms * 20) 6 seconds to test with.
The other factor is the availability of parallel cores on the machine you are running this on. If your cpu has one core, multi-threading will cause slowdown due to the cost of thread-switching. 2 logical cores should show some speed up, 2 physical cores may show close to linear speed up.
Using pragma directives alone also does not guarantee that openMP is used. You have to compile using the -fopenmp command line argument to guarantee that the openmp library is linked to your object code.
Edit
Looking at your code now, the factor that controls the amount of work seems to be N rather than the outer loop. The idea of the outer loop was to artificially increase the amount of work within the same timing period but that can't be done here as you are trying to solve a specific problem. You can try parallelizing the nested loop as well but I think N = 800 is too low for parallelization to make a difference.
#pragma omp parallel for private(j) collapse(2)
j needs to be private to each iteration of the outer loop, hence private(j), otherwise j gets shared across all threads, leading to an inaccurate result.
Your loop is executed 640,000 times which is not much for modern CPUs that clock 3GHZ+, try something around N = 5000 which is 25M iterations.

task: OpenMP directive name required

Here is my code:
#include <stdio.h>
#include <cstdlib>
#include <locale>
#include <omp.h>
using namespace std;
typedef pair<int, int> pii;
typedef long long ll;
ll fib(int n) {
if (n <= 1)
return 1;
ll a, b;
#pragma omp task shared(a)
a = fib(n - 1);
#pragma omp task shared(b)
b = fib(n - 2);
#pragma omp taskwait
return a + b;
}
int main(int argc, char* argv[]) {
setlocale(LC_ALL, "");
int n;
scanf_s("%d", &n);
printf("Result: %lld\n", fib(n));
system("pause");
return 0;
}
Visual Studio returns C3001 error "task: OpenMP directive name required".
If I comment all the "pragma" it works fine, so there must be a problem with OpenMP. Some other program with "#pragma omp parallel" works fine, it's just the problem with the "task" directive.
What could be the problem?

Visual C++ supports the OpenMP 2.0 standard.
OpenMP introduced tasks with OpenMP 3.0
i.e. It's unsupported.

OpenMP count the number of iteration in cycle which is making each thread

I decided to count the number of iteration in cycle which is making each thread.
So i must to declare variable and get the thread number of each iteration right?
i got the number of threads just like ( 0,1,2,3) 4 threads. but when i created variables to calculate the sum of each thread i got a problem.
#include <iostream>
#include <time.h>
#include <math.h>
#include "fact.h"
#include <cstdlib>;
#include <conio.h>;
#include <omp.h>
using namespace std;
int main()
{
clock_t t1,t2;
int n;
long double exp=0;
long double y;
int p;
int axe;
cout<<"Enter n:";
cin>>n;
t1=clock();
int a=0,b=0,c=0,d=0;
#pragma omp parallel for num_threads(4) reduction (+:exp)
for(int i=1; i<n; i++)
{
int l=omp_get_thread_num();
cout<<l<<endl;
if (l=0) {a++;}
else if (l=1) {b++;}
else if (l=2) {c++;}
else {d++;}
p=i+1;
exp=exp+(1/((fact(p))));
}
t2=clock();
double total_clock;
total_clock=t2-t1;
long double total_exp;
total_exp=exp+2;
cout<<endl;
cout<<endl;
cout<<total_clock<<"\t the time is used for parralel calculations"<<endl;
cout<<total_exp<<endl;
cout<<a<<" thread one"<<endl;
cout<<b<<"thread two"<<endl;
cout<<c<<"thread three"<<endl;
cout<<d<<"Thread fourth"<<endl;
return 0;}
I am not getting errors but it shows me not the proper number of iteration in cycle which each thread is making.
In this work i calculated exponent. 2.71

You need to use if (l == 0) etc. instead of if (l = 0). The latter assigns 0 to l rather than comparing l to 0.

OpenMP - using functions

When I am using OpenMP without functions with the reduction(+ : sum) , the OpenMP version works fine.
#include <iostream>
#include <omp.h>
using namespace std;
int sum = 0;
void summation()
{
sum = sum + 1;
}
int main()
{
int i,sum;
#pragma omp parallel for reduction (+ : sum)
for(i = 0; i < 1000000000; i++)
summation();
#pragma omp parallel for reduction (+ : sum)
for(i = 0; i < 1000000000; i++)
summation();
#pragma omp parallel for reduction (+ : sum)
for(i = 0; i < 1000000000; i++)
summation();
std::cerr << "Sum is=" << sum << std::endl;
}
But when I am calling a function summation over a global variable, the OpenMP version is taking even more time than the sequential version.
I would like to know the reason for the same and the changes that should be made.

The summation function doesn't use the OMP shared variable that you are reducing to. Fix it:
#include <iostream>
#include <omp.h>
void summation(int& sum) { sum++; }
int main()
{
int sum;
#pragma omp parallel for reduction (+ : sum)
for(int i = 0; i < 1000000000; ++i)
summation(sum);
std::cerr << "Sum is=" << sum << '\n';
}

The time taken to synchronize the access to this one variable will be way in excess of what you gain by using multiple cores- they will all be endlessly waiting on each other, because there is only one variable and only one core can access it at a time. This design is not capable of concurrency and all the sync you're paying will just increase the run-time.

Couldn't get acceleration OpenMP

I am writing simple parallel program in C++ using OpenMP.
I am working on Windows 7 and on Microsoft Visual Studio 2010 Ultimate.
I changed the Language property of the project to "Yes/OpenMP" to support OpenMP
Here I provide the code:
#include <iostream>
#include <omp.h>
using namespace std;
double sum;
int i;
int n = 800000000;
int main(int argc, char *argv[])
{
omp_set_dynamic(0);
omp_set_num_threads(4);
sum = 0;
#pragma omp for reduction(+:sum)
for (i = 0; i < n; i++)
sum+= i/(n/10);
cout<<"sum="<<sum<<endl;
return EXIT_SUCCESS;
}
But, I couldn't get any acceleration by changing the x in omp_set_num_threads(x);
It doesn't matter if I use OpenMp or not, the calculating time is the same, about 7 seconds.
Does Someone know what is the problem?

Your pragma statement is missing the parallel specifier:
#include <iostream>
#include <omp.h>
using namespace std;
double sum;
int i;
int n = 800000000;
int main(int argc, char *argv[])
{
omp_set_dynamic(0);
omp_set_num_threads(4);
sum = 0;
#pragma omp parallel for reduction(+:sum) // add "parallel"
for (i = 0; i < n; i++)
sum+= i/(n/10);
cout<<"sum="<<sum<<endl;
return EXIT_SUCCESS;
}
Sequential:
sum=3.6e+009
2.30071
Parallel:
sum=3.6e+009
0.618365
Here's a version that some speedup with Hyperthreading. I had to increase the # of iterations by 10x and bump the datatypes to long long:
double sum;
long long i;
long long n = 8000000000;
int main(int argc, char *argv[])
{
omp_set_dynamic(0);
omp_set_num_threads(8);
double start = omp_get_wtime();
sum = 0;
#pragma omp parallel for reduction(+:sum)
for (i = 0; i < n; i++)
sum+= i/(n/10);
cout<<"sum="<<sum<<endl;
double end = omp_get_wtime();
cout << end - start << endl;
system("pause");
return EXIT_SUCCESS;
}
Threads: 1
sum=3.6e+014
13.0541
Threads: 2
sum=3.6e+010
6.62345
Threads: 4
sum=3.6e+010
3.85687
Threads: 8
sum=3.6e+010
3.285

Apart from the error pointed out by Mystical, you seemed to assume that openMP can justs to magic. It can at best use all cores on your machine. If you have 2 cores, it may reduce the execution time by two if you call omp_set_num_threads(np) with np>=2, but for np much larger than the number of cores, the code will be inefficient due to parallelization overheads.
The example from Mystical was apparently run on at least 4 cores with np=4.

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js