I have the following loop that I'd like to accelerate using #pragma omp simd:
#define N 1024
double* data = new double[N];
// Generate data, not important how.
double mean = 0.0
for (size_t i = 0; i < N; i++) {
mean += (data[i] - mean) / (i+1);
}
As I expected, just putting #pragma omp simd directly before the loop has no impact (I'm examining running times). I can tackle the multi-threaded case easily enough using #pragma omp parallel for reduction(...) with a custom reducer as shown below, but how do I put OpenMP SIMD to use here?
I'm using the following class for implementing the + and += operators for adding a double to a running mean as well as combining two running means:
class RunningMean {
private:
double mean;
size_t count;
public:
RunningMean(): mean(0), count(0) {}
RunningMean(double m, size_t c): mean(m), count(c) {}
RunningMean operator+(RunningMean& rhs) {
size_t c = this->count + rhs.count;
double m = (this->mean*this->count + rhs.mean*rhs.count) / c;
return RunningMean(m, c);
}
RunningMean operator+(double rhs) {
size_t c = this->count + 1;
double m = this->mean + (rhs - this->mean) / c;
return RunningMean(m, c);
}
RunningMean& operator+=(const RunningMean& rhs) {
this->mean = this->mean*this->count + rhs.mean*rhs.count;
this->count += rhs.count;
this->mean /= this->count;
return *this;
}
RunningMean& operator+=(double rhs) {
this->count++;
this->mean += (rhs - this->mean) / this->count;
return *this;
}
double getMean() { return mean; }
size_t getCount() { return count; }
};
The maths for this comes from http://prod.sandia.gov/techlib/access-control.cgi/2008/086212.pdf.
For multi-threaded, non-SIMD parallel reduction I do the following:
#pragma omp declare reduction (runningmean : RunningMean : omp_out += omp_in)
RunningMean mean;
#pragma omp parallel for reduction(runningmean:mean)
for (size_t i = 0; i < N; i++)
mean += data[i];
This gives me a 3.2X speedup on my Core i7 2600k using 8 threads.
If I was to implement the SIMD myself without OpenMP, I would just maintain 4 means in a vector, 4 counts in another vector (assuming the use of AVX instructions) and keep on adding 4-element double precision vectors using a vectorised version of operator+(double rhs). Once that is done, I would add the resulting 4 pairs of means and counts using the maths from operator+=. How can I instruct OpenMP to do this?
The problem is that
mean += (data[i] - mean) / (i+1);
is not easily amenable to SIMD. However, by studying the math carefully it's possible to vectorized this without too much effort.
The key forumla is
mean(n+m) = (n*mean(n) + m*mean(m))/(n+m)
which shows how to add the means of n numbers and the mean of m numbers. This can be seen in your operator definition RunningMean operator+(RunningMean& rhs). This explains why your parallel code works. I think this is more clear if we deconvolute your C++ code:
double mean = 0.0;
int count = 0;
#pragma omp parallel
{
double mean_private = 0.0;
int count_private = 0;
#pragma omp for nowait
for(size_t i=0; i<N; i++) {
count_private ++;
mean_private += (data[i] - mean_private)/count_private;
}
#pragma omp critical
{
mean = (count_private*mean_private + count*mean);
count += count_private;
mean /= count;
}
}
But we can use the same idea with SIMD (and combine them together). But let's first do the SIMD only part. Using AVX we can handle four parallel means at once. Each parallel mean will handle the following data elements:
mean 1 data elements: 0, 4, 8, 12,...
mean 2 data elements: 1, 5, 9, 13,...
mean 3 data elements: 2, 6, 10, 14,...
mean 4 data elements: 3, 7, 11, 15,...
One we have looped over all the elements then we add the four parallel sums together and divide by four (since each sum runs over N/4 elements).
Here is the code to to this
double mean = 0.0;
__m256d mean4 = _mm256_set1_pd(0.0);
__m256d count4 = _mm256_set1_pd(0.0);
for(size_t i=0; i<N/4; i++) {
count4 = _mm256_add_pd(count4,_mm256_set1_pd(1.0));
__m256d t1 = _mm256_loadu_pd(&data[4*i]);
__m256d t2 = _mm256_div_pd(_mm256_sub_pd(t1, mean4), count4);
mean4 = _mm256_add_pd(t2, mean4);
}
__m256d t1 = _mm256_hadd_pd(mean4,mean4);
__m128d t2 = _mm256_extractf128_pd(t1,1);
__m128d t3 = _mm_add_sd(_mm256_castpd256_pd128(t1),t2);
mean = _mm_cvtsd_f64(t3)/4;
int count = 0;
double mean2 = 0;
for(size_t i=4*(N/4); i<N; i++) {
count++;
mean2 += (data[i] - mean2)/count;
}
mean = (4*(N/4)*mean + count*mean2)/N;
Finally, we can combine this with OpenMP to get the full benefit of SIMD and MIMD like this
double mean = 0.0;
int count = 0;
#pragma omp parallel
{
double mean_private = 0.0;
int count_private = 0;
__m256d mean4 = _mm256_set1_pd(0.0);
__m256d count4 = _mm256_set1_pd(0.0);
#pragma omp for nowait
for(size_t i=0; i<N/4; i++) {
count_private++;
count4 = _mm256_add_pd(count4,_mm256_set1_pd(1.0));
__m256d t1 = _mm256_loadu_pd(&data[4*i]);
__m256d t2 = _mm256_div_pd(_mm256_sub_pd(t1, mean4), count4);
mean4 = _mm256_add_pd(t2, mean4);
}
__m256d t1 = _mm256_hadd_pd(mean4,mean4);
__m128d t2 = _mm256_extractf128_pd(t1,1);
__m128d t3 = _mm_add_sd(_mm256_castpd256_pd128(t1),t2);
mean_private = _mm_cvtsd_f64(t3)/4;
#pragma omp critical
{
mean = (count_private*mean_private + count*mean);
count += count_private;
mean /= count;
}
}
int count2 = 0;
double mean2 = 0;
for(size_t i=4*(N/4); i<N; i++) {
count2++;
mean2 += (data[i] - mean2)/count2;
}
mean = (4*(N/4)*mean + count2*mean2)/N;
And here is a working example (compile with -O3 -mavx -fopenmp)
#include <stdio.h>
#include <stdlib.h>
#include <x86intrin.h>
double mean_simd(double *data, const int N) {
double mean = 0.0;
__m256d mean4 = _mm256_set1_pd(0.0);
__m256d count4 = _mm256_set1_pd(0.0);
for(size_t i=0; i<N/4; i++) {
count4 = _mm256_add_pd(count4,_mm256_set1_pd(1.0));
__m256d t1 = _mm256_loadu_pd(&data[4*i]);
__m256d t2 = _mm256_div_pd(_mm256_sub_pd(t1, mean4), count4);
mean4 = _mm256_add_pd(t2, mean4);
}
__m256d t1 = _mm256_hadd_pd(mean4,mean4);
__m128d t2 = _mm256_extractf128_pd(t1,1);
__m128d t3 = _mm_add_sd(_mm256_castpd256_pd128(t1),t2);
mean = _mm_cvtsd_f64(t3)/4;
int count = 0;
double mean2 = 0;
for(size_t i=4*(N/4); i<N; i++) {
count++;
mean2 += (data[i] - mean2)/count;
}
mean = (4*(N/4)*mean + count*mean2)/N;
return mean;
}
double mean_simd_omp(double *data, const int N) {
double mean = 0.0;
int count = 0;
#pragma omp parallel
{
double mean_private = 0.0;
int count_private = 0;
__m256d mean4 = _mm256_set1_pd(0.0);
__m256d count4 = _mm256_set1_pd(0.0);
#pragma omp for nowait
for(size_t i=0; i<N/4; i++) {
count_private++;
count4 = _mm256_add_pd(count4,_mm256_set1_pd(1.0));
__m256d t1 = _mm256_loadu_pd(&data[4*i]);
__m256d t2 = _mm256_div_pd(_mm256_sub_pd(t1, mean4), count4);
mean4 = _mm256_add_pd(t2, mean4);
}
__m256d t1 = _mm256_hadd_pd(mean4,mean4);
__m128d t2 = _mm256_extractf128_pd(t1,1);
__m128d t3 = _mm_add_sd(_mm256_castpd256_pd128(t1),t2);
mean_private = _mm_cvtsd_f64(t3)/4;
#pragma omp critical
{
mean = (count_private*mean_private + count*mean);
count += count_private;
mean /= count;
}
}
int count2 = 0;
double mean2 = 0;
for(size_t i=4*(N/4); i<N; i++) {
count2++;
mean2 += (data[i] - mean2)/count2;
}
mean = (4*(N/4)*mean + count2*mean2)/N;
return mean;
}
int main() {
const int N = 1001;
double data[N];
for(int i=0; i<N; i++) data[i] = 1.0*rand()/RAND_MAX;
float sum = 0; for(int i=0; i<N; i++) sum+= data[i]; sum/=N;
printf("mean %f\n", sum);
printf("mean_simd %f\n", mean_simd(data, N);
printf("mean_simd_omp %f\n", mean_simd_omp(data, N));
}
The KISS answer: Just calculate the mean outside the loop. Parallelize the following code:
double sum = 0.0;
for(size_t i = 0; i < N; i++) sum += data[i];
double mean = sum/N;
The sum is easily parallelizeable, but you won't see any effect of SIMD optimization: it is purely memory bound, the CPU will only be waiting for data from memory. If N is as small as 1024, there is even little point in parallelization, the synchronization overhead will eat up all the gains.
Related
The goal is to add OpenMP parallelization to for (i = 0; i < n; i++) for the lower triangle solver for the form Ax=b. Expected result is exactly same as the result when there is NO parallelization added to for (i = 0; i < n; i++).
vector<vector<double>> represents a 2-D matrix. makeMatrix(int m, int n) initializes a vector<vector<double>> of all zeroes of size mxn.
Two of the most prominent tries have been left in comments.
vector<vector<double>> lowerTriangleSolver(vector<vector<double>> A, vector<vector<double>> b)
{
vector<vector<double>> x = makeMatrix(A.size(), 1);
int i, j;
int n = A.size();
double s;
//#pragma omp parallel for reduction(+: s)
//#pragma omp parallel for shared(s)
for (i = 0; i < n; i++)
{
s = 0.0;
#pragma omp parallel for
for (j = 0; j < i; j++)
{
s = s + A[i][j] * x[j][0];
}
x[i][0] = (b[i][0] - s) / A[i][i];
}
return x;
}
You could try to assign the outer loop iterations among threads, instead of the inner loop. In this way, you increase the granularity of the parallel tasks and avoid the reduction of the 's' variable.
#pragma omp parallel for
for (int i = 0; i < n; i++){
double s = 0.0;
for (int j = 0; j < i; j++){
s = s + A[i][j] * x[j][0];
}
x[i][0] = (b[i][0] - s) / A[i][i];
}
Unfortunately, that is not possible because there is a dependency between s = s + A[i][j] * x[j][0]; and x[i][0] = (b[i][0] - s) / A[i][i];, more precisely x[j][0] depends upon the x[i][0].
So you can try two approaches:
for (int i = 0; i < n; i++){
double s = 0.0;
#pragma omp parallel for reduction(+:s)
for (int j = 0; j < i; j++){
s = s + A[i][j] * x[j][0];
}
x[i][0] = (b[i][0] - s) / A[i][i];
}
or using SIMD :
for (int i = 0; i < n; i++){
double s = 0.0;
#pragma omp simd reduction(+:s)
for (int j = 0; j < i; j++){
s = s + A[i][j] * x[j][0];
}
x[i][0] = (b[i][0] - s) / A[i][i];
}
I'm working on factorial function. I have to write its parallel version using OpenMP.
double sequentialFactorial(const int N) {
double result = 1;
for(int i = 1; i <= N; i++) {
result *= i;
}
return result;
}
It is well known that this algorithm can be efficiently parallelized using reduction tecnique.
I'm aware of the existence of reduction clause (standard §§ 2.15.3.6).
double parallelAutomaticFactorial(const int N) {
double result = 1;
#pragma omp parallel for reduction(*:result)
for (int i=1; i <= N; i++)
result *= i;
return result;
}
However, I want to try to implement reduction tecnique "handmade".
double parallelHandmadeFactorial(const int N) {
// maximum number of threads
const int N_THREADS = omp_get_max_threads();
// table of partial results
double* partial = new double[N_THREADS];
for(int i = 0; i < N_THREADS; i++) {
partial[i] = 1;
}
// reduction tecnique
#pragma omp parallel for
for(int i = 1; i <= N; i++) {
int thread_index = omp_get_thread_num();
partial[thread_index] *= i;
}
// fold results
double result = 1;
for(int i = 0; i < N_THREADS; i++) {
result *= partial[i];
}
delete partial;
return result;
}
I expect the performance of the last two snippet to be very similar, and better than the first one. However, the average performance is:
Sequential Factorial 3500 ms
Parallel Handmade Factorial 6100 ms
Parallel Automatic Factorial 600 ms
Am I missing something?
Thanks to #Gilles and #P.W, this code works as expected
double parallelNoWaitFactorial(const int N) {
double result = 1;
#pragma omp parallel
{
double my_local_result = 1;
// removing nowait does not change the performance
#pragma omp for nowait
for(int i = 1; i <= N; i++)
my_local_result *= i;
#pragma omp atomic
result *= my_local_result;
}
return result;
}
If array elements happen to share a cache line, this leads to false sharing which further leads to performance degradation.
To avoid this:
Use a private variable double partial instead of the double array
partial.
Use the partial result of each thread to compute the final result in a critical region
This final result should a variable that is not private to the parallel region.
The critical region will look like this:
#pragma omp critical
result *= partial;
I wrote code to test the performance of openmp on win (Win7 x64, Corei7 3.4HGz) and on Mac (10.12.3 Core i7 2.7 HGz).
In xcode I made a console application setting the compiled default. I use LLVM 3.7 and OpenMP 5 (in opm.h i searched define KMP_VERSION_MAJOR=5, define KMP_VERSION_MINOR=0 and KMP_VERSION_BUILD = 20150701, libiopm5) on macos 10.12.3 (CPU - Corei7 2700GHz)
For win I use VS2010 Sp1. Additional I set c/C++ -> Optimization -> Optimization = Maximize Speed (O2), c/C++ -> Optimization ->Favor Soze Or Speed = Favor Fast code (Ot).
If I run the application in a single thread, the time difference corresponds to the frequency ratio of processors (approximately). But if you run 4 threads, the difference becomes tangible: win program be faster then mac program in ~70 times.
#include <cmath>
#include <mutex>
#include <cstdint>
#include <cstdio>
#include <iostream>
#include <omp.h>
#include <boost/chrono/chrono.hpp>
static double ActionWithNumber(double number)
{
double sum = 0.0f;
for (std::uint32_t i = 0; i < 50; i++)
{
double coeff = sqrt(pow(std::abs(number), 0.1));
double res = number*(1.0-coeff)*number*(1.0-coeff) * 3.0;
sum += sqrt(res);
}
return sum;
}
static double TestOpenMP(void)
{
const std::uint32_t len = 4000000;
double *a;
double *b;
double *c;
double sum = 0.0;
std::mutex _mutex;
a = new double[len];
b = new double[len];
c = new double[len];
for (std::uint32_t i = 0; i < len; i++)
{
c[i] = 0.0;
a[i] = sin((double)i);
b[i] = cos((double)i);
}
boost::chrono::time_point<boost::chrono::system_clock> start, end;
start = boost::chrono::system_clock::now();
double k = 2.0;
omp_set_num_threads(4);
#pragma omp parallel for
for (int i = 0; i < len; i++)
{
c[i] = k*a[i] + b[i] + k;
if (c[i] > 0.0)
{
c[i] += ActionWithNumber(c[i]);
}
else
{
c[i] -= ActionWithNumber(c[i]);
}
std::lock_guard<std::mutex> scoped(_mutex);
sum += c[i];
}
end = boost::chrono::system_clock::now();
boost::chrono::duration<double> elapsed_time = end - start;
double sum2 = 0.0;
for (std::uint32_t i = 0; i < len; i++)
{
sum2 += c[i];
c[i] /= sum2;
}
if (std::abs(sum - sum2) > 0.01) printf("Incorrect result.\n");
delete[] a;
delete[] b;
delete[] c;
return elapsed_time.count();
}
int main()
{
double sum = 0.0;
const std::uint32_t steps = 5;
for (std::uint32_t i = 0; i < steps; i++)
{
sum += TestOpenMP();
}
sum /= (double)steps;
std::cout << "Elapsed time = " << sum;
return 0;
}
I specifically use a mutex here to compare the performance of openmp on the "mac" and "win". On the "Win" function returns the time of 0.39 seconds. On the "Mac" function returns the time of 25 seconds, i.e. 70 times slower.
What is the cause of this difference?
First of all, thank for edit my post (i use translater to write text).
In the real app, I update the values in a huge matrix (20000х20000) in random order. Each thread determines the new value and writes it in a particular cell. I create a mutex for each row, since in most cases different threads write to different rows. But apparently in cases when 2 threads write in one row and there is a long lock. At the moment I can't divide the rows in different threads, since the order of records is determined by the FEM elements.
So just to put a critical section in there comes out, as it will block writes to the entire matrix.
I wrote code like in real application.
static double ActionWithNumber(double number)
{
const unsigned int steps = 5000;
double sum = 0.0f;
for (u32 i = 0; i < steps; i++)
{
double coeff = sqrt(pow(abs(number), 0.1));
double res = number*(1.0-coeff)*number*(1.0-coeff) * 3.0;
sum += sqrt(res);
}
sum /= (double)steps;
return sum;
}
static double RealAppTest(void)
{
const unsigned int elementsNum = 10000;
double* matrix;
unsigned int* elements;
boost::mutex* mutexes;
elements = new unsigned int[elementsNum*3];
matrix = new double[elementsNum*elementsNum];
mutexes = new boost::mutex[elementsNum];
for (unsigned int i = 0; i < elementsNum; i++)
for (unsigned int j = 0; j < elementsNum; j++)
matrix[i*elementsNum + j] = (double)(rand() % 100);
for (unsigned int i = 0; i < elementsNum; i++) //build FEM element like Triangle
{
elements[3*i] = rand()%(elementsNum-1);
elements[3*i+1] = rand()%(elementsNum-1);
elements[3*i+2] = rand()%(elementsNum-1);
}
boost::chrono::time_point<boost::chrono::system_clock> start, end;
start = boost::chrono::system_clock::now();
omp_set_num_threads(4);
#pragma omp parallel for
for (int i = 0; i < elementsNum; i++)
{
unsigned int* elems = &elements[3*i];
for (unsigned int j = 0; j < 3; j++)
{
//in here set mutex for row with index = elems[j];
boost::lock_guard<boost::mutex> lockup(mutexes[i]);
double res = 0.0;
for (unsigned int k = 0; k < 3; k++)
{
res += ActionWithNumber(matrix[elems[j]*elementsNum + elems[k]]);
}
for (unsigned int k = 0; k < 3; k++)
{
matrix[elems[j]*elementsNum + elems[k]] = res;
}
}
}
end = boost::chrono::system_clock::now();
boost::chrono::duration<double> elapsed_time = end - start;
delete[] elements;
delete[] matrix;
delete[] mutexes;
return elapsed_time.count();
}
int main()
{
double sum = 0.0;
const u32 steps = 5;
for (u32 i = 0; i < steps; i++)
{
sum += RealAppTest();
}
sum /= (double)steps;
std::cout<<"Elapsed time = " << sum;
return 0;
}
You're combining two different sets of threading/synchronization primitives - OpenMP, which is built into the compiler and has a runtime system, and manually creating a posix mutex with std::mutex. It's probably not surprising that there's some interoperability hiccups with some compiler/OS combinations.
My guess here is that in the slow case, the OpenMP runtime is going overboard to make sure that there's no interactions between higher-level ongoing OpenMP threading tasks and the manual mutex, and that doing so inside a tight loop causes the dramatic slowdown.
For mutex-like behaviour in the OpenMP framework, we can use critical sections:
#pragma omp parallel for
for (int i = 0; i < len; i++)
{
//...
// replacing this: std::lock_guard<std::mutex> scoped(_mutex);
#pragma omp critical
sum += c[i];
}
or explicit locks:
omp_lock_t sumlock;
omp_init_lock(&sumlock);
#pragma omp parallel for
for (int i = 0; i < len; i++)
{
//...
// replacing this: std::lock_guard<std::mutex> scoped(_mutex);
omp_set_lock(&sumlock);
sum += c[i];
omp_unset_lock(&sumlock);
}
omp_destroy_lock(&sumlock);
We get much more reasonable timings:
$ time ./openmp-original
real 1m41.119s
user 1m15.961s
sys 1m53.919s
$ time ./openmp-critical
real 0m16.470s
user 1m2.313s
sys 0m0.599s
$ time ./openmp-locks
real 0m15.819s
user 1m0.820s
sys 0m0.276s
Updated: There's no problem with using an array of openmp locks in exactly the same way as the mutexes:
omp_lock_t sumlocks[elementsNum];
for (unsigned idx=0; idx<elementsNum; idx++)
omp_init_lock(&(sumlocks[idx]));
//...
#pragma omp parallel for
for (int i = 0; i < elementsNum; i++)
{
unsigned int* elems = &elements[3*i];
for (unsigned int j = 0; j < 3; j++)
{
//in here set mutex for row with index = elems[j];
double res = 0.0;
for (unsigned int k = 0; k < 3; k++)
{
res += ActionWithNumber(matrix[elems[j]*elementsNum + elems[k]]);
}
omp_set_lock(&(sumlocks[i]));
for (unsigned int k = 0; k < 3; k++)
{
matrix[elems[j]*elementsNum + elems[k]] = res;
}
omp_unset_lock(&(sumlocks[i]));
}
}
for (unsigned idx=0; idx<elementsNum; idx++)
omp_destroy_lock(&(sumlocks[idx]));
I'm trying to count integral
#include <iostream>
#include <omp.h>
using namespace std;
double my_exp(double x) {
double res = 1., term = 1.;
for(int n=1; n<=1000; n++) {
term *= x / n;
res += term;
}
return res;
}
double f(double x) {
return x*my_exp(x);
}
int main() {
double a=0., b=1., result = 0.;
int nsteps = 1000000;
double h = (b - a)/nsteps;
for(int i=1; i<nsteps; i++) result += f(a + i*h);
result = (result + .5*(f(a) + f(b)))*h;
cout.precision(15);
cout << "Result: " << result << endl;
return 0;
}
This program count integral and return result Result: 1.00000000000035
. But time of execute is much.
I should parallel my program, I think I should add #pragma omp parallel for but it doesn't work
change your main function
#pragma omp parallel
{
double localresult = 0.0;
#pragma omp for
for(int i=1; i<nsteps; i++)
localresult += f(a + i*h);
#pragma omp critical
{
result += localresult;
}
}
result = (result + .5*(f(a) + f(b)))*h;
edit: the much simpler solution along the lines of muXXmit2X would be
#pragma omp parallel for reduction(+:result)
for(int i=1; i<nsteps; i++) result += f(a + i*h);
result = (result + .5*(f(a) + f(b)))*h;
I am trying to implement the Viterbi algorithm with the help of OpenMP. So far, my test shows that the execution time of the parallel program is approximately 4 times the execution time of the sequential program. Here is my code:
#include <omp.h>
#include <stdio.h>
#include <time.h>
#define K 39 // num states
#define T 1500 // num obs sequence
int states[K];
double transition[K][K];
double emission[K][K];
double init_prob[K];
int observation[T];
using namespace std;
void generateValues()
{
srand(time(NULL));
for(int i=0; i<T; i++)
{
observation[i] = rand() % K;
}
for(int i=0; i<K; i++)
{
states[i] = i;
init_prob[i] = (double)rand() / (double)RAND_MAX;
for(int j=0;j<K;j++)
{
transition[i][j] = (double)rand() / (double)RAND_MAX;
srand(time(NULL));
emission[i][j] = (double)rand() / (double)RAND_MAX;
}
}
}
int* viterbi(int *S, double *initp, int *Y, double A[][K], double B[][K])
{
double T1[K][T];
int T2[K][T];
#pragma omp parallel for
for(int i=0; i<K; ++i)
{
T1[i][0] = initp[i];
T2[i][0] = 0;
}
for(int i=1; i<T; ++i)
{
double max, temp;
int argmax;
#pragma omp parallel for private (max, temp, argmax)
for(int j=0; j<K; ++j)
{
max = -1;
#pragma omp parallel for
for(int k=0; k<K; ++k)
{
temp = T1[k][i-1] * A[k][j] * B[k][Y[i-1]];
if(temp > max)
{
max = temp;
argmax = k;
}
}
T1[j][i] = max;
T2[j][i] = argmax;
}
}
int Z[T];
int X[T];
double max = -1, temp;
#pragma omp parallel for
for(int k=0; k<K; ++k)
{
temp = T1[k][T-1];
if(temp > max)
{
max = temp;
Z[T-1] = k;
}
}
X[T-1] = S[Z[T-1]];
for(int i=T-1; i>0; --i)
{
Z[i-1] = T2[Z[i]][i];
X[i-1] = S[Z[i-1]];
}
return X;
}
int* viterbiNoOmp(int *S, double *initp, int *Y, double A[][K], double B[][K]) // the same as before, minus the #pragma omp
int main()
{
clock_t tStart;
int *path;
generateValues();
double sumOmp = 0;
for(int i=0;i<6;i++)
{
double start = omp_get_wtime();
path = viterbi(states, init_prob, observation, transition, emission);
double end = omp_get_wtime();
sumOmp += end - start;
}
double sumNoOmp = 0;
for(int i=0;i<6;i++)
{
tStart = clock();
path = viterbiNoOmp(states, init_prob, observation, transition, emission);
sumNoOmp += ((double)(clock() - tStart)/CLOCKS_PER_SEC);
}
for (int i=0;i<T;i++)
{
printf("%d, ", path[i]);
}
printf("\n\ntime With Omp: %f\ntime without Omp: %f", sumOmp/6, sumNoOmp/6);
return 0;
}
What am I doing wrong?
First of all, you used for your first measurement the omp_get_wtime() function, and for your second, you used clock().
Use omp_get_wtime() for both and you'll see a little improvement
Secondly instead of using sumNoOmp += ((double)(clock() - tStart)/CLOCKS_PER_SEC);
just use sumNoOmp = ((double)(clock() - tStart)/CLOCKS_PER_SEC);
Now let's move on to your code:
trying to parallel nested loops is a little tricky
try using #pragma omp parallel for only for the outer loop and watch for the result