Viterbi algorithm with OpenMP - c++

I am trying to implement the Viterbi algorithm with the help of OpenMP. So far, my test shows that the execution time of the parallel program is approximately 4 times the execution time of the sequential program. Here is my code:
#include <omp.h>
#include <stdio.h>
#include <time.h>
#define K 39 // num states
#define T 1500 // num obs sequence
int states[K];
double transition[K][K];
double emission[K][K];
double init_prob[K];
int observation[T];
using namespace std;
void generateValues()
{
srand(time(NULL));
for(int i=0; i<T; i++)
{
observation[i] = rand() % K;
}
for(int i=0; i<K; i++)
{
states[i] = i;
init_prob[i] = (double)rand() / (double)RAND_MAX;
for(int j=0;j<K;j++)
{
transition[i][j] = (double)rand() / (double)RAND_MAX;
srand(time(NULL));
emission[i][j] = (double)rand() / (double)RAND_MAX;
}
}
}
int* viterbi(int *S, double *initp, int *Y, double A[][K], double B[][K])
{
double T1[K][T];
int T2[K][T];
#pragma omp parallel for
for(int i=0; i<K; ++i)
{
T1[i][0] = initp[i];
T2[i][0] = 0;
}
for(int i=1; i<T; ++i)
{
double max, temp;
int argmax;
#pragma omp parallel for private (max, temp, argmax)
for(int j=0; j<K; ++j)
{
max = -1;
#pragma omp parallel for
for(int k=0; k<K; ++k)
{
temp = T1[k][i-1] * A[k][j] * B[k][Y[i-1]];
if(temp > max)
{
max = temp;
argmax = k;
}
}
T1[j][i] = max;
T2[j][i] = argmax;
}
}
int Z[T];
int X[T];
double max = -1, temp;
#pragma omp parallel for
for(int k=0; k<K; ++k)
{
temp = T1[k][T-1];
if(temp > max)
{
max = temp;
Z[T-1] = k;
}
}
X[T-1] = S[Z[T-1]];
for(int i=T-1; i>0; --i)
{
Z[i-1] = T2[Z[i]][i];
X[i-1] = S[Z[i-1]];
}
return X;
}
int* viterbiNoOmp(int *S, double *initp, int *Y, double A[][K], double B[][K]) // the same as before, minus the #pragma omp
int main()
{
clock_t tStart;
int *path;
generateValues();
double sumOmp = 0;
for(int i=0;i<6;i++)
{
double start = omp_get_wtime();
path = viterbi(states, init_prob, observation, transition, emission);
double end = omp_get_wtime();
sumOmp += end - start;
}
double sumNoOmp = 0;
for(int i=0;i<6;i++)
{
tStart = clock();
path = viterbiNoOmp(states, init_prob, observation, transition, emission);
sumNoOmp += ((double)(clock() - tStart)/CLOCKS_PER_SEC);
}
for (int i=0;i<T;i++)
{
printf("%d, ", path[i]);
}
printf("\n\ntime With Omp: %f\ntime without Omp: %f", sumOmp/6, sumNoOmp/6);
return 0;
}
What am I doing wrong?

First of all, you used for your first measurement the omp_get_wtime() function, and for your second, you used clock().
Use omp_get_wtime() for both and you'll see a little improvement
Secondly instead of using sumNoOmp += ((double)(clock() - tStart)/CLOCKS_PER_SEC);
just use sumNoOmp = ((double)(clock() - tStart)/CLOCKS_PER_SEC);
Now let's move on to your code:
trying to parallel nested loops is a little tricky
try using #pragma omp parallel for only for the outer loop and watch for the result

Related

Speedup when avoiding false sharing problem?

I am trying to add cache-line padding to avoid false sharing problem but I cant see a big difference in speedup. With padding its only 1.2 x faster. I am running the code without padding and the one with padding n = 700 milion times for testing. Should I get more speedup than 1.2 times? Maybe I have missed something with my padding implementation? I am adding 15 ints padding because I am assuming that counters doesnt have to be allocated at the start of a cache-line. Any tips appreciated.
Here is my code:
template <const int k> void par_countingsort2(int *out, int const *in, const int n) {
const int paddingAmount = cachelinesize / sizeof(int);
const int kPadded = k + (paddingAmount - 1);
printf("/n%d", kPadded);
int counters[nproc][kPadded] = {}; // all zeros
#pragma omp parallel
{
int *thcounters = counters[omp_get_thread_num()];
#pragma omp for
for (int i = 0; i < n; ++i)
++thcounters[in[i]];
#pragma omp single
{
int tmp, sum = 0;
for (int j = 0; j < k; ++j)
for (int i = 0; i < nproc; ++i) {
tmp = counters[i][j];
counters[i][j] = sum;
sum += tmp;
}
}
#pragma omp for
for (int i = 0; i < n; ++i)
out[thcounters[in[i]]++] = in[i];
}
}
#define k 1000
int main(int argc, char *argv[]) {
//init input
int n = argc>1 && atoi(argv[1])>0 ? atoi(argv[1]) : 0;
int* in = (int*)malloc(sizeof(int)*n);
int* out = (int*)malloc(sizeof(int)*n);;
for (int i = 0; i < n; ++i)
in[i] = rand()%k;
printf("n = %d\n", n);
//print some parameters
printf("nproc = %d\n", nproc);
printf("cachelinesize = %d byte\n", cachelinesize);
printf("k = %d\n", k);
double tp2 = omp_get_wtime();
par_countingsort2<k>(out, in, n);
tp2 = omp_get_wtime() - tp2;
printf("par2, elapsed time = %.3f seconds (%.1fx speedup from par1), check passed = %c\n", tp2, tp/tp2, checkreset(out,in,n)?'y':'n');
//free mem
free(in);
free(out);
return EXIT_SUCCESS;
}

OpenMP code is aborted

I'm trying to perform matrix multiplication using openMP as follows and I compile it using GCC : g++ -std=gnu++11 -g -Wall -fopenmp -o parallel_not_opt parallel_not_opt.cpp
But when I try to run it by using parallel_not_opt.exe, it aborts giving the typical Windows error parallel_not_opt.exe has stopped working...
Am I missing something?
#include "includes/stdafx.h"
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <vector>
# include <omp.h>
#include <chrono>
#include <fstream>
#include <algorithm>
#include <immintrin.h>
#include <cfloat>
#include <limits>
#include <math.h>
using namespace std::chrono;
using namespace std;
//populate matrix with random values.
double** generateMatrix(int n){
double max = DBL_MAX;
double min = DBL_MIN;
double** matA = new double*[n];
for (int i = 0; i < n; i++) {
matA[i] = new double[n];
for (int j = 0; j < n; j++) {
double randVal = (double)rand() / RAND_MAX;
matA[i][j] = min + randVal * (max - min);
}
}
return matA;
}
//generate matrix for final result.
double** generateMatrixFinal(int n){
double** matA = new double*[n];
for (int i = 0; i < n; i++) {
matA[i] = new double[n];
for (int j = 0; j < n; j++) {
matA[i][j] = 0;
}
}
return matA;
}
//matrix multiplication - parallel
double matrixMultiplicationParallel(double** A, double** B, double** C, int n){
int i, j, k;
clock_t begin_time = clock();
# pragma omp parallel shared ( A,B,C,n ) // private ( i, j, k )
{
# pragma omp for
for (i = 0; i < n; i++) {
// cout<< i << ", " ;
for (j = 0; j < n; j++) {
for (k = 0; k < n; k++) {
C[i][j] += A[i][k] * B[k][j];
}
}
}
}
double t = float(clock() - begin_time);
return t;
}
int _tmain(int argc, _TCHAR* argv[])
{
ofstream out("output.txt", ios::out | ios::app);
out << "--------------STARTED--------------" << "\n";
int start = 200, stop = 2000, step = 200;
for (int n = start; n <= stop; n += step)
{
srand(time(NULL));
cout << "\nn: " << n << "\n";
double t1 = 0;
int my_size = n;
double **A = generateMatrix(my_size);
double **B = generateMatrix(my_size);
double **C = generateMatrixFinal(my_size);
double single_sample_time = matrixMultiplicationParallel(A, B, C, n);
t1 += single_sample_time;
for (int i = 0; i < n; i++) {
delete[] A[i];
delete[] B[i];
delete[] C[i];
}
delete[] A;
delete[] B;
delete[] C;
}
out << "-----------FINISHED-----------------" << "\n";
out.close();
return 0;
}
The private ( i, j, k ) declaration is not optional. Add it back, otherwise the inner loop variables j and k are shared, which completely messes up the inner loops.
It is better to declare variables as locally as possible. That makes reasoning about OpenMP code much easier:
clock_t begin_time = clock();
# pragma omp parallel
{
# pragma omp for
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
for (int k = 0; k < n; k++) {
C[i][j] += A[i][k] * B[k][j];
}
}
}
}
return float(clock() - begin_time);
In that case, A,B,C will be shared by default - coming from the outside, and j,k are private because they are declared within the parallel scope. The loop variable of a parallel for is always implicitly private.

Very slow mutex in LLVM/OpenMP

I wrote code to test the performance of openmp on win (Win7 x64, Corei7 3.4HGz) and on Mac (10.12.3 Core i7 2.7 HGz).
In xcode I made a console application setting the compiled default. I use LLVM 3.7 and OpenMP 5 (in opm.h i searched define KMP_VERSION_MAJOR=5, define KMP_VERSION_MINOR=0 and KMP_VERSION_BUILD = 20150701, libiopm5) on macos 10.12.3 (CPU - Corei7 2700GHz)
For win I use VS2010 Sp1. Additional I set c/C++ -> Optimization -> Optimization = Maximize Speed (O2), c/C++ -> Optimization ->Favor Soze Or Speed = Favor Fast code (Ot).
If I run the application in a single thread, the time difference corresponds to the frequency ratio of processors (approximately). But if you run 4 threads, the difference becomes tangible: win program be faster then mac program in ~70 times.
#include <cmath>
#include <mutex>
#include <cstdint>
#include <cstdio>
#include <iostream>
#include <omp.h>
#include <boost/chrono/chrono.hpp>
static double ActionWithNumber(double number)
{
double sum = 0.0f;
for (std::uint32_t i = 0; i < 50; i++)
{
double coeff = sqrt(pow(std::abs(number), 0.1));
double res = number*(1.0-coeff)*number*(1.0-coeff) * 3.0;
sum += sqrt(res);
}
return sum;
}
static double TestOpenMP(void)
{
const std::uint32_t len = 4000000;
double *a;
double *b;
double *c;
double sum = 0.0;
std::mutex _mutex;
a = new double[len];
b = new double[len];
c = new double[len];
for (std::uint32_t i = 0; i < len; i++)
{
c[i] = 0.0;
a[i] = sin((double)i);
b[i] = cos((double)i);
}
boost::chrono::time_point<boost::chrono::system_clock> start, end;
start = boost::chrono::system_clock::now();
double k = 2.0;
omp_set_num_threads(4);
#pragma omp parallel for
for (int i = 0; i < len; i++)
{
c[i] = k*a[i] + b[i] + k;
if (c[i] > 0.0)
{
c[i] += ActionWithNumber(c[i]);
}
else
{
c[i] -= ActionWithNumber(c[i]);
}
std::lock_guard<std::mutex> scoped(_mutex);
sum += c[i];
}
end = boost::chrono::system_clock::now();
boost::chrono::duration<double> elapsed_time = end - start;
double sum2 = 0.0;
for (std::uint32_t i = 0; i < len; i++)
{
sum2 += c[i];
c[i] /= sum2;
}
if (std::abs(sum - sum2) > 0.01) printf("Incorrect result.\n");
delete[] a;
delete[] b;
delete[] c;
return elapsed_time.count();
}
int main()
{
double sum = 0.0;
const std::uint32_t steps = 5;
for (std::uint32_t i = 0; i < steps; i++)
{
sum += TestOpenMP();
}
sum /= (double)steps;
std::cout << "Elapsed time = " << sum;
return 0;
}
I specifically use a mutex here to compare the performance of openmp on the "mac" and "win". On the "Win" function returns the time of 0.39 seconds. On the "Mac" function returns the time of 25 seconds, i.e. 70 times slower.
What is the cause of this difference?
First of all, thank for edit my post (i use translater to write text).
In the real app, I update the values in a huge matrix (20000х20000) in random order. Each thread determines the new value and writes it in a particular cell. I create a mutex for each row, since in most cases different threads write to different rows. But apparently in cases when 2 threads write in one row and there is a long lock. At the moment I can't divide the rows in different threads, since the order of records is determined by the FEM elements.
So just to put a critical section in there comes out, as it will block writes to the entire matrix.
I wrote code like in real application.
static double ActionWithNumber(double number)
{
const unsigned int steps = 5000;
double sum = 0.0f;
for (u32 i = 0; i < steps; i++)
{
double coeff = sqrt(pow(abs(number), 0.1));
double res = number*(1.0-coeff)*number*(1.0-coeff) * 3.0;
sum += sqrt(res);
}
sum /= (double)steps;
return sum;
}
static double RealAppTest(void)
{
const unsigned int elementsNum = 10000;
double* matrix;
unsigned int* elements;
boost::mutex* mutexes;
elements = new unsigned int[elementsNum*3];
matrix = new double[elementsNum*elementsNum];
mutexes = new boost::mutex[elementsNum];
for (unsigned int i = 0; i < elementsNum; i++)
for (unsigned int j = 0; j < elementsNum; j++)
matrix[i*elementsNum + j] = (double)(rand() % 100);
for (unsigned int i = 0; i < elementsNum; i++) //build FEM element like Triangle
{
elements[3*i] = rand()%(elementsNum-1);
elements[3*i+1] = rand()%(elementsNum-1);
elements[3*i+2] = rand()%(elementsNum-1);
}
boost::chrono::time_point<boost::chrono::system_clock> start, end;
start = boost::chrono::system_clock::now();
omp_set_num_threads(4);
#pragma omp parallel for
for (int i = 0; i < elementsNum; i++)
{
unsigned int* elems = &elements[3*i];
for (unsigned int j = 0; j < 3; j++)
{
//in here set mutex for row with index = elems[j];
boost::lock_guard<boost::mutex> lockup(mutexes[i]);
double res = 0.0;
for (unsigned int k = 0; k < 3; k++)
{
res += ActionWithNumber(matrix[elems[j]*elementsNum + elems[k]]);
}
for (unsigned int k = 0; k < 3; k++)
{
matrix[elems[j]*elementsNum + elems[k]] = res;
}
}
}
end = boost::chrono::system_clock::now();
boost::chrono::duration<double> elapsed_time = end - start;
delete[] elements;
delete[] matrix;
delete[] mutexes;
return elapsed_time.count();
}
int main()
{
double sum = 0.0;
const u32 steps = 5;
for (u32 i = 0; i < steps; i++)
{
sum += RealAppTest();
}
sum /= (double)steps;
std::cout<<"Elapsed time = " << sum;
return 0;
}
You're combining two different sets of threading/synchronization primitives - OpenMP, which is built into the compiler and has a runtime system, and manually creating a posix mutex with std::mutex. It's probably not surprising that there's some interoperability hiccups with some compiler/OS combinations.
My guess here is that in the slow case, the OpenMP runtime is going overboard to make sure that there's no interactions between higher-level ongoing OpenMP threading tasks and the manual mutex, and that doing so inside a tight loop causes the dramatic slowdown.
For mutex-like behaviour in the OpenMP framework, we can use critical sections:
#pragma omp parallel for
for (int i = 0; i < len; i++)
{
//...
// replacing this: std::lock_guard<std::mutex> scoped(_mutex);
#pragma omp critical
sum += c[i];
}
or explicit locks:
omp_lock_t sumlock;
omp_init_lock(&sumlock);
#pragma omp parallel for
for (int i = 0; i < len; i++)
{
//...
// replacing this: std::lock_guard<std::mutex> scoped(_mutex);
omp_set_lock(&sumlock);
sum += c[i];
omp_unset_lock(&sumlock);
}
omp_destroy_lock(&sumlock);
We get much more reasonable timings:
$ time ./openmp-original
real 1m41.119s
user 1m15.961s
sys 1m53.919s
$ time ./openmp-critical
real 0m16.470s
user 1m2.313s
sys 0m0.599s
$ time ./openmp-locks
real 0m15.819s
user 1m0.820s
sys 0m0.276s
Updated: There's no problem with using an array of openmp locks in exactly the same way as the mutexes:
omp_lock_t sumlocks[elementsNum];
for (unsigned idx=0; idx<elementsNum; idx++)
omp_init_lock(&(sumlocks[idx]));
//...
#pragma omp parallel for
for (int i = 0; i < elementsNum; i++)
{
unsigned int* elems = &elements[3*i];
for (unsigned int j = 0; j < 3; j++)
{
//in here set mutex for row with index = elems[j];
double res = 0.0;
for (unsigned int k = 0; k < 3; k++)
{
res += ActionWithNumber(matrix[elems[j]*elementsNum + elems[k]]);
}
omp_set_lock(&(sumlocks[i]));
for (unsigned int k = 0; k < 3; k++)
{
matrix[elems[j]*elementsNum + elems[k]] = res;
}
omp_unset_lock(&(sumlocks[i]));
}
}
for (unsigned idx=0; idx<elementsNum; idx++)
omp_destroy_lock(&(sumlocks[idx]));

Parallelize program to count integral using OpenMP C++

I'm trying to count integral
#include <iostream>
#include <omp.h>
using namespace std;
double my_exp(double x) {
double res = 1., term = 1.;
for(int n=1; n<=1000; n++) {
term *= x / n;
res += term;
}
return res;
}
double f(double x) {
return x*my_exp(x);
}
int main() {
double a=0., b=1., result = 0.;
int nsteps = 1000000;
double h = (b - a)/nsteps;
for(int i=1; i<nsteps; i++) result += f(a + i*h);
result = (result + .5*(f(a) + f(b)))*h;
cout.precision(15);
cout << "Result: " << result << endl;
return 0;
}
This program count integral and return result Result: 1.00000000000035
. But time of execute is much.
I should parallel my program, I think I should add #pragma omp parallel for but it doesn't work
change your main function
#pragma omp parallel
{
double localresult = 0.0;
#pragma omp for
for(int i=1; i<nsteps; i++)
localresult += f(a + i*h);
#pragma omp critical
{
result += localresult;
}
}
result = (result + .5*(f(a) + f(b)))*h;
edit: the much simpler solution along the lines of muXXmit2X would be
#pragma omp parallel for reduction(+:result)
for(int i=1; i<nsteps; i++) result += f(a + i*h);
result = (result + .5*(f(a) + f(b)))*h;

Loop tiling matrix multiplication using openmp in C++

I just started OpenMP and am familiar with the basics.
The loop tiled function works faster when executed serially but when i try to use OpenMP, it becomes slower by a huge margin.
The loop tiling is what I've studied from the wikipedia page on loop tiling and also from a video on MIT-OCW.
I'd like to know how to implement this properly and why my code is not working.
#include <iostream>
#include <stdio.h>
#include <omp.h>
#include <time.h>
using namespace std;
#define SIZE 10000
#define N 100
#define S 25
int n = N;
int s = S;
double a[SIZE],b[SIZE],c[SIZE];
// Initializing the matrices
void mat_init(double *a, double *b, int n)
{
for(int i=0; i<n; i++)
for(int j=0; j<n; j++)
a[i*n + j] = 1;
for(int i=0; i<n; i++)
for(int j=0; j<n; j++)
b[i*n + j] = 2;
}
void mat_multi(double *a, double *b, double *c, int n)
{
//double start_t = omp_get_wtime();
clock_t start=clock();
int i,j,k;
#pragma omp num_threads(5) for private(i,j,k)
for( i=0; i<n; i++)
for( j=0; j<n; j++)
for( k=0; k<n; k++)
c[i*n+j]+=a[i*n+k]*b[k*n+j];
start = clock() - start;
double ms = ((double)(start)*1000)/CLOCKS_PER_SEC;
//double stop_t = omp_get_wtime();
cout<<"Naive multiplication requires "<<ms<<"ms"<<endl;
}
void mat_print(double *a, int n)
{
cout<<endl<<endl<<endl<<"************************************************************"<<endl;
for (int i = 0; i < n; ++i)
{
cout<<endl;
for (int j = 0; j < n; ++j)
{
/* code */
cout<<a[i*n+j]<<" ";
}
}
cout<<endl<<endl<<endl<<"************************************************************"<<endl;
}
void mat_empty(double *a, int n)
{
for (int i = 0; i < n; ++i)
{
/* code */
for (int j = 0; j < n; ++j)
{
/* code */
c[i*n+j]=0;
}
}
}
void tiled_mat_multiply(double *a, double *b, double *c, int n)
{
int i,j,k,i1,j1,k1,tid;
clock_t start = clock();
double start_t,stop_t;
omp_set_nested(1);
#pragma omp parallel shared(a,b,c) private(i1,j1,k1,i,j,k,tid) num_threads(omp_get_num_procs())
{
/*
tid = omp_get_thread_num();
if(tid == 0)
{
cout<<"Master thread encountered "<<endl<<endl;
start_t = omp_get_wtime();
}
*/
#pragma omp for
for ( i1 = 0; i1 < n; i1+=s)
for ( j1 = 0; j1 < n; j1+=s)
for ( k1 = 0; k1 < n; k1+=s)
for( i=i1; i <i1+s && i<n; i++)
for ( j=j1; j< j1+s && j<n; ++j)
for( k=k1; k< k1+s && k<n; ++k)
c[i*n+j]+=a[i*n+k]*b[k*n+j];
}
/*if(tid==0)
{
stop_t = omp_get_wtime();
}*/
start = clock() - start;
double ms = ((double)(start)*1000)/CLOCKS_PER_SEC;
cout<<"Tiled matrix multiplication requires "<<ms<<"ms"<<endl;
}
int main()
{
mat_init(a,b,n);
mat_multi(a,b,c,n);
mat_print(c,n);
mat_empty(c,n);
tiled_mat_multiply(a,b,c,n);
mat_print(c,n);
return 0;
}