I just started OpenMP and am familiar with the basics.
The loop tiled function works faster when executed serially but when i try to use OpenMP, it becomes slower by a huge margin.
The loop tiling is what I've studied from the wikipedia page on loop tiling and also from a video on MIT-OCW.
I'd like to know how to implement this properly and why my code is not working.
#include <iostream>
#include <stdio.h>
#include <omp.h>
#include <time.h>
using namespace std;
#define SIZE 10000
#define N 100
#define S 25
int n = N;
int s = S;
double a[SIZE],b[SIZE],c[SIZE];
// Initializing the matrices
void mat_init(double *a, double *b, int n)
{
for(int i=0; i<n; i++)
for(int j=0; j<n; j++)
a[i*n + j] = 1;
for(int i=0; i<n; i++)
for(int j=0; j<n; j++)
b[i*n + j] = 2;
}
void mat_multi(double *a, double *b, double *c, int n)
{
//double start_t = omp_get_wtime();
clock_t start=clock();
int i,j,k;
#pragma omp num_threads(5) for private(i,j,k)
for( i=0; i<n; i++)
for( j=0; j<n; j++)
for( k=0; k<n; k++)
c[i*n+j]+=a[i*n+k]*b[k*n+j];
start = clock() - start;
double ms = ((double)(start)*1000)/CLOCKS_PER_SEC;
//double stop_t = omp_get_wtime();
cout<<"Naive multiplication requires "<<ms<<"ms"<<endl;
}
void mat_print(double *a, int n)
{
cout<<endl<<endl<<endl<<"************************************************************"<<endl;
for (int i = 0; i < n; ++i)
{
cout<<endl;
for (int j = 0; j < n; ++j)
{
/* code */
cout<<a[i*n+j]<<" ";
}
}
cout<<endl<<endl<<endl<<"************************************************************"<<endl;
}
void mat_empty(double *a, int n)
{
for (int i = 0; i < n; ++i)
{
/* code */
for (int j = 0; j < n; ++j)
{
/* code */
c[i*n+j]=0;
}
}
}
void tiled_mat_multiply(double *a, double *b, double *c, int n)
{
int i,j,k,i1,j1,k1,tid;
clock_t start = clock();
double start_t,stop_t;
omp_set_nested(1);
#pragma omp parallel shared(a,b,c) private(i1,j1,k1,i,j,k,tid) num_threads(omp_get_num_procs())
{
/*
tid = omp_get_thread_num();
if(tid == 0)
{
cout<<"Master thread encountered "<<endl<<endl;
start_t = omp_get_wtime();
}
*/
#pragma omp for
for ( i1 = 0; i1 < n; i1+=s)
for ( j1 = 0; j1 < n; j1+=s)
for ( k1 = 0; k1 < n; k1+=s)
for( i=i1; i <i1+s && i<n; i++)
for ( j=j1; j< j1+s && j<n; ++j)
for( k=k1; k< k1+s && k<n; ++k)
c[i*n+j]+=a[i*n+k]*b[k*n+j];
}
/*if(tid==0)
{
stop_t = omp_get_wtime();
}*/
start = clock() - start;
double ms = ((double)(start)*1000)/CLOCKS_PER_SEC;
cout<<"Tiled matrix multiplication requires "<<ms<<"ms"<<endl;
}
int main()
{
mat_init(a,b,n);
mat_multi(a,b,c,n);
mat_print(c,n);
mat_empty(c,n);
tiled_mat_multiply(a,b,c,n);
mat_print(c,n);
return 0;
}
Related
I'm trying to perform matrix multiplication using openMP as follows and I compile it using GCC : g++ -std=gnu++11 -g -Wall -fopenmp -o parallel_not_opt parallel_not_opt.cpp
But when I try to run it by using parallel_not_opt.exe, it aborts giving the typical Windows error parallel_not_opt.exe has stopped working...
Am I missing something?
#include "includes/stdafx.h"
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <vector>
# include <omp.h>
#include <chrono>
#include <fstream>
#include <algorithm>
#include <immintrin.h>
#include <cfloat>
#include <limits>
#include <math.h>
using namespace std::chrono;
using namespace std;
//populate matrix with random values.
double** generateMatrix(int n){
double max = DBL_MAX;
double min = DBL_MIN;
double** matA = new double*[n];
for (int i = 0; i < n; i++) {
matA[i] = new double[n];
for (int j = 0; j < n; j++) {
double randVal = (double)rand() / RAND_MAX;
matA[i][j] = min + randVal * (max - min);
}
}
return matA;
}
//generate matrix for final result.
double** generateMatrixFinal(int n){
double** matA = new double*[n];
for (int i = 0; i < n; i++) {
matA[i] = new double[n];
for (int j = 0; j < n; j++) {
matA[i][j] = 0;
}
}
return matA;
}
//matrix multiplication - parallel
double matrixMultiplicationParallel(double** A, double** B, double** C, int n){
int i, j, k;
clock_t begin_time = clock();
# pragma omp parallel shared ( A,B,C,n ) // private ( i, j, k )
{
# pragma omp for
for (i = 0; i < n; i++) {
// cout<< i << ", " ;
for (j = 0; j < n; j++) {
for (k = 0; k < n; k++) {
C[i][j] += A[i][k] * B[k][j];
}
}
}
}
double t = float(clock() - begin_time);
return t;
}
int _tmain(int argc, _TCHAR* argv[])
{
ofstream out("output.txt", ios::out | ios::app);
out << "--------------STARTED--------------" << "\n";
int start = 200, stop = 2000, step = 200;
for (int n = start; n <= stop; n += step)
{
srand(time(NULL));
cout << "\nn: " << n << "\n";
double t1 = 0;
int my_size = n;
double **A = generateMatrix(my_size);
double **B = generateMatrix(my_size);
double **C = generateMatrixFinal(my_size);
double single_sample_time = matrixMultiplicationParallel(A, B, C, n);
t1 += single_sample_time;
for (int i = 0; i < n; i++) {
delete[] A[i];
delete[] B[i];
delete[] C[i];
}
delete[] A;
delete[] B;
delete[] C;
}
out << "-----------FINISHED-----------------" << "\n";
out.close();
return 0;
}
The private ( i, j, k ) declaration is not optional. Add it back, otherwise the inner loop variables j and k are shared, which completely messes up the inner loops.
It is better to declare variables as locally as possible. That makes reasoning about OpenMP code much easier:
clock_t begin_time = clock();
# pragma omp parallel
{
# pragma omp for
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
for (int k = 0; k < n; k++) {
C[i][j] += A[i][k] * B[k][j];
}
}
}
}
return float(clock() - begin_time);
In that case, A,B,C will be shared by default - coming from the outside, and j,k are private because they are declared within the parallel scope. The loop variable of a parallel for is always implicitly private.
I am new to OpenMP and am trying desperately to learn. I have tried to write an example code in C++ in visual studio 2012 to implement matrix multiplication. I was hoping someone with OpenMP experience could take a look at this code and help me to obtain the ultimate speed / parallelization for this:
#include <iostream>
#include <stdlib.h>
#include <omp.h>
#include <random>
using namespace std;
#define NUM_THREADS 4
// Program Variables
double** A;
double** B;
double** C;
double t_Start;
double t_Stop;
int Am;
int An;
int Bm;
int Bn;
// Program Functions
void Get_Matrix();
void Mat_Mult_Serial();
void Mat_Mult_Parallel();
void Delete_Matrix();
int main()
{
printf("Matrix Multiplication Program\n\n");
cout << "Enter Size of Matrix A: ";
cin >> Am >> An;
cout << "Enter Size of Matrix B: ";
cin >> Bm >> Bn;
Get_Matrix();
Mat_Mult_Serial();
Mat_Mult_Parallel();
system("pause");
return 0;
}
void Get_Matrix()
{
A = new double*[Am];
B = new double*[Bm];
C = new double*[Am];
for ( int i=0; i<Am; i++ ){A[i] = new double[An];}
for ( int i=0; i<Bm; i++ ){B[i] = new double[Bn];}
for ( int i=0; i<Am; i++ ){C[i] = new double[Bn]; }
for ( int i=0; i<Am; i++ )
{
for ( int j=0; j<An; j++ )
{
A[i][j]= rand() % 10 + 1;
}
}
for ( int i=0; i<Bm; i++ )
{
for ( int j=0; j<Bn; j++ )
{
B[i][j]= rand() % 10 + 1;
}
}
printf("Matrix Create Complete.\n");
}
void Mat_Mult_Serial()
{
t_Start = omp_get_wtime();
for ( int i=0; i<Am; i++ )
{
for ( int j=0; j<Bn; j++ )
{
double temp = 0;
for ( int k=0; k<An; k++ )
{
temp += A[i][k]*B[k][j];
}
}
}
t_Stop = omp_get_wtime() - t_Start;
cout << "Serial Multiplication Time: " << t_Stop << " seconds" << endl;
}
void Mat_Mult_Parallel()
{
int i,j,k;
t_Start = omp_get_wtime();
omp_set_num_threads(NUM_THREADS);
#pragma omp parallel for private(i,j,k) schedule(dynamic)
for ( i=0; i<Am; i++ )
{
for ( j=0; j<Bn; j++ )
{
//double temp = 0;
for ( k=0; k<An; k++ )
{
C[i][j] += A[i][k]*B[k][j];
}
}
}
t_Stop = omp_get_wtime() - t_Start;
cout << "Parallel Multiplication Time: " << t_Stop << " seconds." << endl;
}
void Delete_Matrix()
{
for ( int i=0; i<Am; i++ ){ delete [] A[i]; }
for ( int i=0; i<Bm; i++ ){ delete [] B[i]; }
for ( int i=0; i<Am; i++ ){ delete [] C[i]; }
delete [] A;
delete [] B;
delete [] B;
}
My examples are based on a matrix class I created for parallel teaching. If you are interested feel free to contact me.
There are several ways to speedup your matrix multiplication :
Storage
Use a one dimension array in row major order for accessing the element in a faster way.
You can access to A(i,j) with A[i * An + j]
Use loop invariant optimization
for (int i = 0; i < m; i ++)
for (int j = 0; j < p; j ++)
{
Scalar sigma = C(i, j);
for (int k = 0; k < n; k ++)
sigma += (*this)(i, k) * B(k, j);
C(i, j) = sigma;
}
This prevents to recompute C(i,j) several times in the most inner loop.
Change loop order "for k <-> for i"
for (int i = 0; i < m; i ++)
for (int k = 0; k < n; k ++)
{
Aik = (*this)(i, k);
for (int j = 0; j < p; j ++)
C(i, j) += Aik * B(k, j);
}
This allows to play with spatial data locality
Use loop blocking/tiling
for(int ii = 0; ii < m; ii += block_size)
for(int jj = 0; jj < p; jj += block_size)
for(int kk = 0; kk < n; kk += block_size)
#pragma omp parallel for // I think this is the best place for this case
for(int i = ii; i < ii + block_size; i ++)
for(int k = kk; k < kk + block_size; k ++)
{
Scalar Aik = (*this)(i, k);
for(int j = jj; j < jj + block_size; j ++)
C(i, j) += Aik * B(k, j);
}
This can use better temporal data locality. The optimal block_size depends on your architecture and matrix size.
Then parallelize !
Generally, the #pragma omp parallel for should be done a the most outter loop. Maybe using two parallel loop at the two first outter loops can give better results. It depends then on the architecture you use, the matrix size... You have to test !
Since the matrix multiplication has a static workload I would use a static schedule.
Moar optimization !
You can do loop nest optimization.
You can vectorize your code.
You can take look at how BLAS do it.
I am very new to OpenMP and this code is very instructive. However I found an error in the serial version that gives it an unfair speed advantage over the parallel version.
Instead of writing C[i][j] += A[i][k]*B[k][j]; as you do in the parallel version, you have written temp += A[i][k]*B[k][j]; in the serial version. This is much faster (but doesn't help you compute the C matrix). So you're not comparing apples to apples, which makes the parallel code seem slower by comparison. When I fixed this line and ran it on my laptop (which allows 2 threads), the parallel version was almost twice as fast. Not bad!
I am trying to implement the Viterbi algorithm with the help of OpenMP. So far, my test shows that the execution time of the parallel program is approximately 4 times the execution time of the sequential program. Here is my code:
#include <omp.h>
#include <stdio.h>
#include <time.h>
#define K 39 // num states
#define T 1500 // num obs sequence
int states[K];
double transition[K][K];
double emission[K][K];
double init_prob[K];
int observation[T];
using namespace std;
void generateValues()
{
srand(time(NULL));
for(int i=0; i<T; i++)
{
observation[i] = rand() % K;
}
for(int i=0; i<K; i++)
{
states[i] = i;
init_prob[i] = (double)rand() / (double)RAND_MAX;
for(int j=0;j<K;j++)
{
transition[i][j] = (double)rand() / (double)RAND_MAX;
srand(time(NULL));
emission[i][j] = (double)rand() / (double)RAND_MAX;
}
}
}
int* viterbi(int *S, double *initp, int *Y, double A[][K], double B[][K])
{
double T1[K][T];
int T2[K][T];
#pragma omp parallel for
for(int i=0; i<K; ++i)
{
T1[i][0] = initp[i];
T2[i][0] = 0;
}
for(int i=1; i<T; ++i)
{
double max, temp;
int argmax;
#pragma omp parallel for private (max, temp, argmax)
for(int j=0; j<K; ++j)
{
max = -1;
#pragma omp parallel for
for(int k=0; k<K; ++k)
{
temp = T1[k][i-1] * A[k][j] * B[k][Y[i-1]];
if(temp > max)
{
max = temp;
argmax = k;
}
}
T1[j][i] = max;
T2[j][i] = argmax;
}
}
int Z[T];
int X[T];
double max = -1, temp;
#pragma omp parallel for
for(int k=0; k<K; ++k)
{
temp = T1[k][T-1];
if(temp > max)
{
max = temp;
Z[T-1] = k;
}
}
X[T-1] = S[Z[T-1]];
for(int i=T-1; i>0; --i)
{
Z[i-1] = T2[Z[i]][i];
X[i-1] = S[Z[i-1]];
}
return X;
}
int* viterbiNoOmp(int *S, double *initp, int *Y, double A[][K], double B[][K]) // the same as before, minus the #pragma omp
int main()
{
clock_t tStart;
int *path;
generateValues();
double sumOmp = 0;
for(int i=0;i<6;i++)
{
double start = omp_get_wtime();
path = viterbi(states, init_prob, observation, transition, emission);
double end = omp_get_wtime();
sumOmp += end - start;
}
double sumNoOmp = 0;
for(int i=0;i<6;i++)
{
tStart = clock();
path = viterbiNoOmp(states, init_prob, observation, transition, emission);
sumNoOmp += ((double)(clock() - tStart)/CLOCKS_PER_SEC);
}
for (int i=0;i<T;i++)
{
printf("%d, ", path[i]);
}
printf("\n\ntime With Omp: %f\ntime without Omp: %f", sumOmp/6, sumNoOmp/6);
return 0;
}
What am I doing wrong?
First of all, you used for your first measurement the omp_get_wtime() function, and for your second, you used clock().
Use omp_get_wtime() for both and you'll see a little improvement
Secondly instead of using sumNoOmp += ((double)(clock() - tStart)/CLOCKS_PER_SEC);
just use sumNoOmp = ((double)(clock() - tStart)/CLOCKS_PER_SEC);
Now let's move on to your code:
trying to parallel nested loops is a little tricky
try using #pragma omp parallel for only for the outer loop and watch for the result
I am aware that there allready are similar questions here but no answer really helped me.
This is my problem:
I have given an array with 512x512 pixels in it. Each pixel has a value like 165.88009. ( I have to create a heatmap in GnuPlot later)
Now I want to "smoothen" it by creating the average of a variable block of pixels (like 4-16) and write it into a new 2D array and jump to the next block until it is done.
The size of the array should stay the same. So if I average 4 pixels those 4 pixels get the new value.
I made a function for this but it doesn't work properly.
Calculating the average is not my problem. The problem is that I want to have a variable width of pixels but I don't know how to make my algorithm jump to the next block.
Im not experienced in C++ and maybe I have to do it completely different.
So any help or inspiration is greatly appreciated :)
here is my code:
#include <iostream>
#include <fstream>
#include <string>
#include <iomanip>
using namespace std;
int i, j, m, n, k;
void Average(double **Data, int width) // width gets defined and initiated in main
{
double sum;
double avg;
fstream Output;
Output.open( "GemittelteWerte.dat", ios::out);
double** IV_Matrix = new double* [m];
for (int i=0; i<m; i++)
{
IV_Matrix[i] = new double [n];
}
for (int i=0; i<m; i++)
{
for (int j=0; j<n; j++)
{
IV_Matrix[i][j] = 1.0;
}
}
// Here start all my troubles:
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j+=width)
{
sum = 0.0;
k=j;
for( k; k<(j+width); k++)
{
sum+=Data[i][k];
}
avg=(sum/width);
for (int k; k<(j+width); k++)
{
IV_Matrix[i][k] = avg;
}
}
}
for(int i=0; i<n; i++)
{
for(int j=0; j<n; j++)
{
Output<<setprecision(10)<<IV_Matrix[i][j]<<"\t";
}
Output<<"\n";
}
Output.close();
}
Is this block a 2D block (4 = 2x2, 16 = 4x4)? You simply want to do a 2D convolution? Then better use odd widths with 3x3, 5x5, ... kernels.
// int x, y are the dimensions of your image
double get (double **img, int i, int j) // zero padding for areas outside image
{
if (i<0 || i>=x || j<0 || j>=y)
return 0;
else
return img[i][j];
}
void conv (double **img, double **result, int width2) // kernel is (2*width2+1)^2
{
double sum;
for (int i=0; i<x; i++)
for (int j=0; j<y; j++)
{
sum = 0;
for (int ii=-width2; ii<=width2; ii++)
for (int jj=-width2; jj<=width2; jj++)
sum += get(img,i+ii,j+jj) / ((2*width2+1)*(2*width2+1));
result[i][j] = sum;
}
}
This smoothes img to result. Its however the slow unseparated solution. For small images and kernels no problem.
Edit: then easier:
// x, y are the dimensions of your image (x rows, y colums)
void avg (double **img, double **result, int width) // width must be >= 1 and
{ // should be a divider of y
double sum;
for (int i=0; i<x; i++) // process all rows
{
for (int j=0; j<y; j+=width) // jump in block width through a row
{
sum = 0.0;
for (int w=0; w<width; w++) // calculate average of a block
{
sum += img[i][j+w] / width;
}
for (int b=0; b<width; b++) // write average in each pixel inside block
{
result[i][j+b]= sum;
}
}
}
//di means diagonal index
for(int di = 0; di < n/width; ++di) {
int sum = 0.0;
//we sum the values
for(int i = di*width; i < (di+1)*width; ++i)
{
for(int j = di*width; j < (di+1)*width; ++j)
{
sum += Data[i][j];
}
}
//Divide by the number of values
sum /= width*width;
//Spread the results
for(int i = di*width; i < (di+1)*width; ++i)
{
for(int j = di*width; j < (di+1)*width; ++j)
{
IV_Matrix[i][j];
}
}
}
//n might not be a multiple of width
if(n % width != 0) {
//we sum the values
for(int i = (n/width)*width; i < n; ++i)
{
for(int j = di*width; j < (di+1)*width; ++j)
{
sum += Data[i][j];
}
}
//Divide by the number of values
sum /= width*width;
//Spread the results
for(int i = (n/width)*width; i < n; ++i)
{
for(int j = (n/width)*width; j < n; ++j)
IV_Matrix[i][j];
}
}
}
what is wrong with my openMP code? it always takes only 1 thread and works the same time as non-parallel version
template <typename T>
Matrix<T>* Matrix<T>::OMPMultiplication(Matrix<T>* A, Matrix<T>* B){
if(A->ySize != B->xSize)
throw;
Matrix<T>* C = new Matrix<T>(A->xSize, B->ySize);
sizeType i, j, k;
T element;
#pragma omp parallel for private(i, j)
{
#pragma omp for private(i, j)
for( i = 0; i < A->xSize; i++ )
cout<<"There are "<<omp_get_num_threads()<<" threads"<<endl;
for(j = 0; j < B->ySize; j++){
C->matrix[i][j] = 0;
for(k = 0; k < A->ySize; k++){
C->matrix[i][j] += A->matrix[i][k] * B->matrix[k][j];
}
}
}
return C;
}
First of all, you are missing some {} for the i loop and the variable k needs to be made private to each iteration of the i loop. However, I think you have also mixed up how the parallel and for pragmas are combined. To successfully parallelize a for loop, you need to put it inside a parallel pragma and then inside a for pragma. To do this you could either change your code into
#pragma omp parallel private(i, j, k)
{
#pragma omp for
for( i = 0; i < A->xSize; i++ ) {
cout<<"There are "<<omp_get_num_threads()<<" threads"<<endl;
for(j = 0; j < B->ySize; j++) {
C->matrix[i][j] = 0;
for(k = 0; k < A->ySize; k++){
C->matrix[i][j] += A->matrix[i][k] * B->matrix[k][j];
}
}
}
}
or make use of the combined parallel for notation
#pragma omp parallel for private(i, j, k)
for( i = 0; i < A->xSize; i++ ) {
...
}
Also, make sure you are telling OpenMP to use more than 1 thread here. This can be done both with omp_set_num_threads(<number of threads here>) and by setting environment variables like OMP_NUM_THREADS.
Hope you get it parallelized. :)
I get slightly faster result with my 4 cores using this code:
omp_set_num_threads(4);
#pragma omp parallel for
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++) {
c[i] += b[j] * a[j][i];
}
}
Full program
#include <stdio.h>
#include <time.h>
#include <omp.h>
#include <stdlib.h>
int main() {
int i, j, n, a[719][719], b[719], c[719];
clock_t start = clock();
n = 100; //Max 719
printf("Matrix A\n");
for (i = 0; i < n; ++i) {
for (j = 0; j < n; ++j) {
a[i][j] = 10;
printf("%d ", a[i][j]);
}
printf("\n");
}
printf("\nMatrix B\n");
#pragma omp parallel private(i) shared(b)
{
#pragma omp for
for (i = 0; i < n; ++i) {
b[i] = 5;
printf("%d\n", b[i]);
}
}
printf("\nA * B\n");
#pragma omp parallel private(i) shared(c)
{
#pragma omp for
for (i = 0; i < n; ++i) {
c[i] = 0;
}
}
#pragma omp parallel private(i,j) shared(n,a,b,c)
{
#pragma omp for schedule(dynamic)
for (i = 0; i < n; ++i) {
for (j = 0; j < n; ++j) {
c[i] += b[j] * a[j][i];
}
}
}
#pragma omp parallel private(i) shared(c)
{
#pragma omp for
for (i = 0; i < n; ++i) {
printf("%d\n", c[i]);
}
}
clock_t stop = clock();
double elapsed = (double) (stop - start) / CLOCKS_PER_SEC;
printf("\nTime elapsed: %.5f\n", elapsed);
start = clock();
printf("Matrix A\n");
for (i = 0; i < n; ++i) {
for (j = 0; j < n; ++j) {
a[i][j] = 10;
printf("%d ", a[i][j]);
}
printf("\n");
}
printf("\nMatrix B\n");
#pragma omp parallel private(i) shared(b)
{
#pragma omp for
for (i = 0; i < n; ++i) {
b[i] = 5;
printf("%d\n", b[i]);
}
}
printf("\nA * B\n");
omp_set_num_threads(4);
#pragma omp parallel for
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++) {
c[i] += b[j] * a[j][i];
}
}
stop = clock();
elapsed = (double) (stop - start) / CLOCKS_PER_SEC;
printf("\nTime elapsed: %.5f\n", elapsed);
return 0;
}
First method takes
Time elapsed: 0.03442
Second method
Time elapsed: 0.02630