I'm trying to perform matrix multiplication using openMP as follows and I compile it using GCC : g++ -std=gnu++11 -g -Wall -fopenmp -o parallel_not_opt parallel_not_opt.cpp
But when I try to run it by using parallel_not_opt.exe, it aborts giving the typical Windows error parallel_not_opt.exe has stopped working...
Am I missing something?
#include "includes/stdafx.h"
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <vector>
# include <omp.h>
#include <chrono>
#include <fstream>
#include <algorithm>
#include <immintrin.h>
#include <cfloat>
#include <limits>
#include <math.h>
using namespace std::chrono;
using namespace std;
//populate matrix with random values.
double** generateMatrix(int n){
double max = DBL_MAX;
double min = DBL_MIN;
double** matA = new double*[n];
for (int i = 0; i < n; i++) {
matA[i] = new double[n];
for (int j = 0; j < n; j++) {
double randVal = (double)rand() / RAND_MAX;
matA[i][j] = min + randVal * (max - min);
}
}
return matA;
}
//generate matrix for final result.
double** generateMatrixFinal(int n){
double** matA = new double*[n];
for (int i = 0; i < n; i++) {
matA[i] = new double[n];
for (int j = 0; j < n; j++) {
matA[i][j] = 0;
}
}
return matA;
}
//matrix multiplication - parallel
double matrixMultiplicationParallel(double** A, double** B, double** C, int n){
int i, j, k;
clock_t begin_time = clock();
# pragma omp parallel shared ( A,B,C,n ) // private ( i, j, k )
{
# pragma omp for
for (i = 0; i < n; i++) {
// cout<< i << ", " ;
for (j = 0; j < n; j++) {
for (k = 0; k < n; k++) {
C[i][j] += A[i][k] * B[k][j];
}
}
}
}
double t = float(clock() - begin_time);
return t;
}
int _tmain(int argc, _TCHAR* argv[])
{
ofstream out("output.txt", ios::out | ios::app);
out << "--------------STARTED--------------" << "\n";
int start = 200, stop = 2000, step = 200;
for (int n = start; n <= stop; n += step)
{
srand(time(NULL));
cout << "\nn: " << n << "\n";
double t1 = 0;
int my_size = n;
double **A = generateMatrix(my_size);
double **B = generateMatrix(my_size);
double **C = generateMatrixFinal(my_size);
double single_sample_time = matrixMultiplicationParallel(A, B, C, n);
t1 += single_sample_time;
for (int i = 0; i < n; i++) {
delete[] A[i];
delete[] B[i];
delete[] C[i];
}
delete[] A;
delete[] B;
delete[] C;
}
out << "-----------FINISHED-----------------" << "\n";
out.close();
return 0;
}
The private ( i, j, k ) declaration is not optional. Add it back, otherwise the inner loop variables j and k are shared, which completely messes up the inner loops.
It is better to declare variables as locally as possible. That makes reasoning about OpenMP code much easier:
clock_t begin_time = clock();
# pragma omp parallel
{
# pragma omp for
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
for (int k = 0; k < n; k++) {
C[i][j] += A[i][k] * B[k][j];
}
}
}
}
return float(clock() - begin_time);
In that case, A,B,C will be shared by default - coming from the outside, and j,k are private because they are declared within the parallel scope. The loop variable of a parallel for is always implicitly private.
Related
I am confused as to why I am getting a segmentation fault when creating and firing off threads here. It happens in the t[j] = thread(getMax, A); line and I am very confused as to why this is happening. threadMax[] is the max of each thread. getMax() returns the maximum value of an array.
#include <iostream>
#include <stdlib.h>
#include <sys/time.h>
#include <thread>
#define size 10
#define numThreads 10
using namespace std;
int threadMax[numThreads] = {0};
int num =0;
void getMax(double *A){
num += 1;
double max = A[0];
double min = A[0];
for (int i =0; i<size; i++){
if(A[i] > max){
max = A[i];
}
}
threadMax[num] = max;
}
int main(){
int max =0;
double S,E;
double *A = new double[size];
srand(time(NULL));
thread t[numThreads];
//Assign random values to array
for(int i = 0; i<size; i++){
A[i] = (double(rand()%100));
}
//create Threads
for(int j =0; j <numThreads; j++){
cout << A[j] << " " << j << "\n";
t[j] = thread(getMax, A);
}
//join threads
for(int i =0; i< numThreads; i++){
t[i].join();
}
//Find Max from all threads
for(int i =0; i < numThreads; i++){
if(threadMax[i] > max){
max = threadMax[i];
}
}
cout <<max;
delete [] A;
return 0;
}
The behavior of this code is undefined:
void getMax(double *A){
num += 1;
double max = A[0];
double min = A[0];
for (int i =0; i<size; i++){
if(A[i] > max){
max = A[i];
}
}
threadMax[num] = max;
}
The num += 1 can allow multiple threads to attempt to modify num at the same time. Worse, when num is read in the threadMax[num] = max;, threads may see values of num modified by other threads while they were running.
You need to assign each thread a number in some safe way.
Here are three ways it can fail:
Two threads do num += 1; at exactly the same time and as a result, num only increments once.
Every thread does num += 1; before any thread does threadMax[num] = max;. All threads overwrite the same entry in the array. (Which, actually, is out of bounds!)
The code crashes because its behavior is undefined.
As others have stated, your num variable is not protected from race conditions inside of getMax(), which can lead to it being corrupted, thus causing getMax() to access the threadMax[] array out of bounds.
You can avoid that by simply getting rid of that num variable altogether and pass the array index as an input parameter to std::thread instead.
Try something more like this:
#include <iostream>
#include <vector>
#include <array>
#include <thread>
#include <algorithm>
#include <cstdlib>
#include <ctime>
using namespace std;
const size_t size = 10;
const size_t numThreads = 10;
double threadMax[numThreads] = {};
void getMax(int idx, double *A){
threadMax[idx] = *max_element(A, A + size);
}
int main(){
srand(time(nullptr));
vector<double> A(size);
array<thread, numThreads> t;
//Assign random values to array
generate_n(A.begin(), size, [](){ return double(rand() % 100); });
/* or:
for(double &d : A){
d = double(rand() % 100);
}
*/
//create Threads
for(int j = 0; j < numThreads; ++j){
cout << A[j] << " " << j << "\n";
t[j] = thread(getMax, j, A.data());
}
//join threads
for(thread &thd : t){
thd.join();
}
//Find Max from all threads
double max = *max_element(threadMax.begin(), threadMax.end());
cout << max;
return 0;
}
I put in my program two loops - one fills 2D array with one value N0, and next loop is generating random number. And my program does not work when I have loop for array. I get "Unhandled exception... (parameters: 0x00000003)". But without first loop it works correctly. Thanks for help.
#include <iostream>
#include <vector>
#include <boost/random/mersenne_twister.hpp>
#include <boost/random/uniform_int_distribution.hpp>
using namespace std;
const double czas = 1E9;
int main()
{
//Declaration of variables
const int k = 20;
const int L = 30;
double N0 = 7.9E9;
int t,i,j, WalkerAmount;
double excitation, ExcitationAmount;
double slab[30][600];
//Random number generator
boost::random::mt19937 gen;
boost::random::uniform_int_distribution<> numberGenerator(1, 4);
//Filling slab with excitation
for (int i = 0; i <= L; i++)
{
for (int j = 0; j <= k*L; j++) { slab[i][j] = N0; }
}
//Time loop
for (t = 0; t < czas; t++) {
WalkerAmount = 0;
ExcitationAmount = 0;
for (int i = 0; i <= L; i++)
{
for (int j = 0; j <= k*L; j++)
{
int r = numberGenerator(gen);
cout << r << endl;
}
}
}
system("pause");
return 0;
}
Arrays in C++ are indexed from 0 to n-1 where n is the capacity of the array. Then, the code following code is wrong.
int main()
{
//Declaration of variables
const int k = 20;
const int L = 30;
double N0 = 7.9E9;
double slab[30][600];
// [...]
for (int i = 0; i <= L; i++)
{
for (int j = 0; j <= k*L; j++) { slab[i][j] = N0; }
}
}
When you initialize your array, you always go one steep too far. As you consider the case where i == L and j == k*L you reach an area in the memory that out of your array.
The loop you want to execute is
for (int i = 0; i < L; i++)
for (int j = 0; j < k*L; j++)
// Initialize
I just started OpenMP and am familiar with the basics.
The loop tiled function works faster when executed serially but when i try to use OpenMP, it becomes slower by a huge margin.
The loop tiling is what I've studied from the wikipedia page on loop tiling and also from a video on MIT-OCW.
I'd like to know how to implement this properly and why my code is not working.
#include <iostream>
#include <stdio.h>
#include <omp.h>
#include <time.h>
using namespace std;
#define SIZE 10000
#define N 100
#define S 25
int n = N;
int s = S;
double a[SIZE],b[SIZE],c[SIZE];
// Initializing the matrices
void mat_init(double *a, double *b, int n)
{
for(int i=0; i<n; i++)
for(int j=0; j<n; j++)
a[i*n + j] = 1;
for(int i=0; i<n; i++)
for(int j=0; j<n; j++)
b[i*n + j] = 2;
}
void mat_multi(double *a, double *b, double *c, int n)
{
//double start_t = omp_get_wtime();
clock_t start=clock();
int i,j,k;
#pragma omp num_threads(5) for private(i,j,k)
for( i=0; i<n; i++)
for( j=0; j<n; j++)
for( k=0; k<n; k++)
c[i*n+j]+=a[i*n+k]*b[k*n+j];
start = clock() - start;
double ms = ((double)(start)*1000)/CLOCKS_PER_SEC;
//double stop_t = omp_get_wtime();
cout<<"Naive multiplication requires "<<ms<<"ms"<<endl;
}
void mat_print(double *a, int n)
{
cout<<endl<<endl<<endl<<"************************************************************"<<endl;
for (int i = 0; i < n; ++i)
{
cout<<endl;
for (int j = 0; j < n; ++j)
{
/* code */
cout<<a[i*n+j]<<" ";
}
}
cout<<endl<<endl<<endl<<"************************************************************"<<endl;
}
void mat_empty(double *a, int n)
{
for (int i = 0; i < n; ++i)
{
/* code */
for (int j = 0; j < n; ++j)
{
/* code */
c[i*n+j]=0;
}
}
}
void tiled_mat_multiply(double *a, double *b, double *c, int n)
{
int i,j,k,i1,j1,k1,tid;
clock_t start = clock();
double start_t,stop_t;
omp_set_nested(1);
#pragma omp parallel shared(a,b,c) private(i1,j1,k1,i,j,k,tid) num_threads(omp_get_num_procs())
{
/*
tid = omp_get_thread_num();
if(tid == 0)
{
cout<<"Master thread encountered "<<endl<<endl;
start_t = omp_get_wtime();
}
*/
#pragma omp for
for ( i1 = 0; i1 < n; i1+=s)
for ( j1 = 0; j1 < n; j1+=s)
for ( k1 = 0; k1 < n; k1+=s)
for( i=i1; i <i1+s && i<n; i++)
for ( j=j1; j< j1+s && j<n; ++j)
for( k=k1; k< k1+s && k<n; ++k)
c[i*n+j]+=a[i*n+k]*b[k*n+j];
}
/*if(tid==0)
{
stop_t = omp_get_wtime();
}*/
start = clock() - start;
double ms = ((double)(start)*1000)/CLOCKS_PER_SEC;
cout<<"Tiled matrix multiplication requires "<<ms<<"ms"<<endl;
}
int main()
{
mat_init(a,b,n);
mat_multi(a,b,c,n);
mat_print(c,n);
mat_empty(c,n);
tiled_mat_multiply(a,b,c,n);
mat_print(c,n);
return 0;
}
I am new to OpenMP and am trying desperately to learn. I have tried to write an example code in C++ in visual studio 2012 to implement matrix multiplication. I was hoping someone with OpenMP experience could take a look at this code and help me to obtain the ultimate speed / parallelization for this:
#include <iostream>
#include <stdlib.h>
#include <omp.h>
#include <random>
using namespace std;
#define NUM_THREADS 4
// Program Variables
double** A;
double** B;
double** C;
double t_Start;
double t_Stop;
int Am;
int An;
int Bm;
int Bn;
// Program Functions
void Get_Matrix();
void Mat_Mult_Serial();
void Mat_Mult_Parallel();
void Delete_Matrix();
int main()
{
printf("Matrix Multiplication Program\n\n");
cout << "Enter Size of Matrix A: ";
cin >> Am >> An;
cout << "Enter Size of Matrix B: ";
cin >> Bm >> Bn;
Get_Matrix();
Mat_Mult_Serial();
Mat_Mult_Parallel();
system("pause");
return 0;
}
void Get_Matrix()
{
A = new double*[Am];
B = new double*[Bm];
C = new double*[Am];
for ( int i=0; i<Am; i++ ){A[i] = new double[An];}
for ( int i=0; i<Bm; i++ ){B[i] = new double[Bn];}
for ( int i=0; i<Am; i++ ){C[i] = new double[Bn]; }
for ( int i=0; i<Am; i++ )
{
for ( int j=0; j<An; j++ )
{
A[i][j]= rand() % 10 + 1;
}
}
for ( int i=0; i<Bm; i++ )
{
for ( int j=0; j<Bn; j++ )
{
B[i][j]= rand() % 10 + 1;
}
}
printf("Matrix Create Complete.\n");
}
void Mat_Mult_Serial()
{
t_Start = omp_get_wtime();
for ( int i=0; i<Am; i++ )
{
for ( int j=0; j<Bn; j++ )
{
double temp = 0;
for ( int k=0; k<An; k++ )
{
temp += A[i][k]*B[k][j];
}
}
}
t_Stop = omp_get_wtime() - t_Start;
cout << "Serial Multiplication Time: " << t_Stop << " seconds" << endl;
}
void Mat_Mult_Parallel()
{
int i,j,k;
t_Start = omp_get_wtime();
omp_set_num_threads(NUM_THREADS);
#pragma omp parallel for private(i,j,k) schedule(dynamic)
for ( i=0; i<Am; i++ )
{
for ( j=0; j<Bn; j++ )
{
//double temp = 0;
for ( k=0; k<An; k++ )
{
C[i][j] += A[i][k]*B[k][j];
}
}
}
t_Stop = omp_get_wtime() - t_Start;
cout << "Parallel Multiplication Time: " << t_Stop << " seconds." << endl;
}
void Delete_Matrix()
{
for ( int i=0; i<Am; i++ ){ delete [] A[i]; }
for ( int i=0; i<Bm; i++ ){ delete [] B[i]; }
for ( int i=0; i<Am; i++ ){ delete [] C[i]; }
delete [] A;
delete [] B;
delete [] B;
}
My examples are based on a matrix class I created for parallel teaching. If you are interested feel free to contact me.
There are several ways to speedup your matrix multiplication :
Storage
Use a one dimension array in row major order for accessing the element in a faster way.
You can access to A(i,j) with A[i * An + j]
Use loop invariant optimization
for (int i = 0; i < m; i ++)
for (int j = 0; j < p; j ++)
{
Scalar sigma = C(i, j);
for (int k = 0; k < n; k ++)
sigma += (*this)(i, k) * B(k, j);
C(i, j) = sigma;
}
This prevents to recompute C(i,j) several times in the most inner loop.
Change loop order "for k <-> for i"
for (int i = 0; i < m; i ++)
for (int k = 0; k < n; k ++)
{
Aik = (*this)(i, k);
for (int j = 0; j < p; j ++)
C(i, j) += Aik * B(k, j);
}
This allows to play with spatial data locality
Use loop blocking/tiling
for(int ii = 0; ii < m; ii += block_size)
for(int jj = 0; jj < p; jj += block_size)
for(int kk = 0; kk < n; kk += block_size)
#pragma omp parallel for // I think this is the best place for this case
for(int i = ii; i < ii + block_size; i ++)
for(int k = kk; k < kk + block_size; k ++)
{
Scalar Aik = (*this)(i, k);
for(int j = jj; j < jj + block_size; j ++)
C(i, j) += Aik * B(k, j);
}
This can use better temporal data locality. The optimal block_size depends on your architecture and matrix size.
Then parallelize !
Generally, the #pragma omp parallel for should be done a the most outter loop. Maybe using two parallel loop at the two first outter loops can give better results. It depends then on the architecture you use, the matrix size... You have to test !
Since the matrix multiplication has a static workload I would use a static schedule.
Moar optimization !
You can do loop nest optimization.
You can vectorize your code.
You can take look at how BLAS do it.
I am very new to OpenMP and this code is very instructive. However I found an error in the serial version that gives it an unfair speed advantage over the parallel version.
Instead of writing C[i][j] += A[i][k]*B[k][j]; as you do in the parallel version, you have written temp += A[i][k]*B[k][j]; in the serial version. This is much faster (but doesn't help you compute the C matrix). So you're not comparing apples to apples, which makes the parallel code seem slower by comparison. When I fixed this line and ran it on my laptop (which allows 2 threads), the parallel version was almost twice as fast. Not bad!
In function Determininant i keep getting an error....
#include <iostream>
#include <fstream>
#include <cmath>
using namespace std;
const int maxsize = 10;
ifstream fin;
ofstream fout;
void transpose (double omatrix[][maxsize],double tmatrix [][maxsize], int array_size)
{
for(int i = 0; i < array_size; i++)
{
for(int j = 0; j < array_size; j++)
{
tmatrix[j][i] = omatrix[i][j];
}
}
}
void sub (double omatrix[][maxsize], double smatrix[][maxsize], int array_size, int i, int j)
{
int counter1 = 0, counter2 = 0;
for (int a = 0; a < array_size; a++)
{
if (a != i)
{
for (int b = 0; b < array_size; b++)
{
if (b != j)
{
smatrix[counter1][counter2] = omatrix[a][b];
counter2++;
}
}
counter1++;
}
}
}
double Determininant(double original_matrix[][maxsize], int array_size)
{
if(array_size == 1)
return original_matrix[0][0];
else if(array_size == 2)
return original_matrix[0][0] * original_matrix[1][1] - original_matrix[0][1] * original_matrix[1][0];
double d = 0.0;
double temp[maxsize][maxsize];
for(int i = 0; i < array_size; i++)
{
sub (original_matrix,temp,array_size, 0, i);
d += pow(-1.0,i) * original_matrix[0][i] * d(temp, array_size - 1);
}
return d;
}
void print (const double m[][maxsize], int array_size)
{
for(int i = 0; i < array_size; i++)
{
for(int j = 0; j < array_size; j++)
{
fout << m[i][j] << " ";
}
fout << "\n";
}
fout << "\n";
}
The error is error: 'd' cannot be used as a function.
Any ideas on whats wrong?
Exactly what the error message says: d is a double and you can't call it as a function. Perhaps you meant Determinant(temp, array_size - 1)?
It's the end of this line:
d += pow(-1.0,i) * original_matrix[0][i] * d(temp, array_size - 1);
As casablanca said, do you mean this?:
d += pow(-1.0,i) * original_matrix[0][i] * Determinant(temp, array_size - 1);
d += pow(-1.0,i) * original_matrix[0][i] * d(temp, array_size - 1);
The clause d(temp, array_size - 1) is telling C++ to call function d.