C++ openMP parallel matrix multiplication - c++

what is wrong with my openMP code? it always takes only 1 thread and works the same time as non-parallel version
template <typename T>
Matrix<T>* Matrix<T>::OMPMultiplication(Matrix<T>* A, Matrix<T>* B){
if(A->ySize != B->xSize)
throw;
Matrix<T>* C = new Matrix<T>(A->xSize, B->ySize);
sizeType i, j, k;
T element;
#pragma omp parallel for private(i, j)
{
#pragma omp for private(i, j)
for( i = 0; i < A->xSize; i++ )
cout<<"There are "<<omp_get_num_threads()<<" threads"<<endl;
for(j = 0; j < B->ySize; j++){
C->matrix[i][j] = 0;
for(k = 0; k < A->ySize; k++){
C->matrix[i][j] += A->matrix[i][k] * B->matrix[k][j];
}
}
}
return C;
}

First of all, you are missing some {} for the i loop and the variable k needs to be made private to each iteration of the i loop. However, I think you have also mixed up how the parallel and for pragmas are combined. To successfully parallelize a for loop, you need to put it inside a parallel pragma and then inside a for pragma. To do this you could either change your code into
#pragma omp parallel private(i, j, k)
{
#pragma omp for
for( i = 0; i < A->xSize; i++ ) {
cout<<"There are "<<omp_get_num_threads()<<" threads"<<endl;
for(j = 0; j < B->ySize; j++) {
C->matrix[i][j] = 0;
for(k = 0; k < A->ySize; k++){
C->matrix[i][j] += A->matrix[i][k] * B->matrix[k][j];
}
}
}
}
or make use of the combined parallel for notation
#pragma omp parallel for private(i, j, k)
for( i = 0; i < A->xSize; i++ ) {
...
}
Also, make sure you are telling OpenMP to use more than 1 thread here. This can be done both with omp_set_num_threads(<number of threads here>) and by setting environment variables like OMP_NUM_THREADS.
Hope you get it parallelized. :)

I get slightly faster result with my 4 cores using this code:
omp_set_num_threads(4);
#pragma omp parallel for
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++) {
c[i] += b[j] * a[j][i];
}
}
Full program
#include <stdio.h>
#include <time.h>
#include <omp.h>
#include <stdlib.h>
int main() {
int i, j, n, a[719][719], b[719], c[719];
clock_t start = clock();
n = 100; //Max 719
printf("Matrix A\n");
for (i = 0; i < n; ++i) {
for (j = 0; j < n; ++j) {
a[i][j] = 10;
printf("%d ", a[i][j]);
}
printf("\n");
}
printf("\nMatrix B\n");
#pragma omp parallel private(i) shared(b)
{
#pragma omp for
for (i = 0; i < n; ++i) {
b[i] = 5;
printf("%d\n", b[i]);
}
}
printf("\nA * B\n");
#pragma omp parallel private(i) shared(c)
{
#pragma omp for
for (i = 0; i < n; ++i) {
c[i] = 0;
}
}
#pragma omp parallel private(i,j) shared(n,a,b,c)
{
#pragma omp for schedule(dynamic)
for (i = 0; i < n; ++i) {
for (j = 0; j < n; ++j) {
c[i] += b[j] * a[j][i];
}
}
}
#pragma omp parallel private(i) shared(c)
{
#pragma omp for
for (i = 0; i < n; ++i) {
printf("%d\n", c[i]);
}
}
clock_t stop = clock();
double elapsed = (double) (stop - start) / CLOCKS_PER_SEC;
printf("\nTime elapsed: %.5f\n", elapsed);
start = clock();
printf("Matrix A\n");
for (i = 0; i < n; ++i) {
for (j = 0; j < n; ++j) {
a[i][j] = 10;
printf("%d ", a[i][j]);
}
printf("\n");
}
printf("\nMatrix B\n");
#pragma omp parallel private(i) shared(b)
{
#pragma omp for
for (i = 0; i < n; ++i) {
b[i] = 5;
printf("%d\n", b[i]);
}
}
printf("\nA * B\n");
omp_set_num_threads(4);
#pragma omp parallel for
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++) {
c[i] += b[j] * a[j][i];
}
}
stop = clock();
elapsed = (double) (stop - start) / CLOCKS_PER_SEC;
printf("\nTime elapsed: %.5f\n", elapsed);
return 0;
}
First method takes
Time elapsed: 0.03442
Second method
Time elapsed: 0.02630

Related

Is there a way to parallelize a lower triangle matrix solver?

The goal is to add OpenMP parallelization to for (i = 0; i < n; i++) for the lower triangle solver for the form Ax=b. Expected result is exactly same as the result when there is NO parallelization added to for (i = 0; i < n; i++).
vector<vector<double>> represents a 2-D matrix. makeMatrix(int m, int n) initializes a vector<vector<double>> of all zeroes of size mxn.
Two of the most prominent tries have been left in comments.
vector<vector<double>> lowerTriangleSolver(vector<vector<double>> A, vector<vector<double>> b)
{
vector<vector<double>> x = makeMatrix(A.size(), 1);
int i, j;
int n = A.size();
double s;
//#pragma omp parallel for reduction(+: s)
//#pragma omp parallel for shared(s)
for (i = 0; i < n; i++)
{
s = 0.0;
#pragma omp parallel for
for (j = 0; j < i; j++)
{
s = s + A[i][j] * x[j][0];
}
x[i][0] = (b[i][0] - s) / A[i][i];
}
return x;
}
You could try to assign the outer loop iterations among threads, instead of the inner loop. In this way, you increase the granularity of the parallel tasks and avoid the reduction of the 's' variable.
#pragma omp parallel for
for (int i = 0; i < n; i++){
double s = 0.0;
for (int j = 0; j < i; j++){
s = s + A[i][j] * x[j][0];
}
x[i][0] = (b[i][0] - s) / A[i][i];
}
Unfortunately, that is not possible because there is a dependency between s = s + A[i][j] * x[j][0]; and x[i][0] = (b[i][0] - s) / A[i][i];, more precisely x[j][0] depends upon the x[i][0].
So you can try two approaches:
for (int i = 0; i < n; i++){
double s = 0.0;
#pragma omp parallel for reduction(+:s)
for (int j = 0; j < i; j++){
s = s + A[i][j] * x[j][0];
}
x[i][0] = (b[i][0] - s) / A[i][i];
}
or using SIMD :
for (int i = 0; i < n; i++){
double s = 0.0;
#pragma omp simd reduction(+:s)
for (int j = 0; j < i; j++){
s = s + A[i][j] * x[j][0];
}
x[i][0] = (b[i][0] - s) / A[i][i];
}

How to add OpenMp to triple nested for-loop

The goal is to add as much OpenMP to the following Cholesky factor function to increase parallelization. So far, I only have one #pragma omp parallel for implemented correctly. vector<vector<double>> represents a 2-D matrix. I've already tried adding #pragma omp parallel for for
for (int i = 0; i < n; ++i), for (int k = 0; k < i; ++k), and for (int j = 0; j < k; ++j) but the parallelization goes wrong. makeMatrix(n, n) initializes a vector<vector<double>> of all zeroes of size nxn.
vector<vector<double>> cholesky_factor(vector<vector<double>> input)
{
int n = input.size();
vector<vector<double>> result = makeMatrix(n, n);
for (int i = 0; i < n; ++i)
{
for (int k = 0; k < i; ++k)
{
double value = input[i][k];
for (int j = 0; j < k; ++j)
{
value -= result[i][j] * result[k][j];
}
result[i][k] = value / result[k][k];
}
double value = input[i][i];
#pragma omp parallel for
for (int j = 0; j < i; ++j)
{
value -= result[i][j] * result[i][j];
}
result[i][i] = std::sqrt(value);
}
return result;
}
I don't think you can parallelize much more than this with this algorithm, as the ith iteration of the outer loop depends on the results of the i - 1th iteration and the kth iteration of the inner loop depends on the results of the k - 1th iteration.
vector<vector<double>> cholesky_factor(vector<vector<double>> input)
{
int n = input.size();
vector<vector<double>> result = makeMatrix(n, n);
for (int i = 0; i < n; ++i)
{
for (int k = 0; k < i; ++k)
{
double value = input[i][k];
// reduction(-: value) does the same
// (private instances of value are initialized to zero and
// added to the initial instance of value when the threads are joining
#pragma omp parallel for reduction(+: value)
for (int j = 0; j < k; ++j)
{
value -= result[i][j] * result[k][j];
}
result[i][k] = value / result[k][k];
}
double value = input[i][i];
#pragma omp parallel for reduction(+: value)
for (int j = 0; j < i; ++j)
{
value -= result[i][j] * result[i][j];
}
result[i][i] = std::sqrt(value);
}
return result;
}

openmp two consecutive loops, problem with reduction clause

There is two consecutive loops and there is a reduction clause in the second loop.
#pragma opm parallel
{
#pragma omp for
for (size_t i = 0; i < N; ++i)
{
}
#pragma omp barrier
#pragma omp for reduction(+ \
: sumj)
for (size_t i = 0; i < N; ++i)
{
sumj = 0.0;
for (size_t j = 0; j < adjList[i].size(); ++j)
{
sumj += 0;
}
Jac[i, i] = sumj;
}
}
to reduce the creating threads overhead I wand to keep the threads and use them in the second loop, but I get the following error
lib.cpp:131:17: error: reduction variable ‘sumj’ is private in outer context
#pragma omp for reduction(+ \
^~~
how to fix that?
I'm not sure what you are trying to do, but it seems that something like this would do what you expect:
#pragma omp parallel
{
#pragma omp for
for (size_t i = 0; i < N; ++i)
{
}
#pragma omp barrier
#pragma omp for
for (size_t i = 0; i < N; ++i)
{
double sumj = 0.0;
for (size_t j = 0; j < adjList[i].size(); ++j)
{
sumj += 0;
}
Jac[i, i] = sumj;
}
}
Reduce would be useful in the case of an "omp for" in the interior loop.

Pragma omp parallel + ntl

I'm trying to use openmp to run the below code, but I get Segmentation Fault
void modKeyGenPrs(mat_GF2E *&Prs, mat_GF2E Lst[], mat_GF2E L1, mat_GF2E L2) {
Prs = new mat_GF2E[m];
mat_GF2E L1_trans = transpose(L1);
#pragma omp parallel shared(L1_trans,L2,Lst,Prs,L1)
{
#pragma omp for
for (int i = 0; i < m; i++) {
(Prs[i]).SetDims(n, n);
for (int j = 0; j < m; j++) {
Prs[i] = Prs[i] + (L2[i][j] * (L1_trans * (Lst[i]) * L1));
}
}
}
}

openmp: increasing of threads number decreases perfomance

I have this C++ code.
Loop goes throgh the matrix, finds the min element in each row and subtracts it from each element of corresponding row.
Variable myr is a summ of all min elements
Trying to parallel for:
int min = 0;
int myr = 0;
int temp[SIZE][SIZE];
int size = 0;
...//some initialization
omp_set_num_threads(1);
start_time = omp_get_wtime();
#ifdef _OPENMP
#pragma omp parallel for firstprivate(min, size) reduction(+:myr)
#endif
for(int i = 0; i < size; i++){
min = INFINITY;
for(int j = 0; j < size; j++){
if (temp[i][j] < min)
min = temp[i][j];
}
myr+=min;
for(int j = 0; j < size; j++)
temp[i][j]-=min;
}
end_time = omp_get_wtime();
if I set omp_set_num_threads(2); this part of code starts working slower.
My proc has 2 cores
Why code works slower with 2 threads?
There must be some aliasing or something going on. Make things simpler for OpenMP:
int const size0 = size;
#ifdef _OPENMP
#pragma omp parallel for reduction(+:myr)
#endif
for(int i = 0; i < size0; i++){
int min = INFINITY;
int * tmp = temp[i];
for(int j = 0; j < size0; j++){
if (tmp[j] < min)
min = tmp[j];
}
for(int j = 0; j < size0; j++)
tmp[j]-=min;
myr+=min;
}
That is, have most of the variables local and const if you may.
The parallel part can be reinterpreted as follows (I have used the snippet by #jens-gustedt, but to my experience it didn't make much difference):
#pragma omp parallel private(myr_private) shared(myr)
{
myr_private = 0;
#pragma omp for
for(int i = 0; i < size; i++){
int min = INFINITY;
int * tmp = temp[i];
for(int j = 0; j < size; j++){
if (tmp[j] < min)
min = tmp[j];
}
for(int j = 0; j < size; j++)
tmp[j]-=min;
myr_private+=min;
}
#pragma omp critical
{
myr+=myr_private;
}
}
(This interpretation is straight from http://www.openmp.org/mp-documents/OpenMP3.1.pdf Example A.36.2c).
If number of threads is n>1, there is overhead when #pragma omp parallel creates additional thread(s) and then in critical section, which all of the threads should wait for.
I have experimented with different matrix sizes and in my limited tests two threads are considerably faster with sizes above 1000, and start lagging behind with sizes below 500.