I have this C++ code.
Loop goes throgh the matrix, finds the min element in each row and subtracts it from each element of corresponding row.
Variable myr is a summ of all min elements
Trying to parallel for:
int min = 0;
int myr = 0;
int temp[SIZE][SIZE];
int size = 0;
...//some initialization
omp_set_num_threads(1);
start_time = omp_get_wtime();
#ifdef _OPENMP
#pragma omp parallel for firstprivate(min, size) reduction(+:myr)
#endif
for(int i = 0; i < size; i++){
min = INFINITY;
for(int j = 0; j < size; j++){
if (temp[i][j] < min)
min = temp[i][j];
}
myr+=min;
for(int j = 0; j < size; j++)
temp[i][j]-=min;
}
end_time = omp_get_wtime();
if I set omp_set_num_threads(2); this part of code starts working slower.
My proc has 2 cores
Why code works slower with 2 threads?
There must be some aliasing or something going on. Make things simpler for OpenMP:
int const size0 = size;
#ifdef _OPENMP
#pragma omp parallel for reduction(+:myr)
#endif
for(int i = 0; i < size0; i++){
int min = INFINITY;
int * tmp = temp[i];
for(int j = 0; j < size0; j++){
if (tmp[j] < min)
min = tmp[j];
}
for(int j = 0; j < size0; j++)
tmp[j]-=min;
myr+=min;
}
That is, have most of the variables local and const if you may.
The parallel part can be reinterpreted as follows (I have used the snippet by #jens-gustedt, but to my experience it didn't make much difference):
#pragma omp parallel private(myr_private) shared(myr)
{
myr_private = 0;
#pragma omp for
for(int i = 0; i < size; i++){
int min = INFINITY;
int * tmp = temp[i];
for(int j = 0; j < size; j++){
if (tmp[j] < min)
min = tmp[j];
}
for(int j = 0; j < size; j++)
tmp[j]-=min;
myr_private+=min;
}
#pragma omp critical
{
myr+=myr_private;
}
}
(This interpretation is straight from http://www.openmp.org/mp-documents/OpenMP3.1.pdf Example A.36.2c).
If number of threads is n>1, there is overhead when #pragma omp parallel creates additional thread(s) and then in critical section, which all of the threads should wait for.
I have experimented with different matrix sizes and in my limited tests two threads are considerably faster with sizes above 1000, and start lagging behind with sizes below 500.
Related
The goal is to add OpenMP parallelization to for (i = 0; i < n; i++) for the lower triangle solver for the form Ax=b. Expected result is exactly same as the result when there is NO parallelization added to for (i = 0; i < n; i++).
vector<vector<double>> represents a 2-D matrix. makeMatrix(int m, int n) initializes a vector<vector<double>> of all zeroes of size mxn.
Two of the most prominent tries have been left in comments.
vector<vector<double>> lowerTriangleSolver(vector<vector<double>> A, vector<vector<double>> b)
{
vector<vector<double>> x = makeMatrix(A.size(), 1);
int i, j;
int n = A.size();
double s;
//#pragma omp parallel for reduction(+: s)
//#pragma omp parallel for shared(s)
for (i = 0; i < n; i++)
{
s = 0.0;
#pragma omp parallel for
for (j = 0; j < i; j++)
{
s = s + A[i][j] * x[j][0];
}
x[i][0] = (b[i][0] - s) / A[i][i];
}
return x;
}
You could try to assign the outer loop iterations among threads, instead of the inner loop. In this way, you increase the granularity of the parallel tasks and avoid the reduction of the 's' variable.
#pragma omp parallel for
for (int i = 0; i < n; i++){
double s = 0.0;
for (int j = 0; j < i; j++){
s = s + A[i][j] * x[j][0];
}
x[i][0] = (b[i][0] - s) / A[i][i];
}
Unfortunately, that is not possible because there is a dependency between s = s + A[i][j] * x[j][0]; and x[i][0] = (b[i][0] - s) / A[i][i];, more precisely x[j][0] depends upon the x[i][0].
So you can try two approaches:
for (int i = 0; i < n; i++){
double s = 0.0;
#pragma omp parallel for reduction(+:s)
for (int j = 0; j < i; j++){
s = s + A[i][j] * x[j][0];
}
x[i][0] = (b[i][0] - s) / A[i][i];
}
or using SIMD :
for (int i = 0; i < n; i++){
double s = 0.0;
#pragma omp simd reduction(+:s)
for (int j = 0; j < i; j++){
s = s + A[i][j] * x[j][0];
}
x[i][0] = (b[i][0] - s) / A[i][i];
}
The goal is to add as much OpenMP to the following Cholesky factor function to increase parallelization. So far, I only have one #pragma omp parallel for implemented correctly. vector<vector<double>> represents a 2-D matrix. I've already tried adding #pragma omp parallel for for
for (int i = 0; i < n; ++i), for (int k = 0; k < i; ++k), and for (int j = 0; j < k; ++j) but the parallelization goes wrong. makeMatrix(n, n) initializes a vector<vector<double>> of all zeroes of size nxn.
vector<vector<double>> cholesky_factor(vector<vector<double>> input)
{
int n = input.size();
vector<vector<double>> result = makeMatrix(n, n);
for (int i = 0; i < n; ++i)
{
for (int k = 0; k < i; ++k)
{
double value = input[i][k];
for (int j = 0; j < k; ++j)
{
value -= result[i][j] * result[k][j];
}
result[i][k] = value / result[k][k];
}
double value = input[i][i];
#pragma omp parallel for
for (int j = 0; j < i; ++j)
{
value -= result[i][j] * result[i][j];
}
result[i][i] = std::sqrt(value);
}
return result;
}
I don't think you can parallelize much more than this with this algorithm, as the ith iteration of the outer loop depends on the results of the i - 1th iteration and the kth iteration of the inner loop depends on the results of the k - 1th iteration.
vector<vector<double>> cholesky_factor(vector<vector<double>> input)
{
int n = input.size();
vector<vector<double>> result = makeMatrix(n, n);
for (int i = 0; i < n; ++i)
{
for (int k = 0; k < i; ++k)
{
double value = input[i][k];
// reduction(-: value) does the same
// (private instances of value are initialized to zero and
// added to the initial instance of value when the threads are joining
#pragma omp parallel for reduction(+: value)
for (int j = 0; j < k; ++j)
{
value -= result[i][j] * result[k][j];
}
result[i][k] = value / result[k][k];
}
double value = input[i][i];
#pragma omp parallel for reduction(+: value)
for (int j = 0; j < i; ++j)
{
value -= result[i][j] * result[i][j];
}
result[i][i] = std::sqrt(value);
}
return result;
}
The time-determining step of my code is a tensor contraction of the following form
#pragma omp parallel for schedule(dynamic)
for(int i = 0; i < no; ++i){
for(int j = 0; j < no; ++j){
X.middleCols(i*nv,nv) += Y.middleCols(j*nv,nv) * this->getIJMatrix(j,i);
}
}
where X and Y are large matrices of dimension (nx,no*nv) and the function getIJMatrix(j,i) returns the (nv*nv) matrix for index pair ij of a rank-four tensor. Also, no < nv << nx. The parallelization here is straigthforward. However, I can exploit symmetry with respect to i and j
#pragma omp parallel for schedule(dynamic)
for(int i = 0; i < no; ++i){
for(int j = i; j < no; ++j){
auto ij = this->getIJMatrix(j,i);
X.middleCols(i*nv,nv) += Y.middleCols(j*nv,nv) * ij;
if(i!=j) X.middleCols(j*nv,nv) += Y.middleCols(i*nv,nv) * ij.transpose();
}
}
leaving me with a race condition. Since X is large, using a reduction here is not feasible.
If I understand it correctly, there is no way around each thread waiting for the other ones within the inner loop. What's a good practice for this which preferably is as fast as possible?
edit: corrected obvious errors
I've just started studying parallel programming with OpenMP, and there is a subtle point in the nested loop. I wrote a simple matrix multiplication code, and checked the result that is correct. But actually there are several ways to parallelize this for loop, which may be different in terms of low-level detail, and I wanna ask about it.
At first, I wrote code below, which multiply two matrix A, B and assign the result to C.
for(i = 0; i < N; i++)
{
for(j = 0; j < N; j++)
{
sum = 0;
#pragma omp parallel for reduction(+:sum)
for(k = 0; k < N; k++)
{
sum += A[i][k]*B[k][j];
}
C[i][j] = sum;
}
}
It works, but it takes really long time. And I find out that because of the location of parallel directive, it will construct the parallel region N2 time. I found it by huge increase in user time when I used linux time command.
Next time, I tried code below which also worked.
#pragma omp parallel for private(i, j, k, sum)
for(i = 0; i < N; i++)
{
for(j = 0; j < N; j++)
{
sum = 0;
for(k = 0; k < N; k++)
{
sum += A[i][k]*B[k][j];
}
C[i][j] = sum;
}
}
And the elapsed time is decreased from 72.720s in sequential execution to 5.782s in parallel execution with the code above. And it is the reasonable result because I executed it with 16 cores.
But the flow of the second code is not easily drawn in my mind. I know that if we privatize all loop variables, the program will consider that nested loop as one large loop with size N3. It can be easily checked by executing the code below.
#pragma omp parallel for private(i, j, k)
for(i = 0; i < N; i++)
{
for(j = 0; j < N; j++)
{
for(k = 0; k < N; k++)
{
printf("%d, %d, %d\n", i, j, k);
}
}
}
The printf was executed N3 times.
But in my second matrix multiplication code, there is sum right before and after the innermost loop. And It bothers me to unfold the loop in my mind easily. The third code I wrote is easily unfolded in my mind.
To summarize, I want to know what really happens behind the scene in my second matrix multiplication code, especially with the change of the value of sum. Or I'll really thank you for some recommendation of tools to observe the flow of multithreads program written with OpenMP.
omp for by default only applies to the next direct loop. The inner loops are not affected at all. This means, your can think about your second version like this:
// Example for two threads
with one thread execute
{
// declare private variables "locally"
int i, j, k;
for(i = 0; i < N / 2; i++) // loop range changed
{
for(j = 0; j < N; j++)
{
sum = 0;
for(k = 0; k < N; k++)
{
sum += A[i][k]*B[k][j];
}
C[i][j] = sum;
}
}
}
with the other thread execute
{
// declare private variables "locally"
int i, j, k;
for(i = N / 2; i < N; i++) // loop range changed
{
for(j = 0; j < N; j++)
{
sum = 0;
for(k = 0; k < N; k++)
{
sum += A[i][k]*B[k][j];
}
C[i][j] = sum;
}
}
}
You can simply all reasoning about variables with OpenMP by declaring them as locally as possible. I.e. instead of the explicit declaration use:
#pragma omp parallel for
for(int i = 0; i < N; i++)
{
for(int j = 0; j < N; j++)
{
int sum = 0;
for(int k = 0; k < N; k++)
{
sum += A[i][k]*B[k][j];
}
C[i][j] = sum;
}
}
This way you the private scope of variable more easily.
In some cases it can be beneficial to apply parallelism to multiple loops.
This is done by using collapse, i.e.
#pragma omp parallel for collapse(2)
for(int i = 0; i < N; i++)
{
for(int j = 0; j < N; j++)
You can imagine this works with a transformation like:
#pragma omp parallel for
for (int ij = 0; ij < N * N; ij++)
{
int i = ij / N;
int j = ij % N;
A collapse(3) would not work for this loop because of the sum = 0 in-between.
Now is one more detail:
#pragma omp parallel for
is a shorthand for
#pragma omp parallel
#pragma omp for
The first creates the threads - the second shares the work of a loop among all threads reaching this point. This may not be of importance for the understanding now, but there are use-cases for which it matters. For instance you could write:
#pragma omp parallel
for(int i = 0; i < N; i++)
{
#pragma omp for
for(int j = 0; j < N; j++)
{
I hope this sheds some light on what happens there from a logical point of view.
i've got a problem and a question.
I tried to make some matrix multiplication with omp.
If i create the matrices a, b, and c with more than one thread, the colum sizes aren't equal.
The problem remains even if i use critical for push_back.
I thought omp divide the for loop in equal sized pieces, so every thread should have his own column. Is the problem in ?
What is a good way to give every thread a vector?
And what is a good way to avoid shared memory problems without critical and atomic, e.g. if i'm generating data and want to save it somewhere.
Thanks.
P.S. I am working on my english. It 's far away from perfect, so please don't mind.
#include "stdafx.h"
#include <omp.h>
#include <iostream>
#include <ctime>
#include <vector>
#define NRA 300 /* number of rows in matrix A */
#define NCA 300 /* number of columns in matrix A */
#define NCB 300 /* number of columns in matrix B */
int main(int argc, char *argv[])
{
int i, j, k, chunk;
std::vector < std::vector<int> > a;
a.resize(NRA);
std::vector < std::vector<int> > b;
b.resize(NCA);
std::vector < std::vector<int> > c;
c.resize(NRA);
/*
double a[NRA][NCA];
double b[NCA][NCB];
double c[NRA][NCB];
*/
chunk = 10;
std::clock_t start; //Zeitmessung
double duration; //Zeitdauer der Parallelisierung
omp_set_num_threads(4);
#pragma omp parallel
{
#pragma omp for schedule (static, chunk)
for (i = 0; i < NRA; i++)
for (j = 0; j < NCA; j++)
a[i].push_back(i + j);
#pragma omp for schedule (static, chunk)
for (i = 0; i < NCA; i++)
for (j = 0; j < NCB; j++)
b[i].push_back(i*j);
#pragma omp for ordered schedule(static, chunk)
for (i = 0; i < NRA; i++)
for (j = 0; j < NCB; j++)
c[i].push_back(0);
}
for (int nthreads = 1; nthreads < 40; nthreads++)
{
start = std::clock();
omp_set_dynamic(0);
#pragma omp parallel shared(a,b,c,nthreads,chunk) private(i,j,k) num_threads(nthreads)
{
#pragma omp for schedule (static, chunk)
for ( i = 0; i < NRA; i++)
for (j = 0; j < NCB; j++)
c[i][j] = 0;
#pragma omp for ordered schedule (static, chunk)
for (i = 0; i < NRA; i++)
{
for ( j = 0; j < NCB; j++)
for (k = 0; k < NCA; k++)
c[i][j] += a[i][k] * b[k][j];
}
}
duration = (std::clock() - start) / (double)CLOCKS_PER_SEC;
//Time n threads need
std::cout << "Benoetigte Zeit fuer " << nthreads << " Threads betrug " << duration << " Sekunden." << std::endl;
}
std::cin.get();
}
push_back() definitely modifies vector’s metadata, especially size. Try to resize() the inner vectors like you do with the outer ones (a, b, c) and then just modify the elements (a[i] = i + j; etc.) in the parallel run.
Since you know the final count of elements in the beginning, you can use plain arrays instead of vectors to minimize overhead.
int a[NRA][NCA];
int b[NCA][NCB];
int c[NRA][NCB];
I wonder why you’ve commented out the similar part of your code. ;-)