I want to parallelize a program that I have written which calculates a series involving matrix and vector products with the result being a vector. Since arguments become very small and large, I use ARB (based on GMP, MPFR and flint) to prevent Loss of significance.
Also, since series elements are independent, matrix dimensions are not big, but the series needs to be evaluated upto 50k elements, it makes sense to have a number of threads compute a few elements of the series each, i.e. 5 threads could each compute 10k elements in parallel and then add up the resulting vectors.
The issue is now that the ARB function to add up vectors and matrices is not a standard operation that can be used withtin an openmp reduction easily.
When I naively try to write a custom reduction, g++ complains about the void type, since operations in ARB do not have a return value:
void arb_add(arb_t z, const arb_t x, const arb_t y, slong prec)¶
will calculate and set z to z=x+y with a precision of prec bits, but arb_add itself is s a void function.
As an example: a random for-loop for a similar problem looks like this (my actual program is different of course)
[...]
arb_mat_t RMa,RMb,RMI,RMP,RMV,RRes;
arb_mat_init(RMa,Nmax,Nmax); //3 matrices
arb_mat_init(RMb,Nmax,Nmax);
arb_mat_init(RMI,Nmax,Nmax);
arb_mat_init(RMV,Nmax,1); // 3 vectors
arb_mat_init(RMP,Nmax,1);
arb_mat_init(RRes,Nmax,1);
[...]
//Result= V + ABV +(AB)^2 V + (AB)^3 V + ...
//in my actual program A and B would be j- and k-dependent and would
//include more matrices and vectors
#pragma omp parallel for collapse(1) private(j)
for(j=0; j<jmax; j++){
arb_mat_one(RMI); //sets the matrix RMI to 1
for(k=0; k<j; k++){
Qmmd(RMI,RMI,RMa,Nmax,prec); //RMI=RMI*RMa
Qmmd(RMI,RMI,RMb,Nmax,prec); //RMI=RMI*RMb
cout << "j=" << j << ", k=" << k << "\r" << flush;
}
Qmvd(RMP,RMI,RMV,Nmax,prec); //RMP=RMI*RMV
arb_mat_add(RRes,RRes,RMP,prec); //Result=Result + RMP
}
[...]
which of course breaks down when using more than 1 thread because I did not specify a reduction on RRes. Here Qmmd() and Qmvd() are self-written matrix-matrix and matrix-vector product functions, and RMa, RMb, and RMV are random matrices and vectors, resp.
The idea is now to reduce RRes, such that each thread can compute a private version of RRes including a fraction of the final result, before adding them all up using arb_mat_add. I could write a function matrixadd(A,B) to compute A=A+B
void matrixadd(arb_mat_t A, arb_mat_t B) {
arb_mat_add(A,A,B,2000);
//A=A+B, the last value is the precision in bits used for that operation
}
and then eventually
#pragma omp declare reduction \
(myadd : void : matrixadd(omp_out, omp_in))
#pragma omp parallel for private(j) reduction(myadd:RRes)
for(j=0; j<jmax; j++){
arb_mat_one(RMI);
for(k=0; k<j; k++){
Qmmd(RMI,RMI,RMa,Nmax,prec);
Qmmd(RMI,RMI,RMb,Nmax,prec);
cout << "j=" << j << ", k=" << k << "\r" << flush;
}
Qmvd(RMP,RMI,RMV,Nmax,prec);
matrixadd(RRes,RMP);
}
Gcc is not happy with this:
main.cpp: In function ‘int main()’:
main.cpp:503:46: error: invalid use of void expression
(myadd : void : matrixadd(omp_out, omp_in))
^
main.cpp:504:114: error: ‘RRes’ has invalid type for ‘reduction’
Can Openmp understand my void reduction and can be made to work with ARB and GMP? If so, how? Thanks!
(Also, my program currently includes a convergence check with a break condition in the j-for-loop. If you also know how to easily implement such a thing, too, I would be very grateful because for my current openmp tests I just removed the break and set a constant jmax.)
My question is very similar to this one.
Edit: Sorry, here is my try of a minimal, complete and verifiable example. Additional required packages are arb, flint, gmp, mpfr (available through packetmanagers) and gmpfrxx.
#include <iostream>
#include <omp.h>
#include <cmath>
#include <ctime>
#include <cmath>
#include <gmp.h>
#include "gmpfrxx/gmpfrxx.h"
#include "arb.h"
#include "acb.h"
#include "arb_mat.h"
using namespace std;
void generate_matrixARBdeterministic(arb_mat_t Mat, int N, double w2) //generates some matrix
{
int i,j;
double what;
for(i=0;i<N;i++)
{
for(j=0;j<N;j++)
{
what=(i*j+30/w2)/((1+0.1*w2)*j+20-w2);
arb_set_d(arb_mat_entry(Mat,i,j),what);
}
}
}
void generate_vecARBdeterministic(arb_mat_t Mat, int N) //generates some vector
{
int i;
double what;
for(i=0;i<N;i++)
{
what=(4*i*i+40)/200;
arb_set_d(arb_mat_entry(Mat,i,0),what);
}
}
void Qmmd(arb_mat_t res, arb_mat_t MA, arb_mat_t MB, int NM, slong prec)
{ ///res=M*M=Matrix * Matrix
arb_t Qh1;
arb_mat_t QMh;
arb_init(Qh1);
arb_mat_init(QMh,NM,NM);
for (int i=0; i<NM; i++){
for(int j=0; j<NM; j++){
for (int k=0; k<NM; k++ ) {
arb_mul(Qh1,arb_mat_entry(MA, i, k),arb_mat_entry(MB, k, j),prec);
arb_add(arb_mat_entry(QMh, i, j),arb_mat_entry(QMh, i, j),Qh1,prec);
}
}
}
arb_mat_set(res,QMh);
arb_mat_clear(QMh);
arb_clear(Qh1);
}
void Qmvd(arb_mat_t res, arb_mat_t M, arb_mat_t V, int NM, slong prec) //res=M*V=Matrix * Vector
{ ///res=M*V
arb_t Qh,Qs;
arb_mat_t QMh;
arb_init(Qh);
arb_init(Qs);
arb_mat_init(QMh,NM,1);
arb_set_ui(Qh,0.0);
arb_set_ui(Qs,0.0);
arb_mat_zero(QMh);
for (int i=0; i<NM; i++){
arb_set_ui(Qs,0.0);
for(int j=0; j<NM; j++){
arb_mul(Qh,arb_mat_entry(M, i, j),arb_mat_entry(V, j, 0),prec);
arb_add(Qs,Qs,Qh,prec);
}
arb_set(arb_mat_entry(QMh, i, 0),Qs);
}
arb_mat_set(res,QMh);
arb_mat_clear(QMh);
arb_clear(Qh);
arb_clear(Qs);
}
void QPrintV(arb_mat_t A, int N){ //Prints Vector
for(int i=0;i<N;i++){
cout << arb_get_str(arb_mat_entry(A, i, 0),5,0) << endl; //ARB_STR_NO_RADIUS
}
}
void matrixadd(arb_mat_t A, arb_mat_t B) {
arb_mat_add(A,A,B,2000);
}
int main() {
int Nmax=10,jmax=300; //matrix dimension and max of j-loop
ulong prec=2000; //precision for arb
//initializations
arb_mat_t RMa,RMb,RMI,RMP,RMV,RRes;
arb_mat_init(RMa,Nmax,Nmax);
arb_mat_init(RMb,Nmax,Nmax);
arb_mat_init(RMI,Nmax,Nmax);
arb_mat_init(RMV,Nmax,1);
arb_mat_init(RMP,Nmax,1);
arb_mat_init(RRes,Nmax,1);
omp_set_num_threads(1);
cout << "Maximal threads is " << omp_get_max_threads() << endl;
generate_matrixARBdeterministic(RMa,Nmax,1.0); //generates some Matrix for RMa
arb_mat_set(RMb,RMa); // sets RMb=RMa
generate_vecARBdeterministic(RMV,Nmax); //generates some vector
double st=omp_get_wtime();
Qmmd(RMI,RMa,RMb,Nmax,prec);
int j,k=0;
#pragma omp declare reduction \
(myadd : void : matrixadd(omp_out, omp_in))
#pragma omp parallel for private(j) reduction(myadd:RRes)
for(j=0; j<jmax; j++){
arb_mat_one(RMI);
for(k=0; k<j; k++){
Qmmd(RMI,RMI,RMa,Nmax,prec);
Qmmd(RMI,RMI,RMb,Nmax,prec);
cout << "j=" << j << ", k=" << k << "\r" << flush;
}
Qmvd(RMP,RMI,RMV,Nmax,prec);
matrixadd(RRes,RMP);
}
QPrintV(RRes,Nmax);
double en=omp_get_wtime();
printf("\n Time it took was %lfs\n",en-st);
arb_mat_clear(RMa);
arb_mat_clear(RMb);
arb_mat_clear(RMV);
arb_mat_clear(RMP);
arb_mat_clear(RMI);
arb_mat_clear(RRes);
return 0;
}
and
g++ test.cpp -g -fexceptions -O3 -ltbb -fopenmp -lmpfr -lflint -lgmp -lgmpxx -larb -I../../PersonalLib -std=c++14 -lm -o b.out
You can do the reduction by hand like this
#pragma omp parallel
{
arb_mat_t RMI, RMP;
arb_mat_init(RMI,Nmax,Nmax); //allocate memory
arb_mat_init(RMP,Nmax,1); //allocate memory
#pragma omp for
for(int j=0; j<jmax; j++){
arb_mat_one(RMI);
for(int k=0; k<j; k++){
Qmmd(RMI,RMI,RMa,Nmax,prec);
Qmmd(RMI,RMI,RMb,Nmax,prec);
}
}
Qmvd(RMP,RMI,RMV,Nmax,prec);
#pragma omp critical
arb_mat_add(RRes,RRes,RMP,prec);
arb_mat_clear(RMI); //deallocate memory
arb_mat_clear(RMP); //deallocate memory
}
If you want to use declare reduction you need to make a C++ wrapper for arb_mat_t. Using declare reduction lets OpenMP decide how to do the reduction. But I highly doubt you will find a case where this gives better performance than the manual case.
You can either create an array of matrices for each thread to sum up. You simply replace matrixadd(RRes, RMP) with matrixadd(RRes[get_omp_thread_num()], RMP) and then sum all RRes in the end, where RRes would now be a std::vector<arb_mat_t>.
Or you could try to define an addition operator for a wrapper class, of course you should be careful to avoid copying the entire matrix. This feels like more of a hassle as you have to be a bit careful with the memory management (since you're using a library - you don't know exactly what it does unless you take the time to go through it all).
Related
I have a matrix (relatively big) that I need to transpose. For example assume that my matrix is
a b c d e f
g h i j k l
m n o p q r
I want the result be as follows:
a g m
b h n
c I o
d j p
e k q
f l r
What is the fastest way to do this?
This is a good question. There are many reason you would want to actually transpose the matrix in memory rather than just swap coordinates, e.g. in matrix multiplication and Gaussian smearing.
First let me list one of the functions I use for the transpose (EDIT: please see the end of my answer where I found a much faster solution)
void transpose(float *src, float *dst, const int N, const int M) {
#pragma omp parallel for
for(int n = 0; n<N*M; n++) {
int i = n/N;
int j = n%N;
dst[n] = src[M*j + i];
}
}
Now let's see why the transpose is useful. Consider matrix multiplication C = A*B. We could do it this way.
for(int i=0; i<N; i++) {
for(int j=0; j<K; j++) {
float tmp = 0;
for(int l=0; l<M; l++) {
tmp += A[M*i+l]*B[K*l+j];
}
C[K*i + j] = tmp;
}
}
That way, however, is going to have a lot of cache misses. A much faster solution is to take the transpose of B first
transpose(B);
for(int i=0; i<N; i++) {
for(int j=0; j<K; j++) {
float tmp = 0;
for(int l=0; l<M; l++) {
tmp += A[M*i+l]*B[K*j+l];
}
C[K*i + j] = tmp;
}
}
transpose(B);
Matrix multiplication is O(n^3) and the transpose is O(n^2), so taking the transpose should have a negligible effect on the computation time (for large n). In matrix multiplication loop tiling is even more effective than taking the transpose but that's much more complicated.
I wish I knew a faster way to do the transpose (Edit: I found a faster solution, see the end of my answer). When Haswell/AVX2 comes out in a few weeks it will have a gather function. I don't know if that will be helpful in this case but I could image gathering a column and writing out a row. Maybe it will make the transpose unnecessary.
For Gaussian smearing what you do is smear horizontally and then smear vertically. But smearing vertically has the cache problem so what you do is
Smear image horizontally
transpose output
Smear output horizontally
transpose output
Here is a paper by Intel explaining that
http://software.intel.com/en-us/articles/iir-gaussian-blur-filter-implementation-using-intel-advanced-vector-extensions
Lastly, what I actually do in matrix multiplication (and in Gaussian smearing) is not take exactly the transpose but take the transpose in widths of a certain vector size (e.g. 4 or 8 for SSE/AVX). Here is the function I use
void reorder_matrix(const float* A, float* B, const int N, const int M, const int vec_size) {
#pragma omp parallel for
for(int n=0; n<M*N; n++) {
int k = vec_size*(n/N/vec_size);
int i = (n/vec_size)%N;
int j = n%vec_size;
B[n] = A[M*i + k + j];
}
}
EDIT:
I tried several function to find the fastest transpose for large matrices. In the end the fastest result is to use loop blocking with block_size=16 (Edit: I found a faster solution using SSE and loop blocking - see below). This code works for any NxM matrix (i.e. the matrix does not have to be square).
inline void transpose_scalar_block(float *A, float *B, const int lda, const int ldb, const int block_size) {
#pragma omp parallel for
for(int i=0; i<block_size; i++) {
for(int j=0; j<block_size; j++) {
B[j*ldb + i] = A[i*lda +j];
}
}
}
inline void transpose_block(float *A, float *B, const int n, const int m, const int lda, const int ldb, const int block_size) {
#pragma omp parallel for
for(int i=0; i<n; i+=block_size) {
for(int j=0; j<m; j+=block_size) {
transpose_scalar_block(&A[i*lda +j], &B[j*ldb + i], lda, ldb, block_size);
}
}
}
The values lda and ldb are the width of the matrix. These need to be multiples of the block size. To find the values and allocate the memory for e.g. a 3000x1001 matrix I do something like this
#define ROUND_UP(x, s) (((x)+((s)-1)) & -(s))
const int n = 3000;
const int m = 1001;
int lda = ROUND_UP(m, 16);
int ldb = ROUND_UP(n, 16);
float *A = (float*)_mm_malloc(sizeof(float)*lda*ldb, 64);
float *B = (float*)_mm_malloc(sizeof(float)*lda*ldb, 64);
For 3000x1001 this returns ldb = 3008 and lda = 1008
Edit:
I found an even faster solution using SSE intrinsics:
inline void transpose4x4_SSE(float *A, float *B, const int lda, const int ldb) {
__m128 row1 = _mm_load_ps(&A[0*lda]);
__m128 row2 = _mm_load_ps(&A[1*lda]);
__m128 row3 = _mm_load_ps(&A[2*lda]);
__m128 row4 = _mm_load_ps(&A[3*lda]);
_MM_TRANSPOSE4_PS(row1, row2, row3, row4);
_mm_store_ps(&B[0*ldb], row1);
_mm_store_ps(&B[1*ldb], row2);
_mm_store_ps(&B[2*ldb], row3);
_mm_store_ps(&B[3*ldb], row4);
}
inline void transpose_block_SSE4x4(float *A, float *B, const int n, const int m, const int lda, const int ldb ,const int block_size) {
#pragma omp parallel for
for(int i=0; i<n; i+=block_size) {
for(int j=0; j<m; j+=block_size) {
int max_i2 = i+block_size < n ? i + block_size : n;
int max_j2 = j+block_size < m ? j + block_size : m;
for(int i2=i; i2<max_i2; i2+=4) {
for(int j2=j; j2<max_j2; j2+=4) {
transpose4x4_SSE(&A[i2*lda +j2], &B[j2*ldb + i2], lda, ldb);
}
}
}
}
}
This is going to depend on your application but in general the fastest way to transpose a matrix would be to invert your coordinates when you do a look up, then you do not have to actually move any data.
Some details about transposing 4x4 square float (I will discuss 32-bit integer later) matrices with x86 hardware. It's helpful to start here in order to transpose larger square matrices such as 8x8 or 16x16.
_MM_TRANSPOSE4_PS(r0, r1, r2, r3) is implemented differently by different compilers. GCC and ICC (I have not checked Clang) use unpcklps, unpckhps, unpcklpd, unpckhpd whereas MSVC uses only shufps. We can actually combine these two approaches together like this.
t0 = _mm_unpacklo_ps(r0, r1);
t1 = _mm_unpackhi_ps(r0, r1);
t2 = _mm_unpacklo_ps(r2, r3);
t3 = _mm_unpackhi_ps(r2, r3);
r0 = _mm_shuffle_ps(t0,t2, 0x44);
r1 = _mm_shuffle_ps(t0,t2, 0xEE);
r2 = _mm_shuffle_ps(t1,t3, 0x44);
r3 = _mm_shuffle_ps(t1,t3, 0xEE);
One interesting observation is that two shuffles can be converted to one shuffle and two blends (SSE4.1) like this.
t0 = _mm_unpacklo_ps(r0, r1);
t1 = _mm_unpackhi_ps(r0, r1);
t2 = _mm_unpacklo_ps(r2, r3);
t3 = _mm_unpackhi_ps(r2, r3);
v = _mm_shuffle_ps(t0,t2, 0x4E);
r0 = _mm_blend_ps(t0,v, 0xC);
r1 = _mm_blend_ps(t2,v, 0x3);
v = _mm_shuffle_ps(t1,t3, 0x4E);
r2 = _mm_blend_ps(t1,v, 0xC);
r3 = _mm_blend_ps(t3,v, 0x3);
This effectively converted 4 shuffles into 2 shuffles and 4 blends. This uses 2 more instructions than the implementation of GCC, ICC, and MSVC. The advantage is that it reduces port pressure which may have a benefit in some circumstances.
Currently all the shuffles and unpacks can go only to one particular port whereas the blends can go to either of two different ports.
I tried using 8 shuffles like MSVC and converting that into 4 shuffles + 8 blends but it did not work. I still had to use 4 unpacks.
I used this same technique for a 8x8 float transpose (see towards the end of that answer).
https://stackoverflow.com/a/25627536/2542702. In that answer I still had to use 8 unpacks but I manged to convert the 8 shuffles into 4 shuffles and 8 blends.
For 32-bit integers there is nothing like shufps (except for 128-bit shuffles with AVX512) so it can only be implemented with unpacks which I don't think can be convert to blends (efficiently). With AVX512 vshufi32x4 acts effectively like shufps except for 128-bit lanes of 4 integers instead of 32-bit floats so this same technique might be possibly with vshufi32x4 in some cases. With Knights Landing shuffles are four times slower (throughput) than blends.
If the size of the arrays are known prior then we could use the union to our help. Like this-
#include <bits/stdc++.h>
using namespace std;
union ua{
int arr[2][3];
int brr[3][2];
};
int main() {
union ua uav;
int karr[2][3] = {{1,2,3},{4,5,6}};
memcpy(uav.arr,karr,sizeof(karr));
for (int i=0;i<3;i++)
{
for (int j=0;j<2;j++)
cout<<uav.brr[i][j]<<" ";
cout<<'\n';
}
return 0;
}
Consider each row as a column, and each column as a row .. use j,i instead of i,j
demo: http://ideone.com/lvsxKZ
#include <iostream>
using namespace std;
int main ()
{
char A [3][3] =
{
{ 'a', 'b', 'c' },
{ 'd', 'e', 'f' },
{ 'g', 'h', 'i' }
};
cout << "A = " << endl << endl;
// print matrix A
for (int i=0; i<3; i++)
{
for (int j=0; j<3; j++) cout << A[i][j];
cout << endl;
}
cout << endl << "A transpose = " << endl << endl;
// print A transpose
for (int i=0; i<3; i++)
{
for (int j=0; j<3; j++) cout << A[j][i];
cout << endl;
}
return 0;
}
transposing without any overhead (class not complete):
class Matrix{
double *data; //suppose this will point to data
double _get1(int i, int j){return data[i*M+j];} //used to access normally
double _get2(int i, int j){return data[j*N+i];} //used when transposed
public:
int M, N; //dimensions
double (*get_p)(int, int); //functor to access elements
Matrix(int _M,int _N):M(_M), N(_N){
//allocate data
get_p=&Matrix::_get1; // initialised with normal access
}
double get(int i, int j){
//there should be a way to directly use get_p to call. but i think even this
//doesnt incur overhead because it is inline and the compiler should be intelligent
//enough to remove the extra call
return (this->*get_p)(i,j);
}
void transpose(){ //twice transpose gives the original
if(get_p==&Matrix::get1) get_p=&Matrix::_get2;
else get_p==&Matrix::_get1;
swap(M,N);
}
}
can be used like this:
Matrix M(100,200);
double x=M.get(17,45);
M.transpose();
x=M.get(17,45); // = original M(45,17)
of course I didn't bother with the memory management here, which is crucial but different topic.
template <class T>
void transpose( const std::vector< std::vector<T> > & a,
std::vector< std::vector<T> > & b,
int width, int height)
{
for (int i = 0; i < width; i++)
{
for (int j = 0; j < height; j++)
{
b[j][i] = a[i][j];
}
}
}
Modern linear algebra libraries include optimized versions of the most common operations. Many of them include dynamic CPU dispatch, which chooses the best implementation for the hardware at program execution time (without compromising on portability).
This is commonly a better alternative to performing manual optimization of your functinos via vector extensions intrinsic functions. The latter will tie your implementation to a particular hardware vendor and model: if you decide to swap to a different vendor (e.g. Power, ARM) or to a newer vector extensions (e.g. AVX512), you will need to re-implement it again to get the most of them.
MKL transposition, for example, includes the BLAS extensions function imatcopy. You can find it in other implementations such as OpenBLAS as well:
#include <mkl.h>
void transpose( float* a, int n, int m ) {
const char row_major = 'R';
const char transpose = 'T';
const float alpha = 1.0f;
mkl_simatcopy (row_major, transpose, n, m, alpha, a, n, n);
}
For a C++ project, you can make use of the Armadillo C++:
#include <armadillo>
void transpose( arma::mat &matrix ) {
arma::inplace_trans(matrix);
}
intel mkl suggests in-place and out-of-place transposition/copying matrices. here is the link to the documentation. I would recommend trying out of place implementation as faster ten in-place and into the documentation of the latest version of mkl contains some mistakes.
I think that most fast way should not taking higher than O(n^2) also in this way you can use just O(1) space :
the way to do that is to swap in pairs because when you transpose a matrix then what you do is: M[i][j]=M[j][i] , so store M[i][j] in temp, then M[i][j]=M[j][i],and the last step : M[j][i]=temp. this could be done by one pass so it should take O(n^2)
my answer is transposed of 3x3 matrix
#include<iostream.h>
#include<math.h>
main()
{
int a[3][3];
int b[3];
cout<<"You must give us an array 3x3 and then we will give you Transposed it "<<endl;
for(int i=0;i<3;i++)
{
for(int j=0;j<3;j++)
{
cout<<"Enter a["<<i<<"]["<<j<<"]: ";
cin>>a[i][j];
}
}
cout<<"Matrix you entered is :"<<endl;
for (int e = 0 ; e < 3 ; e++ )
{
for ( int f = 0 ; f < 3 ; f++ )
cout << a[e][f] << "\t";
cout << endl;
}
cout<<"\nTransposed of matrix you entered is :"<<endl;
for (int c = 0 ; c < 3 ; c++ )
{
for ( int d = 0 ; d < 3 ; d++ )
cout << a[d][c] << "\t";
cout << endl;
}
return 0;
}
I'm new to openMP. I'm working on All Pair Shortest Path Algorithm and here is the serial C++ routine i need to parallelize (complete code at the end of the post):
void mini(vector<vector<double>> &M, size_t n, vector<double> &rowk, vector<double> &colk)
{
size_t i, j;
for ( i=0; i<n; i++)
for ( j=0; j<n; j++)
M[i][j]=min(rowk[j]+colk[i], M[i][j]);
}
At execution I get this :
$ time ./floyd
real 0m0,349s
user 0m0,349s
sys 0m0,000s
Now, I try to insert some directives:
void mini(vector<vector<double>> &M, size_t n, vector<double> &rowk, vector<double> &colk)
{
#pragma omp parallel
{
size_t i, j;
#pragma omp parallel for
for ( i=0; i<n; i++)
for ( j=0; j<n; j++)
M[i][j]=min(rowk[j]+colk[i], M[i][j]);
}
}
Unfortunately, there is no speedup:
$ grep -c ^processor /proc/cpuinfo
4
$ time ./floyd
real 0m0,547s
user 0m2,073s
sys 0m0,004s
What am I doing wrong?
EDIT
Processor: Intel(R) Core(TM) i5-4590 CPU (4 hardware cores)
Complete code :
#include <cstdio>
#include <vector>
#include <limits>
#include <ctime>
#include <random>
#include <set>
#include <omp.h>
using namespace std;
typedef struct Wedge
{
int a, b;
double w;
} Wedge;
typedef pair<int, int> edge;
int randrange(int end, int start=0)
{
random_device rd;
mt19937 gen(rd());
uniform_int_distribution<> dis(start, end-1);
return dis(gen);
}
void relax_omp(vector<vector<double>> &M, size_t n, vector<double> &rowk, vector<double> &colk)
{
#pragma omp parallel
{
size_t i, j;
#pragma omp parallel for
for (i=0; i<n; i++)
for ( j=0; j<n; j++)
M[i][j]=min(rowk[j]+colk[i], M[i][j]);
}
}
void relax_serial(vector<vector<double>> &M, size_t n, vector<double> &rowk, vector<double> &colk)
{
size_t i, j;
for (i=0; i<n; i++)
for ( j=0; j<n; j++)
M[i][j]=min(rowk[j]+colk[i], M[i][j]);
}
void floyd(vector<vector<double>> &dist, bool serial)
{
size_t i, k;
size_t n {dist.size()};
for (k=0; k<n; k++)
{
vector<double> rowk =dist[k];
vector<double> colk(n);
for (i=0; i<n; i++)
colk[i]=dist[i][k];
if (serial)
relax_serial(dist, n, rowk, colk);
else
relax_omp(dist, n, rowk, colk);
}
for (i=0; i<n; i++)
dist[i][i]=0;
}
vector<Wedge> random_edges(int n, double density, double max_weight)
{
int M{n*(n-1)/2};
double m{density*M};
set<edge> edges;
vector<Wedge> wedges;
while (edges.size()<m)
{
pair<int,int> L;
L.first=randrange(n);
L.second=randrange(n);
if (L.first!=L.second && edges.find(L) == edges.end())
{
double w=randrange(max_weight);
Wedge wedge{L.first, L.second, w};
wedges.push_back(wedge);
edges.insert(L);
}
}
return wedges;
}
vector<vector<double>> fill_distances(vector<Wedge> wedges, int n)
{
double INF = std::numeric_limits<double>::infinity();
size_t i, m=wedges.size();
vector<vector<double>> dist(n, vector<double>(n, INF));
int a, b;
double w;
for (i=0; i<m; i++)
{ a=wedges[i].a;
b=wedges[i].b;
w=wedges[i].w;
dist[a][b]=w;
}
return dist;
}
int main (void)
{
double density{0.33};
double max_weight{200};
int n{800};
bool serial;
int ntest=10;
double avge_serial=0, avge_omp=0;
for (int i=0; i<ntest; i++)
{
vector<Wedge> wedges=random_edges(n, density, max_weight);
vector<vector<double>> dist=fill_distances(wedges, n);
double dtime;
dtime = omp_get_wtime();
serial=true;
floyd(dist, serial);
dtime = omp_get_wtime() - dtime;
avge_serial+=dtime;
dtime = omp_get_wtime();
serial=false;
floyd(dist, serial);
dtime = omp_get_wtime() - dtime;
avge_omp+=dtime;
}
printf("%d tests, n=%d\n", ntest, n);
printf("Average serial : %.2lf\n", avge_serial/ntest);
printf("Average openMP : %.2lf\n", avge_omp/ntest);
return 0;
}
output :
20 tests, n=800
Average serial : 0.31
Average openMP : 0.61
command line:
g++ -std=c++11 -Wall -O2 -Wno-unused-result -Wno-unused-variable -Wno-unused-but-set-variable -Wno-unused-parameter floyd.cpp -o floyd -lm -fopenmp
Your main issue is that you accidentally use nested parallelism:
#pragma omp parallel
{
size_t i, j;
#pragma omp parallel for
Since you already are in a parallel region, your second line should be
#pragma omp for
Otherwise, since a omp parallel for equals a omp parallel and a omp for, you have two nested parallel regions which is typically bad. Fixing this minor thing gets an ~2x speedup on a similar CPU.
There are several limitations why you are unlikely to get a full 4x speedup, such as but not limited to:
Memory bandwidth as a bottleneck
Relative overhead due to the small amount of work done within the parallel loop
Lower clock frequencies with multiple threads in turbo mode
Edit:
By the way, the much more idiomatic way to write your code is the following:
void relax_omp(...) {
#pragma omp parallel for
for (size_t i=0; i<n; i++) {
for (size_t j=0; j<n; j++) {
M[i][j]=min(rowk[j]+colk[i], M[i][j]);
}
}
}
If you declare variables as locally as possible, OpenMP wil almost alaways do the right thing. Which, in this case, means that i and j are private. In general it is much easier to reason about code this way.
There could be many reasons for this, the most obvious being that the work load is too small to notice speed up. The initial work load is 300ms. I would suggest enclosing this in a serial outerloop that repeats this work for at least 20 times, then you are starting with a serial time of (300ms * 20) 6 seconds to test with.
The other factor is the availability of parallel cores on the machine you are running this on. If your cpu has one core, multi-threading will cause slowdown due to the cost of thread-switching. 2 logical cores should show some speed up, 2 physical cores may show close to linear speed up.
Using pragma directives alone also does not guarantee that openMP is used. You have to compile using the -fopenmp command line argument to guarantee that the openmp library is linked to your object code.
Edit
Looking at your code now, the factor that controls the amount of work seems to be N rather than the outer loop. The idea of the outer loop was to artificially increase the amount of work within the same timing period but that can't be done here as you are trying to solve a specific problem. You can try parallelizing the nested loop as well but I think N = 800 is too low for parallelization to make a difference.
#pragma omp parallel for private(j) collapse(2)
j needs to be private to each iteration of the outer loop, hence private(j), otherwise j gets shared across all threads, leading to an inaccurate result.
Your loop is executed 640,000 times which is not much for modern CPUs that clock 3GHZ+, try something around N = 5000 which is 25M iterations.
I tried to write this code
float* theArray; // the array to find the minimum value
int index, i;
float thisValue, min;
index = 0;
min = theArray[0];
#pragma omp parallel for reduction(min:min_dist)
for (i=1; i<size; i++) {
thisValue = theArray[i];
if (thisValue < min)
{ /* find the min and its array index */
min = thisValue;
index = i;
}
}
return(index);
However this one is not outputting correct answers. Seems the min is OK but the correct index has been destroyed by threads.
I also tried some ways provided on the Internet and here (using parallel for for outer loop and use critical for final comparison) but this cause a speed drop rather than speedup.
What should I do to make both the min value and its index correct? Thanks!
I don't know of an elegant want to do a minimum reduction and save an index. I do this by finding the local minimum and index for each thread and then the global minimum and index in a critical section.
index = 0;
min = theArray[0];
#pragma omp parallel
{
int index_local = index;
float min_local = min;
#pragma omp for nowait
for (i = 1; i < size; i++) {
if (theArray[i] < min_local) {
min_local = theArray[i];
index_local = i;
}
}
#pragma omp critical
{
if (min_local < min) {
min = min_local;
index = index_local;
}
}
}
With OpenMP 4.0 it's possible to use user-defined reductions. A user-defined minimum reduction can be defined like this
struct Compare { float val; sizt_t index; };
#pragma omp declare reduction(minimum : struct Compare : omp_out = omp_in.val < omp_out.val ? omp_in : omp_out)
Then the reduction can be done like this
struct Compare min;
min.val = theArray[0];
min.index = 0;
#pragma omp parallel for reduction(minimum:min)
for(int i = 1; i<size; i++) {
if(theArray[i]<min.val) {
min.val = a[i];
min.index = i;
}
}
That works for C and C++. User defined reductions have other advantages besides simplified code. There are multiple algorithms for doing reductions. For example the merging can be done in O(number of threads) or O(Log(number of threads). The first solution I gave does this in O(number of threads) however using user-defined reductions let's OpenMP choose the algorithm.
Basic Idea
This can be accomplished without any parellelization-breaking critical or atomic sections by creating a custom reduction. Basically, define an object that stores both the index and value, and then create a function that sorts two of these objects by only the value, not the index.
Details
An object to store an index and value together:
typedef std::pair<unsigned int, float> IndexValuePair;
You can access the index by accessing the first property and the value by accessing the second property, i.e.,
IndexValuePair obj(0, 2.345);
unsigned int ix = obj.first; // 0
float val = obj.second; // 2.345
Define a function to sort two IndexValuePair objects:
IndexValuePair myMin(IndexValuePair a, IndexValuePair b){
return a.second < b.second ? a : b;
}
Then, construct a custom reduction following the guidelines in the OpenMP documentation:
#pragma omp declare reduction \
(minPair:IndexValuePair:omp_out=myMin(omp_out, omp_in)) \
initializer(omp_priv = IndexValuePair(0, 1000))
In this case, I've chosen to initialize the index to 0 and the value to 1000. The value should be initialized to some number larger than the largest value you expect to sort.
Functional Example
Finally, combine all these pieces with the parallel for loop!
// Compile with g++ -std=c++11 -fopenmp demo.cpp
#include <iostream>
#include <utility>
#include <vector>
typedef std::pair<unsigned int, float> IndexValuePair;
IndexValuePair myMin(IndexValuePair a, IndexValuePair b){
return a.second < b.second ? a : b;
}
int main(){
std::vector<float> vals {10, 4, 6, 2, 8, 0, -1, 2, 3, 4, 4, 8};
unsigned int i;
IndexValuePair minValueIndex(0, 1000);
#pragma omp declare reduction \
(minPair:IndexValuePair:omp_out=myMin(omp_out, omp_in)) \
initializer(omp_priv = IndexValuePair(0, 1000))
#pragma omp parallel for reduction(minPair:minValueIndex)
for(i = 0; i < vals.size(); i++){
if(vals[i] < minValueIndex.second){
minValueIndex.first = i;
minValueIndex.second = vals[i];
}
}
std::cout << "minimum value = " << minValueIndex.second << std::endl; // Should be -1
std::cout << "index = " << minValueIndex.first << std::endl; // Should be 6
return EXIT_SUCCESS;
}
Because you're not only trying to find the minimal value (reduction(min:___)) but also retain the index, you need to make the check critical. This can significantly slow down the loop (as reported). In general, make sure that there is enough work so you don't encounter overhead as in this question. An alternative would be to have each thread find the minimum and it's index and save them to a unique variable and have the master thread do a final check on those as in the following program.
#include <iostream>
#include <vector>
#include <ctime>
#include <random>
#include <omp.h>
using std::cout;
using std::vector;
void initializeVector(vector<double>& v)
{
std::mt19937 generator(time(NULL));
std::uniform_real_distribution<double> dis(0.0, 1.0);
v.resize(100000000);
for(int i = 0; i < v.size(); i++)
{
v[i] = dis(generator);
}
}
int main()
{
vector<double> vec;
initializeVector(vec);
float minVal = vec[0];
int minInd = 0;
int startTime = clock();
for(int i = 1; i < vec.size(); i++)
{
if(vec[i] < minVal)
{
minVal = vec[i];
minInd = i;
}
}
int elapsedTime1 = clock() - startTime;
// Change the number of threads accordingly
vector<float> threadRes(4, std::numeric_limits<float>::max());
vector<int> threadInd(4);
startTime = clock();
#pragma omp parallel for
for(int i = 0; i < vec.size(); i++)
{
{
if(vec[i] < threadRes[omp_get_thread_num()])
{
threadRes[omp_get_thread_num()] = vec[i];
threadInd[omp_get_thread_num()] = i;
}
}
}
float minVal2 = threadRes[0];
int minInd2 = threadInd[0];
for(int i = 1; i < threadRes.size(); i++)
{
if(threadRes[i] < minVal2)
{
minVal2 = threadRes[i];
minInd2 = threadInd[i];
}
}
int elapsedTime2 = clock() - startTime;
cout << "Min " << minVal << " at " << minInd << " took " << elapsedTime1 << std::endl;
cout << "Min " << minVal2 << " at " << minInd2 << " took " << elapsedTime2 << std::endl;
}
Please note that with optimizations on and nothing else to be done in the loop, the serial version seems to remain king. With optimizations turned off, OMP gains the upper hand.
P.S. you wrote reduction(min:min_dist) and the proceeded to use min instead of min_dist.
Actually, we can use omp critical directive to make only one thread run the code inside the critical region at a time.So only one thread can run it and the indexvalue wont be destroyed by other threads.
About omp critical directive:
The omp critical directive identifies a section of code that must be executed by a single thread at a time.
This code solves your issue:
#include <stdio.h>
#include <omp.h>
int main() {
int i;
int arr[10] = {11,42,53,64,55,46,47, 68, 59, 510};
float* theArray; // the array to find the minimum value
int index;
float thisValue, min;
index = 0;
min = arr[0];
int size=10;
#pragma omp parallel for
for (i=1; i<size; i++) {
thisValue = arr[i];
#pragma omp critical
if (thisValue < min)
{ /* find the min and its array index */
min = thisValue;
index = i;
}
}
printf("min:%d index:%d",min,index);
return 0;
}
This function has been asked a few times on here but I am interested in a particular case. Is it possible to have the size of the array passed defined by an additional argument?
As an example, let's say I want a function to print a 2D array. However, I the array may not have the same dimensions every time. It would be ideal if I could have additional arguments define the size of that array. I am aware that I could easily switch out the n for a number here as needed but if I have more complex functions with separate header files it seems silly to go and edit the header files every time a different size array comes along. The following results in error: use of parameter 'n' outside function body... which I understand but would like to find some workaround. I also tried with g++ -std=c++11 but still the same error.
#include <iostream>
using namespace std;
void printArray(int n, int A[][n], int m) {
for(int i=0; i < m; i++){
for(int j=0; j<n; j++) {
cout << A[i][j] << " ";
}
cout << endl;
}
}
int main() {
int A[][3] = {
{1,2,3},
{4,5,6},
{7,8,9},
{10,11,12}
};
printArray(3, A, 4);
return 0;
}
Supposedly, this can be done with C99 and also mentioned in this question but I cannot figure out how with C++.
This works:
template<size_t N, size_t M>
void printArray( int(&arr)[M][N] ) {
for(int i=0; i < M; i++){
for(int j=0; j < N; j++) {
std::cout << A[i][j] << " ";
}
std::cout << std::endl;
}
}
if you are willing to put the code in a header file. As a bonus, it deduces N and M for you.
When I am using OpenMP without functions with the reduction(+ : sum) , the OpenMP version works fine.
#include <iostream>
#include <omp.h>
using namespace std;
int sum = 0;
void summation()
{
sum = sum + 1;
}
int main()
{
int i,sum;
#pragma omp parallel for reduction (+ : sum)
for(i = 0; i < 1000000000; i++)
summation();
#pragma omp parallel for reduction (+ : sum)
for(i = 0; i < 1000000000; i++)
summation();
#pragma omp parallel for reduction (+ : sum)
for(i = 0; i < 1000000000; i++)
summation();
std::cerr << "Sum is=" << sum << std::endl;
}
But when I am calling a function summation over a global variable, the OpenMP version is taking even more time than the sequential version.
I would like to know the reason for the same and the changes that should be made.
The summation function doesn't use the OMP shared variable that you are reducing to. Fix it:
#include <iostream>
#include <omp.h>
void summation(int& sum) { sum++; }
int main()
{
int sum;
#pragma omp parallel for reduction (+ : sum)
for(int i = 0; i < 1000000000; ++i)
summation(sum);
std::cerr << "Sum is=" << sum << '\n';
}
The time taken to synchronize the access to this one variable will be way in excess of what you gain by using multiple cores- they will all be endlessly waiting on each other, because there is only one variable and only one core can access it at a time. This design is not capable of concurrency and all the sync you're paying will just increase the run-time.