I am new on using OpenMP 2.0 along with MSVC++ 2017. I'm working with a big data structure (referenced as bigMap) so I need to distribute the workload when iterating on it in the best possible way. My attempt for doing so is:
std::map<int, std::set<std::pair<double, double>>> bigMap;
///thousands of values are added here
int k;
int max_threads = omp_get_max_threads();
omp_set_num_threads(max_threads);
#pragma omp parallel default(none) private(k)
{
#pragma omp for
for(k = kMax; k > kMin; k--)
{
for (auto& myPair : bigMap[k])
{
int pthread = omp_get_thread_num();
std::cout << "Thread " << pthread << std::endl;
for (auto& item : myPair)
{
#pragma omp critical
myMap[k-1].insert(std::make_pair(item, 0));
}
}
}
The output for "pthread" is always "0" and the execution time is the same as for single-thread (so I assume no new threads are being created).
Why this code doesn't work and which OMP directives / clauses / sections are wrong??
UPDATE:
OMP is now working, but the code below is not working as expected:
#pragma omp parallel for schedule(static,1)
for (int i = 0; i < map_size; ++i) {
#pragma omp critical
bigMap[i] = std::set<int>();
}
bigMap[1] = { 10, 100, 1000 };
int i;
#pragma omp parallel for schedule(static) num_threads(8)
for (i = thread_num; i < map_size; i += thread_count)
{
for (auto it = bigMap[i].begin(); it != bigMap[i].end(); ++it)
{
int elem = *it;
bigMap[i + 1].insert(elem);
}
}
I expect the 3 elements from bigMap[1] to be inserted across all entries of bigMap, instead, they're inserted only once, for bigMap[2], why??
Little bug....
#pragma omp parallel for schedule(static,1)
for (int i = 0; i < map_size; ++i) {
#pragma omp critical
bigMap[i] = std::set<int>();
}
bigMap[1] = { 10, 100, 1000 };
int i;
#pragma omp parallel for schedule(static) num_threads(8)
for (i = thread_num; i < map_size; i += thread_count)
{
//here you loop on bigMap[i] which is empty execpt for i==1.
//for (auto it = bigMap[i].begin(); it != bigMap[i].end(); ++it)
for (auto it = bigMap[1].begin(); it != bigMap[1].end(); ++it)
{
int elem = *it;
bigMap[i + 1].insert(elem);
}
}
Maybe you miss understand what static means.
Related
Im trying to do the equivalent of this code with omp parallel and omp critical to synchronize it:
std::vector<int> randValueCounter(int n, int listSize) {
std::vector<int> list1(listSize, 0);
for (int i = 0; i < N; ++i) {
int rand = randomVal();
if(rand <= listSize){
++list1[rand]
}
return list1;
}
my attempt to use prallelization + synchronization using OMP parallel and OMP critical:
std::vector<int> randValueCounter2(int n, int listSize) {
std::vector<int> list1(listSize, 0);
#pragma omp parallel for
for (int i = 0; i < N; ++i) {
int rand = randomVal();
#pragma omp critical
{
if(rand <= listSize){
++list1[rand]
}
}
return list1;
}
I read that randValueCounter2 will have some overhead, but I was wondering if it in this case would accumulate the same result in functionality as the first function ?
There is two consecutive loops and there is a reduction clause in the second loop.
#pragma opm parallel
{
#pragma omp for
for (size_t i = 0; i < N; ++i)
{
}
#pragma omp barrier
#pragma omp for reduction(+ \
: sumj)
for (size_t i = 0; i < N; ++i)
{
sumj = 0.0;
for (size_t j = 0; j < adjList[i].size(); ++j)
{
sumj += 0;
}
Jac[i, i] = sumj;
}
}
to reduce the creating threads overhead I wand to keep the threads and use them in the second loop, but I get the following error
lib.cpp:131:17: error: reduction variable ‘sumj’ is private in outer context
#pragma omp for reduction(+ \
^~~
how to fix that?
I'm not sure what you are trying to do, but it seems that something like this would do what you expect:
#pragma omp parallel
{
#pragma omp for
for (size_t i = 0; i < N; ++i)
{
}
#pragma omp barrier
#pragma omp for
for (size_t i = 0; i < N; ++i)
{
double sumj = 0.0;
for (size_t j = 0; j < adjList[i].size(); ++j)
{
sumj += 0;
}
Jac[i, i] = sumj;
}
}
Reduce would be useful in the case of an "omp for" in the interior loop.
I'm trying to learn OpenMP, but the professor moved on to a different subject and I feel like I haven't learned a whole lot (or understood).
After looking at some solved questions here on SO I wrote this bit of code:
Working code now looks like this:
void many_iterations()
{
int it, i, j;
for (it = 0; it < NUM_ITERATIONS; it++)
{
#pragma omp parallel
{
#pragma omp for private(j)
for (i = 0; i < N; i++)
for (j = 0; j < M; j++)
{
if (i == j) B[i][j] = A[i][j] * 2;
else B[i][j] = A[i][j] * 3;
}
}
int **aux = A;
A = B; B = aux;
}
}
I also wrote a serial version (without the #pragma omp bits) and noticed that this version does not actually properly work (outputing A is different between the serial and this version). I then managed to change the two inner for loops to this working bit (correct output as far as I can tell):
for (index = 0; index < N * M; index++)
{
int i = index / M, j = index % M;
// rest of code here
This one does work, but I ran into a problem: running on two threads, it is just as fast as the serial version (with 2 inner fors) and when I tried running this with only one thread the execution time was a lot slower.
Reading online I understood that the parallel section should somehow start before the main for so that it reduces the overhead, but again, my output (A) is wrong.
So my issues are:
How do I set #pragma omp parallel before the first for without ruining the code?
Why is the serial version equal to the 2-thread version of the code with collapsed for loops?
How should I make the code actually more efficient when running on multiple threads?
As a side note, I tried running the serial version with collapsed for loops and I got it to run a lot slower (just like the "parallel" version with 1 thread).
Edit: Trying to use #pragma omp parallel before the it loop:
void many_iterations()
{
int it, i, j;
#pragma omp parallel
{
for (it = 0; it < NUM_ITERATIONS; it++)
{
#pragma omp for private(j)
for (i = 0; i < N; i++)
for (j = 0; j < M; j++)
{
if (i == j) B[i][j] = A[i][j] * 2;
else B[i][j] = A[i][j] * 3;
}
#pragma omp single
{
int **aux = A;
A = B; B = aux;
}
}
}
}
what is wrong with my openMP code? it always takes only 1 thread and works the same time as non-parallel version
template <typename T>
Matrix<T>* Matrix<T>::OMPMultiplication(Matrix<T>* A, Matrix<T>* B){
if(A->ySize != B->xSize)
throw;
Matrix<T>* C = new Matrix<T>(A->xSize, B->ySize);
sizeType i, j, k;
T element;
#pragma omp parallel for private(i, j)
{
#pragma omp for private(i, j)
for( i = 0; i < A->xSize; i++ )
cout<<"There are "<<omp_get_num_threads()<<" threads"<<endl;
for(j = 0; j < B->ySize; j++){
C->matrix[i][j] = 0;
for(k = 0; k < A->ySize; k++){
C->matrix[i][j] += A->matrix[i][k] * B->matrix[k][j];
}
}
}
return C;
}
First of all, you are missing some {} for the i loop and the variable k needs to be made private to each iteration of the i loop. However, I think you have also mixed up how the parallel and for pragmas are combined. To successfully parallelize a for loop, you need to put it inside a parallel pragma and then inside a for pragma. To do this you could either change your code into
#pragma omp parallel private(i, j, k)
{
#pragma omp for
for( i = 0; i < A->xSize; i++ ) {
cout<<"There are "<<omp_get_num_threads()<<" threads"<<endl;
for(j = 0; j < B->ySize; j++) {
C->matrix[i][j] = 0;
for(k = 0; k < A->ySize; k++){
C->matrix[i][j] += A->matrix[i][k] * B->matrix[k][j];
}
}
}
}
or make use of the combined parallel for notation
#pragma omp parallel for private(i, j, k)
for( i = 0; i < A->xSize; i++ ) {
...
}
Also, make sure you are telling OpenMP to use more than 1 thread here. This can be done both with omp_set_num_threads(<number of threads here>) and by setting environment variables like OMP_NUM_THREADS.
Hope you get it parallelized. :)
I get slightly faster result with my 4 cores using this code:
omp_set_num_threads(4);
#pragma omp parallel for
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++) {
c[i] += b[j] * a[j][i];
}
}
Full program
#include <stdio.h>
#include <time.h>
#include <omp.h>
#include <stdlib.h>
int main() {
int i, j, n, a[719][719], b[719], c[719];
clock_t start = clock();
n = 100; //Max 719
printf("Matrix A\n");
for (i = 0; i < n; ++i) {
for (j = 0; j < n; ++j) {
a[i][j] = 10;
printf("%d ", a[i][j]);
}
printf("\n");
}
printf("\nMatrix B\n");
#pragma omp parallel private(i) shared(b)
{
#pragma omp for
for (i = 0; i < n; ++i) {
b[i] = 5;
printf("%d\n", b[i]);
}
}
printf("\nA * B\n");
#pragma omp parallel private(i) shared(c)
{
#pragma omp for
for (i = 0; i < n; ++i) {
c[i] = 0;
}
}
#pragma omp parallel private(i,j) shared(n,a,b,c)
{
#pragma omp for schedule(dynamic)
for (i = 0; i < n; ++i) {
for (j = 0; j < n; ++j) {
c[i] += b[j] * a[j][i];
}
}
}
#pragma omp parallel private(i) shared(c)
{
#pragma omp for
for (i = 0; i < n; ++i) {
printf("%d\n", c[i]);
}
}
clock_t stop = clock();
double elapsed = (double) (stop - start) / CLOCKS_PER_SEC;
printf("\nTime elapsed: %.5f\n", elapsed);
start = clock();
printf("Matrix A\n");
for (i = 0; i < n; ++i) {
for (j = 0; j < n; ++j) {
a[i][j] = 10;
printf("%d ", a[i][j]);
}
printf("\n");
}
printf("\nMatrix B\n");
#pragma omp parallel private(i) shared(b)
{
#pragma omp for
for (i = 0; i < n; ++i) {
b[i] = 5;
printf("%d\n", b[i]);
}
}
printf("\nA * B\n");
omp_set_num_threads(4);
#pragma omp parallel for
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++) {
c[i] += b[j] * a[j][i];
}
}
stop = clock();
elapsed = (double) (stop - start) / CLOCKS_PER_SEC;
printf("\nTime elapsed: %.5f\n", elapsed);
return 0;
}
First method takes
Time elapsed: 0.03442
Second method
Time elapsed: 0.02630
I want to declare a parallel for within a master region, like this:
#pragma omp parallel
{
#pragma omp master
{
*many functions...*
#pragma omp parallel for
for (int i = 0; i < x; ++i){
a += i;
}
}
}
This is just an example code, I have hundreds of functions that I don't want to manually add the master clause in each of them, but is this possible to do? Or is there any other way to do what I want?
#pragma omp parallel
{
//mater only
#pragma omp master
{
*many functions...*
}
//full team: just for not parallel for
#pragma omp for
for(int i = 0; i < x; ++i){
a += i;
}
}
Just declare the for outside of the mater.
Or just do the sequential actions out side of the parallel section al together
*many functions...*
#pragma omp parallel for
for(int i = 0; i < x; ++i){
a += i;
}