parallelization and synchronization with omp - c++

Im trying to do the equivalent of this code with omp parallel and omp critical to synchronize it:
std::vector<int> randValueCounter(int n, int listSize) {
std::vector<int> list1(listSize, 0);
for (int i = 0; i < N; ++i) {
int rand = randomVal();
if(rand <= listSize){
++list1[rand]
}
return list1;
}
my attempt to use prallelization + synchronization using OMP parallel and OMP critical:
std::vector<int> randValueCounter2(int n, int listSize) {
std::vector<int> list1(listSize, 0);
#pragma omp parallel for
for (int i = 0; i < N; ++i) {
int rand = randomVal();
#pragma omp critical
{
if(rand <= listSize){
++list1[rand]
}
}
return list1;
}
I read that randValueCounter2 will have some overhead, but I was wondering if it in this case would accumulate the same result in functionality as the first function ?

Related

GPU array addition using OpenMP

I am trying out OpenMP offloading with an nvidia GPU and I am trying to do some array calculations with it in C++.
Right now my output is not desirable, as I am new with offloading calculations with OpenMP. Would appreciate if someone can point me to the correct direction.
Code Snippet:
#include <omp.h>
#include <iostream>
using namespace std;
int main(){
int totalSum, ompSum;
const int N = 1000;
int array[N];
for (int i=0; i<N; i++){
array[i]=i;
}
#pragma omp target
{
#pragma omp parallal private(ompSum) shared(totalSum)
{
ompSum=0;
#pragma omp parallel for
for (int i=0; i<N; i++){
ompSum += array[i];
}
#pragma omp critical
totalSum += ompSum;
}
printf ( "Caculated sum should be %d but is %d\n", N*(N-1)/2, totalSum );
}
return 0;
}
Right now, I know that the sum should calculate to a number 499500, but my machine is outputting extremely big numbers that are also negative.
You have some typos on the OpenMP constructors, namely:
#pragma omp parallal -> #pragma omp parallel;
#pragma omp parallel for -> #pragma omp for
Regarding 2. you do not need the parallel because you are already inside a parallel region.
Try the following:
using namespace std;
int main(){
int totalSum = 0, ompSum = 0;
const int N = 1000;
int array[N];
for (int i=0; i<N; i++){
array[i]=i;
}
#pragma omp target
{
#pragma omp parallel private(ompSum) shared(totalSum)
{
ompSum=0;
#pragma omp for
for (int i=0; i<N; i++){
ompSum += array[i];
}
#pragma omp critical
totalSum += ompSum;
}
printf ( "Caculated sum should be %d but is %d\n", N*(N-1)/2, totalSum );
}
return 0;
}

openmp two consecutive loops, problem with reduction clause

There is two consecutive loops and there is a reduction clause in the second loop.
#pragma opm parallel
{
#pragma omp for
for (size_t i = 0; i < N; ++i)
{
}
#pragma omp barrier
#pragma omp for reduction(+ \
: sumj)
for (size_t i = 0; i < N; ++i)
{
sumj = 0.0;
for (size_t j = 0; j < adjList[i].size(); ++j)
{
sumj += 0;
}
Jac[i, i] = sumj;
}
}
to reduce the creating threads overhead I wand to keep the threads and use them in the second loop, but I get the following error
lib.cpp:131:17: error: reduction variable ‘sumj’ is private in outer context
#pragma omp for reduction(+ \
^~~
how to fix that?
I'm not sure what you are trying to do, but it seems that something like this would do what you expect:
#pragma omp parallel
{
#pragma omp for
for (size_t i = 0; i < N; ++i)
{
}
#pragma omp barrier
#pragma omp for
for (size_t i = 0; i < N; ++i)
{
double sumj = 0.0;
for (size_t j = 0; j < adjList[i].size(); ++j)
{
sumj += 0;
}
Jac[i, i] = sumj;
}
}
Reduce would be useful in the case of an "omp for" in the interior loop.

Optimizing parallel nested loop with inner loop dependent on outer loop using openmp

How do I get a better optimization for this piece of code using openmp.
Number of threads is 6, but can't get better performance.
I have tried different scheduling options, but i can't get it optimized better.
Is there a way of getting a better result ?
int lenght = 40000;
int idx;
long *result = new long[ size ];
#pragma omp parallel for private(idx) schedule(dynamic)
for ( int i = 0; i < lenght; i++ ) {
for ( int j = 0; j < i; j++ ) {
idx = (int)( someCalculations( i, j ) );
#pragma omp atomic
result[ idx ] += 1;
}
}
This piece of code does optimize the calculation time, but I still need a better result.
Thanks in advance.
Since OpenMP 4.0 you can write your own reduction.
The idea is :
in for loop, you tell the compiler to reduce the place you modify in each loop.
since omp doesn't know how to reduce such array, you must write your own adder my_add which will simply sum two array.
you tell omp how to use it in your reducer (myred)
#include <stdio.h>
#include <stdlib.h>
#define LEN 40000
int someCalculations(int i, int j)
{
return i * j % 40000 ;
}
/* simple adder, just sum x+y in y */
long *my_add(long * x, long *y)
{
int i;
#pragma omp parallel for private(i)
for (i = 0; i < LEN; ++i)
{
x[i] += y[i];
}
free(y);
return x;
}
/* reduction declaration:
name
type
operation to be performed
initializer */
#pragma omp declare reduction(myred: long*:omp_out=my_add(omp_out,omp_in))\
initializer(omp_priv=calloc(LEN, sizeof(long)))
int main(void)
{
int i, j;
long *result = calloc(LEN, sizeof *result);
// tell omp how to use it
#pragma omp parallel for reduction(myred:result) private (i, j)
for (i = 0; i < LEN; i++) {
for (j = 0; j < i; j++) {
int idx = someCalculations(i, j);
result[idx] += 1;
}
}
// simple display, I store it in a file and compare
// result files with/without openmp to be sure it's correct...
for (i = 0; i < LEN; ++i) {
printf("%ld\n", result[i]);
}
return 0;
}
Without -fopenmp: real 0m3.727s
With -fopenmp: real 0m0.835s

OpenMP Handmade reduction directive

I'm working on factorial function. I have to write its parallel version using OpenMP.
double sequentialFactorial(const int N) {
double result = 1;
for(int i = 1; i <= N; i++) {
result *= i;
}
return result;
}
It is well known that this algorithm can be efficiently parallelized using reduction tecnique.
I'm aware of the existence of reduction clause (standard §§ 2.15.3.6).
double parallelAutomaticFactorial(const int N) {
double result = 1;
#pragma omp parallel for reduction(*:result)
for (int i=1; i <= N; i++)
result *= i;
return result;
}
However, I want to try to implement reduction tecnique "handmade".
double parallelHandmadeFactorial(const int N) {
// maximum number of threads
const int N_THREADS = omp_get_max_threads();
// table of partial results
double* partial = new double[N_THREADS];
for(int i = 0; i < N_THREADS; i++) {
partial[i] = 1;
}
// reduction tecnique
#pragma omp parallel for
for(int i = 1; i <= N; i++) {
int thread_index = omp_get_thread_num();
partial[thread_index] *= i;
}
// fold results
double result = 1;
for(int i = 0; i < N_THREADS; i++) {
result *= partial[i];
}
delete partial;
return result;
}
I expect the performance of the last two snippet to be very similar, and better than the first one. However, the average performance is:
Sequential Factorial 3500 ms
Parallel Handmade Factorial 6100 ms
Parallel Automatic Factorial 600 ms
Am I missing something?
Thanks to #Gilles and #P.W, this code works as expected
double parallelNoWaitFactorial(const int N) {
double result = 1;
#pragma omp parallel
{
double my_local_result = 1;
// removing nowait does not change the performance
#pragma omp for nowait
for(int i = 1; i <= N; i++)
my_local_result *= i;
#pragma omp atomic
result *= my_local_result;
}
return result;
}
If array elements happen to share a cache line, this leads to false sharing which further leads to performance degradation.
To avoid this:
Use a private variable double partial instead of the double array
partial.
Use the partial result of each thread to compute the final result in a critical region
This final result should a variable that is not private to the parallel region.
The critical region will look like this:
#pragma omp critical
result *= partial;

OMP for nested loops in VS2017

I am new on using OpenMP 2.0 along with MSVC++ 2017. I'm working with a big data structure (referenced as bigMap) so I need to distribute the workload when iterating on it in the best possible way. My attempt for doing so is:
std::map<int, std::set<std::pair<double, double>>> bigMap;
///thousands of values are added here
int k;
int max_threads = omp_get_max_threads();
omp_set_num_threads(max_threads);
#pragma omp parallel default(none) private(k)
{
#pragma omp for
for(k = kMax; k > kMin; k--)
{
for (auto& myPair : bigMap[k])
{
int pthread = omp_get_thread_num();
std::cout << "Thread " << pthread << std::endl;
for (auto& item : myPair)
{
#pragma omp critical
myMap[k-1].insert(std::make_pair(item, 0));
}
}
}
The output for "pthread" is always "0" and the execution time is the same as for single-thread (so I assume no new threads are being created).
Why this code doesn't work and which OMP directives / clauses / sections are wrong??
UPDATE:
OMP is now working, but the code below is not working as expected:
#pragma omp parallel for schedule(static,1)
for (int i = 0; i < map_size; ++i) {
#pragma omp critical
bigMap[i] = std::set<int>();
}
bigMap[1] = { 10, 100, 1000 };
int i;
#pragma omp parallel for schedule(static) num_threads(8)
for (i = thread_num; i < map_size; i += thread_count)
{
for (auto it = bigMap[i].begin(); it != bigMap[i].end(); ++it)
{
int elem = *it;
bigMap[i + 1].insert(elem);
}
}
I expect the 3 elements from bigMap[1] to be inserted across all entries of bigMap, instead, they're inserted only once, for bigMap[2], why??
Little bug....
#pragma omp parallel for schedule(static,1)
for (int i = 0; i < map_size; ++i) {
#pragma omp critical
bigMap[i] = std::set<int>();
}
bigMap[1] = { 10, 100, 1000 };
int i;
#pragma omp parallel for schedule(static) num_threads(8)
for (i = thread_num; i < map_size; i += thread_count)
{
//here you loop on bigMap[i] which is empty execpt for i==1.
//for (auto it = bigMap[i].begin(); it != bigMap[i].end(); ++it)
for (auto it = bigMap[1].begin(); it != bigMap[1].end(); ++it)
{
int elem = *it;
bigMap[i + 1].insert(elem);
}
}
Maybe you miss understand what static means.