I am trying out OpenMP offloading with an nvidia GPU and I am trying to do some array calculations with it in C++.
Right now my output is not desirable, as I am new with offloading calculations with OpenMP. Would appreciate if someone can point me to the correct direction.
Code Snippet:
#include <omp.h>
#include <iostream>
using namespace std;
int main(){
int totalSum, ompSum;
const int N = 1000;
int array[N];
for (int i=0; i<N; i++){
array[i]=i;
}
#pragma omp target
{
#pragma omp parallal private(ompSum) shared(totalSum)
{
ompSum=0;
#pragma omp parallel for
for (int i=0; i<N; i++){
ompSum += array[i];
}
#pragma omp critical
totalSum += ompSum;
}
printf ( "Caculated sum should be %d but is %d\n", N*(N-1)/2, totalSum );
}
return 0;
}
Right now, I know that the sum should calculate to a number 499500, but my machine is outputting extremely big numbers that are also negative.
You have some typos on the OpenMP constructors, namely:
#pragma omp parallal -> #pragma omp parallel;
#pragma omp parallel for -> #pragma omp for
Regarding 2. you do not need the parallel because you are already inside a parallel region.
Try the following:
using namespace std;
int main(){
int totalSum = 0, ompSum = 0;
const int N = 1000;
int array[N];
for (int i=0; i<N; i++){
array[i]=i;
}
#pragma omp target
{
#pragma omp parallel private(ompSum) shared(totalSum)
{
ompSum=0;
#pragma omp for
for (int i=0; i<N; i++){
ompSum += array[i];
}
#pragma omp critical
totalSum += ompSum;
}
printf ( "Caculated sum should be %d but is %d\n", N*(N-1)/2, totalSum );
}
return 0;
}
Related
Im trying to do the equivalent of this code with omp parallel and omp critical to synchronize it:
std::vector<int> randValueCounter(int n, int listSize) {
std::vector<int> list1(listSize, 0);
for (int i = 0; i < N; ++i) {
int rand = randomVal();
if(rand <= listSize){
++list1[rand]
}
return list1;
}
my attempt to use prallelization + synchronization using OMP parallel and OMP critical:
std::vector<int> randValueCounter2(int n, int listSize) {
std::vector<int> list1(listSize, 0);
#pragma omp parallel for
for (int i = 0; i < N; ++i) {
int rand = randomVal();
#pragma omp critical
{
if(rand <= listSize){
++list1[rand]
}
}
return list1;
}
I read that randValueCounter2 will have some overhead, but I was wondering if it in this case would accumulate the same result in functionality as the first function ?
There is two consecutive loops and there is a reduction clause in the second loop.
#pragma opm parallel
{
#pragma omp for
for (size_t i = 0; i < N; ++i)
{
}
#pragma omp barrier
#pragma omp for reduction(+ \
: sumj)
for (size_t i = 0; i < N; ++i)
{
sumj = 0.0;
for (size_t j = 0; j < adjList[i].size(); ++j)
{
sumj += 0;
}
Jac[i, i] = sumj;
}
}
to reduce the creating threads overhead I wand to keep the threads and use them in the second loop, but I get the following error
lib.cpp:131:17: error: reduction variable ‘sumj’ is private in outer context
#pragma omp for reduction(+ \
^~~
how to fix that?
I'm not sure what you are trying to do, but it seems that something like this would do what you expect:
#pragma omp parallel
{
#pragma omp for
for (size_t i = 0; i < N; ++i)
{
}
#pragma omp barrier
#pragma omp for
for (size_t i = 0; i < N; ++i)
{
double sumj = 0.0;
for (size_t j = 0; j < adjList[i].size(); ++j)
{
sumj += 0;
}
Jac[i, i] = sumj;
}
}
Reduce would be useful in the case of an "omp for" in the interior loop.
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include <sys/time.h>
#define N 10000
int A[N][N];
int B[N][N];
int C[N][N];
int main(){
int i,j,k;
struct timeval tv1, tv2;
struct timezone tz;
double elapsed;
for (i= 0; i< N; i++){
for (j= 0; j< N; j++)
{
A[i][j] = 3;
B[i][j] = 3;
}
}
gettimeofday(&tv1, &tz);
omp_set_num_threads(4);
#pragma omp parallel default (private) shared (A,B,C,N) num_threads(4)
#pragma omp parallel for schedule(static)
for (i = 0; i < N; ++i){
for (j = 0; j < N; ++j){
C[i][j]=0;
for (k = 0; k < N; ++k){
C[i][j] += A[i][k] * B[k][j];
}
}
}
gettimeofday(&tv2, &tz);
elapsed = (double) (tv2.tv_sec-tv1.tv_sec) + (double) (tv2.tv_usec-tv1.tv_usec) * 1.e-6;
printf("elapsed time = %f seconds.\n", elapsed);
for (i= 0; i< N; i++){
for (j= 0; j< N; j++) {
printf("%d \t",C[i][j]);
}
printf("\n");
}
}
this code is not working , although I make sure that every "{" is put right ! what's wrong with this code? It is a matrix multiplication using OpenMp library , and I am using eclipse c++
Any idea of what's wrong with the code?
I get "expected declaration or statement at end of input } "
and it glows under printf("\n");
I tried to delete the whole thing that prints the matrix but it didn't work
I got following errors on Wandbox:
prog.c: In function 'main':
prog.c:29:35: error: expected 'none' or 'shared' before 'private'
29 | #pragma omp parallel default (private) shared (A,B,C,N) num_threads(4)
| ^~~~~~~
prog.c:29:58: error: expected ')' before numeric constant
29 | #pragma omp parallel default (private) shared (A,B,C,N) num_threads(4)
| ~ ^
| )
Firstly, OpenMP default clause doesn't have option private.
You should remove the default clause and mark the inner loop variables j and k private via private clause.
Secondly, N is defined as macro and is expanded to 10000.
It is not a variable and therefore it cannot be specified in OpenMP clause that expect variables.
I'm working on factorial function. I have to write its parallel version using OpenMP.
double sequentialFactorial(const int N) {
double result = 1;
for(int i = 1; i <= N; i++) {
result *= i;
}
return result;
}
It is well known that this algorithm can be efficiently parallelized using reduction tecnique.
I'm aware of the existence of reduction clause (standard §§ 2.15.3.6).
double parallelAutomaticFactorial(const int N) {
double result = 1;
#pragma omp parallel for reduction(*:result)
for (int i=1; i <= N; i++)
result *= i;
return result;
}
However, I want to try to implement reduction tecnique "handmade".
double parallelHandmadeFactorial(const int N) {
// maximum number of threads
const int N_THREADS = omp_get_max_threads();
// table of partial results
double* partial = new double[N_THREADS];
for(int i = 0; i < N_THREADS; i++) {
partial[i] = 1;
}
// reduction tecnique
#pragma omp parallel for
for(int i = 1; i <= N; i++) {
int thread_index = omp_get_thread_num();
partial[thread_index] *= i;
}
// fold results
double result = 1;
for(int i = 0; i < N_THREADS; i++) {
result *= partial[i];
}
delete partial;
return result;
}
I expect the performance of the last two snippet to be very similar, and better than the first one. However, the average performance is:
Sequential Factorial 3500 ms
Parallel Handmade Factorial 6100 ms
Parallel Automatic Factorial 600 ms
Am I missing something?
Thanks to #Gilles and #P.W, this code works as expected
double parallelNoWaitFactorial(const int N) {
double result = 1;
#pragma omp parallel
{
double my_local_result = 1;
// removing nowait does not change the performance
#pragma omp for nowait
for(int i = 1; i <= N; i++)
my_local_result *= i;
#pragma omp atomic
result *= my_local_result;
}
return result;
}
If array elements happen to share a cache line, this leads to false sharing which further leads to performance degradation.
To avoid this:
Use a private variable double partial instead of the double array
partial.
Use the partial result of each thread to compute the final result in a critical region
This final result should a variable that is not private to the parallel region.
The critical region will look like this:
#pragma omp critical
result *= partial;
I want to declare a parallel for within a master region, like this:
#pragma omp parallel
{
#pragma omp master
{
*many functions...*
#pragma omp parallel for
for (int i = 0; i < x; ++i){
a += i;
}
}
}
This is just an example code, I have hundreds of functions that I don't want to manually add the master clause in each of them, but is this possible to do? Or is there any other way to do what I want?
#pragma omp parallel
{
//mater only
#pragma omp master
{
*many functions...*
}
//full team: just for not parallel for
#pragma omp for
for(int i = 0; i < x; ++i){
a += i;
}
}
Just declare the for outside of the mater.
Or just do the sequential actions out side of the parallel section al together
*many functions...*
#pragma omp parallel for
for(int i = 0; i < x; ++i){
a += i;
}