I'm looking for a way to call a function in parallel.
For example, if I have 4 threads, I want to each of them to call the same function with their own thread id as an argument.
Because of the argument, no thread will work on the same data.
#pragma omp parallel
{
for(int p = 0; p < numberOfThreads; p++)
{
if(p == omp_get_thread_num())
parDF(p);
}
}
Thread 0 should run parDF(0)
Thread 1 should run parDF(1)
Thread 2 should run parDF(2)
Thread 3 should run parDF(3)
All this should be done at the same time...
This (obviously) doesn't work, but what is the right way to do parallel function calls?
EDIT: The actual code (This might be too much information... But it was asked for...)
From the function that calls parDF():
omp_set_num_threads(NUM_THREADS);
#pragma omp parallel
{
numberOfThreads = omp_get_num_threads();
//split nodeQueue
#pragma omp master
{
splitNodeQueue(numberOfThreads);
}
int tid = omp_get_thread_num();
//printf("Hello World from thread = %d\n", tid);
#pragma omp parallel for private(tid)
for(int i = 0; i < numberOfThreads; ++i)
{
parDF(tid, originalQueueSize, DFlevel);
}
}
The parDF function:
bool Tree::parDF(int id, int originalQueueSize, int DFlevel)
{
double possibilities[20];
double sequence[3];
double workingSequence[3];
int nodesToExpand = originalQueueSize/omp_get_num_threads();
int tenthsTicks = nodesToExpand/10;
int numPossibilities = 0;
int percentage = 0;
list<double>::iterator i;
list<TreeNode*>::iterator n;
cout << "My ID is: "<< omp_get_thread_num() << endl;
while(parNodeQueue[id].size() > 0 and parNodeQueue[id].back()->depth == DFlevel)
{
if(parNodeQueue[id].size()%tenthsTicks == 0)
{
cout << endl;
cout << percentage*10 << "% done..." << endl;
if(percentage == 10)
{
percentage = 0;
}
percentage++;
}
//countStartPoints++;
depthFirstQueue.push_back(parNodeQueue[id].back());
numPossibilities = 0;
for(i = parNodeQueue[id].back()->content.sortedPoints.begin(); i != parNodeQueue[id].back()->content.sortedPoints.end(); i++)
{
for(int j = 0; j < deltas; j++)
{
if(parNodeQueue[id].back()->content.doesPointExist((*i) + delta[j]))
{
for(int k = 0; k <= numPossibilities; k++)
{
if(fabs((*i) + delta[j] - possibilities[k]) < 0.01)
{
goto pointAlreadyAdded;
}
}
possibilities[numPossibilities] = ((*i) + delta[j]);
numPossibilities++;
pointAlreadyAdded:
continue;
}
}
}
// Out of the list of possible points. All combinations of 3 are added, building small subtrees in from the node.
// If a subtree succesfully breaks the lower bound, true is returned.
for(int i = 0; i < numPossibilities; i++)
{
for(int j = 0; j < numPossibilities; j++)
{
for(int k = 0; k < numPossibilities; k++)
{
if( k != j and j != i and i != k)
{
sequence[0] = possibilities[i];
sequence[1] = possibilities[j];
sequence[2] = possibilities[k];
//countSeq++;
if(addSequence(sequence, id))
{
//successes++;
workingSequence[0] = sequence[0];
workingSequence[1] = sequence[1];
workingSequence[2] = sequence[2];
parNodeQueue[id].back()->workingSequence[0] = sequence[0];
parNodeQueue[id].back()->workingSequence[1] = sequence[1];
parNodeQueue[id].back()->workingSequence[2] = sequence[2];
parNodeQueue[id].back()->live = false;
succesfulNodes.push_back(parNodeQueue[id].back());
goto nextNode;
}
else
{
destroySubtree(parNodeQueue[id].back());
}
}
}
}
}
nextNode:
parNodeQueue[id].pop_back();
}
Is this what you are after?
Live On Coliru
#include <omp.h>
#include <cstdio>
int main()
{
int nthreads, tid;
#pragma omp parallel private(tid)
{
tid = ::omp_get_thread_num();
printf("Hello World from thread = %d\n", tid);
/* Only master thread does this */
if (tid == 0) {
nthreads = ::omp_get_num_threads();
printf("Number of threads = %d\n", nthreads);
}
} /* All threads join master thread and terminate */
}
Output:
Hello World from thread = 0
Number of threads = 8
Hello World from thread = 4
Hello World from thread = 3
Hello World from thread = 5
Hello World from thread = 2
Hello World from thread = 1
Hello World from thread = 6
Hello World from thread = 7
You should be doing something like this :
#pragma omp parallel private(tid)
{
tid = omp_get_thread_num();
parDF(tid);
}
I think its quite straight forward.
There are two ways to achieve what you want:
Exactly the way you are describing it: each thread starts the function with it's own thread id:
#pragma omp parallel
{
int threadId = omp_get_thread_num();
parDF(threadId);
}
The parallel block starts as many threads as the system reports that it supports, and each of them executes the block. Since they differ in threadId, they will process different data. To force that starting of more threads you can add a numthreads(100) or whatever to the pragma.
The correct way to do what you want is to use a parallel for block.
#pragma omp parallel for
for (int i=0; i < numThreads; ++i) {
parDF(i);
}
This way each iteration of the loop (value of i) gets assigned to a thread, that executes it. As many iterations will be ran in parallel, as there are available threads.
Method 1. is not very general, and is inefficient because you have to have as many threads as you want function calls. Method 2. is the canonical (right) way to get your problem solved.
Related
I am trying to write code for finding if pairwise sums are even or not(among all possible pairs from 0 to 100000). I have written code using pthreads where the work allocation is done statically. Here is the code
#include<iostream>
#include<chrono>
#include<iomanip>
#include<pthread.h>
using namespace std;
#define MAX_THREAD 4
vector<long long> cnt(MAX_THREAD,0);
long long n = 100000;
int work_per_thread;
void *count_array(void* arg)
{
int t = *((int*)arg);
long long sum = 0;
int counter = 0;
for(int i = t*work_per_thread + 1; i <= (t+1)*work_per_thread; i++)
for(int j = i-1; j >= 0; j--)
{
sum = i + j;
if(sum%2 == 0)
counter++;
}
cnt[t] = counter;
cout<<"thread"<<t<<" finished work"<<endl;
return NULL;
}
int main()
{
pthread_t threads[MAX_THREAD];
vector<int> arr;
for(int i = 0; i < MAX_THREAD; i++)
arr.push_back(i);
long long total_count = 0;
work_per_thread = n/MAX_THREAD;
auto start = chrono::high_resolution_clock::now();
for (int i = 0; i < MAX_THREAD; i++)
pthread_create(&threads[i], NULL, count_array, &arr[i]);
for (int i = 0; i < MAX_THREAD; i++)
pthread_join(threads[i], NULL);
for (int i = 0; i < MAX_THREAD; i++)
total_count += cnt[i];
cout << "count is " << total_count << endl;
auto end = chrono::high_resolution_clock::now();
double time_taken = chrono::duration_cast<chrono::nanoseconds>(end - start).count();
time_taken *= 1e-9;
cout << "Time taken by program is : " << fixed << time_taken << setprecision(9)<<" secs"<<endl;
return 0;
}
Now I want to do the work allocation part dynamically. To be specific, let's say I have 5 threads. Initially I give the threads a certain range to work with, let's say thread1 works on all pairs from 0-1249, thread2 from 1250-2549 and so on. Now as soon as a thread completes its work I want to give it a new range to work on. This way no threads will be idle for most of the time, like was in the case of static allocation.
This is the classic usage of a thread pool. Typically you set up a synchronized queue that can be pushed and pulled by any number of threads. Then you start N threads, the "thread pool". These threads wait on a condition variable that locks a mutex. When you have work to do from the main thread, it pushes work into the queue (it can be as simple as a struct with a range) and then signals the condition variable, which will release one thread.
See this answer: https://codereview.stackexchange.com/questions/221617/thread-pool-c-implementation
I implemented a concurrent queue with two methods: add (enqueue) & remove (dequeue).
To test my implementation using 2 threads, I generated 10 (NUMBER_OF_OPERATIONS) random numbers between 0 and 1 in a method called getRandom(). This allows me to create different distribution of add and remove operations.
The doWork method splits up the work done by the number of threads.
PROBLEM: The threadID that I am passing in from the main function does not match the threadID that the doWork method receives. Here are some sample runs:
Output 1
Output 2
#define NUMBER_OF_THREADS 2
#define NUMBER_OF_OPERATIONS 10
int main () {
BoundedQueue<int> bQ;
std::vector<double> temp = getRandom();
double* randomNumbers = &temp[0];
std::thread myThreads[NUMBER_OF_THREADS];
for(int i = 0; i < NUMBER_OF_THREADS; i++) {
cout << "Thread " << i << " created.\n";
myThreads[i] = std::thread ( [&] { bQ.doWork(randomNumbers, i); });
}
cout << "Main Thread\n";
for(int i = 0; i < NUMBER_OF_THREADS; i++) {
if(myThreads[i].joinable()) myThreads[i].join();
}
return 0;
}
template <class T> void BoundedQueue<T>::doWork (double randomNumbers[], int threadID) {
cout << "Thread ID is " << threadID << "\n";
srand(time(NULL));
int split = NUMBER_OF_OPERATIONS / NUMBER_OF_THREADS;
for (int i = threadID * split; i < (threadID * split) + split; i++) {
if(randomNumbers[i] <= 0.5) {
int numToAdd = rand() % 10 + 1;
add(numToAdd);
}
else {
int numRemoved = remove();
}
}
}
In this line you're capturing i by reference:
myThreads[i] = std::thread ( [&] { bQ.doWork(randomNumbers, i); });
This means that when the other thread runs the lambda, it'll get the latest value of i, not the value when it was created. Capture it by value instead:
myThreads[i] = std::thread ( [&, i] { bQ.doWork(randomNumbers, i); });
Whats worse, as you've got unordered read and write to i, your current code has undefined behavoir. And the fact i may've gone out of scope on the main thread before the other thread reads it. This fix above fixes all these issues.
I wrote a OpenMP program in C++ which basically finds the suffix-prefix overlap of a given length. All my strings are stored in a vector and I have two for loops for checking the overlap (all against all). I am trying to make the for loop parallel, but it does not improve the time. Following is my program
vector<string> Reads; // contains all strings
vector<int> *AdjList = new vector<int>[Reads.size()];
vector<int> *OLL = new vector<int>[Reads.size()];
// int i,j;
/*# pragma omp parallel \
shared ( AdjList, OLL ) \
private ( i, j )*/
#pragma omp parallel for
for(int i=0; i<Reads.size(); i++){
string suff = Reads.at(i).substr(Reads.at(i).length() - minOLL, minOLL);
for(int j=0; j<Reads.size(); j++){
if(i != j){
size_t found = rabin_karp(suff, Reads.at(j));
if(found != -1){
string pref1 = Reads.at(j).substr(0, found);
string suff1 = Reads.at(i).substr(Reads.at(i).length() - minOLL - found, found);
if(pref1.compare(suff1) == 0){
AdjList[i].push_back(j);
OLL[i].push_back(found + minOLL);
}
}
}
}
}
I guess reduction might help, but I am clueless about how to use it
1.since size of strings may be different you may use schedule(dynamic) so the tasks dynamically assigned to threads. 2. you can split inner loop into two loops to get rid of if statement. 3. substr is not a good choice because leads to creation of new string so you may use and save character positions to speed the code. However below applied 1, 2 mentioned cases:
#pragma omp parallel for schedule(dynamic)
for(int i=0; i<Reads.size(); i++){
string suff = Reads.at(i).substr(Reads.at(i).length() - minOLL, minOLL);
for(int j=0; j< i; j++){
size_t found = rabin_karp(suff, Reads.at(j));
if(found != -1){
string pref1 = Reads.at(j).substr(0, found);
string suff1 = Reads.at(i).substr(Reads.at(i).length() - minOLL - found, found);
if(pref1.compare(suff1) == 0){
AdjList[i].push_back(j);
OLL[i].push_back(found + minOLL);
}
}
}
for(int j=i+1; j< Reads.size(); j++){
size_t found = rabin_karp(suff, Reads.at(j));
if(found != -1){
string pref1 = Reads.at(j).substr(0, found);
string suff1 = Reads.at(i).substr(Reads.at(i).length() - minOLL - found, found);
if(pref1.compare(suff1) == 0){
AdjList[i].push_back(j);
OLL[i].push_back(found + minOLL);
}
}
}
}
Here's an example.
#include "pch.h"
#include <iostream>
#include <iomanip>
#include <fstream>
#include <string>
#include <omp.h>
using namespace std;
class Monitorius {
private:
int M[50];
int count = 0;
int suma = 0;
public:
Monitorius()
{
for (int i = 0; i < 50; i++)
M[i] = 0; // nusinulinamas masyvas
}
// Funkcijos ---------------------------------------
void Prideti(int skaicius)
{
#pragma omp critical
{
M[count] = skaicius;
count++;
Suma();
}
}
void Suma()
{
suma = 0;
for (int i = 0; i < 50; i++) {
suma += M[i];
}
cout << "Suma: " << suma << endl;
}
};
void paleistiGijas(Monitorius *monitorius) {
#pragma omp parallel num_threads(5)
{
for (int i = 1; i <= 10; i++) {
monitorius->Prideti(i);
}
}
}
int main()
{
// Monitoriaus sukurimas bei giju paleidimas
Monitorius *monitorius = new Monitorius();
// Pradedamas giju darbas
paleistiGijas(monitorius);
// Atlaisvinama atmintis
delete(monitorius);
return 0;
}
I don't know how I can parallel this loops because I have a lot of dependent variables and I am very confused
can you help and guide me?
the number one is :
for (int a = 0; a < sigmaLen; ++a) {
int f = freq[a];
if (f >= sumFreqLB)
if (updateRemainingDistances(s, a, pos))
if (prunePassed(pos + 1)) {
lmer[pos] = a;
enumerateStrings(pos + 1, sumFreqLB - f);
}
}
The second one is :
void preprocessLowerBounds() {
int i = stackSz - 1;
int pairOffset = (i * (i - 1)) >> 1;
for (int k = L; k; --k) {
int *dsn = dist[k] + pairOffset;
int *ds = dist[k - 1] + pairOffset;
int *s = colS[k - 1];
char ci = s[i];
for (int j = 0; j < i; ++j) {
char cj = s[j];
*ds++ = (*dsn++) + (ci != cj);
}
}
Really another one is :
void enumerateSubStrings(int rowNumber, int remainQTolerance) {
int nItems = rowSize[rowNumber][stackSz];
if (shouldGenerateNeighborhood(rowNumber, nItems)) {
bruteForceIt(rowNumber, nItems);
} else {
indexType *row = rowItem[rowNumber];
for (int j = 0; j < nItems; ++j) {
indexType ind = row[j];
addString(lmers + ind);
preprocessLowerBounds();
uint threshold = maxLB[stackSz] - addMaxFreq();
if (hasSolution(0, threshold)) {
if (getValid<hasPreprocessedPairs, useQ>(rowNumber + 1,
(stackSz <= 2 ? n : smallN), threshold + LminusD,
ind, remainQTolerance)) {
enumerateSubStrings<hasPreprocessedPairs, useQ>(
rowNumber + 1, remainQTolerance);
}
}
removeLastString();
}
}
void addString(const char *t) {
int *mf = colMf[stackSz + 1];
for (int j = 0; j < L; ++j) {
int c = t[j];
colS[j][stackSz] = c;
mf[j] = colMaxFreq[j] + (colMaxFreq[j] == colFreq[j][c]++);
}
colMaxFreq = mf;
++stackSz;
}
void preprocessLowerBounds() {
int i = stackSz - 1;
int pairOffset = (i * (i - 1)) >> 1;
for (int k = L; k; --k) {
int *dsn = dist[k] + pairOffset;
int *ds = dist[k - 1] + pairOffset;
int *s = colS[k - 1];
char ci = s[i];
for (int j = 0; j < i; ++j) {
char cj = s[j];
*ds++ = (*dsn++) + (ci != cj);
}
}
}
void removeLastString() {
--stackSz;
for (int j = 0; j < L; ++j)
--colFreq[j][colS[j][stackSz]];
colMaxFreq = colMf[stackSz];
}
Ok, For OpenMP to parallelize a loop in your basically follow these two rules, the first never write in the same memory location from different threads and second rule never depend on the reading of a memory area that may modified another thread, Now in the first loop you just change the lmer variable and other operations are read-only variables that I assume are not changing at the same time from another part of your code, so the first loop would be as follows:
#pragma omp for private(s,a,pos) //According to my intuition these variables are global or belong to a class, so you must convert private to each thread, on the other hand sumFreqLB and freq not included because only these reading
for (int a = 0; a < sigmaLen; ++a) {
int f = freq[a];
if (f >= sumFreqLB)
if (updateRemainingDistances(s, a, pos))
if (prunePassed(pos + 1)) {
#pragma omp critical //Only one thread at a time can enter otherwise you will fail at runtime
{
lmer[pos] = a;
}
enumerateStrings(pos + 1, sumFreqLB - f);
}
}
In the second loop i could not understand how you're using the for, but you have no problems because you use only reads and only modified the thread local variables.
You must make sure that the functions updateRemainingDistances, prunePassed and enumerateStrings do not use static or global variables within.
In the following function you use most only read operations which can be done from multiple threads (if any thread modifying these variables) and write in local memory positions so just change the shape of the FOR for OpenMP can recognize that FOR.
void preprocessLowerBounds() {
int i = stackSz - 1;
int pairOffset = (i * (i - 1)) >> 1;
#pragma omp for
for (int var=0; var<=k-L; var++){
int newK=k-var;//This will cover the initial range and in the same order
int *dsn = dist[newK] + pairOffset;
int *ds = dist[newK - 1] + pairOffset;
int *s = colS[newK - 1];
char ci = s[i];
for (int j = 0; j < i; ++j) {
char cj = s[j];
*ds++ = (*dsn++) + (ci != cj);
}
}
In the last function you use many functions for which I do not know the source code and thus can not know if they are looking for parallelizable example below the following examples are wrong:
std::vector myVector;
void notParalelizable_1(int i){
miVector.push_back(i);
}
void notParalelizable_2(int i){
static int A=0;
A=A+i;
}
int varGlobal=0;
void notParalelizable_3(int i){
varGlobal=varGlobal+i;
}
void oneFunctionParalelizable(int i)
{
int B=i;
}
int main()
{
#pragma omp for
for(int i=0;i<10;i++)
{
notParalelizable_1(i);//Error because myVector is modified simultaneously from multiple threads, The error here is that myVector not store the values in ascending order as this necessarily being accesing by multiple threads, this more complex functions can generate erroneous results or even errors in run time.
}
#pragma omp for
for(int i=0;i<10;i++)
{
notParalelizable_2(i);//Error because A is modified simultaneously from multiple threads
}
#pragma omp for
for(int i=0;i<10;i++)
{
notParalelizable_3(i);//Error because varGlobal is modified simultaneously from multiple threads
}
#pragma omp for
for(int i=0;i<10;i++)
{
oneFunctionParalelizable(i);//no problem
}
//The following code is correct
int *vector=new int[10];
#pragma omp for
for(int i=0;i<10;i++)
{
vector[i]=i;//No problem because each thread writes to a different memory pocicion
}
//The following code is wrong
int k=2;
#pragma omp for
for(int i=0;i<10;i++)
{
k=k+i; //The result of the k variable at the end will be wrong as it is modified from different threads
}
return 0;
}
I'm trying to understand the Eisenberg-McGuire algorithm and I found this program which implements it but when I run the program I get a segmentation fault.
Segmentation fault: 11
Here is the program
/* Eisenberg-McGuire algorithm: a software approach to N-process
mutual exclusion.
For description of Eisenberg-McGuire algorithm, see page 261 of
"Concurrent Systems - Operating Systems, Database and Distributed
Systems: An Inegrated Approach / Jean Bacon -- 2nd Edition".
Copyrigh (c) 2001 Xiao Zhang */
#include <stdlib.h>
#include <pthread.h>
#include <iostream>
using namespace std;
/**********************************************************************/
/* Eisenberg-McGuire's algorithm for N-process mutual exclusion */
/**********************************************************************/
class eis_mcg_mutex_t {
private:
int n;
enum procphase { out_cr, want_cr, claim_cr } *procphase;
int turn;
public:
/* Initialize the mutex data shared by N processes */
eis_mcg_mutex_t(int nproc)
{
n = nproc;
procphase = new enum procphase [n];
srand(time(0));
turn = (int) (1.0 * n * rand() / (RAND_MAX + 1.0));
for (int i = 0; i < n; i++)
procphase[i] = out_cr;
}
/* Entry protocol for process i */
void mutex_lock(int i) {
procphase[i] = want_cr;
int j = turn;
do
{
while (j != i)
{
if (procphase[j] == out_cr)
j = (j + 1) % n;
else
j = turn;
}
procphase[i] = claim_cr;
j = (j + 1) % n;
while (procphase[j] != claim_cr)
j = (j + 1) % n;
} while (!(j == i && (turn == i || procphase[turn] == out_cr)));
turn = i;
}
/* Exit protocol for process i */
void mutex_unlock(int i)
{
int j = (turn + 1) % n;
while (procphase[j] == out_cr)
j = (j + 1) % n;
turn = j;
procphase[i] = out_cr;
}
};
/**********************************************************************/
/* To test the Eisenberg-McGuire's algorithm, we write a simple */
/* program that creates N threads (processes) and then has each */
/* thread increment a global variable `counter' NLOOP times. The */
/* final value of `counter' is expected to be N * NLOOP. */
/**********************************************************************/
#define N 4 /* number of threads */
#define NLOOP 1000 /* number of times each thread loops */
int counter; /* this is cremented by the threads */
eis_mcg_mutex_t counter_in_use(N);
void *doit(void *arg)
{
int i, val;
int tid = *(int *)arg;
/* Each thread fetches, prints and increments the counter NLOOP times.
The value of the counter should increase monotonically. */
for (i = 0; i < NLOOP; i++) {
/* Replace pthread_mutex_lock() with Eisenberg-McGuire's
enter-critical-section procedure. */
counter_in_use.mutex_lock(tid);
/* Here is critical section */
val = counter;
counter = val + 1;
cout << tid << ": " << counter << endl;
/* Replace pthread_mutex_unlock() with Eisenberg-McGuire's
leave-critical-section procedure. */
counter_in_use.mutex_unlock(tid);
}
return NULL;
}
int main()
{
pthread_t tid[N];
int i;
for (i = 0; i < N; i++) pthread_create(&tid[i], NULL, doit, (void *)i);
for (i = 0; i < N; i++) pthread_join(tid[i], NULL);
return 0;
}
I can't understand what is causing the segmentation fault. Any help is appreciated. Thank You.
Fixed it.
for (i = 0; i < N; i++) pthread_create(&tid[i], NULL, doit, (void *)i);
should be
for (i = 0; i < N; i++) pthread_create(&tid[i], NULL, doit, (void *)&i);
missed the amperson operator for address.
Update:
I didn't pass the address now.
for (i = 0; i < N; i++) pthread_create(&tid[i], NULL, doit, (void *)i);
and in int doit(void *arg), changed int tid = *((int*)(&arg));
It works perfectly now.