How to get tasks working with TBB - c++

The code below compiles but appears to get stuck somewhere in the tasks I make with Intel TBB. It simply runs and displays nothing and I have to kill the program to end it. Basically, I modelled this after an example in a book and I probably did it incorrectly. What am I doing incorrectly with these tasks? I am using g++ 4.8.4 and think I am using TBB 3.9.
/*
g++ test0.cpp -o test0.out -std=c++11 -ltbb
*/
#include <iostream>
#include "tbb/task_scheduler_init.h"
#include "tbb/task.h"
using namespace tbb;
long serial_fibo(long n) {
if(n < 2) {
return n;
} else {
return serial_fibo(n - 1) + serial_fibo(n - 2);
}
}
class Fibo_Task: public task {
public:
const long n;
long* const sum;
Fibo_Task(long _n_, long* _sum_) :
n(_n_), sum(_sum_) {}
// override virtual function task::execute
task *execute() {
if(n < 4) {
*sum = serial_fibo(n);
} else {
long x = 0, y = 0;
// references x
Fibo_Task& a =
*new(task::allocate_root())
Fibo_Task(n - 1, &x);
// references y
Fibo_Task& b =
*new(task::allocate_root())
Fibo_Task(n - 2, &y);
// two children and another to wait
set_ref_count(3);
spawn(a);
spawn_and_wait_for_all(b);
*sum = x + y;
}
return NULL;
}
};
long parallel_fibo(long n) {
long sum;
Fibo_Task& a =
*new(task::allocate_root())
Fibo_Task(n, &sum);
task::spawn_root_and_wait(a);
return sum;
}
int main() {
task_scheduler_init init;
long number = 8;
long first = serial_fibo(number);
long second = parallel_fibo(number);
std::cout << "first: " << first << "\n";
std::cout << "second: " << second << "\n";
return 0;
}

You allocated 'root' tasks instead of 'child' tasks. The difference is that allocate_root() creates independent task which does not point to anything as its successor. And thus the wait_for_all() does not receive corresponding signals that the tasks are completed and thus hangs.
You can find the correct original example in the TBB documentation here.
Or you can fix yours by adding a... and b.set_parent(this) which effectively fixes the difference between allocate_root() and allocate_child() as I implemented here.

Related

Vector processing issues in multi threading

I'm implement about the data process in multi thread.
I want to process data in class DataProcess and merge the data in class DataStorage.
My problem is when the data is add to the vector sometimes occurs the exception error.
In my opinions, there have a different address class
Is it a problem to create a new data handling class and process each data?
Here is my code.
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <thread>
#include <vector>
#include <mutex>
using namespace::std;
static std::mutex m;
class DataStorage
{
private :
std::vector<long long> vecData;
public:
DataStorage()
{
}
~DataStorage()
{
}
void SetDataVectorSize(int size)
{
vecData.clear();
vecData.resize(size);
}
void DataInsertLoop(void* Data, int start, int end)
{
m.lock();
std::vector<long long> const * _v1 = static_cast<std::vector<long long> const *>(Data);
long long num = 0;
for (int idx = start; idx < _v1->size(); ++idx)
{
vecData[idx] = _v1->at(idx);
}
m.unlock();
}
};
class DataProcess
{
private:
int m_index;
long long m_startIndex;
long long m_endIndex;
int m_coreNum;
long long num;
DataStorage* m_mainStorage;
std::vector<long long> m_vecData;
public :
DataProcess(int pindex, long long startindex, long long endindex)
: m_index(pindex), m_startIndex(startindex), m_endIndex(endindex),
m_coreNum(0),m_mainStorage(NULL), num(0)
{
m_vecData.clear();
}
~DataProcess()
{
}
void SetMainAdrr(DataStorage* const mainstorage)
{
m_mainStorage = mainstorage;
}
void SetCoreInCPU(int num)
{
m_coreNum = num;
}
void DataRun()
{
for (long long idx = m_startIndex; idx < m_endIndex; ++idx)
{
num += rand();
m_vecData.push_back(num); //<- exception error position
}
m_mainStorage->DataInsertLoop(&m_vecData, m_startIndex, m_endIndex);
}
};
int main()
{
//auto beginTime = std::chrono::high_resolution_clock::now();
clock_t beginTime, endTime;
DataStorage* main = new DataStorage();
beginTime = clock();
long long totalcount = 200000000;
long long halfdata = totalcount / 2;
std::thread t1,t2;
for (int t = 0; t < 2; ++t)
{
DataProcess* clsDP = new DataProcess(1, 0, halfdata);
clsDP->SetCoreInCPU(2);
clsDP->SetMainAdrr(main);
if (t == 0)
{
t1 = std::thread([&]() {clsDP->DataRun(); });
}
else
{
t2 = std::thread([&]() {clsDP->DataRun(); });
}
}
t1.join(); t2.join();
endTime = clock();
double resultTime = (double)(endTime - beginTime);
std::cout << "Multi Thread " << resultTime / 1000 << " sec" << std::endl;
printf("--------------------\n");
int value = getchar();
}
Interestingly, if none of your threads accesses portions of vecData accessed by another thread, DataInsertLoop::DataInsertLoop should not need to be synchonized at all. That should make processsing much faster. That is, after all bugs are fixed... This also means, you should not need a mutex at all.
There are other issues with your code... The most easily spotted is a memory leak.
In main:
DataStorage* main = new DataStorage(); // you call new, but never call delete...
// that's a memory leak. Avoid caling
// new() directly.
//
// Also: 'main' is kind of a reserved
// name, don't use it except for the
// program entry point.
// How about this, instead ?
DataStorage dataSrc; // DataSrc has a very small footprint (a few pointers).
// ...
std::thread t1,t2; // why not use an array ?
// as in:
std::vector<std::tread> thrds;
// ...
// You forgot to set the size of your data set before starting, by calling:
dataSrc.SetDataVectorSize(200000000);
for (int t = 0; t < 2; ++t)
{
// ...
// Calling new again, and not delete... Use a smart pointer type
DataProcess* clsDP = new DataProcess(1, 0, halfdata);
// Also, fix the start and en indices (NOTE: code below works for t < 2, but
// probably not for t < 3)
auto clsDP = std::make_unique<DataProcess>(t, t * halfdata, (t + 1) * halfdata);
// You need to keep a reference to these pointers
// Either by storing them in an array, or by passing them to
// the threads. As in, for example:
thrds.emplace_back([dp = std::move(clsDP)]() {clsDP->DataRun(); });
}
//...
std::for_each(thrds.begin(), thrds.end(), [](auto& t) { t.join(); });
//...
More...
You create a mutex on your very first line of executable code. That's good... somewhat...
static std::mutex m; // a one letter name is a terrible choice for a variable with
// file scope.
Apart form the name, it's not in the right scope... If you want to use a mutex to protect DataStorage::vecData, this mutex should be declared in the same scope as DataStorage::vecData.
One last thing. Have you considered using iterators (aka pointers) as arguments to DataProcess::DataProcess() ? This would simplify the code quite a bit, and it would very likely run faster.

Displaying results as soon as they are ready with std::async

I'm trying to discover asynchronous programming in C++. Here's a toy example I've been using:
#include <iostream>
#include <future>
#include <vector>
#include <chrono>
#include <thread>
#include <random>
// For simplicity
using namespace std;
int called_from_async(int m, int n)
{
this_thread::sleep_for(chrono::milliseconds(rand() % 1000));
return m * n;
}
void test()
{
int m = 12;
int n = 42;
vector<future<int>> results;
for(int i = 0; i < 10; i++)
{
for(int j = 0; j < 10; j++)
{
results.push_back(async(launch::async, called_from_async, i, j));
}
}
for(auto& f : results)
{
cout << f.get() << endl;
}
}
Now, the example is not really interesting, but it raises a question that is, to me, interesting. Let's say I want to display results as they "arrive" (I don't know what will be ready first, since the delay is random), how should I do it?
What I'm doing here is obviously wrong, since I wait for all the tasks in the order in which I created them - so I'll wait for the first to finish even if it's longer than the others.
I thought about the following idea: for each future, using wait_for on a small time and if it's ready, display the value. But I feel weird doing that:
while (any_of(results.begin(), results.end(), [](const future<int>& f){
return f.wait_for(chrono::seconds(0)) != future_status::ready;
}))
{
cout << "Loop" << endl;
for(auto& f : results)
{
auto result = f.wait_for(std::chrono::milliseconds(20));
if (result == future_status::ready)
cout << f.get() << endl;
}
}
This brings another issue: we'd call get several times on some futures, which is illegal:
terminate called after throwing an instance of 'std::future_error' what(): std::future_error: No associated state
So I don't really know what to do here, please suggest!
Use valid() to skip the futures for which you have already called get().
bool all_ready;
do {
all_ready = true;
for(auto& f : results) {
if (f.valid()) {
auto result = f.wait_for(std::chrono::milliseconds(20));
if (result == future_status::ready) {
cout << f.get() << endl;
}
else {
all_ready = false;
}
}
}
}
while (!all_ready);

c++ threading, duplicate/missing threads

I'm trying to write a program that concurrently add and removes items from a "storehouse". I have a "Monitor" class that handles the "storehouse" operations:
class Monitor
{
private:
mutex m;
condition_variable cv;
vector<Storage> S;
int counter = 0;
bool busy = false;;
public:
void add(Computer c, int index) {
unique_lock <mutex> lock(m);
if (busy)
cout << "Thread " << index << ": waiting for !busy " << endl;
cv.wait(lock, [&] { return !busy; });
busy = true;
cout << "Thread " << index << ": Request: add " << c.CPUFrequency << endl;
for (int i = 0; i < counter; i++) {
if (S[i].f == c.CPUFrequency) {
S[i].n++;
busy = false; cv.notify_one();
return;
}
}
Storage s;
s.f = c.CPUFrequency;
s.n = 1;
// put the new item in a sorted position
S.push_back(s);
counter++;
busy = false; cv.notify_one();
}
}
The threads are created like this:
void doThreadStuff(vector<Computer> P, vector <Storage> R, Monitor &S)
{
int Pcount = P.size();
vector<thread> myThreads;
myThreads.reserve(Pcount);
for (atomic<size_t> i = 0; i < Pcount; i++)
{
int index = i;
Computer c = P[index];
myThreads.emplace_back([&] { S.add(c, index); });
}
for (size_t i = 0; i < Pcount; i++)
{
myThreads[i].join();
}
// printing results
}
Running the program produced the following results:
I'm familiar with race conditions, but this doesn't look like one to me. My bet would be on something reference related, because in the results we can see that for every "missing thread" (threads 1, 3, 10, 25) I get "duplicate threads" (threads 2, 9, 24, 28).
I have tried to create local variables in functions and loops but it changed nothing.
I have heard about threads sharing memory regions, but my previous work should have produced similar results, so I don't think that's the case here, but feel free to prove me wrong.
I'm using Visual Studio 2017
Here you catch local variables by reference in a loop, they will be destroyed in every turn, causing undefined behavior:
for (atomic<size_t> i = 0; i < Pcount; i++)
{
int index = i;
Computer c = P[index];
myThreads.emplace_back([&] { S.add(c, index); });
}
You should catch index and c by value:
myThreads.emplace_back([&S, index, c] { S.add(c, index); });
Another approach would be to pass S, i and c as arguments instead of capturing them by defining the following non-capturing lambda, th_func:
auto th_func = [](Monitor &S, int index, Computer c){ S.add(c, index); };
This way you have to explicitly wrap the arguments that must be passed by reference to the thread's callable object with std::reference_wrapper by means of the function template std::ref(). In your case, only S:
for (atomic<size_t> i = 0; i < Pcount; i++) {
int index = i;
Computer c = P[index];
myThreads.emplace_back(th_func, std::ref(S), index, c);
}
Failing to wrap with std::reference_wrapper the arguments that must be passed by reference will result in a compile-time error. That is, the following won't compile:
myThreads.emplace_back(th_func, S, index, c); // <-- it should be std::ref(S)
See also this question.

How to limit a decrement?

There is a initial game difficulty which is
game_difficulty=5 //Initial
Every 3 times if you get it right, your difficulty goes up to infinity but every 3 times you get it wrong, your difficulty goes down but not below 5. So, in this code for ex:
if(user_words==words) win_count+=1;
else() incorrect_count+=1;
if(win_count%3==0) /*increase diff*/;
if(incorrect_count%3==0) /*decrease difficulty*/;
How should I go about doing this?
Simple answer:
if(incorrect_count%3==0) difficulty = max(difficulty-1, 5);
But personally I would wrap it up in a small class then you can contain all the logic and expand it as you go along, something such as:
class Difficulty
{
public:
Difficulty() {};
void AddWin()
{
m_IncorrectCount = 0; // reset because we got one right?
if (++m_WinCount % 3)
{
m_WinCount = 0;
++m_CurrentDifficulty;
}
}
void AddIncorrect()
{
m_WinCount = 0; // reset because we got one wrong?
if (++m_IncorrectCount >= 3 && m_CurrentDifficulty > 5)
{
m_IncorrectCount = 0;
--m_CurrentDifficulty;
}
}
int GetDifficulty()
{
return m_CurrentDifficulty;
}
private:
int m_CurrentDifficulty = 5;
int m_WinCount = 0;
int m_IncorrectCount = 0;
};
You could just add this as a condition:
if (user words==words) {
win_count += 1;
if (win_count %3 == 0) {
++diff;
}
} else {
incorrect_count += 1;
if (incorrect_count % 3 == 0 && diff > 5) {
--diff
}
}
For example:
if(win_count%3==0) difficulty++;
if(incorrect_count%3==0 && difficulty > 5) difficulty--;
This can be turned into a motivating example for custom data types.
Create a class which wraps the difficulty int as a private member variable, and in the public member functions make sure that the so-called contract is met. You will end up with a value which is always guaranteed to meet your specifications. Here is an example:
class Difficulty
{
public:
// initial values for a new Difficulty object:
Difficulty() :
right_answer_count(0),
wrong_answer_count(0),
value(5)
{}
// called when a right answer should be taken into account:
void GotItRight()
{
++right_answer_count;
if (right_answer_count == 3)
{
right_answer_count = 0;
++value;
}
}
// called when a wrong answer should be taken into account:
void GotItWrong()
{
++wrong_answer_count;
if (wrong_answer_count == 3)
{
wrong_answer_count = 0;
--value;
if (value < 5)
{
value = 5;
}
}
}
// returns the value itself
int Value() const
{
return value;
}
private:
int right_answer_count;
int wrong_answer_count;
int value;
};
And here is how you would use the class:
Difficulty game_difficulty;
// six right answers:
for (int count = 0; count < 6; ++count)
{
game_difficulty.GotItRight();
}
// check wrapped value:
std::cout << game_difficulty.Value() << "\n";
// three wrong answers:
for (int count = 0; count < 3; ++count)
{
game_difficulty.GotItWrong();
}
// check wrapped value:
std::cout << game_difficulty.Value() << "\n";
// one hundred wrong answers:
for (int count = 0; count < 100; ++count)
{
game_difficulty.GotItWrong();
}
// check wrapped value:
std::cout << game_difficulty.Value() << "\n";
Output:
7
6
5
Once you have a firm grasp on how such types are created and used, you can start to look into operator overloading so that the type can be used more like a real int, i.e. with +, - and so on.
How should I go about doing this?
You have marked this question as C++. IMHO the c++ way is to create a class encapsulating all your issues.
Perhaps something like:
class GameDifficulty
{
public:
GameDifficulty () :
game_difficulty (5), win_count(0), incorrect_count(0)
{}
~GameDifficulty () {}
void update(const T& words)
{
if(user words==words) win_count+=1;
else incorrect_count+=1;
// modify game_difficulty as you desire
if(win_count%3 == 0)
game_difficulty += 1 ; // increase diff no upper limit
if((incorrect_count%3 == 0) && (game_difficulty > 5))
game_difficulty -= 1; //decrease diff;
}
inline int gameDifficulty() { return (game_difficulty); }
// and any other access per needs of your game
private:
int game_difficulty;
int win_count;
int incorrect_count;
}
// note - not compiled or tested
usage would be:
// instantiate
GameDiffculty gameDifficulty;
// ...
// use update()
gameDifficulty.update(word);
// ...
// use access
gameDifficulty.gameDifficulty();
Advantage: encapsulation
This code is in one place, not polluting elsewhere in your code.
You can change these policies in this one place, with no impact to the rest of your code.

openmpi/c++: defining a mpi data type for class with members of variable length (pointers pointing to malloced memory)?

i am currently learning to use openmpi, my aim is to parallelize a simple program whose code i will post bellow.
The program is for testing my concept of paralleling a much bigger program, i hope to learn all i need to know for my actual problem if i succeed with this.
Basically it is a definition of a simple c++ class for lists. A list consists of two arrays, one integer and one double. Entries with the same indicies belong together, in a way that the integer entry is some kind of list entry identifier (maybe an object ID) and the double entry is some kind of quantifier (maybe the weight if an object).
The basic purpose of the program is to add lists together (this is the task i want to parallelize). Adding works as follows: For each entry in one list it is checked if there is the same integer entry in the the other list, if so then the double entry gets added to the double entry in the other list, if there is no such entry in the other list then both the integer and the double entries gets added to the end of the list.
Basically each summand in this list addition represents a storage and each entry is a type of object with a given amount (int is the type and double is the amount), so adding two lists means putting the stuff from the second storage to the first.
The order of the list entries is irrelevant, this means that the addition of lists is not only associative but commutative too!
My plan is to add a very large number of such lists (a few billions) so parallelizing could be to let each thread add a subset of lists first and when this is finished distribute all such sublists (one for each thread) to all of the threads.
My current understanding of openmpi is that only the last step (distributing of finished sublists) needs any special non standard stuff. Basically i need a AllReduce but with a custom data type and a custom operaton.
The first problem i have is understanding how to create a fitting MPI data type. I came to the conclusion that i probably need MPI_Type_create_struct to create a struct type.
I found this site with a nice example: http://mpi.deino.net/mpi_functions/MPI_Type_create_struct.html
from which i learned a lot but the problem is, that in this case there are fixed member arrays. In my case i have lists with arbitrary sized member variables or better with pointers pointing to memory blocks of arbitrary size. So doing it like in the example would lead to creating a new MPI datatype for each list size (using fixed sized lists could help but only in this minimalistic case, but i want to learn how to do it with arbitrary sized lists are preparation for my actual problem).
So my question is: how to create a data type for this special case? What is the best way?
I even thought to maybe write some non mpi code to serialize my class/object, (which would be a lot of work for my real problem but in this example it should be easy) to a single block of bits. Then i could simply use a MPI function to distribute those blocks to all threads and then i just have to translate it back to the actual object, and then i could let each thread simply add the "number-of-threads" lists together to have the same full reduced list on all threads (because the operation is commutative it is not important if the order is the same on each thread in the end).
The problem is that i do not know which MPI function to use to distribute a such memory blocks to each thread so that in the end each thread has an array of "number-of-threads" such blocks (similar like AllReduce but with blocks).
But thats just another idea, i would like to hear from you whats the best way.
Thank you, here is my fully working example program (ignore the MPI parts thats just preparation, you can simply compile with: g++)
As you can see, i needed to create custom copy constructors because standard of the pointer members. I hope thats not a problem for MPI?
#include <iostream>
#include <cstdlib>
#if (CFG_MPI > 0)
#include <mpi.h>
#else
#define MPI_Barrier(xxx) // dummy code if not parallel
#endif
class list {
private:
int *ilist;
double *dlist;
int n;
public:
list(int n, int *il, double *dl) {
int i;
if (n>0) {
this->ilist = (int*)malloc(n*sizeof(int));
this->dlist = (double*)malloc(n*sizeof(double));
if (!ilist || !dlist) std::cout << "ERROR: malloc in constructor failed!" << std::endl;
} else {
this->ilist = NULL;
this->dlist = NULL;
}
for (i=0; i<n; i++) {
this->ilist[i] = il[i];
this->dlist[i] = dl[i];
}
this->n = n;
}
~list() {
free(ilist);
free(dlist);
ilist = NULL;
dlist = NULL;
this->n=0;
}
list(const list& cp) {
int i;
this->n = cp.n;
this->ilist = NULL;
this->dlist = NULL;
if (this->n > 0) {
this->ilist = (int*)malloc(this->n*sizeof(int));
this->dlist = (double*)malloc(this->n*sizeof(double));
if (!ilist || !dlist) std::cout << "ERROR: malloc in copy constructor failed!" << std::endl;
}
for (i=0; i<this->n; i++) {
this->ilist[i] = cp.ilist[i];
this->dlist[i] = cp.dlist[i];
}
}
list& operator=(const list& cp) {
if(this == &cp) return *this;
this->~list();
int i;
this->n = cp.n;
if (this->n > 0) {
this->ilist = (int*)malloc(this->n*sizeof(int));
this->dlist = (double*)malloc(this->n*sizeof(double));
if (!ilist || !dlist) std::cout << "ERROR: malloc in copy constructor failed!" << std::endl;
} else {
this->ilist = NULL;
this->dlist = NULL;
}
for (i=0; i<this->n; i++) {
this->ilist[i] = cp.ilist[i];
this->dlist[i] = cp.dlist[i];
}
return *this;
}
void print() {
int i;
for (i=0; i<this->n; i++)
std::cout << i << " : " << "[" << this->ilist[i] << " - " << (double)dlist[i] << "]" << std::endl;
}
list& operator+=(const list& cp) {
int i,j;
if(this == &cp) {
for (i=0; i<this->n; i++)
this->dlist[i] *= 2;
return *this;
}
double *dl;
int *il;
il = (int *) realloc(this->ilist, (this->n+cp.n)*sizeof(int));
dl = (double *) realloc(this->dlist, (this->n+cp.n)*sizeof(double));
if (!il || !dl)
std::cout << "ERROR: 1st realloc in operator += failed!" << std::endl;
else {
this->ilist = il;
this->dlist = dl;
il = NULL;
dl = NULL;
}
for (i=0; i<cp.n; i++) {
for (j=0; j<this->n; j++) {
if (this->ilist[j] == cp.ilist[i]) {
this->dlist[j] += cp.dlist[i];
break;
}
} if (j == this->n) {// no matching entry found in this
this->ilist[this->n] = cp.ilist[i];
this->dlist[this->n] = cp.dlist[i];
this->n++;
}
}
il = (int *) realloc(this->ilist, (this->n)*sizeof(int));
dl = (double *) realloc(this->dlist, (this->n)*sizeof(double));
if (!il || !dl)
std::cout << "ERROR: 2nd realloc in operator += failed!" << std::endl;
else {
this->ilist = il;
this->dlist = dl;
}
return *this;
}
};
int main(int argc, char **argv) {
int npe, myid;
#if (CFG_MPI > 0)
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD,&npe);
MPI_Comm_rank(MPI_COMM_WORLD,&myid);
#else
npe=1;
myid=0;
#endif
if (!myid) // reduce output
std::cout << "NPE = " << npe << " MYID = " << myid << std::endl;
int ilist[5] = {14,17,4,29,0};
double dlist[5] = {0.0, 170.0, 0.0, 0.0, 24.523};
int ilist2[6] = {14,117,14,129,0, 34};
double dlist2[6] = {0.5, 170.5, 0.5, 0.5, 24.0, 1.2};
list tlist(5, ilist, dlist);
list tlist2(6, ilist2, dlist2);
if (!myid) {
tlist.print();
tlist2.print();
}
tlist +=tlist2;
if (myid) tlist.print();
#if (CFG_MPI > 0)
MPI_Finalize();
#endif
return 0;
}