move() data back from thread - c++

I have a class ('Buffer') that contains a < vector > of data and an analyze() function that does some computation on the data ('raw' and 'optimised').
class Buffer
{
public:
Buffer(){};
vector<Element> raw;
vector<Element> optimised; // DATA CALCULATED BASED ON RAW
[...]
void analyze() // COMPUTE THE OPTIMISED BUFFER
{
for(std::vector<Element>::iterator it = raw.begin(); it != raw.end(); ++it)
{
optimised.push_back(Element(it->a,it->p));
// DO SOME COMPUTATIONALLY INTENSIVE CALCULATIONS
for(i=1; i<9999999; i++)
t = 9./i;
}
};
};
Because I need to create a series of the above 'Buffer' objects and keep interactive framerates, I am running the analyze() function for each buffer object onto a separate thread.
The only working solution I found for this is to use unique_ptr for keeping my collection of buffer objects
std::unique_ptr<dynamica::Buffer> buffer; //TEMPORARY COLLECTOR
std::vector<std::unique_ptr<dynamica::Buffer>> queue; //COLLECTION OF ALL PREVIOUS BUFFERS
and hence use move() when passing each of these objects to the respective thread.
queue.push_back(move(buffer));
//RUN 'ANALYZE' IN PARALLEL IN BACKGROUND
std::thread t(&Buffer::analyze, move(queue.back()));
t.detach();
My problem (I suspect) is that after I move() my object to a new thread for doing the analyze() computation I can no loner access the variable and methods in it (once the parallel thread has finished) from the main thread.
//from the main thread, once I know the parallel thread doing analyze() has finished
queue[0]->someFunction() // CRASH! ERROR! BOOM!
PS: Here (C++ multiple threads and vectors) is a reference to how I managed to get the threading working - and why it's the only solution.

This isn't really a threading issue. This is a misunderstanding of move semantics I think, you have handed the queue over to the detached thread and thus no longer own it.
Edit (Example code) :
#include <vector>
#include <iostream>
#include <memory>
#include <thread>
class Element
{
public:
int a;
float p;
Element(int _a, float _p=1.0): a(_a), p(_p){};
};
class Buffer
{
public:
Buffer(){};
std::vector<Element> raw;
std::vector<Element> optimised; // DATA CALCULATED BASED ON RAW
void addElement(int _a,float _p=1.0) // FILL THE RAW BUFFER
{
raw.push_back(Element(_a,_p));
}
void compute() // COMPUTE THE OPTIMISED BUFFER
{
float t;
int i;
for(std::vector<Element>::iterator it = raw.begin(); it != raw.end(); ++it)
{
optimised.push_back(Element(it->a,it->p));
// DO SOME COMPUTATIONALLY INTENSIVE CALCULATIONS
std::cout << "Performing calculations..." << std::endl;
for(i=1; i<9999999; i++)
t = 9./i;
}
};
void clear() // ERASE BOTH BUFFERS
{
raw.clear();
optimised.clear();
}
const std::pair<std::vector<Element>::const_iterator, std::vector<Element>::const_iterator> someFunction()
{
return std::make_pair<std::vector<Element>::const_iterator, std::vector<Element>::const_iterator>(begin(optimised), end(optimised));
}
~Buffer() { clear(); }
};
int _tmain(int argc, _TCHAR* argv[])
{
Buffer origin;
auto buffer = std::make_shared<Buffer *>(&origin);
std::vector<decltype(buffer)> queue;
for(int i = 0; i < 10; i++ )
(*buffer)->addElement(i);
queue.push_back(buffer);
std::thread t(&Buffer::compute, *queue.back());
t.join(); // wait for results
auto result = (*queue.back())->someFunction();
for( auto ele = result.first; ele != result.second; ele++ )
{
std::cout << (*ele).a << std::endl;
}
return 0;
}

Related

Vector processing issues in multi threading

I'm implement about the data process in multi thread.
I want to process data in class DataProcess and merge the data in class DataStorage.
My problem is when the data is add to the vector sometimes occurs the exception error.
In my opinions, there have a different address class
Is it a problem to create a new data handling class and process each data?
Here is my code.
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <thread>
#include <vector>
#include <mutex>
using namespace::std;
static std::mutex m;
class DataStorage
{
private :
std::vector<long long> vecData;
public:
DataStorage()
{
}
~DataStorage()
{
}
void SetDataVectorSize(int size)
{
vecData.clear();
vecData.resize(size);
}
void DataInsertLoop(void* Data, int start, int end)
{
m.lock();
std::vector<long long> const * _v1 = static_cast<std::vector<long long> const *>(Data);
long long num = 0;
for (int idx = start; idx < _v1->size(); ++idx)
{
vecData[idx] = _v1->at(idx);
}
m.unlock();
}
};
class DataProcess
{
private:
int m_index;
long long m_startIndex;
long long m_endIndex;
int m_coreNum;
long long num;
DataStorage* m_mainStorage;
std::vector<long long> m_vecData;
public :
DataProcess(int pindex, long long startindex, long long endindex)
: m_index(pindex), m_startIndex(startindex), m_endIndex(endindex),
m_coreNum(0),m_mainStorage(NULL), num(0)
{
m_vecData.clear();
}
~DataProcess()
{
}
void SetMainAdrr(DataStorage* const mainstorage)
{
m_mainStorage = mainstorage;
}
void SetCoreInCPU(int num)
{
m_coreNum = num;
}
void DataRun()
{
for (long long idx = m_startIndex; idx < m_endIndex; ++idx)
{
num += rand();
m_vecData.push_back(num); //<- exception error position
}
m_mainStorage->DataInsertLoop(&m_vecData, m_startIndex, m_endIndex);
}
};
int main()
{
//auto beginTime = std::chrono::high_resolution_clock::now();
clock_t beginTime, endTime;
DataStorage* main = new DataStorage();
beginTime = clock();
long long totalcount = 200000000;
long long halfdata = totalcount / 2;
std::thread t1,t2;
for (int t = 0; t < 2; ++t)
{
DataProcess* clsDP = new DataProcess(1, 0, halfdata);
clsDP->SetCoreInCPU(2);
clsDP->SetMainAdrr(main);
if (t == 0)
{
t1 = std::thread([&]() {clsDP->DataRun(); });
}
else
{
t2 = std::thread([&]() {clsDP->DataRun(); });
}
}
t1.join(); t2.join();
endTime = clock();
double resultTime = (double)(endTime - beginTime);
std::cout << "Multi Thread " << resultTime / 1000 << " sec" << std::endl;
printf("--------------------\n");
int value = getchar();
}
Interestingly, if none of your threads accesses portions of vecData accessed by another thread, DataInsertLoop::DataInsertLoop should not need to be synchonized at all. That should make processsing much faster. That is, after all bugs are fixed... This also means, you should not need a mutex at all.
There are other issues with your code... The most easily spotted is a memory leak.
In main:
DataStorage* main = new DataStorage(); // you call new, but never call delete...
// that's a memory leak. Avoid caling
// new() directly.
//
// Also: 'main' is kind of a reserved
// name, don't use it except for the
// program entry point.
// How about this, instead ?
DataStorage dataSrc; // DataSrc has a very small footprint (a few pointers).
// ...
std::thread t1,t2; // why not use an array ?
// as in:
std::vector<std::tread> thrds;
// ...
// You forgot to set the size of your data set before starting, by calling:
dataSrc.SetDataVectorSize(200000000);
for (int t = 0; t < 2; ++t)
{
// ...
// Calling new again, and not delete... Use a smart pointer type
DataProcess* clsDP = new DataProcess(1, 0, halfdata);
// Also, fix the start and en indices (NOTE: code below works for t < 2, but
// probably not for t < 3)
auto clsDP = std::make_unique<DataProcess>(t, t * halfdata, (t + 1) * halfdata);
// You need to keep a reference to these pointers
// Either by storing them in an array, or by passing them to
// the threads. As in, for example:
thrds.emplace_back([dp = std::move(clsDP)]() {clsDP->DataRun(); });
}
//...
std::for_each(thrds.begin(), thrds.end(), [](auto& t) { t.join(); });
//...
More...
You create a mutex on your very first line of executable code. That's good... somewhat...
static std::mutex m; // a one letter name is a terrible choice for a variable with
// file scope.
Apart form the name, it's not in the right scope... If you want to use a mutex to protect DataStorage::vecData, this mutex should be declared in the same scope as DataStorage::vecData.
One last thing. Have you considered using iterators (aka pointers) as arguments to DataProcess::DataProcess() ? This would simplify the code quite a bit, and it would very likely run faster.

Error occurred when using thread_local to maintain a concurrent memory buffer

In the following code, I want to create a memory buffer that allows multiple threads to read/write it concurrently. At a time, all threads will read this buffer in parallel, and later they will write to the buffer in parallel. But there will be no read/write operation at the same time.
To do this, I use a vector of shared_ptr<vector<uint64_t>>. When a new thread arrives, it will be allocated with a new vector<uint64_t> and only write to it. Two threads will not write to the same vector.
I use thread_local to track the vector index and offset the current thread will write to. When I need to add a new buffer to the memory_ variable, I use a mutex to protect it.
class TestBuffer {
public:
thread_local static uint32_t index_;
thread_local static uint32_t offset_;
thread_local static bool ready_;
vector<shared_ptr<vector<uint64_t>>> memory_;
mutex lock_;
void init() {
if (!ready_) {
new_slab();
ready_ = true;
}
}
void new_slab() {
std::lock_guard<mutex> lock(lock_);
index_ = memory_.size();
memory_.push_back(make_shared<vector<uint64_t>>(1000));
offset_ = 0;
}
void put(uint64_t value) {
init();
if (offset_ == 1000) {
new_slab();
}
if(memory_[index_] == nullptr) {
cout << "Error" << endl;
}
*(memory_[index_]->data() + offset_) = value;
offset_++;
}
};
thread_local uint32_t TestBuffer::index_ = 0;
thread_local uint32_t TestBuffer::offset_ = 0;
thread_local bool TestBuffer::ready_ = false;
int main() {
TestBuffer buffer;
vector<std::thread> threads;
for (int i = 0; i < 10; ++i) {
thread t = thread([&buffer, i]() {
for (int j = 0; j < 10000; ++j) {
buffer.put(i * 10000 + j);
}
});
threads.emplace_back(move(t));
}
for (auto &t: threads) {
t.join();
}
}
The code does not behave as expected, and reports error is in the put function. The root cause is that memory_[index_] sometimes return nullptr. However, I do not understand why this is possible as I think I have set the values properly. Thanks for the help!
You have a race condition in put caused by new_slab(). When new_slab calls memory_.push_back() the _memory vector may need to resize itself, and if another thread is executing put while the resize is in progress, memory_[index_] might access stale data.
One solution is to protect the _memory vector by locking the mutex:
{
std::lock_guard<mutex> lock(lock_);
if(memory_[index_] == nullptr) {
cout << "Error" << endl;
}
*(memory_[index_]->data() + offset_) = value;
}
Another is to reserve the space you need in the memory_ vector ahead of time.

std::atomic_flag to stop multiple threads

I'm trying to stop multiple worker threads using a std::atomic_flag. Starting from Issue using std::atomic_flag with worker thread the following works:
#include <iostream>
#include <atomic>
#include <chrono>
#include <thread>
std::atomic_flag continueFlag;
std::thread t;
void work()
{
while (continueFlag.test_and_set(std::memory_order_relaxed)) {
std::cout << "work ";
std::this_thread::sleep_for(std::chrono::milliseconds(10));
}
}
void start()
{
continueFlag.test_and_set(std::memory_order_relaxed);
t = std::thread(&work);
}
void stop()
{
continueFlag.clear(std::memory_order_relaxed);
t.join();
}
int main()
{
std::cout << "Start" << std::endl;
start();
std::this_thread::sleep_for(std::chrono::milliseconds(200));
std::cout << "Stop" << std::endl;
stop();
std::cout << "Stopped." << std::endl;
return 0;
}
Trying to rewrite into multiple worker threads:
#include <iostream>
#include <atomic>
#include <chrono>
#include <thread>
#include <vector>
#include <memory>
struct thread_data {
std::atomic_flag continueFlag;
std::thread thread;
};
std::vector<thread_data> threads;
void work(int threadNum, std::atomic_flag &continueFlag)
{
while (continueFlag.test_and_set(std::memory_order_relaxed)) {
std::cout << "work" << threadNum << " ";
std::this_thread::sleep_for(std::chrono::milliseconds(10));
}
}
void start()
{
const unsigned int numThreads = 2;
for (int i = 0; i < numThreads; i++) {
////////////////////////////////////////////////////////////////////
//PROBLEM SECTOR
////////////////////////////////////////////////////////////////////
thread_data td;
td.continueFlag.test_and_set(std::memory_order_relaxed);
td.thread = std::thread(&work, i, td.continueFlag);
threads.push_back(std::move(td));
////////////////////////////////////////////////////////////////////
//PROBLEM SECTOR
////////////////////////////////////////////////////////////////////
}
}
void stop()
{
//Flag stop
for (auto &data : threads) {
data.continueFlag.clear(std::memory_order_relaxed);
}
//Join
for (auto &data : threads) {
data.thread.join();
}
threads.clear();
}
int main()
{
std::cout << "Start" << std::endl;
start();
std::this_thread::sleep_for(std::chrono::milliseconds(200));
std::cout << "Stop" << std::endl;
stop();
std::cout << "Stopped." << std::endl;
return 0;
}
My issue is "Problem Sector" in above. Namely creating the threads. I cannot wrap my head around how to instantiate the threads and passing the variables to the work thread.
The error right now is referencing this line threads.push_back(std::move(td)); with error Error C2280 'thread_data::thread_data(const thread_data &)': attempting to reference a deleted function.
Trying to use unique_ptr like this:
auto td = std::make_unique<thread_data>();
td->continueFlag.test_and_set(std::memory_order_relaxed);
td->thread = std::thread(&work, i, td->continueFlag);
threads.push_back(std::move(td));
Gives error std::atomic_flag::atomic_flag(const std::atomic_flag &)': attempting to reference a deleted function at line td->thread = std::thread(&work, i, td->continueFlag);. Am I fundamentally misunderstanding the use of std::atomic_flag? Is it really both immovable and uncopyable?
Your first approach was actually closer to the truth. The problem is that it passed a reference to an object within the local for loop scope to each thread, as a parameter. But, of course, once the loop iteration ended, that object went out of scope and got destroyed, leaving each thread with a reference to a destroyed object, resulting in undefined behavior.
Nobody cared about the fact that you moved the object into the std::vector, after creating the thread. The thread received a reference to a locally-scoped object, and that's all it knew. End of story.
Moving the object into the vector first, and then passing to each thread a reference to the object in the std::vector will not work either. As soon as the vector internally reallocates, as part of its natural growth, you'll be in the same pickle.
What needs to happen is to have the entire threads array created first, before actually starting any std::threads. If the RAII principle is religiously followed, that means nothing more than a simple call to std::vector::resize().
Then, in a second loop, iterate over the fully-cooked threads array, and go and spawn off a std::thread for each element in the array.
I was almost there with my unique_ptr solution. I just needed to pass the call as a std::ref() as such:
std::vector<std::unique_ptr<thread_data>> threads;
void start()
{
const unsigned int numThreads = 2;
for (int i = 0; i < numThreads; i++) {
auto td = std::make_unique<thread_data>();
td->continueFlag.test_and_set(std::memory_order_relaxed);
td->thread = std::thread(&work, i, std::ref(td->continueFlag));
threads.push_back(std::move(td));
}
}
However, inspired by Sam above I also figured a non-pointer way:
std::vector<thread_data> threads;
void start()
{
const unsigned int numThreads = 2;
//create new vector, resize doesn't work as it tries to assign/copy which atomic_flag
//does not support
threads = std::vector<thread_data>(numThreads);
for (int i = 0; i < numThreads; i++) {
auto& t = threads.at(i);
t.continueFlag.test_and_set(std::memory_order_relaxed);
t.thread = std::thread(&work, i, std::ref(t.continueFlag));
}
}

How to apply a concurrent solution to a Producer-Consumer like situation

I have a XML file with a sequence of nodes. Each node represents an element that I need to parse and add in a sorted list (the order must be the same of the nodes found in the file).
At the moment I am using a sequential solution:
struct Graphic
{
bool parse()
{
// parsing...
return parse_outcome;
}
};
vector<unique_ptr<Graphic>> graphics;
void producer()
{
for (size_t i = 0; i < N_GRAPHICS; i++)
{
auto g = new Graphic();
if (g->parse())
graphics.emplace_back(g);
else
delete g;
}
}
So, only if the graphic (that actually is an instance of a class derived from Graphic, a Line, a Rectangle and so on, that is why the new) can be properly parse, it will be added to my data structure.
Since I only care about the order in which thes graphics are added to my list, I though to call the parse method asynchronously, such that the producer has the task of read each node from the file and add this graphic to the data structure, while the consumer has the task of parse each graphic whenever a new graphic is ready to be parsed.
Now I have several consumer threads (created in the main) and my code looks like the following:
queue<pair<Graphic*, size_t>> q;
mutex m;
atomic<size_t> n_elements;
void producer()
{
for (size_t i = 0; i < N_GRAPHICS; i++)
{
auto g = new Graphic();
graphics.emplace_back(g);
q.emplace(make_pair(g, i));
}
n_elements = graphics.size();
}
void consumer()
{
pair<Graphic*, size_t> item;
while (true)
{
{
std::unique_lock<std::mutex> lk(m);
if (n_elements == 0)
return;
n_elements--;
item = q.front();
q.pop();
}
if (!item.first->parse())
{
// here I should remove the item from the vector
assert(graphics[item.second].get() == item.first);
delete item.first;
graphics[item.second] = nullptr;
}
}
}
I run the producer first of all in my main, so that when the first consumer starts the queue is already completely full.
int main()
{
producer();
vector<thread> threads;
for (auto i = 0; i < N_THREADS; i++)
threads.emplace_back(consumer);
for (auto& t : threads)
t.join();
return 0;
}
The concurrent version seems to be at least twice as faster as the original one.
The full code has been uploaded here.
Now I am wondering:
Are there any (synchronization) errors in my code?
Is there a way to achieve the same result faster (or better)?
Also, I noticed that on my computer I get the best result (in terms of elapsed time) if I set the number of thread equals to 8. More (or less) threads give me worst results. Why?
Blockquote
There isn't synchronization errors, but I think that the memory managing could be better, since your code leaked if parse() throws an exception.
There isn't synchronization errors, but I think that your memory managing could be better, since you will have leaks if parse() throw an exception.
Blockquote
Is there a way to achieve the same result faster (or better)?
Probably. You could use a simple implementation of a thread pool and a lambda that do the parse() for you.
The code below illustrate this approach. I use the threadpool implementation
here
#include <iostream>
#include <stdexcept>
#include <vector>
#include <memory>
#include <chrono>
#include <utility>
#include <cassert>
#include <ThreadPool.h>
using namespace std;
using namespace std::chrono;
#define N_GRAPHICS (1000*1000*1)
#define N_THREADS 8
struct Graphic;
using GPtr = std::unique_ptr<Graphic>;
static vector<GPtr> graphics;
struct Graphic
{
Graphic()
: status(false)
{
}
bool parse()
{
// waste time
try
{
throw runtime_error("");
}
catch (runtime_error)
{
}
status = true;
//return false;
return true;
}
bool status;
};
int main()
{
auto start = system_clock::now();
auto producer_unit = []()-> GPtr {
std::unique_ptr<Graphic> g(new Graphic);
if(!g->parse()){
g.reset(); // if g don't parse, return nullptr
}
return g;
};
using ResultPool = std::vector<std::future<GPtr>>;
ResultPool results;
// ThreadPool pool(thread::hardware_concurrency());
ThreadPool pool(N_THREADS);
for(int i = 0; i <N_GRAPHICS; ++i){
// Running async task
results.emplace_back(pool.enqueue(producer_unit));
}
for(auto &t : results){
auto value = t.get();
if(value){
graphics.emplace_back(std::move(value));
}
}
auto duration = duration_cast<milliseconds>(system_clock::now() - start);
cout << "Elapsed: " << duration.count() << endl;
for (size_t i = 0; i < graphics.size(); i++)
{
if (!graphics[i]->status)
{
cerr << "Assertion failed! (" << i << ")" << endl;
break;
}
}
cin.get();
return 0;
}
It is a bit faster (1s) on my machine, more readable, and removes the necessity of shared datas (synchronization is evil, avoid it or hide it in a reliable and efficient way).

Extend the life of threads with synchronization (C++11)

I have a program with a function which takes a pointer as arg, and a main. The main is creating n threads, each of them running the function on different memory areas depending on the passed arg. Threads are then joined, the main performs some data mixing between the area and creates n new threads which do the the same operation as the old ones.
To improve the program I would like to keep the threads alive, removing the long time necessary to create them. Threads should sleep when the main is working and notified when they have to come up again. At the same way the main should wait when threads are working as it did with join.
I cannot end up with a strong implementation of this, always falling in a deadlock.
Simple baseline code, any hints about how to modify this would be much appreciated
#include <thread>
#include <climits>
...
void myfunc(void * p) {
do_something(p);
}
int main(){
void * myp[n_threads] {a_location, another_location,...};
std::thread mythread[n_threads];
for (unsigned long int j=0; j < ULONG_MAX; j++) {
for (unsigned int i=0; i < n_threads; i++) {
mythread[i] = std::thread(myfunc, myp[i]);
}
for (unsigned int i=0; i < n_threads; i++) {
mythread[i].join();
}
mix_data(myp);
}
return 0;
}
Here is a possible approach using only classes from the C++11 Standard Library. Basically, each thread you create has an associated command queue (encapsulated in std::packaged_task<> objects) which it continuously check. If the queue is empty, the thread will just wait on a condition variable (std::condition_variable).
While data races are avoided through the use of std::mutex and std::unique_lock<> RAII wrappers, the main thread can wait for a particular job to be terminated by storing the std::future<> object associated to each submitted std::packaged_tast<> and call wait() on it.
Below is a simple program that follows this design. Comments should be sufficient to explain what it does:
#include <thread>
#include <iostream>
#include <sstream>
#include <future>
#include <queue>
#include <condition_variable>
#include <mutex>
// Convenience type definition
using job = std::packaged_task<void()>;
// Some data associated to each thread.
struct thread_data
{
int id; // Could use thread::id, but this is filled before the thread is started
std::thread t; // The thread object
std::queue<job> jobs; // The job queue
std::condition_variable cv; // The condition variable to wait for threads
std::mutex m; // Mutex used for avoiding data races
bool stop = false; // When set, this flag tells the thread that it should exit
};
// The thread function executed by each thread
void thread_func(thread_data* pData)
{
std::unique_lock<std::mutex> l(pData->m, std::defer_lock);
while (true)
{
l.lock();
// Wait until the queue won't be empty or stop is signaled
pData->cv.wait(l, [pData] () {
return (pData->stop || !pData->jobs.empty());
});
// Stop was signaled, let's exit the thread
if (pData->stop) { return; }
// Pop one task from the queue...
job j = std::move(pData->jobs.front());
pData->jobs.pop();
l.unlock();
// Execute the task!
j();
}
}
// Function that creates a simple task
job create_task(int id, int jobNumber)
{
job j([id, jobNumber] ()
{
std::stringstream s;
s << "Hello " << id << "." << jobNumber << std::endl;
std::cout << s.str();
});
return j;
}
int main()
{
const int numThreads = 4;
const int numJobsPerThread = 10;
std::vector<std::future<void>> futures;
// Create all the threads (will be waiting for jobs)
thread_data threads[numThreads];
int tdi = 0;
for (auto& td : threads)
{
td.id = tdi++;
td.t = std::thread(thread_func, &td);
}
//=================================================
// Start assigning jobs to each thread...
for (auto& td : threads)
{
for (int i = 0; i < numJobsPerThread; i++)
{
job j = create_task(td.id, i);
futures.push_back(j.get_future());
std::unique_lock<std::mutex> l(td.m);
td.jobs.push(std::move(j));
}
// Notify the thread that there is work do to...
td.cv.notify_one();
}
// Wait for all the tasks to be completed...
for (auto& f : futures) { f.wait(); }
futures.clear();
//=================================================
// Here the main thread does something...
std::cin.get();
// ...done!
//=================================================
//=================================================
// Posts some new tasks...
for (auto& td : threads)
{
for (int i = 0; i < numJobsPerThread; i++)
{
job j = create_task(td.id, i);
futures.push_back(j.get_future());
std::unique_lock<std::mutex> l(td.m);
td.jobs.push(std::move(j));
}
// Notify the thread that there is work do to...
td.cv.notify_one();
}
// Wait for all the tasks to be completed...
for (auto& f : futures) { f.wait(); }
futures.clear();
// Send stop signal to all threads and join them...
for (auto& td : threads)
{
std::unique_lock<std::mutex> l(td.m);
td.stop = true;
td.cv.notify_one();
}
// Join all the threads
for (auto& td : threads) { td.t.join(); }
}
The concept you want is the threadpool. This SO question deals with existing implementations.
The idea is to have a container for a number of thread instances. Each instance is associated with a function which polls a task queue, and when a task is available, pulls it and run it. Once the task is over (if it terminates, but that's another problem), the thread simply loop over to the task queue.
So you need a synchronized queue, a thread class which implements the loop on the queue, an interface for the task objects, and maybe a class to drive the whole thing (the pool class).
Alternatively, you could make a very specialized thread class for the task it has to perform (with only the memory area as a parameter for instance). This requires a notification mechanism for the threads to indicate that they are done with the current iteration.
The thread main function would be a loop on that specific task, and at the end of one iteration, the thread signals its end, and wait on condition variables to start the next loop. In essence, you would be inlining the task code within the thread, dropping the need of a queue altogether.
using namespace std;
// semaphore class based on C++11 features
class semaphore {
private:
mutex mMutex;
condition_variable v;
int mV;
public:
semaphore(int v): mV(v){}
void signal(int count=1){
unique_lock lock(mMutex);
mV+=count;
if (mV > 0) mCond.notify_all();
}
void wait(int count = 1){
unique_lock lock(mMutex);
mV-= count;
while (mV < 0)
mCond.wait(lock);
}
};
template <typename Task>
class TaskThread {
thread mThread;
Task *mTask;
semaphore *mSemStarting, *mSemFinished;
volatile bool mRunning;
public:
TaskThread(Task *task, semaphore *start, semaphore *finish):
mTask(task), mRunning(true),
mSemStart(start), mSemFinished(finish),
mThread(&TaskThread<Task>::psrun){}
~TaskThread(){ mThread.join(); }
void run(){
do {
(*mTask)();
mSemFinished->signal();
mSemStart->wait();
} while (mRunning);
}
void finish() { // end the thread after the current loop
mRunning = false;
}
private:
static void psrun(TaskThread<Task> *self){ self->run();}
};
classcMyTask {
public:
MyTask(){}
void operator()(){
// some code here
}
};
int main(){
MyTask task1;
MyTask task2;
semaphore start(2), finished(0);
TaskThread<MyTask> t1(&task1, &start, &finished);
TaskThread<MyTask> t2(&task2, &start, &finished);
for (int i = 0; i < 10; i++){
finished.wait(2);
start.signal(2);
}
t1.finish();
t2.finish();
}
The proposed (crude) implementation above relies on the Task type which must provide the operator() (ie. a functor like class). I said you could incorporate the task code directly in the thread function body earlier, but since I don't know it, I kept it as abstract as I could. There's one condition variable for the start of threads, and one for their end, both encapsulated in semaphore instances.
Seeing the other answer proposing the use of boost::barrier, I can only support this idea: make sure to replace my semaphore class with that class if possible, the reason being that it is better to rely on well tested and maintained external code rather than a self implemented solution for the same feature set.
All in all, both approaches are valid, but the former gives up a tiny bit of performance in favor of flexibility. If the task to be performed takes a sufficiently long time, the management and queue synchronization cost becomes negligible.
Update: code fixed and tested. Replaced a simple condition variable by a semaphore.
It can easily be achieved using a barrier (just a convenience wrapper over a conditional variable and a counter). It basically blocks until all N threads have reached the "barrier". It then "recycles" again. Boost provides an implementation.
void myfunc(void * p, boost::barrier& start_barrier, boost::barrier& end_barrier) {
while (!stop_condition) // You'll need to tell them to stop somehow
{
start_barrier.wait ();
do_something(p);
end_barrier.wait ();
}
}
int main(){
void * myp[n_threads] {a_location, another_location,...};
boost::barrier start_barrier (n_threads + 1); // child threads + main thread
boost::barrier end_barrier (n_threads + 1); // child threads + main thread
std::thread mythread[n_threads];
for (unsigned int i=0; i < n_threads; i++) {
mythread[i] = std::thread(myfunc, myp[i], start_barrier, end_barrier);
}
start_barrier.wait (); // first unblock the threads
for (unsigned long int j=0; j < ULONG_MAX; j++) {
end_barrier.wait (); // mix_data must not execute before the threads are done
mix_data(myp);
start_barrier.wait (); // threads must not start new iteration before mix_data is done
}
return 0;
}
The following is a simple compiling and working code performing some random stuffs. It implements aleguna's concept of barrier. The task length of each thread is different so it is really necessary to have a strong synchronization mechanism. I will try to do a pool on the same tasks and benchmark the result, and then maybe with futures as pointed out by Andy Prowl.
#include <iostream>
#include <thread>
#include <mutex>
#include <condition_variable>
#include <chrono>
#include <complex>
#include <random>
const unsigned int n_threads=4; //varying this will not (almost) change the total amount of work
const unsigned int task_length=30000/n_threads;
const float task_length_variation=task_length/n_threads;
unsigned int rep=1000; //repetitions of tasks
class t_chronometer{
private:
std::chrono::steady_clock::time_point _t;
public:
t_chronometer(): _t(std::chrono::steady_clock::now()) {;}
void reset() {_t = std::chrono::steady_clock::now();}
double get_now() {return std::chrono::duration_cast<std::chrono::duration<double>>(std::chrono::steady_clock::now() - _t).count();}
double get_now_ms() {return
std::chrono::duration_cast<std::chrono::duration<double,std::milli>>(std::chrono::steady_clock::now() - _t).count();}
};
class t_barrier {
private:
std::mutex m_mutex;
std::condition_variable m_cond;
unsigned int m_threshold;
unsigned int m_count;
unsigned int m_generation;
public:
t_barrier(unsigned int count):
m_threshold(count),
m_count(count),
m_generation(0) {
}
bool wait() {
std::unique_lock<std::mutex> lock(m_mutex);
unsigned int gen = m_generation;
if (--m_count == 0)
{
m_generation++;
m_count = m_threshold;
m_cond.notify_all();
return true;
}
while (gen == m_generation)
m_cond.wait(lock);
return false;
}
};
using namespace std;
void do_something(complex<double> * c, unsigned int max) {
complex<double> a(1.,0.);
complex<double> b(1.,0.);
for (unsigned int i = 0; i<max; i++) {
a *= polar(1.,2.*M_PI*i/max);
b *= polar(1.,4.*M_PI*i/max);
*(c)+=a+b;
}
}
bool done=false;
void task(complex<double> * c, unsigned int max, t_barrier* start_barrier, t_barrier* end_barrier) {
while (!done) {
start_barrier->wait ();
do_something(c,max);
end_barrier->wait ();
}
cout << "task finished" << endl;
}
int main() {
t_chronometer t;
std::default_random_engine gen;
std::normal_distribution<double> dis(.0,1000.0);
complex<double> cpx[n_threads];
for (unsigned int i=0; i < n_threads; i++) {
cpx[i] = complex<double>(dis(gen), dis(gen));
}
t_barrier start_barrier (n_threads + 1); // child threads + main thread
t_barrier end_barrier (n_threads + 1); // child threads + main thread
std::thread mythread[n_threads];
unsigned long int sum=0;
for (unsigned int i=0; i < n_threads; i++) {
unsigned int max = task_length + i * task_length_variation;
cout << i+1 << "th task length: " << max << endl;
mythread[i] = std::thread(task, &cpx[i], max, &start_barrier, &end_barrier);
sum+=max;
}
cout << "total task length " << sum << endl;
complex<double> c(0,0);
for (unsigned long int j=1; j < rep+1; j++) {
start_barrier.wait (); //give to the threads the missing call to start
if (j==rep) done=true;
end_barrier.wait (); //wait for the call from each tread
if (j%100==0) cout << "cycle: " << j << endl;
for (unsigned int i=0; i<n_threads; i++) {
c+=cpx[i];
}
}
for (unsigned int i=0; i < n_threads; i++) {
mythread[i].join();
}
cout << "result: " << c << " it took: " << t.get_now() << " s." << endl;
return 0;
}