Efficiently waiting for all tasks in a threadpool to finish - c++

I currently have a program with x workers in my threadpool. During the main loop y tasks are assigned to the workers to complete, but after the tasks are sent out I must wait for all tasks for finish before preceding with the program. I believe my current solution is inefficient, there must be a better way to wait for all tasks to finish but I am not sure how to go about this
// called in main after all tasks are enqueued to
// std::deque<std::function<void()>> tasks
void ThreadPool::waitFinished()
{
while(!tasks.empty()) //check if there are any tasks in queue waiting to be picked up
{
//do literally nothing
}
}
More information:
threadpool structure
//worker thread objects
class Worker {
public:
Worker(ThreadPool& s): pool(s) {}
void operator()();
private:
ThreadPool &pool;
};
//thread pool
class ThreadPool {
public:
ThreadPool(size_t);
template<class F>
void enqueue(F f);
void waitFinished();
~ThreadPool();
private:
friend class Worker;
//keeps track of threads so we can join
std::vector< std::thread > workers;
//task queue
std::deque< std::function<void()> > tasks;
//sync
std::mutex queue_mutex;
std::condition_variable condition;
bool stop;
};
or here's a gist of my threadpool.hpp
example of what I want to use waitFinished() for:
while(running)
//....
for all particles alive
push particle position function to threadpool
end for
threadPool.waitFinished();
push new particle position data into openGL buffer
end while
so this way I can send hundrends of thousands of particle position tasks to be done in parallel, wait for them to finish and put the new data inside the openGL position buffers

This is one way to do what you're trying. Using two condition variables on the same mutex is not for the light-hearted unless you know what is going on internally. I didn't need the atomic processed member other than my desire to demonstrate how many items were finished between each run.
The sample workload function in this generates one million random int values, then sorts them (gotta heat my office one way or another). waitFinished will not return until the queue is empty and no threads are busy.
#include <iostream>
#include <deque>
#include <functional>
#include <thread>
#include <condition_variable>
#include <mutex>
#include <random>
//thread pool
class ThreadPool
{
public:
ThreadPool(unsigned int n = std::thread::hardware_concurrency());
template<class F> void enqueue(F&& f);
void waitFinished();
~ThreadPool();
unsigned int getProcessed() const { return processed; }
private:
std::vector< std::thread > workers;
std::deque< std::function<void()> > tasks;
std::mutex queue_mutex;
std::condition_variable cv_task;
std::condition_variable cv_finished;
std::atomic_uint processed;
unsigned int busy;
bool stop;
void thread_proc();
};
ThreadPool::ThreadPool(unsigned int n)
: busy()
, processed()
, stop()
{
for (unsigned int i=0; i<n; ++i)
workers.emplace_back(std::bind(&ThreadPool::thread_proc, this));
}
ThreadPool::~ThreadPool()
{
// set stop-condition
std::unique_lock<std::mutex> latch(queue_mutex);
stop = true;
cv_task.notify_all();
latch.unlock();
// all threads terminate, then we're done.
for (auto& t : workers)
t.join();
}
void ThreadPool::thread_proc()
{
while (true)
{
std::unique_lock<std::mutex> latch(queue_mutex);
cv_task.wait(latch, [this](){ return stop || !tasks.empty(); });
if (!tasks.empty())
{
// got work. set busy.
++busy;
// pull from queue
auto fn = tasks.front();
tasks.pop_front();
// release lock. run async
latch.unlock();
// run function outside context
fn();
++processed;
latch.lock();
--busy;
cv_finished.notify_one();
}
else if (stop)
{
break;
}
}
}
// generic function push
template<class F>
void ThreadPool::enqueue(F&& f)
{
std::unique_lock<std::mutex> lock(queue_mutex);
tasks.emplace_back(std::forward<F>(f));
cv_task.notify_one();
}
// waits until the queue is empty.
void ThreadPool::waitFinished()
{
std::unique_lock<std::mutex> lock(queue_mutex);
cv_finished.wait(lock, [this](){ return tasks.empty() && (busy == 0); });
}
// a cpu-busy task.
void work_proc()
{
std::random_device rd;
std::mt19937 rng(rd());
// build a vector of random numbers
std::vector<int> data;
data.reserve(100000);
std::generate_n(std::back_inserter(data), data.capacity(), [&](){ return rng(); });
std::sort(data.begin(), data.end(), std::greater<int>());
}
int main()
{
ThreadPool tp;
// run five batches of 100 items
for (int x=0; x<5; ++x)
{
// queue 100 work tasks
for (int i=0; i<100; ++i)
tp.enqueue(work_proc);
tp.waitFinished();
std::cout << tp.getProcessed() << '\n';
}
// destructor will close down thread pool
return EXIT_SUCCESS;
}
Output
100
200
300
400
500
Best of luck.

Related

Thread Pool join hangs when oversubscribing threads

I'm having an issue with a thread hanging when joining my threads in a thread pool I have created. The issue only occurs if I loop over the thread pool execution a large number of times.
I have a thread pool class like the following;
#include <queue>
#include <mutex>
#include <condition_variable>
#include <functional>
#include <atomic>
#include <vector>
#include <thread>
#include <iostream>
class ThreadPool
{
public:
ThreadPool()
{
m_shutdown.store(false, std::memory_order_relaxed);
createThreads(1);
}
ThreadPool(std::size_t numThreads)
{
m_shutdown.store(false, std::memory_order_relaxed);
createThreads(numThreads);
}
void add_job(std::function<void()> new_job)
{
{
std::scoped_lock<std::mutex> lock(m_jobMutex);
m_jobQueue.push(new_job);
}
m_notifier.notify_one();
}
void waitFinished()
{
{
std::unique_lock<std::mutex> lock(m_jobMutex);
m_finished.wait(lock, [this] {return m_jobQueue.empty(); }); //&& busy == 0
}
m_shutdown.store(true, std::memory_order_relaxed);
m_notifier.notify_all();
for (std::thread& th : m_threads)
{
th.join();
}
m_threads.clear();
}
private:
using Job = std::function<void()>;
std::vector<std::thread> m_threads;
std::queue<Job> m_jobQueue;
std::condition_variable m_notifier;
std::condition_variable m_finished;
std::mutex m_jobMutex;
std::atomic<bool> m_shutdown;
void createThreads(std::size_t numThreads)
{
// Settup threads
m_threads.reserve(numThreads);
for (int i = 0; i != numThreads; ++i)
{
m_threads.emplace_back(std::thread([this]()
{
// Infinite loop to consume tasks from queue and execute
while (true)
{
Job job;
{
std::unique_lock<std::mutex> lock(m_jobMutex);
m_notifier.wait(lock, [this] {return !m_jobQueue.empty() || m_shutdown.load(std::memory_order_relaxed); });
if (m_shutdown.load(std::memory_order_relaxed) || m_jobQueue.empty())
{
break;
}
job = std::move(m_jobQueue.front());
m_jobQueue.pop();
}
job();
m_finished.notify_one();
}
}));
}
}
};
I run this in a simple manner, like the following;
void threader (int x) {
std::cout<<"In threaded function: "<<x<<std::endl;
}
int main()
{
//outer loop
for (auto i = 0; i < 10000; i++) {
//Thread pool
int num_threads = std::thread::hardware_concurrency();
ThreadPool test_pool(num_threads);
// Assign work
for (int j = 0; j < 48; j++) {
test_pool.add_job(std::bind(threader, j));
}
test_pool.waitFinished();
std::cout<<"Thread Pool Done"<<std::endl;
}
}
After a number of outer loop iterations the join in the waitFinished hangs for a thread. The error only seems to occur after a number of iterations of the outer loop large enough. I have investigated this and can see that the threader function get's called 48 times, so looks like all threads complete. It seems to be the joining of threads in the waitFinished function of the ThreadPool that is causing the hang.
Is there something obvious I'm doing wrong ?
Many thanks!

clearing a grid with multhithreading

I am trying to clear a (game) grid, but whenever I multithread it, the time it takes to clear the grid increases with 3 seconds.
To my own logic this should not be the case since each Y value of the array hold a lot of X values (the X values store a class) which then should iterate through a so called objects property and perform objects.clear() on it, which also iterates through every element.
My code:
const int NUM_OF_AVAIL_THREADS = std::thread::hardware_concurrency() * 2;
ThreadPool* pool = new ThreadPool(NUM_OF_AVAIL_THREADS);
vector<future<void>> threads;
void Terrain::clear_grid()
{
for (int y = 0; y < tiles.size(); y++)
{
threads.push_back(pool->enqueue([&]()
{
array<TerrainTile, terrain_width>& h = tiles.at(y);
for (int x = 0; x < h.size(); x++)
{
h.at(x).objects.clear();
}
}));
}
pool->wait_and_clear_threads(threads);
}
TerrainTile looks like this:
class TerrainTile
{
public:
//TerrainTile *up, *down, *left, *right;
vector<TerrainTile*> exits;
bool visited = false;
size_t position_x;
size_t position_y;
TileType tile_type;
vector<TerrainTile*> neighbors;
vector<MovingAsset*> objects;
vector<Tank*> tanks;
vector<MovingAsset*> beams;
vector<MovingAsset*> get_collidable_assets();
void add_collidable_assets(MovingAsset* asset);
void add_neighbor(TerrainTile* neighbor);
};
How the tiles array looks like:
static constexpr size_t terrain_width = 80;
static constexpr size_t terrain_height = 45;
std::array<std::array<TerrainTile, terrain_width>, terrain_height> tiles;
am I missing out on something crucial here, or does the cost of creating a thread simply outweigh the time it takes to iterate through the arrays?
EDIT: THIS IS THE THREADPOOL
#pragma once
namespace Tmpl8
{
class ThreadPool; //Forward declare
class Worker;
class Worker
{
public:
//Instantiate the worker class by passing and storing the threadpool as a reference
Worker(ThreadPool& s) : pool(s) {}
inline void operator()();
private:
ThreadPool& pool;
};
class ThreadPool
{
public:
ThreadPool(size_t numThreads) : stop(false)
{
for (size_t i = 0; i < numThreads; ++i)
workers.push_back(std::thread(Worker(*this)));
}
~ThreadPool()
{
stop = true; // stop all threads
condition.notify_all();
for (auto& thread : workers)
thread.join();
}
void wait_and_clear_threads(vector<future<void>>& threads)
{
for (future<void>& t : threads)
{
t.wait();
}
threads.clear();
}
template <class T>
auto enqueue(T task) -> std::future<decltype(task())>
{
//Wrap the function in a packaged_task so we can return a future object
auto wrapper = std::make_shared<std::packaged_task<decltype(task())()>>(std::move(task));
//Scope to restrict critical section
{
//lock our queue and add the given task to it
std::unique_lock<std::mutex> lock(queue_mutex);
tasks.push_back([=]
{
(*wrapper)();
});
}
//Wake up a thread to start this task
condition.notify_one();
return wrapper->get_future();
}
private:
friend class Worker; //Gives access to the private variables of this class
std::vector<std::thread> workers;
std::deque<std::function<void()>> tasks;
std::condition_variable condition; //Wakes up a thread when work is available
std::mutex queue_mutex; //Lock for our queue
bool stop = false;
};
inline void Worker::operator()()
{
std::function<void()> task;
while (true)
{
//Scope to restrict critical section
//This is important because we don't want to hold the lock while executing the task,
//because that would make it so only one task can be run simultaneously (aka sequantial)
{
std::unique_lock<std::mutex> locker(pool.queue_mutex);
//Wait until some work is ready or we are stopping the threadpool
//Because of spurious wakeups we need to check if there is actually a task available or we are stopping
pool.condition.wait(locker, [=] { return pool.stop || !pool.tasks.empty(); });
if (pool.stop) break;
task = pool.tasks.front();
pool.tasks.pop_front();
}
task();
}
}
} // namespace Tmpl8

C++ function does not exit at the end of main after return 0

I was implementing a multi-threading database using a thread pool. Everything was fine and the functions can execute all the codes before return 0.
However, the function does not end after the return 0 in main(). I used _Exit(0) to force exit, which is not very nasty. I ran it with valgrind and there is no memory leak until the end of function.
As you might see in the picture, all the threads are detached, so I shouldn't be running into trouble with threads. But what could go run such that the program can't stop?
auto rwfunc = [](int &id,struct rwinfo &_rwinfo){
Qtable.vec_qpvec[_rwinfo.tableid][id].iswriter?
Writer(id,_rwinfo):Reader(id,_rwinfo);};
//my lambda function to insert my function into the thread
this_thread::yield();
if (COPYFLAG){
for (unsigned int i = 0; i < Qtable.tablenum; ++i) {
for (int j = 0; j < info_vec[i].vecsize; ++j) {
pool.push(rwfunc,j,info_vec[i]);
}
}
}
//pushing function into the pool
Minimal reproducible example Definition and Thread pool
#include <getopt.h>
#include <fstream>
#include <iostream>
#include <string>
#include <mutex>
#include <thread>
#include <condition_variable>
#include <sstream>
#include <iostream>
#include <semaphore.h>
#include <queue>
#include <functional>
//#pragma once
#include<thread>
#include<vector>
#include<queue>
#include<mutex>
#include<condition_variable>
#include<functional>
#include<future>
//////////////////////////
#define MAX_THREADS std::thread::hardware_concurrency() - 1;
bool EXITFLAG = false;
bool COPYFLAG = false;
//portable way to null the copy and assignment operators
#define NULL_COPY_AND_ASSIGN(T) \
T(const T& other) {(void)other;} \
void operator=(const T& other) { (void)other; }
/* ThreadPool class
It is a singleton. To prevent spawning
tons of threads, I made it a singleton */
class ThreadPool{
public:
//getInstance to allow the second constructor to be called
static ThreadPool& getInstance(int numThreads){
static ThreadPool instance(numThreads);
return instance;
}
void waitfinish(){
for (int i = 0; i < numThreads; ++i) {
}
Pool.clear();
}
//add any arg # function to queue
template <typename Func, typename... Args >
inline auto push(Func&& f, Args&&... args){
//get return type of the function
typedef decltype(f(args...)) retType;
//package the task
std::packaged_task<retType()> task(std::move(std::bind(f, args...)));
// lock jobqueue mutex, add job to the job queue
std::unique_lock<std::mutex> lock(JobMutex);
//get the future from the task before the task is moved into the jobqueue
std::future<retType> future = task.get_future();
//place the job into the queue
JobQueue.emplace( std::make_shared<AnyJob<retType> > (std::move(task)) );
//notify a thread that there is a new job
thread.notify_one();
//return the future for the function so the user can get the return value
return future;
}
inline int getThreadCount(){
return numThreads;
}
private:
//used polymorphism to store any type of function in the job queue
class Job {
private:
std::packaged_task<void()> func;
public:
virtual ~Job() {}
virtual void execute() = 0;
};
template <typename RetType>
class AnyJob : public Job {
private:
std::packaged_task<RetType()> func;
public:
AnyJob(std::packaged_task<RetType()> func) : func(std::move(func)) {}
void execute() {
func();
}
};
// end member classes
//member variables
int numThreads; // number of threads in the pool
std::vector<std::thread> Pool; //the actual thread pool
std::queue<std::shared_ptr<Job>> JobQueue;
std::condition_variable thread;// used to notify threads about available jobs
std::mutex JobMutex; // used to push/pop jobs to/from the queue
//end member variables
/* infinite loop function */
inline void threadManager() {
while (!EXITFLAG) {
std::unique_lock<std::mutex> lock(JobMutex);
thread.wait(lock, [this] {return !JobQueue.empty(); });
//strange bug where it will continue even if the job queue is empty
if (JobQueue.size() < 1)
continue;
(*JobQueue.front()).execute();
JobQueue.pop();
}
std::cerr<<"thread end!"<<std::endl;
}
/* Constructors */
ThreadPool(); //prevent default constructor from being called
//real constructor that is used
inline explicit ThreadPool(int _numThreads) : numThreads(_numThreads) {
int tmp = MAX_THREADS;
if(numThreads > tmp){
numThreads = tmp;
}
Pool.reserve(numThreads);
std::cerr<<"Thread pool core num: "<<numThreads<<std::endl;
for(int i = 0; i != numThreads; ++i){
Pool.emplace_back(std::thread(&ThreadPool::threadManager, this));
Pool.back().detach();
}
}
/* end constructors */
NULL_COPY_AND_ASSIGN(ThreadPool);
}; /* end ThreadPool Class */
using namespace std;
int COUNTER = 0;
mutex id_mtx;
struct rwinfo{
sem_t &FINISHED;
rwinfo(sem_t &finished):
FINISHED(finished)
{}
};
void work_todo(int &id,struct rwinfo &_rwinfo){
id_mtx.lock();
cout<<"Job "<<id<<" is done."<<endl;
COUNTER++;
cerr<<"COUNTER is now : "<<COUNTER<<endl;
if (COUNTER==10){
sem_post(&_rwinfo.FINISHED);
}
std::this_thread::sleep_for(500ms);
id_mtx.unlock();
}
ThreadPool& pool = ThreadPool::getInstance(8);
int main(int argc, char *argv[]) {
std::ios_base::sync_with_stdio(false);
sem_t FINISHED;
sem_init(&FINISHED,1,0);
auto mylambdafunc = [](int &i,struct rwinfo &_rwinfo){work_todo(i,_rwinfo);};
auto A = rwinfo(FINISHED);
for (int i = 0; i < 10; ++i) {
pool.push(mylambdafunc,i,A);
}
cerr<<"Start waiting"<<endl;
sem_wait(&FINISHED);
cerr<<"wake up"<<endl;
EXITFLAG = true;
cerr<<"Leaving"<<endl;
return 0;
}
Main
int main(int argc, char *argv[]) {
std::ios_base::sync_with_stdio(false);
sem_t FINISHED;
sem_init(&FINISHED,1,0);
auto mylambdafunc = [](int &i,struct rwinfo &_rwinfo){work_todo(i,_rwinfo);};
auto A = rwinfo(FINISHED);
for (int i = 0; i < 10; ++i) {
pool.push(mylambdafunc,i,A);
}
cerr<<"Start waiting"<<endl;
sem_wait(&FINISHED);
cerr<<"wake up"<<endl;
EXITFLAG = true;
cerr<<"Leaving"<<endl;
return 0;
}
This example can sometimes stuck and sometimes return correctly. I guess it is because it is a lot less jobs to do and jobs are a lot faster.
Also,another question. I was writing a mutithreading database. Some queries will change the data table and some will not. I treated it as the classic reader writer problem and queued every query as a job into the job queue of the thread pool.
The strange thing is that, The program runs actually fastest when there is only 1 thread in the thread pool. When it has 8 threads, it works a lot slower. I'm sure that the thread number in the thread pool is at most std::thread::hardware_concurrency() -1
What could be the possible reason?

C++ STL Producer multiple consumer where producer waits for free consumer before producing next value

My little consumer-producer problem had me stumped for some time. I didn't want an implementation where one producer pushes some data round-robin to the consumers, filling up their queues of data respectively.
I wanted to have one producer, x consumers, but the producer waits with producing new data until a consumer is free again. In my example there are 3 consumers so the producer creates a maximum of 3 objects of data at any given time. Since I don't like polling, the consumers were supposed to notify the producer when they are done. Sounds simple, but the solution I found doesn't please me. First the code.
#include "stdafx.h"
#include <mutex>
#include <iostream>
#include <future>
#include <map>
#include <atomic>
std::atomic_int totalconsumed;
class producer {
using runningmap_t = std::map<int, std::pair<std::future<void>, bool>>;
// Secure the map of futures.
std::mutex mutex_;
runningmap_t running_;
// Used for finished notification
std::mutex waitermutex_;
std::condition_variable waiter_;
// The magic number to limit the producer.
std::atomic<int> count_;
bool can_run();
void clean();
// Fake a source, e.g. filesystem scan.
int fakeiter;
int next();
bool has_next() const;
public:
producer() : fakeiter(50) {}
void run();
void notify(int value);
void wait();
};
class consumer {
producer& producer_;
public:
consumer(producer& producer) : producer_(producer) {}
void run(int value) {
std::this_thread::sleep_for(std::chrono::milliseconds(42));
std::cout << "Consumed " << value << " on (" << std::this_thread::get_id() << ")" << std::endl;
totalconsumed++;
producer_.notify(value);
}
};
// Only if less than three threads are active, another gets to run.
bool producer::can_run() { return count_.load() < 3; }
// Verify if there's something to consume
bool producer::has_next() const { return 0 != fakeiter; }
// Produce the next value for consumption.
int producer::next() { return --fakeiter; }
// Remove the futures that have reported to be finished.
void producer::clean()
{
for (auto it = running_.begin(); it != running_.end(); ) {
if (it->second.second) {
it = running_.erase(it);
}
else {
++it;
}
}
}
// Runs the producer. Creates a new consumer for every produced value. Max 3 at a time.
void producer::run()
{
while (has_next()) {
if (can_run()) {
auto c = next();
count_++;
auto future = std::async(&consumer::run, consumer(*this), c);
std::unique_lock<std::mutex> lock(mutex_);
running_[c] = std::make_pair(std::move(future), false);
clean();
}
else {
std::unique_lock<std::mutex> lock(waitermutex_);
waiter_.wait(lock);
}
}
}
// Consumers diligently tell the producer that they are finished.
void producer::notify(int value)
{
count_--;
mutex_.lock();
running_[value].second = true;
mutex_.unlock();
std::unique_lock<std::mutex> waiterlock(waitermutex_);
waiter_.notify_all();
}
// Wait for all consumers to finish.
void producer::wait()
{
while (!running_.empty()) {
mutex_.lock();
clean();
mutex_.unlock();
std::this_thread::sleep_for(std::chrono::milliseconds(10));
}
}
// Looks like the application entry point.
int main()
{
producer p;
std::thread pthread(&producer::run, &p);
pthread.join();
p.wait();
std::cout << std::endl << std::endl << "Total consumed " << totalconsumed.load() << std::endl;
return 0;
}
The part I don't like is the list of values mapped to the futures, called running_. I need to keep the future around until the consumer is actually done. I can't remove the future from the map in the notify method or else I'll kill the thread that is currently calling notify.
Am I missing something that could simplify this construct?
template<class T>
struct slotted_data {
std::size_t I;
T t;
};
template<class T>
using sink = std::function<void(T)>;
template<class T, std::size_t N>
struct async_slots {
bool produce( slotted_data<T> data ) {
if (terminate || data.I>=N) return false;
{
auto l = lock();
if (slots[data.I]) return false;
slots[data.I] = std::move(data.t);
}
cv.notify_one();
return true;
}
// rare use of non-lambda cv.wait in the wild!
bool consume(sink<slotted_data<T>> f) {
auto l = lock();
while(!terminate) {
for (auto& slot:slots) {
if (slot) {
auto r = std::move(*slot);
slot = std::nullopt;
f({std::size_t(&slot-slots.data()), std::move(r)}); // invoke in lock
return true;
}
}
cv.wait(l);
}
return false;
}
// easier and safer version:
std::optional<slotted_data<T>> consume() {
std::optional<slotted_data<T>> r;
bool worked = consume([&](auto&& data) { r = std::move(data); });
if (!worked) return {};
return r;
}
void finish() {
{
auto l = lock();
terminate = true;
}
cv.notify_all();
}
private:
auto lock() { return std::unique_lock<std::mutex>(m); }
std::mutex m;
std::condition_variable cv;
std::array< std::optional<T>, N > slots;
bool terminate = false;
};
async_slots provides a fixed number of slots and an awaitable consume. If you try to produce two things in the same slot, the producer function returns false and ignores you.
consume invokes the sink of the data inside the mutex in a continuation passing style. This permits atomic consumption.
We want to invert producer and consumer:
template<class T, std::size_t N>
struct slotted_consumer {
bool consume( std::size_t I, sink<T> sink ) {
std::optional<T> data;
std::condition_variable cv;
std::mutex m;
bool worked = slots.produce(
{
I,
[&](auto&& t){
{
std::unique_lock<std::mutex> l(m);
data.emplace(std::move(t));
}
cv.notify_one();
}
}
);
if (!worked) return false;
std::unique_lock<std::mutex> l(m);
cv.wait(l, [&]()->bool{
return (bool)data;
});
sink( std::move(*data) );
return true;
}
bool produce( T t ) {
return slots.consume(
[&](auto&& f) {
f.t( std::move(t) );
}
);
}
void finish() {
slots.finish();
}
private:
async_slots< sink<T>, N > slots;
};
we have to take some care to execute sink in a context where we are not holding the mutex of async_slots, which is why consume above is so strange.
Live example.
You share a slotted_consumer< int, 3 > slots. The producing thread repeatedly calls slots.produce(42);. It blocks until a new consumer lines up.
Consumer #2 calls slots.consume( 2, [&](int x){ /* code to consume x */ } ), and #1 and #0 pass their slot numbers as well.
All 3 consumers can be waiting for the next production. The above system defaults to feeding #0 first if it is waiting for more work; we could make it "fair" at a cost of keeping a bit more state.

How to use the same thread pool batch by batch

I found a good implementation of boost based thread pool which is an improvement over this and this . it is very easy to understand and test. It looks like this:
#include <boost/thread/thread.hpp>
#include <boost/asio.hpp>
// the actual thread pool
struct ThreadPool {
ThreadPool(std::size_t);
template<class F>
void enqueue(F f);
~ThreadPool();
// the io_service we are wrapping
boost::asio::io_service io_service;
// dont let io_service stop
boost::shared_ptr<boost::asio::io_service::work> work;
//the threads
boost::thread_group threads;
};
// the constructor just launches some amount of workers
ThreadPool::ThreadPool(size_t nThreads)
:io_service()
,work(new boost::asio::io_service::work(io_service))
{
for ( std::size_t i = 0; i < nThreads; ++i ) {
threads.create_thread(boost::bind(&boost::asio::io_service::run, &io_service));
}
}
// add new work item to the pool
template<class F>
void ThreadPool::enqueue(F f) {
io_service.post(f);
}
// the destructor joins all threads
ThreadPool::~ThreadPool() {
work.reset();
io_service.run();
}
//tester:
void f(int i)
{
std::cout << "hello " << i << std::endl;
boost::this_thread::sleep(boost::posix_time::milliseconds(300));
std::cout << "world " << i << std::endl;
}
//it can be tested via:
int main() {
// create a thread pool of 4 worker threads
ThreadPool pool(4);
// queue a bunch of "work items"
for( int i = 0; i < 8; ++i ) {
std::cout << "task " << i << " created" << std::endl;
pool.enqueue(boost::bind(&f,i));
}
}
g++ ThreadPool-4.cpp -lboost_system -lboost_thread
Now the question:
I need to know how I can modify the implementation to be able to use this thread pool batch by batch- only when the first set of my work is fully completed by the thread pool, I need to supply the second set and so on. I tried to play with .run() and .reset() (found in the destructor) between the batch jobs but no luck:
//adding methods to the tread pool :
//reset the asio work and thread
void ThreadPool::reset(size_t nThreads){
work.reset(new boost::asio::io_service::work(io_service));
for ( std::size_t i = 0; i < nThreads; ++i ) {
threads.create_thread(boost::bind(&boost::asio::io_service::run, &io_service));
}
std::cout << "group size : " << threads.size() << std::endl;
}
//join, and even , interrupt
void ThreadPool::joinAll(){
threads.join_all();
threads.interrupt_all();
}
//tester
int main() {
// create a thread pool of 4 worker threads
ThreadPool pool(4);
// queue a bunch of "work items"
for( int i = 0; i < 20; ++i ) {
std::cout << "task " << i << " created" << std::endl;
pool.enqueue(boost::bind(&f,i));
}
//here i play with the asio work , io_service and and the thread group
pool.work.reset();
pool.io_service.run();
std::cout << "after run" << std::endl;
pool.joinAll();
std::cout << "after join all" << std::endl;
pool.reset(4);
std::cout << "new thread group size: " << pool.threads.size() << std::endl;///btw: new threa group size is 8. I expected 4!
// second batch... never completes
for( int i = 20; i < 30; ++i ) {
pool.enqueue(boost::bind(&f,i));
}
}
The second batch doesn't complete. I will appreciate if you help me fix this.
thank you
UPDATE- Solution:
based on a solution by Nik, I developed a solution using condition variable. Just add the following code to the original class:
// add new work item to the pool
template<class F>
void ThreadPool::enqueue(F f) {
{
boost::unique_lock<boost::mutex> lock(mutex_);
nTasks ++;
}
//forwarding the job to wrapper()
void (ThreadPool::*ff)(boost::tuple<F>) = &ThreadPool::wrapper<F>;
io_service.post(boost::bind(ff, this, boost::make_tuple(f))); //using a tuple seems to be the only practical way. it is mentioned in boost examples.
}
//run+notfiy
template<class F>
void ThreadPool::wrapper(boost::tuple<F> f) {
boost::get<0>(f)();//this is the task (function and its argument) that has to be executed by a thread
{
boost::unique_lock<boost::mutex> lock(mutex_);
nTasks --;
cond.notify_one();
}
}
void ThreadPool::wait(){
boost::unique_lock<boost::mutex> lock(mutex_);
while(nTasks){
cond.wait(lock);
}
}
Now you may call wait() method between batches of work.
one problem however:
Even after the last batch, I have to call pool.wait() because the thread pool's scope will end after that and thread pool's destructor will be invoked. During destruction, some of the jobs are done and it will be the time to call the .notify(). As the Threadpool::mutex during destruction is invalidated, exceptions occur during locking. your suggestion will be appreciated.
A condition variable could be used to achieve desired result.
Implement a function responsible for calling enqueue the tasks and wait on a condition variable.
Condition variable is notified when all tasks assigned to the pool are complete.
Every thread checks if the jobs are complete or not. Once all the jobs are complete condition variable is notified.
//An example of what you could try, this just an hint for what could be explored.
void jobScheduler()
{
int jobs = numberOfJobs; //this could vary and can be made shared memory
// queue a bunch of "work items"
for( int i = 0; i < jobs; ++i )
{
std::cout << "task " << i << " created" << std::endl;
pool.enqueue(boost::bind(&f,i));
}
//wait on a condition variable
boost::mutex::scoped_lock lock(the_mutex);
conditionVariable.wait(lock); //Have this varibale notified from any thread which realizes that all jobs are complete.
}
Solution 2
I have a new working solution, with some assumption about syntax of functions being called back, but that could be changed as per requirement.
Continuing on the lines of above I use condition variable for managing my tasks but with a difference.
Create a queue of jobs.
A Manager which waits for new JOBS in the queue.
Once a job is received a notification is sent to waiting manager about the same.
Worker maintains a handle to Manager. When all the tasks assigned are complete Manger is informed.
Manager on getting a call for end, stops waiting for new JOBS in queue and exits.
#include <iostream>
#include <queue>
#include <boost/thread/thread.hpp>
#include <boost/asio.hpp>
#include <boost/tuple/tuple.hpp>
#include <boost/tuple/tuple_io.hpp>
#include <boost/function.hpp>
///JOB Queue hold all jobs required to be executed
template<typename Job>
class JobQueue
{
private:
std::queue<Job> _queue;
mutable boost::mutex _mutex;
boost::condition_variable _conditionVariable;
public:
void push(Job const& job)
{
boost::mutex::scoped_lock lock(_mutex);
_queue.push(job);
lock.unlock();
_conditionVariable.notify_one();
}
bool empty() const
{
boost::mutex::scoped_lock lock(_mutex);
return _queue.empty();
}
bool tryPop(Job& poppedValue)
{
boost::mutex::scoped_lock lock(_mutex);
if(_queue.empty())
{
return false;
}
poppedValue = _queue.front();
_queue.pop();
return true;
}
void waitAndPop(Job& poppedValue)
{
boost::mutex::scoped_lock lock(_mutex);
while(_queue.empty())
{
_conditionVariable.wait(lock);
}
poppedValue = _queue.front();
_queue.pop();
}
};
///Thread pool for posting jobs to io service
class ThreadPool
{
public :
ThreadPool( int noOfThreads = 1) ;
~ThreadPool() ;
template< class func >
void post( func f ) ;
boost::asio::io_service &getIoService() ;
private :
boost::asio::io_service _ioService;
boost::asio::io_service::work _work ;
boost::thread_group _threads;
};
inline ThreadPool::ThreadPool( int noOfThreads )
: _work( _ioService )
{
for(int i = 0; i < noOfThreads ; ++i) // 4
_threads.create_thread(boost::bind(&boost::asio::io_service::run, &_ioService));
}
inline ThreadPool::~ThreadPool()
{
_ioService.stop() ;
_threads.join_all() ;
}
inline boost::asio::io_service &ThreadPool::getIoService()
{
return _ioService ;
}
template< class func >
void ThreadPool::post( func f )
{
_ioService.post( f ) ;
}
template<typename T>
class Manager;
///Worker doing some work.
template<typename T>
class Worker{
T _data;
int _taskList;
boost::mutex _mutex;
Manager<T>* _hndl;
public:
Worker(T data, int task, Manager<T>* hndle):
_data(data),
_taskList(task),
_hndl(hndle)
{
}
bool job()
{
boost::mutex::scoped_lock lock(_mutex);
std::cout<<"...Men at work..."<<++_data<<std::endl;
--_taskList;
if(taskDone())
_hndl->end();
}
bool taskDone()
{
std::cout<<"Tasks "<<_taskList<<std::endl<<std::endl;
if(_taskList == 0)
{
std::cout<<"Tasks done "<<std::endl;
return true;
}
else false;
}
};
///Job handler waits for new jobs and
///execute them as when a new job is received using Thread Pool.
//Once all jobs are done hndler exits.
template<typename T>
class Manager{
public:
typedef boost::function< bool (Worker<T>*)> Func;
Manager(int threadCount):
_threadCount(threadCount),
_isWorkCompleted(false)
{
_pool = new ThreadPool(_threadCount);
boost::thread jobRunner(&Manager::execute, this);
}
void add(Func f, Worker<T>* instance)
{
Job job(instance, f);
_jobQueue.push(job);
}
void end()
{
boost::mutex::scoped_lock lock(_mutex);
_isWorkCompleted = true;
//send a dummy job
add( NULL, NULL);
}
void workComplete()
{
std::cout<<"Job well done."<<std::endl;
}
bool isWorkDone()
{
boost::mutex::scoped_lock lock(_mutex);
if(_isWorkCompleted)
return true;
return false;
}
void execute()
{
Job job;
while(!isWorkDone())
{
_jobQueue.waitAndPop(job);
Func f = boost::get<1>(job);
Worker<T>* ptr = boost::get<0>(job);
if(f)
{
_pool->post(boost::bind(f, ptr));
}
else
break;
}
std::cout<<"Complete"<<std::endl;
}
private:
ThreadPool *_pool;
int _threadCount;
typedef boost::tuple<Worker<T>*, Func > Job;
JobQueue<Job> _jobQueue;
bool _isWorkCompleted;
boost::mutex _mutex;
};
typedef boost::function< bool (Worker<int>*)> IntFunc;
typedef boost::function< bool (Worker<char>*)> CharFunc;
int main()
{
boost::asio::io_service ioService;
Manager<int> jobHndl(2);
Worker<int> wrk1(0,4, &jobHndl);
IntFunc f= &Worker<int>::job;
jobHndl.add(f, &wrk1);
jobHndl.add(f, &wrk1);
jobHndl.add(f, &wrk1);
jobHndl.add(f, &wrk1);
Manager<char> jobHndl2(2);
Worker<char> wrk2(0,'a', &jobHndl2);
CharFunc f2= &Worker<char>::job;
jobHndl2.add(f2, &wrk2);
jobHndl2.add(f2, &wrk2);
jobHndl2.add(f2, &wrk2);
jobHndl2.add(f2, &wrk2);
ioService.run();
while(1){}
return 0;
}
The third solution is the best (easiest IMHO), the one from the asio father;
You have to understand that you will stay blocked on "Threads.join_all()" statement while there is still a thread alive. Then you can call again with other work to do.
May be an alternative is to use taskqueue "A task queue that uses a thread pool to complete tasks in parallel", you fill up the queue with your works, it ensures that there will be no more than 'x' tasks working in parallel.
Sample is easy to understand.
May be you need to add that member function to TaskQueue class in order to solve your "pool.wait()" issue:
void WaitForEmpty(){
while( NumPendingTasks() || threads_.size() ){
boost::wait_for_any(futures_.begin(), futures_.end());
}
}
Enjoy !