Boost group_threads Maximal number of parallel thread - c++

i want to apply boost group_thread in my program with a maximal number of Threads. For example
int maxNumberOfThreads
boost::thread_group group;
for (int i = 0; i < N; ++i)
//create new if group.size() is smaller then maximal number of threads
group.create_thread(Worker);
group.join_all();
Someone has an idea how i can realize this ?
Because it will be very inefficient when i start N numbers of thread.
Thank you for your help

What you seem to want is a thread pool.
You can use boost::thread::hardware_concurrency() to determine the number of (logical) cores available on your particular system.
Here's one I rolled for an answer last week:
#include <boost/thread.hpp>
#include <boost/phoenix.hpp>
#include <boost/optional.hpp>
using namespace boost;
using namespace boost::phoenix::arg_names;
boost::atomic_size_t counter(0ul);
class thread_pool
{
private:
mutex mx;
condition_variable cv;
typedef function<void()> job_t;
std::deque<job_t> _queue;
thread_group pool;
boost::atomic_bool shutdown;
static void worker_thread(thread_pool& q)
{
while (optional<job_t> job = q.dequeue())
(*job)();
}
public:
thread_pool() : shutdown(false) {
for (unsigned i = 0; i < boost::thread::hardware_concurrency(); ++i)
pool.create_thread(bind(worker_thread, ref(*this)));
}
void enqueue(job_t job)
{
lock_guard<mutex> lk(mx);
_queue.push_back(job);
cv.notify_one();
}
optional<job_t> dequeue()
{
unique_lock<mutex> lk(mx);
namespace phx = boost::phoenix;
cv.wait(lk, phx::ref(shutdown) || !phx::empty(phx::ref(_queue)));
if (_queue.empty())
return none;
job_t job = _queue.front();
_queue.pop_front();
return job;
}
~thread_pool()
{
shutdown = true;
{
lock_guard<mutex> lk(mx);
cv.notify_all();
}
pool.join_all();
}
};
A typical way to use that is also in that answer:
static const size_t bignumber = 1 << 20;
class myClass
{
thread_pool pool; // uses 1 thread per core
public:
void launch_jobs()
{
std::cout << "enqueuing jobs... " << std::flush;
for(size_t i=0; i<bignumber; ++i)
{
for(int j=0; j<2; ++j) {
pool.enqueue(bind(&myClass::myFunction, this, j, i));
}
}
std::cout << "done\n";
}
private:
void myFunction(int i, int j)
{
boost::this_thread::sleep_for(boost::chrono::milliseconds(1));
counter += 1;
}
};
int main()
{
myClass instance;
instance.launch_jobs();
size_t last = 0;
while (counter < (2*bignumber))
{
boost::this_thread::sleep_for(boost::chrono::milliseconds(100));
if ((counter >> 4u) > last)
{
std::cout << "Progress: " << counter << "/" << (bignumber*2) << "\n";
last = counter >> 4u;
}
}
}
For bonus, at that question, in the comments to another answer, I also posted an equivalent solution based on a lock-free job queue implementation:
boost thread throwing exception "thread_resource_error: resource temporarily unavailable"

This is my (imperfect) implementation :
/**
* \author Christophe Dumeunier
* \brief Extension of boost::thread_group managing a maximum number of threads running in parallel
*/
class thread_group_max : public boost::thread_group
{
public:
/**
* \brief Instanciate a group for threads
* \param max_running_threads Maximum number of threads running in parallel, if 0 use the number of cores
* \param max_sleeping_time Maximum sleeping time (seconds) between two checks for finished threads (must be > sleeping_time_start)
* \param sleeping_time_grow Coefficient increasing sleeping time while waiting for finished threads (must be > 1)
* \param sleeping_time_start Initial sleeping time (must be > 0)
*/
explicit thread_group_max(std::size_t max_running_threads = 0, float max_sleeping_time = 1.0f,
float sleeping_time_grow = 1.1f, float sleeping_time_start = 0.001f);
/**
* \brief Destroy the group
* \note Doesn't join the unterminated threads
*/
~thread_group_max();
/** \brief Wait for an available slot and then create a new thread and launch it */
template<typename F>
boost::thread* create_thread(F f);
private:
std::size_t maxRunningThreads; //!< Maximum number of running threads
float maxSleepingTime; //!< Maximum sleeping time between two checks for finished threads
float sleepingTimeStart; //!< Initial sleeping time
float sleepingTimeGrow; //!< Coefficient increasing sleeping time while waiting for finished threads
std::set<boost::thread*> runningThreads; //!< Pointers to running or finished-but-not-removed-yet threads
};
thread_group_max::thread_group_max(std::size_t max_running_threads, float max_sleeping_time, float sleeping_time_grow, float sleeping_time_start) :
boost::thread_group(),
maxRunningThreads(max_running_threads == 0 ? std::max(boost::thread::hardware_concurrency(), 1u) : max_running_threads),
maxSleepingTime(max_sleeping_time),
sleepingTimeStart(sleeping_time_start),
sleepingTimeGrow(sleeping_time_grow),
runningThreads()
{
assert(this->maxRunningThreads > 0);
assert(this->maxSleepingTime >= this->sleepingTimeStart);
assert(this->sleepingTimeStart > 0.0f);
assert(this->sleepingTimeGrow > 1.0f);
}
thread_group_max::~thread_group_max()
{}
template<typename F>
boost::thread* thread_group_max::create_thread(F f)
{
// First, try to clean already finished threads
if(this->runningThreads.size() >= this->maxRunningThreads)
{
for(std::set<boost::thread*>::iterator it = this->runningThreads.begin(); it != this->runningThreads.end();)
{
const std::set<boost::thread*>::iterator jt = it++;
if((*jt)->timed_join(boost::posix_time::milliseconds(0))) /// #todo timed_join is deprecated
this->runningThreads.erase(jt);
}
}
// If no finished thread found, wait for it
if(this->runningThreads.size() >= this->maxRunningThreads)
{
float sleeping_time = this->sleepingTimeStart;
do
{
boost::this_thread::sleep(boost::posix_time::milliseconds((long int)(1000.0f * sleeping_time)));
for(std::set<boost::thread*>::iterator it = this->runningThreads.begin(); it != this->runningThreads.end();)
{
const std::set<boost::thread*>::iterator jt = it++;
if((*jt)->timed_join(boost::posix_time::milliseconds(0))) /// #todo timed_join is deprecated
this->runningThreads.erase(jt);
}
if(sleeping_time < this->maxSleepingTime)
{
sleeping_time *= this->sleepingTimeGrow;
if(sleeping_time > this->maxSleepingTime)
sleeping_time = this->maxSleepingTime;
}
} while(this->runningThreads.size() >= this->maxRunningThreads);
}
// Now, at least 1 slot is available, use it
return *this->runningThreads.insert(this->boost::thread_group::create_thread(f)).first;
}
Example of use:
thread_group_max group(num_threads);
for(std::size_t i = 0; i < jobs.size(); ++i)
group.create_thread(boost::bind(&my_run_job_function, boost::ref(job[i])));
group.join_all();

Related

Thread pool with job queue gets stuck

I want to split jobs among multiple std::thread workers and continue once they are all done.
To do so, I implemented a thread pool class mainly based on this SO answer.
I noticed, however, that my benchmarks can get stuck, running forever, without any errors thrown.
I wrote a minimal reproducing code, enclosed at the end.
Based on terminal output, the issue seems to occur when the jobs are being queued.
I checked videos (1, 2), documentation (3) and blog posts (4).
I tried replacing the type of the locks, using atomics.
I could not find the underlying cause.
Here is the snippet to replicate the issue.
The program repeatedly counts the odd elements in the test vector.
#include <atomic>
#include <condition_variable>
#include <functional>
#include <iostream>
#include <mutex>
#include <queue>
#include <thread>
#include <vector>
class Pool {
public:
const int worker_count;
bool to_terminate = false;
std::atomic<int> unfinished_tasks = 0;
std::mutex mutex;
std::condition_variable condition;
std::vector<std::thread> threads;
std::queue<std::function<void()>> jobs;
void thread_loop()
{
while (true) {
std::function<void()> job;
{
std::unique_lock<std::mutex> lock(mutex);
condition.wait(lock, [&] { return (!jobs.empty()) || to_terminate; });
if (to_terminate)
return;
job = jobs.front();
jobs.pop();
}
job();
unfinished_tasks -= 1;
}
}
public:
Pool(int size) : worker_count(size)
{
if (size < 0)
throw std::invalid_argument("Worker count needs to be a positive integer");
for (int i = 0; i < worker_count; ++i)
threads.push_back(std::thread(&Pool::thread_loop, this));
};
~Pool()
{
{
std::unique_lock lock(mutex);
to_terminate = true;
}
condition.notify_all();
for (auto &thread : threads)
thread.join();
threads.clear();
};
void queue_job(const std::function<void()> &job)
{
{
std::unique_lock<std::mutex> lock(mutex);
jobs.push(job);
unfinished_tasks += 1;
// std::cout << unfinished_tasks;
}
condition.notify_one();
}
void wait()
{
while (unfinished_tasks) {
; // spinlock
};
}
};
int main()
{
constexpr int worker_count = 8;
constexpr int vector_size = 1 << 10;
Pool pool = Pool(worker_count);
std::vector<int> test_vector;
test_vector.reserve(vector_size);
for (int i = 0; i < vector_size; ++i)
test_vector.push_back(i);
std::vector<int> worker_odd_counts(worker_count, 0);
std::function<void(int)> worker_task = [&](int thread_id) {
int chunk_size = vector_size / (worker_count) + 1;
int my_start = thread_id * chunk_size;
int my_end = std::min(my_start + chunk_size, vector_size);
int local_odd_count = 0;
for (int ii = my_start; ii < my_end; ++ii)
if (test_vector[ii] % 2 != 0)
++local_odd_count;
worker_odd_counts[thread_id] = local_odd_count;
};
for (int iteration = 0;; ++iteration) {
std::cout << "Jobs.." << std::flush;
for (int i = 0; i < worker_count; ++i)
pool.queue_job([&worker_task, i] { worker_task(i); });
std::cout << "..queued. " << std::flush;
pool.wait();
int odd_count = 0;
for (auto elem : worker_odd_counts)
odd_count += elem;
std::cout << "Iter:" << iteration << ". Odd:" << odd_count << '\n';
}
}
Here is the terminal output of one specific run:
[...]
Jobs....queued. Iter:2994. Odd:512
Jobs....queued. Iter:2995. Odd:512
Jobs..
Edit:
The error occurres using GCC 12.2.0 x86_64-w64-mingw32 on Windows 10 with AMD Ryzen 4750U CPU. I do not get past 15k iterations .
Using Visual Studio Community 2022, I got past 1.5M iterations (and stopped it myself). Thanks #IgorTandetnik for pointing out the latter.
Mingw doesn’t natively support multithreading on Windows. They supporting threads in their C++ standard library over the POSIX API, and winpthreads compatibility layer which implements that API on top of the Windows OS threads.
I think your error is not in the C++ code, but in the computer setup. Do the following.
Use the compiler from x86_64-12.2.0-release-posix-seh-ucrt-rt_v10-rev2.7z archive, there.
Don’t forget the binary built that way depends on a bunch of DLL files provided by the compiler: libgcc_s_seh-1.dll, libwinpthread-1.dll and libstdc++-6.dll. You must use exactly the same version of these DLL which were shipped with mingw. If you have some other versions of these DLLs anywhere in your %PATH%, expect all kinds of fails.
Couple general notes.
Linux-first C++ compilers like gcc have issues on Windows. A path of least resistance is using Visual C++ instead. If you want your software to build on other platforms as well, consider cmake to abstract away the compiler.
Windows already includes a thread pool implementation, since Vista. The API is easy to use, you only need 4 functions: CreateThreadpoolWork, SubmitThreadpoolWork, WaitForThreadpoolWorkCallbacks, and CloseThreadpoolWork. Example.
The first thing you should do is split the queue from the thread pool. They are both tricky enough, writing both of them comingled in one class is asking for trouble.
This also allows you to unit test the queue without the pool.
template<class Payload>
class MutexQueue {
public:
std::optional<Payload> wait_and_pop();
void push(Payload);
void terminate_queue();
bool queue_is_terminated() const;
private:
mutable std::mutex m;
std::condition_variable cv;
std::deque<Payload> q;
bool terminated = false;
std::unique_lock<std::mutex> lock() const {
return std::unique_lock<std::mutex>(m);
}
};
this is a bit easier to write than the thread pool.
void push(Payload p) {
{
auto l = lock();
if (terminate) return;
q.push_back(std::move(p));
}
cv.notify_one();
}
void terminate_queue() {
{
auto l = lock(); // YOU CANNOT SKIP THIS LOCK, even if terminate is atomic
terminate = true;
q.clear();
}
cv.notify_all();
}
bool queue_is_terminated() const {
auto l = lock(); // if you make terminate atomic, you CAN skip this lock
return terminate;
}
std::optional<Payload> wait_and_pop() {
auto l = lock();
cv.wait(l, [&]{ return terminate || !q.empty(); }
if (terminate) return std::nullopt;
auto r = std::move(q.front());
q.pop_front();
return std::move(r);
}
there we go.
Now our thread pool is simpler.
struct ThreadPool {
explicit ThreadPool(std::size_t n) {
create_threads(n);
}
std::future<void> push_task(std::function<void()> f) {
std::packaged_task<void()> p = std::move(f);
auto r = p.get_future();
q.push( std::move(p) );
return r;
}
void terminate_pool() {
q.terminate_queue();
terminate_threads();
}
~ThreadPool() {
terminate_pool();
}
private:
MutexQueue<std::packaged_task<void()>> q;
std::vector<std::thread> threads;
void terminate_threads() {
for(auto& thread:threads)
thread.join();
threads.clear();
}
static void thread_task( MutexQueue<std::packaged_task<void()>>* pq ) {
if (!pq) return;
while (auto task = pq->wait_and_pop()) {
(*task)();
}
}
void create_threads(std::size_t n) {
for (std::size_t i = 0; i < n; ++i) {
threads.push_back( std::thread( thread_task, &q ) );
}
}
I cannot spot an error in your code. But with the above, you can test a split of the queue from the pool.
The queue will work with pthreads or other primitives.

clearing a grid with multhithreading

I am trying to clear a (game) grid, but whenever I multithread it, the time it takes to clear the grid increases with 3 seconds.
To my own logic this should not be the case since each Y value of the array hold a lot of X values (the X values store a class) which then should iterate through a so called objects property and perform objects.clear() on it, which also iterates through every element.
My code:
const int NUM_OF_AVAIL_THREADS = std::thread::hardware_concurrency() * 2;
ThreadPool* pool = new ThreadPool(NUM_OF_AVAIL_THREADS);
vector<future<void>> threads;
void Terrain::clear_grid()
{
for (int y = 0; y < tiles.size(); y++)
{
threads.push_back(pool->enqueue([&]()
{
array<TerrainTile, terrain_width>& h = tiles.at(y);
for (int x = 0; x < h.size(); x++)
{
h.at(x).objects.clear();
}
}));
}
pool->wait_and_clear_threads(threads);
}
TerrainTile looks like this:
class TerrainTile
{
public:
//TerrainTile *up, *down, *left, *right;
vector<TerrainTile*> exits;
bool visited = false;
size_t position_x;
size_t position_y;
TileType tile_type;
vector<TerrainTile*> neighbors;
vector<MovingAsset*> objects;
vector<Tank*> tanks;
vector<MovingAsset*> beams;
vector<MovingAsset*> get_collidable_assets();
void add_collidable_assets(MovingAsset* asset);
void add_neighbor(TerrainTile* neighbor);
};
How the tiles array looks like:
static constexpr size_t terrain_width = 80;
static constexpr size_t terrain_height = 45;
std::array<std::array<TerrainTile, terrain_width>, terrain_height> tiles;
am I missing out on something crucial here, or does the cost of creating a thread simply outweigh the time it takes to iterate through the arrays?
EDIT: THIS IS THE THREADPOOL
#pragma once
namespace Tmpl8
{
class ThreadPool; //Forward declare
class Worker;
class Worker
{
public:
//Instantiate the worker class by passing and storing the threadpool as a reference
Worker(ThreadPool& s) : pool(s) {}
inline void operator()();
private:
ThreadPool& pool;
};
class ThreadPool
{
public:
ThreadPool(size_t numThreads) : stop(false)
{
for (size_t i = 0; i < numThreads; ++i)
workers.push_back(std::thread(Worker(*this)));
}
~ThreadPool()
{
stop = true; // stop all threads
condition.notify_all();
for (auto& thread : workers)
thread.join();
}
void wait_and_clear_threads(vector<future<void>>& threads)
{
for (future<void>& t : threads)
{
t.wait();
}
threads.clear();
}
template <class T>
auto enqueue(T task) -> std::future<decltype(task())>
{
//Wrap the function in a packaged_task so we can return a future object
auto wrapper = std::make_shared<std::packaged_task<decltype(task())()>>(std::move(task));
//Scope to restrict critical section
{
//lock our queue and add the given task to it
std::unique_lock<std::mutex> lock(queue_mutex);
tasks.push_back([=]
{
(*wrapper)();
});
}
//Wake up a thread to start this task
condition.notify_one();
return wrapper->get_future();
}
private:
friend class Worker; //Gives access to the private variables of this class
std::vector<std::thread> workers;
std::deque<std::function<void()>> tasks;
std::condition_variable condition; //Wakes up a thread when work is available
std::mutex queue_mutex; //Lock for our queue
bool stop = false;
};
inline void Worker::operator()()
{
std::function<void()> task;
while (true)
{
//Scope to restrict critical section
//This is important because we don't want to hold the lock while executing the task,
//because that would make it so only one task can be run simultaneously (aka sequantial)
{
std::unique_lock<std::mutex> locker(pool.queue_mutex);
//Wait until some work is ready or we are stopping the threadpool
//Because of spurious wakeups we need to check if there is actually a task available or we are stopping
pool.condition.wait(locker, [=] { return pool.stop || !pool.tasks.empty(); });
if (pool.stop) break;
task = pool.tasks.front();
pool.tasks.pop_front();
}
task();
}
}
} // namespace Tmpl8

std::condition_variable calling notify_all more than once

First, let me introduce you to my problem.
My code looks like this:
#include <iostream>
#include <thread>
#include <condition_variable>
std::mutex mtx;
std::mutex cvMtx;
std::mutex mtx2;
bool ready{false};
std::condition_variable cv;
int threadsFinishedCurrentLevel{0};
void tfunc() {
for(int i = 0; i < 5; i++) {
//do something
for (int j = 0; j < 10000; j++) {
std::cout << j << std::endl;
}
//this is i-th level
mtx2.lock();
threadsFinishedCurrentLevel++;
if (threadsFinishedCurrentLevel == 2) {
//this is last thread in current level
threadsFinishedCurrentLevel = 0;
cvMtx.unlock();
}
mtx2.unlock();
{
//wait for notify
unique_lock<mutex> lck(mtx);
while (!ready) cv_.wait(lck);
}
}
}
int main() {
cvMtx.lock(); //init
std::thread t1(tfunc);
std::thread t2(tfunc);
for (int i = 0; i < 5; i++) {
cvMtx.lock();
{
unique_lock<mutex> lck(mtx);
ready = true;
cv.notify_all();
}
}
t1.join();
t2.join();
return 0;
}
I have 2 threads. My computation consists of levels(for this example, lets say we have 5 levels). On the same level, computation can be divided to threads. Each thread then calculates part of a problem. When i want to step to the next(higher) level, lower level must be first done. So my idea is something like this. When last thread on the current level is done, it unlocks main thread, so it can notify all of the threads to continue to next level. But this notify has to be called more then once. Because there are plenty of these levels. Can this condition_variable be restarted or something? Or do I need for each level one condition_variable? So for example, when i have 1000 levels, i need to allocate dynamically 1000x condition_variable?
Is it just me or you are trying to block the main thread with a mutex (which is your way of trying to notify it when all threads are done?), I mean that's not the task of a mutex. That's where the condition variable should be used.
// New condition_variable, to nofity main thread when child is done with level
std::condition_variable cv2;
// When a child is done, it will update this counter
int counter = 0; // This is already protected by cvMtx, otherwise it could be atomic.
// This is to sync cout
std::mutex cout_mutex;
void tfunc()
{
for (int i = 0; i < 5; i++)
{
{
std::lock_guard<std::mutex> l(cout_mutex);
std::cout << "Level " << i + 1 << " " << std::this_thread::get_id() << std::endl;
}
{
std::lock_guard<std::mutex> l(cvMtx);
counter++; // update counter &
}
cv2.notify_all(); // notify main thread we are done.
{
//wait for notify
unique_lock<mutex> lck(mtx);
cv.wait(lck);
// Note that I've removed the "ready" flag here
// That's because u would need multiple ready flags to make that work
}
}
}
int main()
{
std::thread t1(tfunc);
std::thread t2(tfunc);
for (int i = 0; i < 5; i++)
{
{
unique_lock<mutex> lck(cvMtx);
// Wait takes a predicate which u can take advantage of
cv2.wait(lck, [] { return (counter == 2); });
counter = 0;
// This thread will get notified multiple times
// But it only will wake up when counter matches 2
// Which equals to how many threads we've created.
}
// Sleeping a bit to know the code is working
std::this_thread::sleep_for(std::chrono::milliseconds(1000));
// Wake up all threds and continue to next level.
unique_lock<mutex> lck(mtx);
cv.notify_all();
}
t1.join();
t2.join();
return 0;
}
The synchronization can be done with a single counter, threads increment the counter under lock and check for the counter to reach a multiple of the number of concurrent threads. This greatly simplifies the logic. I've made this change and also grouped the shared variables into a class, and provided member functions to access them. To avoid false sharing I've ensured that variables that are read-only are separate from those that are read-write by the threads, and also separated read-write variables by usage. The use of global variables is discouraged, see C++ Core Guidelines for this and other good advice.
The simplified code follows, you can see it live in ideone. Note: it looks like there isn't true concurrency in ideone, you'll have to run this on a multi-core environment to actually test hardware concurrency.
//http://stackoverflow.com/questions/35318942/stdcondition-variable-calling-notify-all-more-than-once
#include <iostream>
#include <functional>
#include <thread>
#include <mutex>
#include <vector>
#include <condition_variable>
static constexpr size_t CACHE_LINE_SIZE = 64;
static constexpr size_t NTHREADS = 2;
static constexpr size_t NLEVELS = 5;
static constexpr size_t NITERATIONS = 100;
class Synchronize
{
alignas(CACHE_LINE_SIZE) // read/write while threads are busy working
std::mutex mtx_std_cout;
alignas(CACHE_LINE_SIZE) // read/write while threads are synchronizing at level
std::mutex cvMtx;
std::condition_variable cv;
size_t threadsFinished{0};
alignas(CACHE_LINE_SIZE) // read-only parameters
const size_t n_threads;
const size_t n_levels;
public: // class Synchronize owns unique resources:
// - must be explicitly constructed
// - disallow default ctor,
// - disallow copy/move ctor and
// - disallow copy/move assignment
Synchronize( Synchronize const& ) = delete;
Synchronize & operator=( Synchronize const& ) = delete;
explicit Synchronize( size_t nthreads, size_t nlevels )
: n_threads{nthreads}, n_levels{nlevels}
{}
size_t nlevels() const { return n_levels; }
std::mutex & std_cout_mutex() { return mtx_std_cout; }
void level_done_wait_all( size_t level )
{
std::unique_lock<std::mutex> lk(cvMtx);
threadsFinished++;
cv.wait(lk, [&]{return threadsFinished >= n_threads * (level+1);});
cv.notify_all();
}
};
void tfunc( Synchronize & sync )
{
for(size_t i = 0; i < sync.nlevels(); i++)
{
//do something
for (size_t j = 0; j < NITERATIONS; j++) {
std::unique_lock<std::mutex> lck(sync.std_cout_mutex());
if (j == 0) std::cout << '\n';
std::cout << ' ' << i << ',' << j;
}
sync.level_done_wait_all(i);
}
}
int main() {
Synchronize sync{ NTHREADS, NLEVELS };
std::vector<std::thread*> threads(NTHREADS,nullptr);
for(auto&t:threads) t = new std::thread(tfunc,std::ref(sync));
for(auto t:threads) {
t->join();
delete t;
}
std::cout << std::endl;
return 0;
}

C++0x has no semaphores? How to synchronize threads?

Is it true that C++0x will come without semaphores? There are already some questions on Stack Overflow regarding the use of semaphores. I use them (posix semaphores) all the time to let a thread wait for some event in another thread:
void thread0(...)
{
doSomething0();
event1.wait();
...
}
void thread1(...)
{
doSomething1();
event1.post();
...
}
If I would do that with a mutex:
void thread0(...)
{
doSomething0();
event1.lock(); event1.unlock();
...
}
void thread1(...)
{
event1.lock();
doSomethingth1();
event1.unlock();
...
}
Problem: It's ugly and it's not guaranteed that thread1 locks the mutex first (Given that the same thread should lock and unlock a mutex, you also can't lock event1 before thread0 and thread1 started).
So since boost doesn't have semaphores either, what is the simplest way to achieve the above?
You can easily build one from a mutex and a condition variable:
#include <mutex>
#include <condition_variable>
class semaphore {
std::mutex mutex_;
std::condition_variable condition_;
unsigned long count_ = 0; // Initialized as locked.
public:
void release() {
std::lock_guard<decltype(mutex_)> lock(mutex_);
++count_;
condition_.notify_one();
}
void acquire() {
std::unique_lock<decltype(mutex_)> lock(mutex_);
while(!count_) // Handle spurious wake-ups.
condition_.wait(lock);
--count_;
}
bool try_acquire() {
std::lock_guard<decltype(mutex_)> lock(mutex_);
if(count_) {
--count_;
return true;
}
return false;
}
};
Based on Maxim Yegorushkin's answer, I tried to make the example in C++11 style.
#include <mutex>
#include <condition_variable>
class Semaphore {
public:
Semaphore (int count_ = 0)
: count(count_) {}
inline void notify()
{
std::unique_lock<std::mutex> lock(mtx);
count++;
cv.notify_one();
}
inline void wait()
{
std::unique_lock<std::mutex> lock(mtx);
while(count == 0){
cv.wait(lock);
}
count--;
}
private:
std::mutex mtx;
std::condition_variable cv;
int count;
};
I decided to write the most robust/generic C++11 semaphore I could, in the style of the standard as much as I could (note using semaphore = ..., you normally would just use the name semaphore similar to normally using string not basic_string):
template <typename Mutex, typename CondVar>
class basic_semaphore {
public:
using native_handle_type = typename CondVar::native_handle_type;
explicit basic_semaphore(size_t count = 0);
basic_semaphore(const basic_semaphore&) = delete;
basic_semaphore(basic_semaphore&&) = delete;
basic_semaphore& operator=(const basic_semaphore&) = delete;
basic_semaphore& operator=(basic_semaphore&&) = delete;
void notify();
void wait();
bool try_wait();
template<class Rep, class Period>
bool wait_for(const std::chrono::duration<Rep, Period>& d);
template<class Clock, class Duration>
bool wait_until(const std::chrono::time_point<Clock, Duration>& t);
native_handle_type native_handle();
private:
Mutex mMutex;
CondVar mCv;
size_t mCount;
};
using semaphore = basic_semaphore<std::mutex, std::condition_variable>;
template <typename Mutex, typename CondVar>
basic_semaphore<Mutex, CondVar>::basic_semaphore(size_t count)
: mCount{count}
{}
template <typename Mutex, typename CondVar>
void basic_semaphore<Mutex, CondVar>::notify() {
std::lock_guard<Mutex> lock{mMutex};
++mCount;
mCv.notify_one();
}
template <typename Mutex, typename CondVar>
void basic_semaphore<Mutex, CondVar>::wait() {
std::unique_lock<Mutex> lock{mMutex};
mCv.wait(lock, [&]{ return mCount > 0; });
--mCount;
}
template <typename Mutex, typename CondVar>
bool basic_semaphore<Mutex, CondVar>::try_wait() {
std::lock_guard<Mutex> lock{mMutex};
if (mCount > 0) {
--mCount;
return true;
}
return false;
}
template <typename Mutex, typename CondVar>
template<class Rep, class Period>
bool basic_semaphore<Mutex, CondVar>::wait_for(const std::chrono::duration<Rep, Period>& d) {
std::unique_lock<Mutex> lock{mMutex};
auto finished = mCv.wait_for(lock, d, [&]{ return mCount > 0; });
if (finished)
--mCount;
return finished;
}
template <typename Mutex, typename CondVar>
template<class Clock, class Duration>
bool basic_semaphore<Mutex, CondVar>::wait_until(const std::chrono::time_point<Clock, Duration>& t) {
std::unique_lock<Mutex> lock{mMutex};
auto finished = mCv.wait_until(lock, t, [&]{ return mCount > 0; });
if (finished)
--mCount;
return finished;
}
template <typename Mutex, typename CondVar>
typename basic_semaphore<Mutex, CondVar>::native_handle_type basic_semaphore<Mutex, CondVar>::native_handle() {
return mCv.native_handle();
}
in acordance with posix semaphores, I would add
class semaphore
{
...
bool trywait()
{
boost::mutex::scoped_lock lock(mutex_);
if(count_)
{
--count_;
return true;
}
else
{
return false;
}
}
};
And I much prefer using a synchronisation mechanism at a convenient level of abstraction, rather than always copy pasting a stitched-together version using more basic operators.
C++20 finally has semaphores - std::counting_semaphore<max_count>.
These have (at least) the following methods:
acquire() (blocking)
try_acquire() (non-blocking, returns immediately)
try_acquire_for() (non-blocking, takes a duration)
try_acquire_until() (non-blocking, takes a time at which to stop trying)
release()
You can read these CppCon 2019 presentation slides, or watch the video. There's also the official proposal P0514R4, but it may not be up-to-date with actual C++20.
You can also check out cpp11-on-multicore - it has a portable and optimal semaphore implementation.
The repository also contains other threading goodies that complement c++11 threading.
You can work with mutex and condition variables. You gain exclusive access with the mutex, check whether you want to continue or need to wait for the other end. If you need to wait, you wait in a condition. When the other thread determines that you can continue, it signals the condition.
There is a short example in the boost::thread library that you can most probably just copy (the C++0x and boost thread libs are very similar).
Also can be useful RAII semaphore wrapper in threads:
class ScopedSemaphore
{
public:
explicit ScopedSemaphore(Semaphore& sem) : m_Semaphore(sem) { m_Semaphore.Wait(); }
ScopedSemaphore(const ScopedSemaphore&) = delete;
~ScopedSemaphore() { m_Semaphore.Notify(); }
ScopedSemaphore& operator=(const ScopedSemaphore&) = delete;
private:
Semaphore& m_Semaphore;
};
Usage example in multithread app:
boost::ptr_vector<std::thread> threads;
Semaphore semaphore;
for (...)
{
...
auto t = new std::thread([..., &semaphore]
{
ScopedSemaphore scopedSemaphore(semaphore);
...
}
);
threads.push_back(t);
}
for (auto& t : threads)
t.join();
I found the shared_ptr and weak_ptr, a long with a list, did the job I needed. My issue was, I had several clients wanting to interact with a host's internal data. Typically, the host updates the data on it's own, however, if a client requests it, the host needs to stop updating until no clients are accessing the host data. At the same time, a client could ask for exclusive access, so that no other clients, nor the host, could modify that host data.
How I did this was, I created a struct:
struct UpdateLock
{
typedef std::shared_ptr< UpdateLock > ptr;
};
Each client would have a member of such:
UpdateLock::ptr m_myLock;
Then the host would have a weak_ptr member for exclusivity, and a list of weak_ptrs for non-exclusive locks:
std::weak_ptr< UpdateLock > m_exclusiveLock;
std::list< std::weak_ptr< UpdateLock > > m_locks;
There is a function to enable locking, and another function to check if the host is locked:
UpdateLock::ptr LockUpdate( bool exclusive );
bool IsUpdateLocked( bool exclusive ) const;
I test for locks in LockUpdate, IsUpdateLocked, and periodically in the host's Update routine. Testing for a lock is as simple as checking if the weak_ptr's expired, and removing any expired from the m_locks list (I only do this during the host update), I can check if the list is empty; at the same time, I get automatic unlocking when a client resets the shared_ptr they are hanging onto, which also happens when a client gets destroyed automatically.
The over all effect is, since clients rarely need exclusivity (typically reserved for additions and deletions only), most of the time a request to LockUpdate( false ), that is to say non-exclusive, succeeds so long as (! m_exclusiveLock). And a LockUpdate( true ), a request for exclusivity, succeeds only when both (! m_exclusiveLock) and (m_locks.empty()).
A queue could be added to mitigate between exclusive and non-exclusive locks, however, I have had no collisions thus far, so I intend to wait until that happens to add the solution (mostly so I have a real-world test condition).
So far this is working well for my needs; I can imagine the need to expand this, and some issues that might arise over expanded use, however, this was quick to implement, and required very little custom code.
There old question but I would like to offer another solution.
It seems you need a not semathore but a event like Windows Events.
Very effective events can be done like following:
#ifdef _MSC_VER
#include <concrt.h>
#else
// pthread implementation
#include <cstddef>
#include <cstdint>
#include <shared_mutex>
namespace Concurrency
{
const unsigned int COOPERATIVE_TIMEOUT_INFINITE = (unsigned int)-1;
const size_t COOPERATIVE_WAIT_TIMEOUT = SIZE_MAX;
class event
{
public:
event();
~event();
size_t wait(unsigned int timeout = COOPERATIVE_TIMEOUT_INFINITE);
void set();
void reset();
static size_t wait_for_multiple(event** _PPEvents, size_t _Count, bool _FWaitAll, unsigned int _Timeout = COOPERATIVE_TIMEOUT_INFINITE);
static const unsigned int timeout_infinite = COOPERATIVE_TIMEOUT_INFINITE;
private:
int d;
std::shared_mutex guard;
};
};
namespace concurrency = Concurrency;
#include <unistd.h>
#include <errno.h>
#include <sys/eventfd.h>
#include <sys/epoll.h>
#include <chrono>
#include "../HandleHolder.h"
typedef CommonHolder<int, close> fd_holder;
namespace Concurrency
{
int watch(int ep_fd, int fd)
{
epoll_event ep_event;
ep_event.events = EPOLLIN;
ep_event.data.fd = fd;
return epoll_ctl(ep_fd, EPOLL_CTL_ADD, fd, &ep_event);
}
event::event()
: d(eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK))
{
}
event::~event()
{
std::unique_lock<std::shared_mutex> lock(guard);
close(d);
d = -1;
}
size_t event::wait(unsigned int timeout)
{
fd_holder ep_fd(epoll_create1(EPOLL_CLOEXEC));
{
std::shared_lock<std::shared_mutex> lock(guard);
if (d == -1 || watch(ep_fd.GetHandle(), d) < 0)
return COOPERATIVE_WAIT_TIMEOUT;
}
epoll_event ep_event;
return epoll_wait(ep_fd.GetHandle(), &ep_event, 1, timeout) == 1 && (ep_event.events & EPOLLIN) ? 0 : COOPERATIVE_WAIT_TIMEOUT;
}
void event::set()
{
uint64_t count = 1;
write(d, &count, sizeof(count));
}
void event::reset()
{
uint64_t count;
read(d, &count, sizeof(count));
}
size_t event::wait_for_multiple(event** _PPEvents, size_t _Count, bool _FWaitAll, unsigned int _Timeout)
{
if (_FWaitAll) // not implemented
std::abort();
const auto deadline = _Timeout != COOPERATIVE_TIMEOUT_INFINITE ? std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now().time_since_epoch()).count() + _Timeout : COOPERATIVE_TIMEOUT_INFINITE;
fd_holder ep_fd(epoll_create1(EPOLL_CLOEXEC));
int fds[_Count];
for (int i = 0; i < _Count; ++i)
{
std::shared_lock<std::shared_mutex> lock(_PPEvents[i]->guard);
fds[i] = _PPEvents[i]->d;
if (fds[i] != -1 && watch(ep_fd.GetHandle(), fds[i]) < 0)
fds[i] = -1;
}
epoll_event ep_events[_Count];
// Вызов epoll_wait может быть прерван сигналом. Ждём весь тайм-аут, так же, как в Windows
int res = 0;
while (true)
{
res = epoll_wait(ep_fd.GetHandle(), &ep_events[0], _Count, _Timeout);
if (res == -1 && errno == EINTR && std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now().time_since_epoch()).count() < deadline)
continue;
break;
}
for (int i = 0; i < _Count; ++i)
{
if (fds[i] == -1)
continue;
for (int j = 0; j < res; ++j)
if (ep_events[j].data.fd == fds[i] && (ep_events[j].events & EPOLLIN))
return i;
}
return COOPERATIVE_WAIT_TIMEOUT;
}
};
#endif
And then just use concurrency::event
Different from other answers, I propose a new version which:
Unblocks all waiting threads before being deleted. In this case, deleting the semaphore will wake up all waiting threads and only after everybody wakes up, the semaphore destructor will exit.
Has a parameter to the wait() call, to automatically unlock the calling thread after the timeout in milliseconds has passed.
Has an options on the construtor to limit available resources count only up to the count the semaphore was initialized with. This way, calling notify() too many times will not increase how many resources the semaphore has.
#include <stdio.h>
#include <thread>
#include <mutex>
#include <condition_variable>
#include <iostream>
std::recursive_mutex g_sync_mutex;
#define sync(x) do { \
std::unique_lock<std::recursive_mutex> lock(g_sync_mutex); \
x; \
} while (false);
class Semaphore {
int _count;
bool _limit;
int _all_resources;
int _wakedup;
std::mutex _mutex;
std::condition_variable_any _condition_variable;
public:
/**
* count - how many resources this semaphore holds
* limit - limit notify() calls only up to the count value (available resources)
*/
Semaphore (int count, bool limit)
: _count(count),
_limit(limit),
_all_resources(count),
_wakedup(count)
{
}
/**
* Unlock all waiting threads before destructing the semaphore (to avoid their segfalt later)
*/
virtual ~Semaphore () {
std::unique_lock<std::mutex> lock(_mutex);
_wakeup(lock);
}
void _wakeup(std::unique_lock<std::mutex>& lock) {
int lastwakeup = 0;
while( _wakedup < _all_resources ) {
lock.unlock();
notify();
lock.lock();
// avoids 100% CPU usage if someone is not waking up properly
if (lastwakeup == _wakedup) {
std::this_thread::sleep_for( std::chrono::milliseconds(10) );
}
lastwakeup = _wakedup;
}
}
// Mutex and condition variables are not movable and there is no need for smart pointers yet
Semaphore(const Semaphore&) = delete;
Semaphore& operator =(const Semaphore&) = delete;
Semaphore(const Semaphore&&) = delete;
Semaphore& operator =(const Semaphore&&) = delete;
/**
* Release one acquired resource.
*/
void notify()
{
std::unique_lock<std::mutex> lock(_mutex);
// sync(std::cerr << getTime() << "Calling notify(" << _count << ", " << _limit << ", " << _all_resources << ")" << std::endl);
_count++;
if (_limit && _count > _all_resources) {
_count = _all_resources;
}
_condition_variable.notify_one();
}
/**
* This function never blocks!
* Return false if it would block when acquiring the lock. Otherwise acquires the lock and return true.
*/
bool try_acquire() {
std::unique_lock<std::mutex> lock(_mutex);
// sync(std::cerr << getTime() << "Calling try_acquire(" << _count << ", " << _limit << ", " << _all_resources << ")" << std::endl);
if(_count <= 0) {
return false;
}
_count--;
return true;
}
/**
* Return true if the timeout expired, otherwise return false.
* timeout - how many milliseconds to wait before automatically unlocking the wait() call.
*/
bool wait(int timeout = 0) {
std::unique_lock<std::mutex> lock(_mutex);
// sync(std::cerr << getTime() << "Calling wait(" << _count << ", " << _limit << ", " << _all_resources << ")" << std::endl);
_count--;
_wakedup--;
try {
std::chrono::time_point<std::chrono::system_clock> timenow = std::chrono::system_clock::now();
while(_count < 0) {
if (timeout < 1) {
_condition_variable.wait(lock);
}
else {
std::cv_status status = _condition_variable.wait_until(lock, timenow + std::chrono::milliseconds(timeout));
if ( std::cv_status::timeout == status) {
_count++;
_wakedup++;
return true;
}
}
}
}
catch (...) {
_count++;
_wakedup++;
throw;
}
_wakedup++;
return false;
}
/**
* Return true if calling wait() will block the calling thread
*/
bool locked() {
std::unique_lock<std::mutex> lock(_mutex);
return _count <= 0;
}
/**
* Return true the semaphore has at least all resources available (since when it was created)
*/
bool freed() {
std::unique_lock<std::mutex> lock(_mutex);
return _count >= _all_resources;
}
/**
* Return how many resources are available:
* - 0 means not free resources and calling wait() will block te calling thread
* - a negative value means there are several threads being blocked
* - a positive value means there are no threads waiting
*/
int count() {
std::unique_lock<std::mutex> lock(_mutex);
return _count;
}
/**
* Wake everybody who is waiting and reset the semaphore to its initial value.
*/
void reset() {
std::unique_lock<std::mutex> lock(_mutex);
if(_count < 0) {
_wakeup(lock);
}
_count = _all_resources;
}
};
Utility to print the current timestamp:
std::string getTime() {
char buffer[20];
#if defined( WIN32 )
SYSTEMTIME wlocaltime;
GetLocalTime(&wlocaltime);
::snprintf(buffer, sizeof buffer, "%02d:%02d:%02d.%03d ", wlocaltime.wHour, wlocaltime.wMinute, wlocaltime.wSecond, wlocaltime.wMilliseconds);
#else
std::chrono::time_point< std::chrono::system_clock > now = std::chrono::system_clock::now();
auto duration = now.time_since_epoch();
auto hours = std::chrono::duration_cast< std::chrono::hours >( duration );
duration -= hours;
auto minutes = std::chrono::duration_cast< std::chrono::minutes >( duration );
duration -= minutes;
auto seconds = std::chrono::duration_cast< std::chrono::seconds >( duration );
duration -= seconds;
auto milliseconds = std::chrono::duration_cast< std::chrono::milliseconds >( duration );
duration -= milliseconds;
time_t theTime = time( NULL );
struct tm* aTime = localtime( &theTime );
::snprintf(buffer, sizeof buffer, "%02d:%02d:%02d.%03ld ", aTime->tm_hour, aTime->tm_min, aTime->tm_sec, milliseconds.count());
#endif
return buffer;
}
Example program using this semaphore:
// g++ -o test -Wall -Wextra -ggdb -g3 -pthread test.cpp && gdb --args ./test
// valgrind --leak-check=full --show-leak-kinds=all --track-origins=yes --verbose ./test
// procdump -accepteula -ma -e -f "" -x c:\ myexe.exe
int main(int argc, char* argv[]) {
std::cerr << getTime() << "Creating Semaphore" << std::endl;
Semaphore* semaphore = new Semaphore(1, false);
semaphore->wait(1000);
semaphore->wait(1000);
std::cerr << getTime() << "Auto Unlocking Semaphore wait" << std::endl;
std::this_thread::sleep_for( std::chrono::milliseconds(5000) );
delete semaphore;
std::cerr << getTime() << "Exiting after 10 seconds..." << std::endl;
return 0;
}
Example output:
11:03:01.012 Creating Semaphore
11:03:02.012 Auto Unlocking Semaphore wait
11:03:07.012 Exiting after 10 seconds...
Extra function which uses a EventLoop to unlock the semaphores after some time:
std::shared_ptr<std::atomic<bool>> autowait(Semaphore* semaphore, int timeout, EventLoop<std::function<void()>>& eventloop, const char* source) {
std::shared_ptr<std::atomic<bool>> waiting(std::make_shared<std::atomic<bool>>(true));
sync(std::cerr << getTime() << "autowait '" << source << "'..." << std::endl);
if (semaphore->try_acquire()) {
eventloop.enqueue( timeout, [waiting, source, semaphore]{
if ( (*waiting).load() ) {
sync(std::cerr << getTime() << "Timeout '" << source << "'..." << std::endl);
semaphore->notify();
}
} );
}
else {
semaphore->wait(timeout);
}
return waiting;
}
Semaphore semaphore(1, false);
EventLoop<std::function<void()>>* eventloop = new EventLoop<std::function<void()>>(true);
std::shared_ptr<std::atomic<bool>> waiting_something = autowait(&semaphore, 45000, eventloop, "waiting_something");
In case someone is interested in the atomic version, here is the implementation. The performance is expected better than the mutex & condition variable version.
class semaphore_atomic
{
public:
void notify() {
count_.fetch_add(1, std::memory_order_release);
}
void wait() {
while (true) {
int count = count_.load(std::memory_order_relaxed);
if (count > 0) {
if (count_.compare_exchange_weak(count, count-1, std::memory_order_acq_rel, std::memory_order_relaxed)) {
break;
}
}
}
}
bool try_wait() {
int count = count_.load(std::memory_order_relaxed);
if (count > 0) {
if (count_.compare_exchange_strong(count, count-1, std::memory_order_acq_rel, std::memory_order_relaxed)) {
return true;
}
}
return false;
}
private:
std::atomic_int count_{0};
};

How to make boost::thread_group execute a fixed number of parallel threads

This is the code to create a thread_group and execute all threads in parallel:
boost::thread_group group;
for (int i = 0; i < 15; ++i)
group.create_thread(aFunctionToExecute);
group.join_all();
This code will execute all threads at once. What I want to do is to execute them all but 4 maximum in parallel. When on is terminated, another one is executed until there are no more to execute.
Another, more efficient solution would be to have each thread callback to the primary thread when they are finished, and the handler on the primary thread could launch a new thread each time. This prevents the repetitive calls to timed_join, as the primary thread won't do anything until the callback is triggered.
I have something like this:
boost::mutex mutex_;
boost::condition_variable condition_;
const size_t throttle_;
size_t size_;
bool wait_;
template <typename Env, class F>
void eval_(const Env &env, const F &f) {
{
boost::unique_lock<boost::mutex> lock(mutex_);
size_ = std::min(size_+1, throttle_);
while (throttle_ <= size_) condition_.wait(lock);
}
f.eval(env);
{
boost::lock_guard<boost::mutex> lock(mutex_);
--size_;
}
condition_.notify_one();
}
I think you are looking for a thread_pool implementation, which is available here.
Additionally I have noticed that if you create a vector of std::future and store futures of many std::async_tasks in it and you do not have any blocking code in the function passed to the thread, VS2013 (atleast from what I can confirm) will launch exactly the appropriate no of threads your machine can handle. It reuses the threads once created.
I created my own simplified interface of boost::thread_group to do this job:
class ThreadGroup : public boost::noncopyable
{
private:
boost::thread_group group;
std::size_t maxSize;
float sleepStart;
float sleepCoef;
float sleepMax;
std::set<boost::thread*> running;
public:
ThreadGroup(std::size_t max_size = 0,
float max_sleeping_time = 1.0f,
float sleeping_time_coef = 1.5f,
float sleeping_time_start = 0.001f) :
boost::noncopyable(),
group(),
maxSize(max_size),
sleepStart(sleeping_time_start),
sleepCoef(sleeping_time_coef),
sleepMax(max_sleeping_time),
running()
{
if(max_size == 0)
this->maxSize = (std::size_t)std::max(boost::thread::hardware_concurrency(), 1u);
assert(max_sleeping_time >= sleeping_time_start);
assert(sleeping_time_start > 0.0f);
assert(sleeping_time_coef > 1.0f);
}
~ThreadGroup()
{
this->joinAll();
}
template<typename F> boost::thread* createThread(F f)
{
float sleeping_time = this->sleepStart;
while(this->running.size() >= this->maxSize)
{
for(std::set<boost::thread*>::iterator it = running.begin(); it != running.end();)
{
const std::set<boost::thread*>::iterator jt = it++;
if((*jt)->timed_join(boost::posix_time::milliseconds((long int)(1000.0f * sleeping_time))))
running.erase(jt);
}
if(sleeping_time < this->sleepMax)
{
sleeping_time *= this->sleepCoef;
if(sleeping_time > this->sleepMax)
sleeping_time = this->sleepMax;
}
}
return *this->running.insert(this->group.create_thread(f)).first;
}
void joinAll()
{
this->group.join_all();
}
void interruptAll()
{
#ifdef BOOST_THREAD_PROVIDES_INTERRUPTIONS
this->group.interrupt_all();
#endif
}
std::size_t size() const
{
return this->group.size();
}
};
Here is an example of use, very similar to boost::thread_group with the main difference that the creation of the thread is a waiting point:
{
ThreadGroup group(4);
for(int i = 0; i < 15; ++i)
group.createThread(aFunctionToExecute);
} // join all at destruction