Why cannot my c++ thread pool accelerate my program? - c++

I tried to implement a c++ thread pool according to some notes made by others, the code is like this:
#include <vector>
#include <queue>
#include <functional>
#include <future>
#include <atomic>
#include <condition_variable>
#include <thread>
#include <mutex>
#include <memory>
#include <glog/logging.h>
#include <iostream>
#include <chrono>
using std::cout;
using std::endl;
class ThreadPool {
public:
ThreadPool(const ThreadPool&) = delete;
ThreadPool(ThreadPool&&) = delete;
ThreadPool& operator=(const ThreadPool&) = delete;
ThreadPool& operator=(ThreadPool&&) = delete;
ThreadPool(uint32_t capacity=std::thread::hardware_concurrency(),
uint32_t n_threads=std::thread::hardware_concurrency()
): capacity(capacity), n_threads(n_threads) {
init(capacity, n_threads);
}
~ThreadPool() noexcept {
shutdown();
}
void init(uint32_t capacity, uint32_t n_threads) {
CHECK_GT(capacity, 0) << "task queue capacity should be greater than 0";
CHECK_GT(n_threads, 0) << "thread pool capacity should be greater than 0";
for (int i{0}; i < n_threads; ++i) {
pool.emplace_back(std::thread([this] {
std::function<void(void)> task;
while (!this->stop) {
{
std::unique_lock<std::mutex> lock(this->q_mutex);
task_q_empty.wait(lock, [&] {return this->stop | !task_q.empty();});
if (this->stop) break;
task = this->task_q.front();
this->task_q.pop();
task_q_full.notify_one();
}
// auto id = std::this_thread::get_id();
// std::cout << "thread id is: " << id << std::endl;
task();
}
}));
}
}
void shutdown() {
stop = true;
task_q_empty.notify_all();
task_q_full.notify_all();
for (auto& thread : pool) {
if (thread.joinable()) {
thread.join();
}
}
}
template<typename F, typename...Args>
auto submit(F&& f, Args&&... args) -> std::future<decltype(f(args...))> {
using res_type = decltype(f(args...));
std::function<res_type(void)> func = std::bind(std::forward<F>(f), std::forward<Args>(args)...);
auto task_ptr = std::make_shared<std::packaged_task<res_type()>>(func);
{
std::unique_lock<std::mutex> lock(q_mutex);
task_q_full.wait(lock, [&] {return this->stop | task_q.size() <= capacity;});
CHECK (this->stop == false) << "should not add task to stopped queue\n";
task_q.emplace([task_ptr]{(*task_ptr)();});
}
task_q_empty.notify_one();
return task_ptr->get_future();
}
private:
std::vector<std::thread> pool;
std::queue<std::function<void(void)>> task_q;
std::condition_variable task_q_full;
std::condition_variable task_q_empty;
std::atomic<bool> stop{false};
std::mutex q_mutex;
uint32_t capacity;
uint32_t n_threads;
};
int add(int a, int b) {return a + b;}
int main() {
auto t1 = std::chrono::steady_clock::now();
int n_threads = 1;
ThreadPool tp;
tp.init(n_threads, 1024);
std::vector<std::future<int>> res;
for (int i{0}; i < 1000000; ++i) {
res.push_back(tp.submit(add, i, i+1));
}
auto t2 = std::chrono::steady_clock::now();
for (auto &el : res) {
el.get();
// cout << el.get() << endl;
}
tp.shutdown();
cout << "processing: "
<< std::chrono::duration<double, std::milli>(t2 - t1).count()
<< endl;
return 0;
}
The problem is that, when I set n_threads=1, the program takes the same length of time as I set n_threads=4. Since my gpu has 72 kernels (from the htop command), I believe the 4 thread would be faster than the 1 thread settings. What is the problem with this implementation of the thread pool please?

I found few issues:
1) Use ORing instead of the bitwise operation in the both conditional-variable waits:
Replace this - `task_q_empty.wait(lock, [&] {return this->stop | !task_q.empty();});`
By - `task_q_empty.wait(lock, [&] {return this->stop || !task_q.empty();});`
2) Use notify_all() in place of notify_one() in init() and submit().
3) Two condition_variables is unnecessary here, use only task_q_empty.
4) Your use case is not ideal. Switching of the threads may outweigh adding of two integers, it may appear more the threads longer the execution time. Test in optimized mode. Try scenario like this to simulate longer process:
int add(int a, int b) { this_thread::sleep_for(chrono::milliseconds(200)); return a + b; }

Related

Determining function time using a wrapper

I'm looking for a generic way of measuring a functions timing like Here, but for c++.
My main goal is to not have cluttered code like this piece everywhere:
auto t1 = std::chrono::high_resolution_clock::now();
function(arg1, arg2);
auto t2 = std::chrono::high_resolution_clock::now();
auto tDur = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1);
But rather have a nice wrapper around the function.
What I got so far is:
timing.hpp:
#pragma once
#include <chrono>
#include <functional>
template <typename Tret, typename Tin1, typename Tin2> unsigned int getDuration(std::function<Tret(Tin1, Tin2)> function, Tin1 arg1, Tin2 arg2, Tret& retValue)
{
auto t1 = std::chrono::high_resolution_clock::now();
retValue = function(arg1, arg2);
auto t2 = std::chrono::high_resolution_clock::now();
auto tDur = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1);
return tDur.count();
}
main.cpp:
#include "timing.hpp"
#include "matrix.hpp"
constexpr int G_MATRIXSIZE = 2000;
int main(int argc, char** argv)
{
CMatrix<double> myMatrix(G_MATRIXSIZE);
bool ret;
// this call is quite ugly
std::function<bool(int, std::vector<double>)> fillRow = std::bind(&CMatrix<double>::fillRow, &myMatrix, 0, fillVec);
auto duration = getDuration(fillRow, 5, fillVec, ret );
std::cout << "duration(ms): " << duration << std::endl;
}
in case sb wants to test the code, matrix.hpp:
#pragma once
#include <iostream>
#include <string>
#include <sstream>
#include <vector>
template<typename T> class CMatrix {
public:
// ctor
CMatrix(int size) :
m_size(size)
{
m_matrixData = new std::vector<std::vector<T>>;
createUnityMatrix();
}
// dtor
~CMatrix()
{
std::cout << "Destructor of CMatrix called" << std::endl;
delete m_matrixData;
}
// print to std::out
void printMatrix()
{
std::ostringstream oss;
for (int i = 0; i < m_size; i++)
{
for (int j = 0; j < m_size; j++)
{
oss << m_matrixData->at(i).at(j) << ";";
}
oss << "\n";
}
std::cout << oss.str() << std::endl;
}
bool fillRow(int index, std::vector<T> row)
{
// checks
if (!indexValid(index))
{
return false;
}
if (row.size() != m_size)
{
return false;
}
// data replacement
for (int j = 0; j < m_size; j++)
{
m_matrixData->at(index).at(j) = row.at(j);
}
return true;
}
bool fillColumn(int index, std::vector<T> column)
{
// checks
if (!indexValid(index))
{
return false;
}
if (column.size() != m_size)
{
return false;
}
// data replacement
for (int j = 0; j < m_size; j++)
{
m_matrixData->at(index).at(j) = column.at(j);
}
return true;
}
private:
// variables
std::vector<std::vector<T>>* m_matrixData;
int m_size;
bool indexValid(int index)
{
if (index + 1 > m_size)
{
return false;
}
return true;
}
// functions
void createUnityMatrix()
{
for (int i = 0; i < m_size; i++)
{
std::vector<T> _vector;
for (int j = 0; j < m_size; j++)
{
if (i == j)
{
_vector.push_back(1);
}
else
{
_vector.push_back(0);
}
}
m_matrixData->push_back(_vector);
}
}
};
The thing is, this code is still quite ugly due to the std::function usage. Is there a better and/or simpler option ?
(+ also I'm sure I messed sth up with the std::bind, I think I need to use std::placeholders since I want to set the arguments later on.)
// edit, correct use of placeholder in main:
std::function<bool(int, std::vector<double>)> fillRow = std::bind(&CMatrix<double>::fillRow, &myMatrix, std::placeholders::_1, std::placeholders::_2);
auto duration = getDuration(fillRow, 18, fillVec, ret );
You can utilize RAII to implement a timer that records the execution time of a code block and a template function that wraps the function you would like to execute with the timer.
#include<string>
#include<chrono>
#include <unistd.h>
struct Timer
{
std::string fn, title;
std::chrono::time_point<std::chrono::steady_clock> start;
Timer(std::string fn, std::string title)
: fn(std::move(fn)), title(std::move(title)), start(std::chrono::steady_clock::now())
{
}
~Timer()
{
const auto elapsed =
std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - start).count();
printf("%s: function=%s; elasepd=%f ms\n", title.c_str(), fn.c_str(), elapsed / 1000.0);
}
};
#ifndef ENABLE_BENCHMARK
static constexpr inline void dummy_fn() { }
#define START_BENCHMARK_TIMER(...) dummy_fn()
#else
#define START_BENCHMARK_TIMER(title) bench::Timer timer(__FUNCTION__, title)
#endif
template<typename F, typename ...Args>
auto time_fn(F&& fn, Args&&... args) {
START_BENCHMARK_TIMER("wrapped fn");
return fn(std::forward<Args>(args)...);
}
int foo(int i) {
usleep(70000);
return i;
}
int main()
{
printf("%d\n", time_fn(foo, 3));
}
stdout:
wrapped fn: function=time_fn; elasepd=71.785000 ms
3
General Idea:
time_fn is a simple template function that calls START_BENCHMARK_TIMER and calls fn with the provided arguments
START_BENCHMARK_TIMER then creates a Timer object. It will record the current time in start. Do note that __FUNCTION__ will be replaced with the function that was called.
When the
provided fn returns or throws an exception, the Timer object from (1) will be destroyed and the destructor will be called. The destructor will then calculate the time difference between the current time and the recorded start time and prints it to stdout
Note:
Even though declaring start and end in time_fn instead of the RAII timer will work, having an RAII timer will allow you to cleanly handle the situation when fn throws an exception
If you are on c++11, you will need to change time_fn declaration to typename std::result_of<F &&(Args &&...)>::type time_fn(F&& fn, Args&&... args).
Edit: Updated the response to include a wrapper function approach.

Concurrent program compiled with clang runs fine, but hangs with gcc

I wrote a class to share a limited number of resources (for instance network interfaces) between a larger number of threads. The resources are pooled and, if not in use, they are borrowed out to the requesting thread, which otherwise waits on a condition_variable.
Nothing really exotic: apart for the fancy scoped_lock which requires c++17, it should be good old c++11.
Both gcc10.2 and clang11 compile the test main fine, but while the latter produces an executable which does pretty much what expected, the former hangs without consuming CPU (deadlock?).
With the help of https://godbolt.org/ I tried older versions of gcc and also icc (passing options -O3 -std=c++17 -pthread), all reproducing the bad result, while even there clang confirms the proper behavior.
I wonder if I made a mistake or if the code triggers some compiler misbehavior and in case how to work around that.
#include <iostream>
#include <vector>
#include <stdexcept>
#include <mutex>
#include <condition_variable>
template <typename T>
class Pool {
///////////////////////////
class Borrowed {
friend class Pool<T>;
Pool<T>& pool;
const size_t id;
T * val;
public:
Borrowed(Pool & p, size_t i, T& v): pool(p), id(i), val(&v) {}
~Borrowed() { release(); }
T& get() const {
if (!val) throw std::runtime_error("Borrowed::get() this resource was collected back by the pool");
return *val;
}
void release() { pool.collect(*this); }
};
///////////////////////////
struct Resource {
T val;
bool available = true;
Resource(T v): val(std::move(v)) {}
};
///////////////////////////
std::vector<Resource> vres;
size_t hint = 0;
std::condition_variable cv;
std::mutex mtx;
size_t available_cnt;
public:
Pool(std::initializer_list<T> l): available_cnt(l.size()) {
vres.reserve(l.size());
for (T t: l) {
vres.emplace_back(std::move(t));
}
std::cout << "Pool has size " << vres.size() << std::endl;
}
~Pool() {
for ( auto & res: vres ) {
if ( ! res.available ) {
std::cerr << "WARNING Pool::~Pool resources are still in use\n";
}
}
}
Borrowed borrow() {
std::unique_lock<std::mutex> lk(mtx);
cv.wait(lk, [&](){return available_cnt > 0;});
if ( vres[hint].available ) {
// quick path, if hint points to an available resource
std::cout << "hint good" << std::endl;
vres[hint].available = false;
--available_cnt;
Borrowed b(*this, hint, vres[hint].val);
if ( hint + 1 < vres.size() ) ++hint;
return b; // <--- gcc seems to hang here
} else {
// full scan to find the available resource
std::cout << "hint bad" << std::endl;
for ( hint = 0; hint < vres.size(); ++hint ) {
if ( vres[hint].available ) {
vres[hint].available = false;
--available_cnt;
return Borrowed(*this, hint, vres[hint].val);
}
}
}
throw std::runtime_error("Pool::borrow() no resource is available - internal logic error");
}
void collect(Borrowed & b) {
if ( &(b.pool) != this )
throw std::runtime_error("Pool::collect() trying to collect resource owned by another pool!");
if ( b.val ) {
b.val = nullptr;
{
std::scoped_lock<std::mutex> lk(mtx);
hint = b.id;
vres[hint].available = true;
++available_cnt;
}
cv.notify_one();
}
}
};
///////////////////////////////////////////////////////////////////
#include <thread>
#include <chrono>
int main() {
Pool<std::string> pool{"hello","world"};
std::vector<std::thread> vt;
for (int i = 10; i > 0; --i) {
vt.emplace_back( [&pool, i]()
{
auto res = pool.borrow();
std::this_thread::sleep_for(std::chrono::milliseconds(i*300));
std::cout << res.get() << std::endl;
}
);
}
for (auto & t: vt) t.join();
return 0;
}
You're running into undefined behavior since you effectively relock an already acquired lock. With MSVC I obtained a helpful callstack to distinguish this. Here is a working fixed example (I suppose, works now for me, see the changes within the borrow() method, might be further re-designed since locking inside a destructor might be questioned):
#include <iostream>
#include <vector>
#include <stdexcept>
#include <mutex>
#include <condition_variable>
template <typename T>
class Pool {
///////////////////////////
class Borrowed {
friend class Pool<T>;
Pool<T>& pool;
const size_t id;
T * val;
public:
Borrowed(Pool & p, size_t i, T& v) : pool(p), id(i), val(&v) {}
~Borrowed() { release(); }
T& get() const {
if (!val) throw std::runtime_error("Borrowed::get() this resource was collected back by the pool");
return *val;
}
void release() { pool.collect(*this); }
};
///////////////////////////
struct Resource {
T val;
bool available = true;
Resource(T v) : val(std::move(v)) {}
};
///////////////////////////
std::vector<Resource> vres;
size_t hint = 0;
std::condition_variable cv;
std::mutex mtx;
size_t available_cnt;
public:
Pool(std::initializer_list<T> l) : available_cnt(l.size()) {
vres.reserve(l.size());
for (T t : l) {
vres.emplace_back(std::move(t));
}
std::cout << "Pool has size " << vres.size() << std::endl;
}
~Pool() {
for (auto & res : vres) {
if (!res.available) {
std::cerr << "WARNING Pool::~Pool resources are still in use\n";
}
}
}
Borrowed borrow() {
std::unique_lock<std::mutex> lk(mtx);
while (available_cnt == 0) cv.wait(lk);
if (vres[hint].available) {
// quick path, if hint points to an available resource
std::cout << "hint good" << std::endl;
vres[hint].available = false;
--available_cnt;
Borrowed b(*this, hint, vres[hint].val);
if (hint + 1 < vres.size()) ++hint;
lk.unlock();
return b; // <--- gcc seems to hang here
}
else {
// full scan to find the available resource
std::cout << "hint bad" << std::endl;
for (hint = 0; hint < vres.size(); ++hint) {
if (vres[hint].available) {
vres[hint].available = false;
--available_cnt;
lk.unlock();
return Borrowed(*this, hint, vres[hint].val);
}
}
}
throw std::runtime_error("Pool::borrow() no resource is available - internal logic error");
}
void collect(Borrowed & b) {
if (&(b.pool) != this)
throw std::runtime_error("Pool::collect() trying to collect resource owned by another pool!");
if (b.val) {
b.val = nullptr;
{
std::scoped_lock<std::mutex> lk(mtx);
hint = b.id;
vres[hint].available = true;
++available_cnt;
cv.notify_one();
}
}
}
};
///////////////////////////////////////////////////////////////////
#include <thread>
#include <chrono>
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
int main()
{
try
{
Pool<std::string> pool{ "hello","world" };
std::vector<std::thread> vt;
for (int i = 10; i > 0; --i) {
vt.emplace_back([&pool, i]()
{
auto res = pool.borrow();
std::this_thread::sleep_for(std::chrono::milliseconds(i * 300));
std::cout << res.get() << std::endl;
}
);
}
for (auto & t : vt) t.join();
return 0;
}
catch(const std::exception& e)
{
std::cout << "exception occurred: " << e.what();
}
return 0;
}
Locking destructor coupled with missed NRVO caused the issue (credits to Secundi for pointing this out in the comments).
If the compiler skips NRVO, the few lines below if will call the destructor of b. The destructor tries to acquire the mutex before this gets released by the unique_lock, resulting in a deadlock.
Borrowed b(*this, hint, vres[hint].val);
if ( hint + 1 < vres.size() ) ++hint;
return b; // <--- gcc seems to hang here
It is of crucial importance here to avoid destroying b. In fact, even if manually releasing the unique_lock before returning will avoid the deadlock, the destructor of b will mark the pooled resource as available, while this is just being borrowed out, making the code wrong.
A possible fix consists in replacing the lines above with:
const auto tmp = hint;
if ( hint + 1 < vres.size() ) ++hint;
return Borrowed(*this, tmp, vres[tmp].val);
Another possibility (which does not exclude the former) is to delete the (evil) copy ctor of Borrowed and only provide a move ctor:
Borrowed(const Borrowed &) = delete;
Borrowed(Borrowed && b): pool(b.pool), id(b.id), val(b.val) { b.val = nullptr; }

Boost thread pool join tasks without closing the pool [duplicate]

Consider the functions
#include <iostream>
#include <boost/bind.hpp>
#include <boost/asio.hpp>
void foo(const uint64_t begin, uint64_t *result)
{
uint64_t prev[] = {begin, 0};
for (uint64_t i = 0; i < 1000000000; ++i)
{
const auto tmp = (prev[0] + prev[1]) % 1000;
prev[1] = prev[0];
prev[0] = tmp;
}
*result = prev[0];
}
void batch(boost::asio::thread_pool &pool, const uint64_t a[])
{
uint64_t r[] = {0, 0};
boost::asio::post(pool, boost::bind(foo, a[0], &r[0]));
boost::asio::post(pool, boost::bind(foo, a[1], &r[1]));
pool.join();
std::cerr << "foo(" << a[0] << "): " << r[0] << " foo(" << a[1] << "): " << r[1] << std::endl;
}
where foo is a simple "pure" function that performs a calculation on begin and writes the result to the pointer *result.
This function gets called with different inputs from batch. Here dispatching each call to another CPU core might be beneficial.
Now assume the batch function gets called several 10 000 times. Therefore a thread pool would be nice which is shared between all the sequential batch calls.
Trying this with (for the sake of simplicity only 3 calls)
int main(int argn, char **)
{
boost::asio::thread_pool pool(2);
const uint64_t a[] = {2, 4};
batch(pool, a);
const uint64_t b[] = {3, 5};
batch(pool, b);
const uint64_t c[] = {7, 9};
batch(pool, c);
}
leads to the result
foo(2): 2 foo(4): 4
foo(3): 0 foo(5): 0
foo(7): 0 foo(9): 0
Where all three lines appear at the same time, while the computation of foo takes ~3s.
I assume that only the first join really waits for the pool to complete all jobs.
The others have invalid results. (The not initialized values)
What is the best practice here to reuse the thread pool?
The best practice is not to reuse the pool (what would be the use of pooling, if you keep creating new pools?).
If you want to be sure you "time" the batches together, I'd suggest using when_all on futures:
Live On Coliru
#define BOOST_THREAD_PROVIDES_FUTURE_WHEN_ALL_WHEN_ANY
#include <iostream>
#include <boost/bind.hpp>
#include <boost/asio.hpp>
#include <boost/thread.hpp>
uint64_t foo(uint64_t begin) {
uint64_t prev[] = {begin, 0};
for (uint64_t i = 0; i < 1000000000; ++i) {
const auto tmp = (prev[0] + prev[1]) % 1000;
prev[1] = prev[0];
prev[0] = tmp;
}
return prev[0];
}
void batch(boost::asio::thread_pool &pool, const uint64_t a[2])
{
using T = boost::packaged_task<uint64_t>;
T tasks[] {
T(boost::bind(foo, a[0])),
T(boost::bind(foo, a[1])),
};
auto all = boost::when_all(
tasks[0].get_future(),
tasks[1].get_future());
for (auto& t : tasks)
post(pool, std::move(t));
auto [r0, r1] = all.get();
std::cerr << "foo(" << a[0] << "): " << r0.get() << " foo(" << a[1] << "): " << r1.get() << std::endl;
}
int main() {
boost::asio::thread_pool pool(2);
const uint64_t a[] = {2, 4};
batch(pool, a);
const uint64_t b[] = {3, 5};
batch(pool, b);
const uint64_t c[] = {7, 9};
batch(pool, c);
}
Prints
foo(2): 2 foo(4): 4
foo(3): 503 foo(5): 505
foo(7): 507 foo(9): 509
I would consider
generalizing
message queuing
Generalized
Make it somewhat more flexible by not hardcoding batch sizes. After all, the pool size is already fixed, we don't need to "make sure batches fit" or something:
Live On Coliru
#define BOOST_THREAD_PROVIDES_FUTURE_WHEN_ALL_WHEN_ANY
#include <iostream>
#include <boost/bind.hpp>
#include <boost/asio.hpp>
#include <boost/thread.hpp>
#include <boost/thread/future.hpp>
struct Result { uint64_t begin, result; };
Result foo(uint64_t begin) {
uint64_t prev[] = {begin, 0};
for (uint64_t i = 0; i < 1000000000; ++i) {
const auto tmp = (prev[0] + prev[1]) % 1000;
prev[1] = prev[0];
prev[0] = tmp;
}
return { begin, prev[0] };
}
void batch(boost::asio::thread_pool &pool, std::vector<uint64_t> const a)
{
using T = boost::packaged_task<Result>;
std::vector<T> tasks;
tasks.reserve(a.size());
for(auto begin : a)
tasks.emplace_back(boost::bind(foo, begin));
std::vector<boost::unique_future<T::result_type> > futures;
for (auto& t : tasks) {
futures.push_back(t.get_future());
post(pool, std::move(t));
}
for (auto& fut : boost::when_all(futures.begin(), futures.end()).get()) {
auto r = fut.get();
std::cerr << "foo(" << r.begin << "): " << r.result << " ";
}
std::cout << std::endl;
}
int main() {
boost::asio::thread_pool pool(2);
batch(pool, {2});
batch(pool, {4, 3, 5});
batch(pool, {7, 9});
}
Prints
foo(2): 2
foo(4): 4 foo(3): 503 foo(5): 505
foo(7): 507 foo(9): 509
Generalized2: Variadics Simplify
Contrary to popular believe (and honestly, what usually happens) this time we can leverage variadics to get rid of all the intermediate vectors (every single one of them):
Live On Coliru
void batch(boost::asio::thread_pool &pool, T... a)
{
auto launch = [&pool](uint64_t begin) {
boost::packaged_task<Result> pt(boost::bind(foo, begin));
auto fut = pt.get_future();
post(pool, std::move(pt));
return fut;
};
for (auto& r : {launch(a).get()...}) {
std::cerr << "foo(" << r.begin << "): " << r.result << " ";
}
std::cout << std::endl;
}
If you insist on outputting the results in time, you can still add when_all into the mix (requiring a bit more heroics to unpack the tuple):
Live On Coliru
template <typename...T>
void batch(boost::asio::thread_pool &pool, T... a)
{
auto launch = [&pool](uint64_t begin) {
boost::packaged_task<Result> pt(boost::bind(foo, begin));
auto fut = pt.get_future();
post(pool, std::move(pt));
return fut;
};
std::apply([](auto&&... rfut) {
Result results[] {rfut.get()...};
for (auto& r : results) {
std::cerr << "foo(" << r.begin << "): " << r.result << " ";
}
}, boost::when_all(launch(a)...).get());
std::cout << std::endl;
}
Both still print the same result
Message Queuing
This is very natural to boost, and sort of skips most complexity. If you also want to report per batched group, you'd have to coordinate:
Live On Coliru
#include <iostream>
#include <boost/asio.hpp>
#include <memory>
struct Result { uint64_t begin, result; };
Result foo(uint64_t begin) {
uint64_t prev[] = {begin, 0};
for (uint64_t i = 0; i < 1000000000; ++i) {
const auto tmp = (prev[0] + prev[1]) % 1000;
prev[1] = prev[0];
prev[0] = tmp;
}
return { begin, prev[0] };
}
using Group = std::shared_ptr<size_t>;
void batch(boost::asio::thread_pool &pool, std::vector<uint64_t> begins) {
auto group = std::make_shared<std::vector<Result> >(begins.size());
for (size_t i=0; i < begins.size(); ++i) {
post(pool, [i,begin=begins.at(i),group] {
(*group)[i] = foo(begin);
if (group.unique()) {
for (auto& r : *group) {
std::cout << "foo(" << r.begin << "): " << r.result << " ";
std::cout << std::endl;
}
}
});
}
}
int main() {
boost::asio::thread_pool pool(2);
batch(pool, {2});
batch(pool, {4, 3, 5});
batch(pool, {7, 9});
pool.join();
}
Note this is having concurrent access to group, which is safe due to the limitations on element accesses.
Prints:
foo(2): 2
foo(4): 4 foo(3): 503 foo(5): 505
foo(7): 507 foo(9): 509
I just ran into this advanced executor example which is hidden from the documentation:
I realized just now that Asio comes with a fork_executor example which does exactly this: you can "group" tasks and join the executor (which represents that group) instead of the pool. I've missed this for the longest time since none of the executor examples are listed in the HTML documentation – sehe 21 mins ago
So without further ado, here's that sample applied to your question:
Live On Coliru
#define BOOST_BIND_NO_PLACEHOLDERS
#include <boost/asio/thread_pool.hpp>
#include <boost/asio/ts/executor.hpp>
#include <condition_variable>
#include <memory>
#include <mutex>
#include <queue>
#include <thread>
// A fixed-size thread pool used to implement fork/join semantics. Functions
// are scheduled using a simple FIFO queue. Implementing work stealing, or
// using a queue based on atomic operations, are left as tasks for the reader.
class fork_join_pool : public boost::asio::execution_context {
public:
// The constructor starts a thread pool with the specified number of
// threads. Note that the thread_count is not a fixed limit on the pool's
// concurrency. Additional threads may temporarily be added to the pool if
// they join a fork_executor.
explicit fork_join_pool(std::size_t thread_count = std::thread::hardware_concurrency()*2)
: use_count_(1), threads_(thread_count)
{
try {
// Ask each thread in the pool to dequeue and execute functions
// until it is time to shut down, i.e. the use count is zero.
for (thread_count_ = 0; thread_count_ < thread_count; ++thread_count_) {
boost::asio::dispatch(threads_, [&] {
std::unique_lock<std::mutex> lock(mutex_);
while (use_count_ > 0)
if (!execute_next(lock))
condition_.wait(lock);
});
}
} catch (...) {
stop_threads();
threads_.join();
throw;
}
}
// The destructor waits for the pool to finish executing functions.
~fork_join_pool() {
stop_threads();
threads_.join();
}
private:
friend class fork_executor;
// The base for all functions that are queued in the pool.
struct function_base {
std::shared_ptr<std::size_t> work_count_;
void (*execute_)(std::shared_ptr<function_base>& p);
};
// Execute the next function from the queue, if any. Returns true if a
// function was executed, and false if the queue was empty.
bool execute_next(std::unique_lock<std::mutex>& lock) {
if (queue_.empty())
return false;
auto p(queue_.front());
queue_.pop();
lock.unlock();
execute(lock, p);
return true;
}
// Execute a function and decrement the outstanding work.
void execute(std::unique_lock<std::mutex>& lock,
std::shared_ptr<function_base>& p) {
std::shared_ptr<std::size_t> work_count(std::move(p->work_count_));
try {
p->execute_(p);
lock.lock();
do_work_finished(work_count);
} catch (...) {
lock.lock();
do_work_finished(work_count);
throw;
}
}
// Increment outstanding work.
void
do_work_started(const std::shared_ptr<std::size_t>& work_count) noexcept {
if (++(*work_count) == 1)
++use_count_;
}
// Decrement outstanding work. Notify waiting threads if we run out.
void
do_work_finished(const std::shared_ptr<std::size_t>& work_count) noexcept {
if (--(*work_count) == 0) {
--use_count_;
condition_.notify_all();
}
}
// Dispatch a function, executing it immediately if the queue is already
// loaded. Otherwise adds the function to the queue and wakes a thread.
void do_dispatch(std::shared_ptr<function_base> p,
const std::shared_ptr<std::size_t>& work_count) {
std::unique_lock<std::mutex> lock(mutex_);
if (queue_.size() > thread_count_ * 16) {
do_work_started(work_count);
lock.unlock();
execute(lock, p);
} else {
queue_.push(p);
do_work_started(work_count);
condition_.notify_one();
}
}
// Add a function to the queue and wake a thread.
void do_post(std::shared_ptr<function_base> p,
const std::shared_ptr<std::size_t>& work_count) {
std::lock_guard<std::mutex> lock(mutex_);
queue_.push(p);
do_work_started(work_count);
condition_.notify_one();
}
// Ask all threads to shut down.
void stop_threads() {
std::lock_guard<std::mutex> lock(mutex_);
--use_count_;
condition_.notify_all();
}
std::mutex mutex_;
std::condition_variable condition_;
std::queue<std::shared_ptr<function_base>> queue_;
std::size_t use_count_;
std::size_t thread_count_;
boost::asio::thread_pool threads_;
};
// A class that satisfies the Executor requirements. Every function or piece of
// work associated with a fork_executor is part of a single, joinable group.
class fork_executor {
public:
fork_executor(fork_join_pool& ctx)
: context_(ctx), work_count_(std::make_shared<std::size_t>(0)) {}
fork_join_pool& context() const noexcept { return context_; }
void on_work_started() const noexcept {
std::lock_guard<std::mutex> lock(context_.mutex_);
context_.do_work_started(work_count_);
}
void on_work_finished() const noexcept {
std::lock_guard<std::mutex> lock(context_.mutex_);
context_.do_work_finished(work_count_);
}
template <class Func, class Alloc>
void dispatch(Func&& f, const Alloc& a) const {
auto p(std::allocate_shared<exFun<Func>>(
typename std::allocator_traits<Alloc>::template rebind_alloc<char>(a),
std::move(f), work_count_));
context_.do_dispatch(p, work_count_);
}
template <class Func, class Alloc> void post(Func f, const Alloc& a) const {
auto p(std::allocate_shared<exFun<Func>>(
typename std::allocator_traits<Alloc>::template rebind_alloc<char>(a),
std::move(f), work_count_));
context_.do_post(p, work_count_);
}
template <class Func, class Alloc>
void defer(Func&& f, const Alloc& a) const {
post(std::forward<Func>(f), a);
}
friend bool operator==(const fork_executor& a, const fork_executor& b) noexcept {
return a.work_count_ == b.work_count_;
}
friend bool operator!=(const fork_executor& a, const fork_executor& b) noexcept {
return a.work_count_ != b.work_count_;
}
// Block until all work associated with the executor is complete. While it
// is waiting, the thread may be borrowed to execute functions from the
// queue.
void join() const {
std::unique_lock<std::mutex> lock(context_.mutex_);
while (*work_count_ > 0)
if (!context_.execute_next(lock))
context_.condition_.wait(lock);
}
private:
template <class Func> struct exFun : fork_join_pool::function_base {
explicit exFun(Func f, const std::shared_ptr<std::size_t>& w)
: function_(std::move(f)) {
work_count_ = w;
execute_ = [](std::shared_ptr<fork_join_pool::function_base>& p) {
Func tmp(std::move(static_cast<exFun*>(p.get())->function_));
p.reset();
tmp();
};
}
Func function_;
};
fork_join_pool& context_;
std::shared_ptr<std::size_t> work_count_;
};
// Helper class to automatically join a fork_executor when exiting a scope.
class join_guard {
public:
explicit join_guard(const fork_executor& ex) : ex_(ex) {}
join_guard(const join_guard&) = delete;
join_guard(join_guard&&) = delete;
~join_guard() { ex_.join(); }
private:
fork_executor ex_;
};
//------------------------------------------------------------------------------
#include <algorithm>
#include <iostream>
#include <random>
#include <vector>
#include <boost/bind.hpp>
static void foo(const uint64_t begin, uint64_t *result)
{
uint64_t prev[] = {begin, 0};
for (uint64_t i = 0; i < 1000000000; ++i) {
const auto tmp = (prev[0] + prev[1]) % 1000;
prev[1] = prev[0];
prev[0] = tmp;
}
*result = prev[0];
}
void batch(fork_join_pool &pool, const uint64_t (&a)[2])
{
uint64_t r[] = {0, 0};
{
fork_executor fork(pool);
join_guard join(fork);
boost::asio::post(fork, boost::bind(foo, a[0], &r[0]));
boost::asio::post(fork, boost::bind(foo, a[1], &r[1]));
// fork.join(); // or let join_guard destructor run
}
std::cerr << "foo(" << a[0] << "): " << r[0] << " foo(" << a[1] << "): " << r[1] << std::endl;
}
int main() {
fork_join_pool pool;
batch(pool, {2, 4});
batch(pool, {3, 5});
batch(pool, {7, 9});
}
Prints:
foo(2): 2 foo(4): 4
foo(3): 503 foo(5): 505
foo(7): 507 foo(9): 509
Things to note:
executors can overlap/nest: you can use several joinable fork_executors on a single fork_join_pool and they will join the distinct groups of tasks for each executor
You can get that sense easily when looking at the library example (which does a recursive divide-and-conquer merge sort).
I had a similar problem and ended up using latches. In this case the code would would be (I also switched from bind to lambdas):
void batch(boost::asio::thread_pool &pool, const uint64_t a[])
{
uint64_t r[] = {0, 0};
boost::latch latch(2);
boost::asio::post(pool, [&](){ foo(a[0], &r[0]); latch.count_down();});
boost::asio::post(pool, [&](){ foo(a[1], &r[1]); latch.count_down();});
latch.wait();
std::cerr << "foo(" << a[0] << "): " << r[0] << " foo(" << a[1] << "): " << r[1] << std::endl;
}
https://godbolt.org/z/oceP6jjs7

How to wait until all threads from the pool ends their work?

I am trying to implement simple thread pool using boost library.
Here is code:
//boost::asio::io_service ioService;
//boost::thread_group pool;
//boost::asio::io_service::work* worker;
ThreadPool::ThreadPool(int poolSize /*= boost::thread::hardware_concurrency()*/)
{
if (poolSize >= 1 && poolSize <= boost::thread::hardware_concurrency())
threadAmount = poolSize;
else
threadAmount = 1;
worker = NULL;
}
ThreadPool::~ThreadPool()
{
if (worker != NULL && !ioService.stopped())
{
_shutdown();
delete worker;
worker = NULL;
}
}
void ThreadPool::start()
{
if (worker != NULL)
{
return;
}
worker = new boost::asio::io_service::work(ioService);
for (int i = 0; i < threadAmount; ++i)
{
pool.create_thread(boost::bind(&boost::asio::io_service::run, &ioService));
}
}
template<class F, class...Args>
void ThreadPool::execute(F f, Args&&... args)
{
ioService.post(boost::bind(f, std::forward<Args>(args)...));
}
void ThreadPool::shutdown()
{
pool.interrupt_all();
_shutdown();
}
void ThreadPool::join_all()
{
// wait for all threads before continue
// in other words - barier for all threads when they finished all jobs
// and to be able re-use them in futur.
}
void ThreadPool::_shutdown()
{
ioService.reset();
ioService.stop();
}
In my program i assign to thread pool some tasks that needs to be done, and going further with main thread. At some point i need to wait for all threads to finished all tasks before i could proceed calculations. Is there any way to do this ?
Thanks a lot.
As others have pointed out, the main culprit is the work instance.
I'd much simplify the interface (there's really no reason to split shutdown into shutdown, _shutdown, join_all and some random logic in the destructor as well. That just makes it hard to know what responsibility is where.
The interface should be a Pit Of Success - easy to use right, hard to use wrong.
At the same time it makes it much easier to implement it correctly.
Here's a first stab:
Live On Coliru
#include <boost/asio.hpp>
#include <boost/thread.hpp>
namespace ba = boost::asio;
struct ThreadPool {
ThreadPool(unsigned poolSize = boost::thread::hardware_concurrency());
~ThreadPool();
void start();
template <typename F, typename... Args>
void execute(F f, Args&&... args) {
ioService.post(std::bind(f, std::forward<Args>(args)...));
}
private:
unsigned threadAmount;
ba::io_service ioService;
boost::thread_group pool;
std::unique_ptr<ba::io_service::work> work;
void shutdown();
};
ThreadPool::ThreadPool(
unsigned poolSize /*= boost::thread::hardware_concurrency()*/) {
threadAmount = std::max(1u, poolSize);
threadAmount = std::min(boost::thread::hardware_concurrency(), poolSize);
}
ThreadPool::~ThreadPool() {
shutdown();
}
void ThreadPool::start() {
if (!work) {
work = std::make_unique<ba::io_service::work>(ioService);
for (unsigned i = 0; i < threadAmount; ++i) {
pool.create_thread(
boost::bind(&ba::io_service::run, &ioService));
}
}
}
void ThreadPool::shutdown() {
work.reset();
pool.interrupt_all();
ioService.stop();
pool.join_all();
ioService.reset();
}
#include <iostream>
using namespace std::chrono_literals;
int main() {
auto now = std::chrono::high_resolution_clock::now;
auto s = now();
{
ThreadPool p(10);
p.start();
p.execute([] { std::this_thread::sleep_for(1s); });
p.execute([] { std::this_thread::sleep_for(600ms); });
p.execute([] { std::this_thread::sleep_for(400ms); });
p.execute([] { std::this_thread::sleep_for(200ms); });
p.execute([] { std::this_thread::sleep_for(10ms); });
}
std::cout << "Total elapsed: " << (now() - s) / 1.0s << "s\n";
}
Which on most multi-core systems will print something like on mine:
Total elapsed: 1.00064s
It looks like you had an error in calculating threadAmount where you'd take 1 if poolSize was more than hardware_concurrency.
To be honest, why have the bind in the implementation? It really doesn't add a lot, you can leave it up to the caller, and they can choose whether they use bind, and if so, whether it's boost::bind, std::bind or some other way of composing calleables:
template <typename F>
void execute(F f) { ioService.post(f); }
You're missing exception handling around io_service::run calls (see Should the exception thrown by boost::asio::io_service::run() be caught?).
If you're using recent boost version, you can use the newer io_context and thread_pool interfaces, greatly simplifying things:
Live On Coliru
#include <boost/asio.hpp>
struct ThreadPool {
ThreadPool(unsigned poolSize)
: pool(std::clamp(poolSize, 1u, std::thread::hardware_concurrency()))
{ }
template <typename F>
void execute(F f) { post(pool, f); }
private:
boost::asio::thread_pool pool;
};
This still has 99% of the functionality¹, but in 10 LoC.
In fact, the class has become a trivial wrapper, so we could just write:
Live On Coliru
#include <boost/asio.hpp>
#include <iostream>
using namespace std::chrono_literals;
using C = std::chrono::high_resolution_clock;
static void sleep_for(C::duration d) { std::this_thread::sleep_for(d); }
int main() {
auto s = C::now();
{
boost::asio::thread_pool pool;
post(pool, [] { sleep_for(1s); });
post(pool, [] { sleep_for(600ms); });
// still can bind if you want
post(pool, std::bind(sleep_for, 400ms));
post(pool, std::bind(sleep_for, 200ms));
post(pool, std::bind(sleep_for, 10ms));
//pool.join(); // implicit in destructor
}
std::cout << "Total elapsed: " << (C::now() - s) / 1.0s << "s\n";
}
Main difference is the default pool size: it is 2*hardware concurrency (but also calculated more safely, because not all platforms have a reliable hardware_concurrency() - it could be zero, e.g.).
¹ It doesn't currently exercise interruptions points

Using condition_variable::notify_all to notify multiple threads

I have been trying to code the dining philosophers as a way to get better with multithreading programming. In my code, I have a condition_variable that stops the thread until all the threads have been created. However, it seems that when I call condition_variable::notify_all to notify that all the threads have been created and to start 'eating', only one thread is notified. For example:
I have a Philosophers class which has these member variables:
static std::condition_variable start;
static std::mutex start_mutex;
And these member function.
static void start_eating() {
start.notify_all();
}
void dine() {
signal(SIGINT, ctrl_c_catch);
std::unique_lock lk{ start_mutex };
start.wait(lk);
std::cout << id << "started\n";
// see end for complete class...
Each thread waits on the condition_variable start and won't continue until I call start_eating(). The problem is that when I call start.notify_all();, only one of the threads gets notified and continues. However, when I change the code to unlock the mutex after waiting, everything runs OK (All the threads continue):
std::unique_lock lk{ start_mutex };
start.wait(lk);
lk.unlock();
I was dont understand what is going on here. Why do I need to unlock the mutex?
The full code:
#include <chrono>
#include <mutex>
#include <vector>
#include <thread>
#include <condition_variable>
#include <atomic>
#include <signal.h>
#include <iostream>
#include <shared_mutex>
#include <ctime>
namespace clk = std::chrono;
const auto EAT_SLEEP_TIME = clk::milliseconds{1}; // 5 seconds
const auto NUM_SEATS = 5U;
using Fork = std::mutex; // is the fork being used or not
std::mutex cout_mutex;
void ctrl_c_catch(int dummy);
class Philosopher {
Fork& left;
Fork& right;
unsigned id;
unsigned times_eaten;
static std::condition_variable start;
static std::mutex start_mutex;
static std::atomic_bool end;
public:
Philosopher(Fork& l, Fork& r, unsigned i) : left{ l }, right{ r }, id{ i }, times_eaten{} {}
static void start_eating() {
start.notify_all();
}
static void stop_eating() {
end = true;
}
void dine() {
signal(SIGINT, ctrl_c_catch);
std::unique_lock lk{ start_mutex };
start.wait(lk);
// lk.unlock(); // uncommenting this fixes the issue
std::cout << id << " started\n";
while (!end) {
if (&right < &left) {
right.lock();
left.lock();
} else {
left.lock();
right.lock();
}
cout_mutex.lock();
std::clog << id << " got both forks, eating\n";
cout_mutex.unlock();
++times_eaten;
std::this_thread::sleep_for(EAT_SLEEP_TIME * (rand() % 50));
right.unlock();
left.unlock();
std::this_thread::sleep_for(EAT_SLEEP_TIME * (rand() % 50));
}
cout_mutex.lock();
std::cout << id << " stopped, terminating thread. Eaten " << times_eaten << "\n";
cout_mutex.unlock();
delete this;
}
};
std::atomic_bool Philosopher::end = false;
std::condition_variable Philosopher::start{};
std::mutex Philosopher::start_mutex{};
template <size_t N, typename T = unsigned>
constexpr std::array<T, N> range(T b = 0, T s = 1) {
std::array<T, N> ret{};
for (auto& i : ret) {
i = b;
b += s;
}
return ret;
}
void ctrl_c_catch(int dummy) {
std::cout << "Caught ctrl-c or stop\nStoping Philosophers\n";
Philosopher::stop_eating();
std::this_thread::sleep_for(clk::seconds{5});
exit(0);
}
int main() {
srand(time(NULL));
signal(SIGINT, ctrl_c_catch);
std::vector<Fork> forks{ NUM_SEATS }; // 5 forks
std::vector<std::thread> phil; // vector of philosophers
for (unsigned i : range<NUM_SEATS - 1>()) {
auto p = new Philosopher{forks[i], forks[i + 1], i};
phil.emplace_back(&Philosopher::dine, p);
}
auto p = new Philosopher{forks[NUM_SEATS - 1], forks[0], NUM_SEATS - 1};
phil.emplace_back(&Philosopher::dine, p);
std::clog << "Waiting for 5 seconds\n";
std::this_thread::sleep_for(clk::seconds{10});
std::clog << "Starting Philosophers\n Type 'stop' to stop\n";
Philosopher::start_eating();
for (auto& t : phil)
t.detach();
std::this_thread::sleep_for(clk::seconds{15});
ctrl_c_catch(0);
std::string dummy;
std::cin >> dummy;
if (dummy == "stop")
ctrl_c_catch(0);
return 0;
}
As explained here, calling std::condition_variable::wait releases the lock, waits, and after waking up, the lock is reacquired. So you need to unlock it manually (or automatically using RAII) to allow other threads to lock it. Condition variables in C++ have similar semantics to non-blocking monitors, so you can read up on that to get a better intuitive understanding. Also, because of spurious unblocking, which is impossible to prevent, you should use the other version of the function, the one that uses a predicate (more info in above link).