Concurrent program compiled with clang runs fine, but hangs with gcc - c++

I wrote a class to share a limited number of resources (for instance network interfaces) between a larger number of threads. The resources are pooled and, if not in use, they are borrowed out to the requesting thread, which otherwise waits on a condition_variable.
Nothing really exotic: apart for the fancy scoped_lock which requires c++17, it should be good old c++11.
Both gcc10.2 and clang11 compile the test main fine, but while the latter produces an executable which does pretty much what expected, the former hangs without consuming CPU (deadlock?).
With the help of https://godbolt.org/ I tried older versions of gcc and also icc (passing options -O3 -std=c++17 -pthread), all reproducing the bad result, while even there clang confirms the proper behavior.
I wonder if I made a mistake or if the code triggers some compiler misbehavior and in case how to work around that.
#include <iostream>
#include <vector>
#include <stdexcept>
#include <mutex>
#include <condition_variable>
template <typename T>
class Pool {
///////////////////////////
class Borrowed {
friend class Pool<T>;
Pool<T>& pool;
const size_t id;
T * val;
public:
Borrowed(Pool & p, size_t i, T& v): pool(p), id(i), val(&v) {}
~Borrowed() { release(); }
T& get() const {
if (!val) throw std::runtime_error("Borrowed::get() this resource was collected back by the pool");
return *val;
}
void release() { pool.collect(*this); }
};
///////////////////////////
struct Resource {
T val;
bool available = true;
Resource(T v): val(std::move(v)) {}
};
///////////////////////////
std::vector<Resource> vres;
size_t hint = 0;
std::condition_variable cv;
std::mutex mtx;
size_t available_cnt;
public:
Pool(std::initializer_list<T> l): available_cnt(l.size()) {
vres.reserve(l.size());
for (T t: l) {
vres.emplace_back(std::move(t));
}
std::cout << "Pool has size " << vres.size() << std::endl;
}
~Pool() {
for ( auto & res: vres ) {
if ( ! res.available ) {
std::cerr << "WARNING Pool::~Pool resources are still in use\n";
}
}
}
Borrowed borrow() {
std::unique_lock<std::mutex> lk(mtx);
cv.wait(lk, [&](){return available_cnt > 0;});
if ( vres[hint].available ) {
// quick path, if hint points to an available resource
std::cout << "hint good" << std::endl;
vres[hint].available = false;
--available_cnt;
Borrowed b(*this, hint, vres[hint].val);
if ( hint + 1 < vres.size() ) ++hint;
return b; // <--- gcc seems to hang here
} else {
// full scan to find the available resource
std::cout << "hint bad" << std::endl;
for ( hint = 0; hint < vres.size(); ++hint ) {
if ( vres[hint].available ) {
vres[hint].available = false;
--available_cnt;
return Borrowed(*this, hint, vres[hint].val);
}
}
}
throw std::runtime_error("Pool::borrow() no resource is available - internal logic error");
}
void collect(Borrowed & b) {
if ( &(b.pool) != this )
throw std::runtime_error("Pool::collect() trying to collect resource owned by another pool!");
if ( b.val ) {
b.val = nullptr;
{
std::scoped_lock<std::mutex> lk(mtx);
hint = b.id;
vres[hint].available = true;
++available_cnt;
}
cv.notify_one();
}
}
};
///////////////////////////////////////////////////////////////////
#include <thread>
#include <chrono>
int main() {
Pool<std::string> pool{"hello","world"};
std::vector<std::thread> vt;
for (int i = 10; i > 0; --i) {
vt.emplace_back( [&pool, i]()
{
auto res = pool.borrow();
std::this_thread::sleep_for(std::chrono::milliseconds(i*300));
std::cout << res.get() << std::endl;
}
);
}
for (auto & t: vt) t.join();
return 0;
}

You're running into undefined behavior since you effectively relock an already acquired lock. With MSVC I obtained a helpful callstack to distinguish this. Here is a working fixed example (I suppose, works now for me, see the changes within the borrow() method, might be further re-designed since locking inside a destructor might be questioned):
#include <iostream>
#include <vector>
#include <stdexcept>
#include <mutex>
#include <condition_variable>
template <typename T>
class Pool {
///////////////////////////
class Borrowed {
friend class Pool<T>;
Pool<T>& pool;
const size_t id;
T * val;
public:
Borrowed(Pool & p, size_t i, T& v) : pool(p), id(i), val(&v) {}
~Borrowed() { release(); }
T& get() const {
if (!val) throw std::runtime_error("Borrowed::get() this resource was collected back by the pool");
return *val;
}
void release() { pool.collect(*this); }
};
///////////////////////////
struct Resource {
T val;
bool available = true;
Resource(T v) : val(std::move(v)) {}
};
///////////////////////////
std::vector<Resource> vres;
size_t hint = 0;
std::condition_variable cv;
std::mutex mtx;
size_t available_cnt;
public:
Pool(std::initializer_list<T> l) : available_cnt(l.size()) {
vres.reserve(l.size());
for (T t : l) {
vres.emplace_back(std::move(t));
}
std::cout << "Pool has size " << vres.size() << std::endl;
}
~Pool() {
for (auto & res : vres) {
if (!res.available) {
std::cerr << "WARNING Pool::~Pool resources are still in use\n";
}
}
}
Borrowed borrow() {
std::unique_lock<std::mutex> lk(mtx);
while (available_cnt == 0) cv.wait(lk);
if (vres[hint].available) {
// quick path, if hint points to an available resource
std::cout << "hint good" << std::endl;
vres[hint].available = false;
--available_cnt;
Borrowed b(*this, hint, vres[hint].val);
if (hint + 1 < vres.size()) ++hint;
lk.unlock();
return b; // <--- gcc seems to hang here
}
else {
// full scan to find the available resource
std::cout << "hint bad" << std::endl;
for (hint = 0; hint < vres.size(); ++hint) {
if (vres[hint].available) {
vres[hint].available = false;
--available_cnt;
lk.unlock();
return Borrowed(*this, hint, vres[hint].val);
}
}
}
throw std::runtime_error("Pool::borrow() no resource is available - internal logic error");
}
void collect(Borrowed & b) {
if (&(b.pool) != this)
throw std::runtime_error("Pool::collect() trying to collect resource owned by another pool!");
if (b.val) {
b.val = nullptr;
{
std::scoped_lock<std::mutex> lk(mtx);
hint = b.id;
vres[hint].available = true;
++available_cnt;
cv.notify_one();
}
}
}
};
///////////////////////////////////////////////////////////////////
#include <thread>
#include <chrono>
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
int main()
{
try
{
Pool<std::string> pool{ "hello","world" };
std::vector<std::thread> vt;
for (int i = 10; i > 0; --i) {
vt.emplace_back([&pool, i]()
{
auto res = pool.borrow();
std::this_thread::sleep_for(std::chrono::milliseconds(i * 300));
std::cout << res.get() << std::endl;
}
);
}
for (auto & t : vt) t.join();
return 0;
}
catch(const std::exception& e)
{
std::cout << "exception occurred: " << e.what();
}
return 0;
}

Locking destructor coupled with missed NRVO caused the issue (credits to Secundi for pointing this out in the comments).
If the compiler skips NRVO, the few lines below if will call the destructor of b. The destructor tries to acquire the mutex before this gets released by the unique_lock, resulting in a deadlock.
Borrowed b(*this, hint, vres[hint].val);
if ( hint + 1 < vres.size() ) ++hint;
return b; // <--- gcc seems to hang here
It is of crucial importance here to avoid destroying b. In fact, even if manually releasing the unique_lock before returning will avoid the deadlock, the destructor of b will mark the pooled resource as available, while this is just being borrowed out, making the code wrong.
A possible fix consists in replacing the lines above with:
const auto tmp = hint;
if ( hint + 1 < vres.size() ) ++hint;
return Borrowed(*this, tmp, vres[tmp].val);
Another possibility (which does not exclude the former) is to delete the (evil) copy ctor of Borrowed and only provide a move ctor:
Borrowed(const Borrowed &) = delete;
Borrowed(Borrowed && b): pool(b.pool), id(b.id), val(b.val) { b.val = nullptr; }

Related

Determining function time using a wrapper

I'm looking for a generic way of measuring a functions timing like Here, but for c++.
My main goal is to not have cluttered code like this piece everywhere:
auto t1 = std::chrono::high_resolution_clock::now();
function(arg1, arg2);
auto t2 = std::chrono::high_resolution_clock::now();
auto tDur = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1);
But rather have a nice wrapper around the function.
What I got so far is:
timing.hpp:
#pragma once
#include <chrono>
#include <functional>
template <typename Tret, typename Tin1, typename Tin2> unsigned int getDuration(std::function<Tret(Tin1, Tin2)> function, Tin1 arg1, Tin2 arg2, Tret& retValue)
{
auto t1 = std::chrono::high_resolution_clock::now();
retValue = function(arg1, arg2);
auto t2 = std::chrono::high_resolution_clock::now();
auto tDur = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1);
return tDur.count();
}
main.cpp:
#include "timing.hpp"
#include "matrix.hpp"
constexpr int G_MATRIXSIZE = 2000;
int main(int argc, char** argv)
{
CMatrix<double> myMatrix(G_MATRIXSIZE);
bool ret;
// this call is quite ugly
std::function<bool(int, std::vector<double>)> fillRow = std::bind(&CMatrix<double>::fillRow, &myMatrix, 0, fillVec);
auto duration = getDuration(fillRow, 5, fillVec, ret );
std::cout << "duration(ms): " << duration << std::endl;
}
in case sb wants to test the code, matrix.hpp:
#pragma once
#include <iostream>
#include <string>
#include <sstream>
#include <vector>
template<typename T> class CMatrix {
public:
// ctor
CMatrix(int size) :
m_size(size)
{
m_matrixData = new std::vector<std::vector<T>>;
createUnityMatrix();
}
// dtor
~CMatrix()
{
std::cout << "Destructor of CMatrix called" << std::endl;
delete m_matrixData;
}
// print to std::out
void printMatrix()
{
std::ostringstream oss;
for (int i = 0; i < m_size; i++)
{
for (int j = 0; j < m_size; j++)
{
oss << m_matrixData->at(i).at(j) << ";";
}
oss << "\n";
}
std::cout << oss.str() << std::endl;
}
bool fillRow(int index, std::vector<T> row)
{
// checks
if (!indexValid(index))
{
return false;
}
if (row.size() != m_size)
{
return false;
}
// data replacement
for (int j = 0; j < m_size; j++)
{
m_matrixData->at(index).at(j) = row.at(j);
}
return true;
}
bool fillColumn(int index, std::vector<T> column)
{
// checks
if (!indexValid(index))
{
return false;
}
if (column.size() != m_size)
{
return false;
}
// data replacement
for (int j = 0; j < m_size; j++)
{
m_matrixData->at(index).at(j) = column.at(j);
}
return true;
}
private:
// variables
std::vector<std::vector<T>>* m_matrixData;
int m_size;
bool indexValid(int index)
{
if (index + 1 > m_size)
{
return false;
}
return true;
}
// functions
void createUnityMatrix()
{
for (int i = 0; i < m_size; i++)
{
std::vector<T> _vector;
for (int j = 0; j < m_size; j++)
{
if (i == j)
{
_vector.push_back(1);
}
else
{
_vector.push_back(0);
}
}
m_matrixData->push_back(_vector);
}
}
};
The thing is, this code is still quite ugly due to the std::function usage. Is there a better and/or simpler option ?
(+ also I'm sure I messed sth up with the std::bind, I think I need to use std::placeholders since I want to set the arguments later on.)
// edit, correct use of placeholder in main:
std::function<bool(int, std::vector<double>)> fillRow = std::bind(&CMatrix<double>::fillRow, &myMatrix, std::placeholders::_1, std::placeholders::_2);
auto duration = getDuration(fillRow, 18, fillVec, ret );
You can utilize RAII to implement a timer that records the execution time of a code block and a template function that wraps the function you would like to execute with the timer.
#include<string>
#include<chrono>
#include <unistd.h>
struct Timer
{
std::string fn, title;
std::chrono::time_point<std::chrono::steady_clock> start;
Timer(std::string fn, std::string title)
: fn(std::move(fn)), title(std::move(title)), start(std::chrono::steady_clock::now())
{
}
~Timer()
{
const auto elapsed =
std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - start).count();
printf("%s: function=%s; elasepd=%f ms\n", title.c_str(), fn.c_str(), elapsed / 1000.0);
}
};
#ifndef ENABLE_BENCHMARK
static constexpr inline void dummy_fn() { }
#define START_BENCHMARK_TIMER(...) dummy_fn()
#else
#define START_BENCHMARK_TIMER(title) bench::Timer timer(__FUNCTION__, title)
#endif
template<typename F, typename ...Args>
auto time_fn(F&& fn, Args&&... args) {
START_BENCHMARK_TIMER("wrapped fn");
return fn(std::forward<Args>(args)...);
}
int foo(int i) {
usleep(70000);
return i;
}
int main()
{
printf("%d\n", time_fn(foo, 3));
}
stdout:
wrapped fn: function=time_fn; elasepd=71.785000 ms
3
General Idea:
time_fn is a simple template function that calls START_BENCHMARK_TIMER and calls fn with the provided arguments
START_BENCHMARK_TIMER then creates a Timer object. It will record the current time in start. Do note that __FUNCTION__ will be replaced with the function that was called.
When the
provided fn returns or throws an exception, the Timer object from (1) will be destroyed and the destructor will be called. The destructor will then calculate the time difference between the current time and the recorded start time and prints it to stdout
Note:
Even though declaring start and end in time_fn instead of the RAII timer will work, having an RAII timer will allow you to cleanly handle the situation when fn throws an exception
If you are on c++11, you will need to change time_fn declaration to typename std::result_of<F &&(Args &&...)>::type time_fn(F&& fn, Args&&... args).
Edit: Updated the response to include a wrapper function approach.

C++20 coroutines. When yield is called empty value is retrieved

I watched the Björn Fahller - Asynchronous I/O and coroutines for smooth data streaming - Meeting C++ online talk. Following up this presentation, I gave a try to execute a similar example myself. There is a bug in my code and when yield is called , the value that is printed is zero. Debugging the code , I detected that the yield_value comparing with await_resume, is called from different promise object.
I am confused, and I do not know how to call the yield_value using the correct promise object.
#include <iostream>
#include <coroutine>
#include <optional>
#include <string>
#include <memory>
using namespace std;
template<typename T>
struct promise;
template<typename T>
struct task
{
using promise_type = promise<T>;
auto operator co_await() const noexcept
{
struct awaitable
{
awaitable(promise<T> & promise)
:m_promise(promise)
{
}
bool await_ready() const noexcept
{
return m_promise.isready();
}
void await_suspend(coroutine_handle<promise_type> next)
{
m_promise.m_continuation = next;
}
T await_resume() const
{
std::cout << "await_resume m_promise::" << &m_promise << std::endl;
return m_promise.get();
}
promise<T> & m_promise;
};
return awaitable(_coroutine.promise());
}
task(promise_type& promise) : _coroutine(coroutine_handle<promise_type>::from_promise(promise))
{
promise.m_continuation = _coroutine;
}
task() = default;
task(task const&) = delete;
task& operator=(task const&) = delete;
task(task && other) : _coroutine(other._coroutine)
{
other._coroutine = nullptr;
}
task& operator=(task&& other)
{
if (&other != this) {
_coroutine = other._coroutine;
other._coroutine = nullptr;
}
return *this;
}
static task<T> make()
{
std::cout << "Enter make" << std::endl;
co_await suspend_always{};
std::cout << "Enter exit" << std::endl;
}
auto get_promise()
{
std::cout << "get_promise " << &_coroutine.promise() << std::endl;
return _coroutine.promise();
}
~task()
{
if (_coroutine) {
_coroutine.destroy();
}
}
private:
friend class promise<T>;
coroutine_handle<promise_type> _coroutine;
};
template<typename T>
struct promise
{
task<T> get_return_object() noexcept
{
return {*this};
}
suspend_never initial_suspend() noexcept{return {};}
suspend_always final_suspend() noexcept{return {};}
bool isready() const noexcept
{
return m_value.has_value();
}
T get()
{
return m_value.has_value()? m_value.value(): 0;
}
void unhandled_exception()
{
auto ex = std::current_exception();
std::rethrow_exception(ex);
//// MSVC bug? should be possible to rethrow with "throw;"
//// rethrow exception immediately
// throw;
}
template<typename U>
suspend_always yield_value(U && u)
{
std::cout << "yield_value::" << &m_continuation.promise() << std::endl;
m_value.emplace(std::forward<U>(u));
m_continuation.resume();
//m_continuation.
return {};
}
void return_void(){}
coroutine_handle<promise<T>> m_continuation;
optional<T> m_value;
};
template<typename T>
task<T> print_all(task<T> & values)
{
std::cout << "print all" << std::endl;
for(;;)
{
auto v = co_await values;
std::cout << v << "\n" << std::flush;
}
}
int main(int argc, const char * argv[]) {
auto incoming = task<int>::make();
auto h = print_all(incoming);
auto promise = incoming.get_promise();
promise.yield_value(4);
}
Any help?
demo
This is returning a copy of the promise:
auto get_promise()
{
std::cout << "get_promise " << &_coroutine.promise() << std::endl;
return _coroutine.promise();
}
So instead of calling into the promise for the task, you're calling into just some other, unrelated promise object.
Once you fix that, you'll find that your code has an infinite loop. Your promise is "ready" when it has a value. But once it has a value, it always has a value - it's always ready. One way to fix this is to ensure that await_resume consumes the value. For instance, by changing get() to:
T get()
{
assert(m_value.has_value());
T v = *std::move(m_value);
m_value.reset();
return v;
}
That ensures that the next co_await actually suspends.

Future with Coroutines co_await

Watching a c++ lecture (https://youtu.be/DLLt4anKXKU?t=1589), I tried to understand how future work with co_await; example:
auto compute = []() -> std::future<int> {
int fst = co_await std::async(get_first);
int snd = co_await std::async(get_second);
co_return fst + snd;
};
auto f = compute();
/* some heavy task */
f.get();
I can't understand how and when co_await std::async(get_first) returns control to compute. i.e how std::future implements an awaitable interface (type).
how std::future implements an awaitable interface
Well as far as C++20 is concerned, it doesn't. C++20 provides co_await and its attendant language functionality, but it doesn't provide any actual awaitable types.
How std::future could implement the awaitable interface is basically the same as how std::experimental::future from the Concurrency TS implements future::then. then takes a function to be continued when the future's value becomes available. The return value of then is a new future<U> (the old future<T> now becomes non-functional), where U is the new value that the given continuation function returns. That new future will only have a U available when the original value is available and when the continuation has processed it into the new value. In that order.
The exact details about how .then works depend entirely on how future is implemented. And it may depend on how the specific future was created, as futures from std::async have special properties that other futures don't.
co_await just makes this process much more digestible visually. A co_awaitable future would simply shove the coroutine handle into future::then, thereby altering the future.
Here there is a full program that can await futures with C++20 coroutines. I did it myself these days to learn.
#include <cassert>
#include <coroutine>
#include <future>
#include <iostream>
#include <optional>
#include <thread>
using namespace std::literals;
template <class T>
class FutureAwaitable {
public:
template <class U> struct BasicPromiseType {
auto get_return_object() {
return FutureAwaitable<T>(CoroHandle::from_promise(*this));
}
std::suspend_always initial_suspend() noexcept {
std::cout << "Initial suspend\n";
return {};
}
std::suspend_never final_suspend() noexcept {
std::cout << "Final suspend\n";
return {};
}
template <class V>
requires std::is_convertible_v<V, T>
void return_value(V v) { _value = v; }
void unhandled_exception() { throw; }
std::optional<T> _value;
};
using promise_type = BasicPromiseType<FutureAwaitable<T>>;
using CoroHandle = std::coroutine_handle<promise_type>;
explicit FutureAwaitable(CoroHandle h) : _parent(h) { }
~FutureAwaitable() {
}
bool is_ready() const {
auto & fut = std::get<FutureAwaitable<T> *>(&_parent);
return fut->wait_for(std::chrono::seconds(0)) != std::future_status::ready;
}
FutureAwaitable(std::future<T> && f) {
_f = &f;
}
T get() const { return promise()._value.value(); }
std::future<T> & std_future() const {
assert(_f->valid());
return *_f;
}
bool await_ready() {
if (!(_f->wait_for(std::chrono::seconds(0)) == std::future_status::ready)) {
std::cout << "Await ready IS ready\n";
return true;
}
else
std::cout << "Await ready NOT ready\n";
return false;
}
auto await_resume() {
std::cout << "Await resume" << std::endl;
return std_future().get();
}
bool await_suspend(CoroHandle parent) {
_parent = parent;
std::cout << "Await suspend\n";
return true;
}
void resume() {
assert(_parent);
_parent.resume();
}
auto parent() const { return _parent; }
bool done() const noexcept {
return _parent.done();
}
private:
auto & promise() const noexcept { return _parent.promise(); }
CoroHandle _parent = nullptr;
std::future<T> * _f = nullptr;
};
template <class T> auto operator co_await(std::future<T> &&f) {
return FutureAwaitable<T>(std::forward<std::future<T>>(f));
}
template <class T> auto operator co_await(std::future<T> & f) {
return FutureAwaitable<T>(std::forward<std::future<T>>(f));
}
FutureAwaitable<int> coroutine() {
std::promise<int> p;
auto fut = p.get_future();
p.set_value(31);
std::cout << "Entered func()" << std::endl;
auto res = co_await std::move(fut);
std::cout << "Continue func(): " << res << std::endl;
auto computation = co_await std::async(std::launch::async, [] {
int j = 0;
for (int i = 0; i < 1000; ++i) {
j += i;
}
return j;
});
auto computation2 = std::async(std::launch::async, [] {
int j = 0;
std::this_thread::sleep_for(20s);
for (int i = 0; i < 1000; ++i) {
j += i;
}
return j;
});
auto computation3 = std::async(std::launch::async, [] {
int j = 0;
std::this_thread::sleep_for(20s);
for (int i = 0; i < 1000; ++i) {
j += i;
}
return j;
});
co_await computation2;
co_await computation3;
std::cout << "Computation result is " << computation << std::endl;
co_return computation;
}
#define ASYNC_MAIN(coro) \
int main() { \
FutureAwaitable<int> c = coro(); \
do { c.resume(); } while (!c.done()); \
std::cout << "The coroutine returned " << c.get(); \
return 0; \
}
ASYNC_MAIN(coroutine)

Why cannot my c++ thread pool accelerate my program?

I tried to implement a c++ thread pool according to some notes made by others, the code is like this:
#include <vector>
#include <queue>
#include <functional>
#include <future>
#include <atomic>
#include <condition_variable>
#include <thread>
#include <mutex>
#include <memory>
#include <glog/logging.h>
#include <iostream>
#include <chrono>
using std::cout;
using std::endl;
class ThreadPool {
public:
ThreadPool(const ThreadPool&) = delete;
ThreadPool(ThreadPool&&) = delete;
ThreadPool& operator=(const ThreadPool&) = delete;
ThreadPool& operator=(ThreadPool&&) = delete;
ThreadPool(uint32_t capacity=std::thread::hardware_concurrency(),
uint32_t n_threads=std::thread::hardware_concurrency()
): capacity(capacity), n_threads(n_threads) {
init(capacity, n_threads);
}
~ThreadPool() noexcept {
shutdown();
}
void init(uint32_t capacity, uint32_t n_threads) {
CHECK_GT(capacity, 0) << "task queue capacity should be greater than 0";
CHECK_GT(n_threads, 0) << "thread pool capacity should be greater than 0";
for (int i{0}; i < n_threads; ++i) {
pool.emplace_back(std::thread([this] {
std::function<void(void)> task;
while (!this->stop) {
{
std::unique_lock<std::mutex> lock(this->q_mutex);
task_q_empty.wait(lock, [&] {return this->stop | !task_q.empty();});
if (this->stop) break;
task = this->task_q.front();
this->task_q.pop();
task_q_full.notify_one();
}
// auto id = std::this_thread::get_id();
// std::cout << "thread id is: " << id << std::endl;
task();
}
}));
}
}
void shutdown() {
stop = true;
task_q_empty.notify_all();
task_q_full.notify_all();
for (auto& thread : pool) {
if (thread.joinable()) {
thread.join();
}
}
}
template<typename F, typename...Args>
auto submit(F&& f, Args&&... args) -> std::future<decltype(f(args...))> {
using res_type = decltype(f(args...));
std::function<res_type(void)> func = std::bind(std::forward<F>(f), std::forward<Args>(args)...);
auto task_ptr = std::make_shared<std::packaged_task<res_type()>>(func);
{
std::unique_lock<std::mutex> lock(q_mutex);
task_q_full.wait(lock, [&] {return this->stop | task_q.size() <= capacity;});
CHECK (this->stop == false) << "should not add task to stopped queue\n";
task_q.emplace([task_ptr]{(*task_ptr)();});
}
task_q_empty.notify_one();
return task_ptr->get_future();
}
private:
std::vector<std::thread> pool;
std::queue<std::function<void(void)>> task_q;
std::condition_variable task_q_full;
std::condition_variable task_q_empty;
std::atomic<bool> stop{false};
std::mutex q_mutex;
uint32_t capacity;
uint32_t n_threads;
};
int add(int a, int b) {return a + b;}
int main() {
auto t1 = std::chrono::steady_clock::now();
int n_threads = 1;
ThreadPool tp;
tp.init(n_threads, 1024);
std::vector<std::future<int>> res;
for (int i{0}; i < 1000000; ++i) {
res.push_back(tp.submit(add, i, i+1));
}
auto t2 = std::chrono::steady_clock::now();
for (auto &el : res) {
el.get();
// cout << el.get() << endl;
}
tp.shutdown();
cout << "processing: "
<< std::chrono::duration<double, std::milli>(t2 - t1).count()
<< endl;
return 0;
}
The problem is that, when I set n_threads=1, the program takes the same length of time as I set n_threads=4. Since my gpu has 72 kernels (from the htop command), I believe the 4 thread would be faster than the 1 thread settings. What is the problem with this implementation of the thread pool please?
I found few issues:
1) Use ORing instead of the bitwise operation in the both conditional-variable waits:
Replace this - `task_q_empty.wait(lock, [&] {return this->stop | !task_q.empty();});`
By - `task_q_empty.wait(lock, [&] {return this->stop || !task_q.empty();});`
2) Use notify_all() in place of notify_one() in init() and submit().
3) Two condition_variables is unnecessary here, use only task_q_empty.
4) Your use case is not ideal. Switching of the threads may outweigh adding of two integers, it may appear more the threads longer the execution time. Test in optimized mode. Try scenario like this to simulate longer process:
int add(int a, int b) { this_thread::sleep_for(chrono::milliseconds(200)); return a + b; }

Start remaining futures without blocking

I have a loop over a set where I have to perform an expensive calculation. I want to do this in parallel using the future class. As far as I understand this, async either starts the thread or defers it and starts it only when I call get() or wait(). So, when I have threads not started and try to get the result, I block the main thread an get a sequential processing. Is there a way to start the remaining deferred processes, so everything is calculated in parallel and will not block when I call get().
// do the calculations
std::vector<std::future<class>> futureList;
for (auto elem : container)
{
futureList.push_back(std::async(fct, elem));
}
// start remaining processes
// use the results
for (auto elem : futureList)
{
processResult(elem.get())
}
Thanks for your help.
You might use:
std::async(std::launch::async, fct, elem)
Sample:
#include <iostream>
#include <future>
#include <chrono>
#include <vector>
#include <stdexcept>
bool work() {
std::this_thread::sleep_for(std::chrono::milliseconds(1000));
if( ! (std::rand() % 2)) throw std::runtime_error("Exception");
return true;
}
int main() {
const unsigned Elements = 10;
typedef std::vector<std::future<bool>> future_container;
future_container futures;
for(unsigned i = 0; i < Elements; ++i)
{
futures.push_back(std::async(std::launch::async, work));
std::this_thread::sleep_for(std::chrono::milliseconds(10));
}
while( ! futures.empty()) {
future_container::iterator f = futures.begin();
while(f != futures.end())
{
if(f->wait_for(std::chrono::milliseconds(100)) == std::future_status::timeout) ++f;
else {
// Note:: Exception resulting due to the invokation of
// the thread are thrown here.
// (See 30.6.6 Class template future)
try {
std::cout << f->get() << '\n';
}
catch(const std::exception& e) {
std::cout << e.what() << '\n';
}
f = futures.erase(f);
}
}
}
return 0;
}
You may do something like : (http://coliru.stacked-crooked.com/a/005c7d2345ad791c)
Create this function:
void processResult_async(std::future<myClass>& f) { processResult(f.get()); }
And then
// use the results
std::vector<std::future<void>> results;
for (auto& elem : futureList)
{
results.push_back(std::async(std::launch::async, processResult_async, std::ref(elem)));
}