Determining function time using a wrapper - c++

I'm looking for a generic way of measuring a functions timing like Here, but for c++.
My main goal is to not have cluttered code like this piece everywhere:
auto t1 = std::chrono::high_resolution_clock::now();
function(arg1, arg2);
auto t2 = std::chrono::high_resolution_clock::now();
auto tDur = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1);
But rather have a nice wrapper around the function.
What I got so far is:
timing.hpp:
#pragma once
#include <chrono>
#include <functional>
template <typename Tret, typename Tin1, typename Tin2> unsigned int getDuration(std::function<Tret(Tin1, Tin2)> function, Tin1 arg1, Tin2 arg2, Tret& retValue)
{
auto t1 = std::chrono::high_resolution_clock::now();
retValue = function(arg1, arg2);
auto t2 = std::chrono::high_resolution_clock::now();
auto tDur = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1);
return tDur.count();
}
main.cpp:
#include "timing.hpp"
#include "matrix.hpp"
constexpr int G_MATRIXSIZE = 2000;
int main(int argc, char** argv)
{
CMatrix<double> myMatrix(G_MATRIXSIZE);
bool ret;
// this call is quite ugly
std::function<bool(int, std::vector<double>)> fillRow = std::bind(&CMatrix<double>::fillRow, &myMatrix, 0, fillVec);
auto duration = getDuration(fillRow, 5, fillVec, ret );
std::cout << "duration(ms): " << duration << std::endl;
}
in case sb wants to test the code, matrix.hpp:
#pragma once
#include <iostream>
#include <string>
#include <sstream>
#include <vector>
template<typename T> class CMatrix {
public:
// ctor
CMatrix(int size) :
m_size(size)
{
m_matrixData = new std::vector<std::vector<T>>;
createUnityMatrix();
}
// dtor
~CMatrix()
{
std::cout << "Destructor of CMatrix called" << std::endl;
delete m_matrixData;
}
// print to std::out
void printMatrix()
{
std::ostringstream oss;
for (int i = 0; i < m_size; i++)
{
for (int j = 0; j < m_size; j++)
{
oss << m_matrixData->at(i).at(j) << ";";
}
oss << "\n";
}
std::cout << oss.str() << std::endl;
}
bool fillRow(int index, std::vector<T> row)
{
// checks
if (!indexValid(index))
{
return false;
}
if (row.size() != m_size)
{
return false;
}
// data replacement
for (int j = 0; j < m_size; j++)
{
m_matrixData->at(index).at(j) = row.at(j);
}
return true;
}
bool fillColumn(int index, std::vector<T> column)
{
// checks
if (!indexValid(index))
{
return false;
}
if (column.size() != m_size)
{
return false;
}
// data replacement
for (int j = 0; j < m_size; j++)
{
m_matrixData->at(index).at(j) = column.at(j);
}
return true;
}
private:
// variables
std::vector<std::vector<T>>* m_matrixData;
int m_size;
bool indexValid(int index)
{
if (index + 1 > m_size)
{
return false;
}
return true;
}
// functions
void createUnityMatrix()
{
for (int i = 0; i < m_size; i++)
{
std::vector<T> _vector;
for (int j = 0; j < m_size; j++)
{
if (i == j)
{
_vector.push_back(1);
}
else
{
_vector.push_back(0);
}
}
m_matrixData->push_back(_vector);
}
}
};
The thing is, this code is still quite ugly due to the std::function usage. Is there a better and/or simpler option ?
(+ also I'm sure I messed sth up with the std::bind, I think I need to use std::placeholders since I want to set the arguments later on.)
// edit, correct use of placeholder in main:
std::function<bool(int, std::vector<double>)> fillRow = std::bind(&CMatrix<double>::fillRow, &myMatrix, std::placeholders::_1, std::placeholders::_2);
auto duration = getDuration(fillRow, 18, fillVec, ret );

You can utilize RAII to implement a timer that records the execution time of a code block and a template function that wraps the function you would like to execute with the timer.
#include<string>
#include<chrono>
#include <unistd.h>
struct Timer
{
std::string fn, title;
std::chrono::time_point<std::chrono::steady_clock> start;
Timer(std::string fn, std::string title)
: fn(std::move(fn)), title(std::move(title)), start(std::chrono::steady_clock::now())
{
}
~Timer()
{
const auto elapsed =
std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - start).count();
printf("%s: function=%s; elasepd=%f ms\n", title.c_str(), fn.c_str(), elapsed / 1000.0);
}
};
#ifndef ENABLE_BENCHMARK
static constexpr inline void dummy_fn() { }
#define START_BENCHMARK_TIMER(...) dummy_fn()
#else
#define START_BENCHMARK_TIMER(title) bench::Timer timer(__FUNCTION__, title)
#endif
template<typename F, typename ...Args>
auto time_fn(F&& fn, Args&&... args) {
START_BENCHMARK_TIMER("wrapped fn");
return fn(std::forward<Args>(args)...);
}
int foo(int i) {
usleep(70000);
return i;
}
int main()
{
printf("%d\n", time_fn(foo, 3));
}
stdout:
wrapped fn: function=time_fn; elasepd=71.785000 ms
3
General Idea:
time_fn is a simple template function that calls START_BENCHMARK_TIMER and calls fn with the provided arguments
START_BENCHMARK_TIMER then creates a Timer object. It will record the current time in start. Do note that __FUNCTION__ will be replaced with the function that was called.
When the
provided fn returns or throws an exception, the Timer object from (1) will be destroyed and the destructor will be called. The destructor will then calculate the time difference between the current time and the recorded start time and prints it to stdout
Note:
Even though declaring start and end in time_fn instead of the RAII timer will work, having an RAII timer will allow you to cleanly handle the situation when fn throws an exception
If you are on c++11, you will need to change time_fn declaration to typename std::result_of<F &&(Args &&...)>::type time_fn(F&& fn, Args&&... args).
Edit: Updated the response to include a wrapper function approach.

Related

how can i find a beautiful function wrapper

typedef void (*void_proc)(void* parameter);
void* parallel_init(void* dummy, int core_number);
int parallel_addtask(void* parallel_monitor, void_proc process, void *parameter);
int parallel_waittask(void* parallel_monitor, int task_id);
int parallel_uninit(void* parallel_monitor);
struct parallel_parameter {
int end;
int begin;
};
void process(void* parameter) {
auto p = reinterpret_cast<parallel_parameter*>(parameter);
// ur_function_name(p->begin, p-end);
}
above is a parallel library(c style) which i woule like to use. every time u call it, u should define a specific struct parameter, it is so annoying that i want implement a template function to mitigate the call steps and i try some kinds of methods to achieve this but failed.
template<typename _function, typename... _parameter>
int parallel_executor(_function&& function, _parameter&&... parameter) {
auto res = 0;
parallel_parameter p[8]{0};
auto body = [](void* para) -> void {
auto p = reinterpret_cast<parallel_parameter*>(para);
function(p->begin, p->end, std::forward<_parameter>(parameter)...)
};
auto parallel_handle = parallel_init(nullptr, 8);
do {
for (int i = 0;i < 8; ++i) {
res = parallel_addtask(parallel_handle, body, static_cast<void*>(&p[i]));
if (res != 0) break;
}
for (int i = 0; i < 8; ++i) {
res = parallel_waittask(parallel_handle, i);
if (res != 0) break;
}
} while (false);
parallel_uninit(parallel_handle);
return res;
}
this call is just simple to show my dilemma, when i use the parallel_executor, it turns out sessioncannot be accessed, because i am not specific the capture style, but when i change the body into below style, the parallel_addtask will not accept body function.
auto body = [&](void* para) -> void {
auto p = reinterpret_cast<parallel_parameter*>(para);
function(p->begin, p->end, std::forward<_parameter>(parameter)...)
};
and now i am in this awkward position for a while. below is the call style which i prefered.
auto ret = parallel_executor(
[](int begin, int end, int parameter_1, int parameter_2) {
std::cout << begin << " ==> " << end << " ==> " << parameter_1 << std::endl;
},
100, // parameter_1
200 // parameter_2
);
regarding the issue, i hope I have made myself clear. any suggestion is appreciated.
Wrapper might look like:
class ParrallelWrapper
{
public:
ParrallelWrapper(int core_number) :
parallel_monitor(parallel_init(nullptr, core_number))
{}
ParrallelWrapper(const ParrallelWrapper&) = delete;
ParrallelWrapper& operator= (const ParrallelWrapper&) = delete;
~ParrallelWrapper() { parallel_uninit(parallel_monitor); }
int AddTask(std::function<void()> f) {
auto run_function = *[](void* f){
(*reinterpret_cast<std::function<void()>*>(f))();
};
functions.push_back(std::make_unique<std::function<void()>>(f));
return parallel_addtask(parallel_monitor, run_function, functions.back().get());
}
int Wait(int task_id) { return parallel_waittask(parallel_monitor, task_id); }
private:
void* parallel_monitor = nullptr;
// Ensure lifetime, and pointer consistence.
std::vector<std::unique_ptr<std::function<void()>>> functions;
};
Demo
With appropriate blanks for specifying begin, end, and the number of tasks, you can use something like
struct parallel_deleter {
void operator()(void *m) const {parallel_uninit(m);}
};
template<class F,class ...TT>
int parallel_executor(F f,TT &&...tt) {
constexpr auto p=+f; // require captureless
constexpr int n=/*...*/;
std::unique_ptr<void,parallel_deleter> m(parallel_init(nullptr,n));
struct arg {
int begin,end;
std::tuple<TT...> user;
};
std::vector<arg> v(n,{0,0,{tt...}});
for(auto &x : v) {
x.begin=/*...*/;
x.end=/*...*/;
if(const int res=parallel_addtask(m.get(),[](void *v) {
const auto &a=*static_cast<arg*>(v);
std::apply([&a](auto &...aa) {p(a.begin,a.end,aa...);},a.user);
},&x)) return res;
}
for(int i=0;i<n;++i)
if(const int res=parallel_waittask(m.get(),i)) return res;
return parallel_uninit(m.release());
}
This design relies on a captureless lambda being passed (so that p can be used inside the task lambda without capturing anything); if you need to support any callable, Jarod42's solution based on std::function is superior.

Concurrent program compiled with clang runs fine, but hangs with gcc

I wrote a class to share a limited number of resources (for instance network interfaces) between a larger number of threads. The resources are pooled and, if not in use, they are borrowed out to the requesting thread, which otherwise waits on a condition_variable.
Nothing really exotic: apart for the fancy scoped_lock which requires c++17, it should be good old c++11.
Both gcc10.2 and clang11 compile the test main fine, but while the latter produces an executable which does pretty much what expected, the former hangs without consuming CPU (deadlock?).
With the help of https://godbolt.org/ I tried older versions of gcc and also icc (passing options -O3 -std=c++17 -pthread), all reproducing the bad result, while even there clang confirms the proper behavior.
I wonder if I made a mistake or if the code triggers some compiler misbehavior and in case how to work around that.
#include <iostream>
#include <vector>
#include <stdexcept>
#include <mutex>
#include <condition_variable>
template <typename T>
class Pool {
///////////////////////////
class Borrowed {
friend class Pool<T>;
Pool<T>& pool;
const size_t id;
T * val;
public:
Borrowed(Pool & p, size_t i, T& v): pool(p), id(i), val(&v) {}
~Borrowed() { release(); }
T& get() const {
if (!val) throw std::runtime_error("Borrowed::get() this resource was collected back by the pool");
return *val;
}
void release() { pool.collect(*this); }
};
///////////////////////////
struct Resource {
T val;
bool available = true;
Resource(T v): val(std::move(v)) {}
};
///////////////////////////
std::vector<Resource> vres;
size_t hint = 0;
std::condition_variable cv;
std::mutex mtx;
size_t available_cnt;
public:
Pool(std::initializer_list<T> l): available_cnt(l.size()) {
vres.reserve(l.size());
for (T t: l) {
vres.emplace_back(std::move(t));
}
std::cout << "Pool has size " << vres.size() << std::endl;
}
~Pool() {
for ( auto & res: vres ) {
if ( ! res.available ) {
std::cerr << "WARNING Pool::~Pool resources are still in use\n";
}
}
}
Borrowed borrow() {
std::unique_lock<std::mutex> lk(mtx);
cv.wait(lk, [&](){return available_cnt > 0;});
if ( vres[hint].available ) {
// quick path, if hint points to an available resource
std::cout << "hint good" << std::endl;
vres[hint].available = false;
--available_cnt;
Borrowed b(*this, hint, vres[hint].val);
if ( hint + 1 < vres.size() ) ++hint;
return b; // <--- gcc seems to hang here
} else {
// full scan to find the available resource
std::cout << "hint bad" << std::endl;
for ( hint = 0; hint < vres.size(); ++hint ) {
if ( vres[hint].available ) {
vres[hint].available = false;
--available_cnt;
return Borrowed(*this, hint, vres[hint].val);
}
}
}
throw std::runtime_error("Pool::borrow() no resource is available - internal logic error");
}
void collect(Borrowed & b) {
if ( &(b.pool) != this )
throw std::runtime_error("Pool::collect() trying to collect resource owned by another pool!");
if ( b.val ) {
b.val = nullptr;
{
std::scoped_lock<std::mutex> lk(mtx);
hint = b.id;
vres[hint].available = true;
++available_cnt;
}
cv.notify_one();
}
}
};
///////////////////////////////////////////////////////////////////
#include <thread>
#include <chrono>
int main() {
Pool<std::string> pool{"hello","world"};
std::vector<std::thread> vt;
for (int i = 10; i > 0; --i) {
vt.emplace_back( [&pool, i]()
{
auto res = pool.borrow();
std::this_thread::sleep_for(std::chrono::milliseconds(i*300));
std::cout << res.get() << std::endl;
}
);
}
for (auto & t: vt) t.join();
return 0;
}
You're running into undefined behavior since you effectively relock an already acquired lock. With MSVC I obtained a helpful callstack to distinguish this. Here is a working fixed example (I suppose, works now for me, see the changes within the borrow() method, might be further re-designed since locking inside a destructor might be questioned):
#include <iostream>
#include <vector>
#include <stdexcept>
#include <mutex>
#include <condition_variable>
template <typename T>
class Pool {
///////////////////////////
class Borrowed {
friend class Pool<T>;
Pool<T>& pool;
const size_t id;
T * val;
public:
Borrowed(Pool & p, size_t i, T& v) : pool(p), id(i), val(&v) {}
~Borrowed() { release(); }
T& get() const {
if (!val) throw std::runtime_error("Borrowed::get() this resource was collected back by the pool");
return *val;
}
void release() { pool.collect(*this); }
};
///////////////////////////
struct Resource {
T val;
bool available = true;
Resource(T v) : val(std::move(v)) {}
};
///////////////////////////
std::vector<Resource> vres;
size_t hint = 0;
std::condition_variable cv;
std::mutex mtx;
size_t available_cnt;
public:
Pool(std::initializer_list<T> l) : available_cnt(l.size()) {
vres.reserve(l.size());
for (T t : l) {
vres.emplace_back(std::move(t));
}
std::cout << "Pool has size " << vres.size() << std::endl;
}
~Pool() {
for (auto & res : vres) {
if (!res.available) {
std::cerr << "WARNING Pool::~Pool resources are still in use\n";
}
}
}
Borrowed borrow() {
std::unique_lock<std::mutex> lk(mtx);
while (available_cnt == 0) cv.wait(lk);
if (vres[hint].available) {
// quick path, if hint points to an available resource
std::cout << "hint good" << std::endl;
vres[hint].available = false;
--available_cnt;
Borrowed b(*this, hint, vres[hint].val);
if (hint + 1 < vres.size()) ++hint;
lk.unlock();
return b; // <--- gcc seems to hang here
}
else {
// full scan to find the available resource
std::cout << "hint bad" << std::endl;
for (hint = 0; hint < vres.size(); ++hint) {
if (vres[hint].available) {
vres[hint].available = false;
--available_cnt;
lk.unlock();
return Borrowed(*this, hint, vres[hint].val);
}
}
}
throw std::runtime_error("Pool::borrow() no resource is available - internal logic error");
}
void collect(Borrowed & b) {
if (&(b.pool) != this)
throw std::runtime_error("Pool::collect() trying to collect resource owned by another pool!");
if (b.val) {
b.val = nullptr;
{
std::scoped_lock<std::mutex> lk(mtx);
hint = b.id;
vres[hint].available = true;
++available_cnt;
cv.notify_one();
}
}
}
};
///////////////////////////////////////////////////////////////////
#include <thread>
#include <chrono>
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
int main()
{
try
{
Pool<std::string> pool{ "hello","world" };
std::vector<std::thread> vt;
for (int i = 10; i > 0; --i) {
vt.emplace_back([&pool, i]()
{
auto res = pool.borrow();
std::this_thread::sleep_for(std::chrono::milliseconds(i * 300));
std::cout << res.get() << std::endl;
}
);
}
for (auto & t : vt) t.join();
return 0;
}
catch(const std::exception& e)
{
std::cout << "exception occurred: " << e.what();
}
return 0;
}
Locking destructor coupled with missed NRVO caused the issue (credits to Secundi for pointing this out in the comments).
If the compiler skips NRVO, the few lines below if will call the destructor of b. The destructor tries to acquire the mutex before this gets released by the unique_lock, resulting in a deadlock.
Borrowed b(*this, hint, vres[hint].val);
if ( hint + 1 < vres.size() ) ++hint;
return b; // <--- gcc seems to hang here
It is of crucial importance here to avoid destroying b. In fact, even if manually releasing the unique_lock before returning will avoid the deadlock, the destructor of b will mark the pooled resource as available, while this is just being borrowed out, making the code wrong.
A possible fix consists in replacing the lines above with:
const auto tmp = hint;
if ( hint + 1 < vres.size() ) ++hint;
return Borrowed(*this, tmp, vres[tmp].val);
Another possibility (which does not exclude the former) is to delete the (evil) copy ctor of Borrowed and only provide a move ctor:
Borrowed(const Borrowed &) = delete;
Borrowed(Borrowed && b): pool(b.pool), id(b.id), val(b.val) { b.val = nullptr; }

Future with Coroutines co_await

Watching a c++ lecture (https://youtu.be/DLLt4anKXKU?t=1589), I tried to understand how future work with co_await; example:
auto compute = []() -> std::future<int> {
int fst = co_await std::async(get_first);
int snd = co_await std::async(get_second);
co_return fst + snd;
};
auto f = compute();
/* some heavy task */
f.get();
I can't understand how and when co_await std::async(get_first) returns control to compute. i.e how std::future implements an awaitable interface (type).
how std::future implements an awaitable interface
Well as far as C++20 is concerned, it doesn't. C++20 provides co_await and its attendant language functionality, but it doesn't provide any actual awaitable types.
How std::future could implement the awaitable interface is basically the same as how std::experimental::future from the Concurrency TS implements future::then. then takes a function to be continued when the future's value becomes available. The return value of then is a new future<U> (the old future<T> now becomes non-functional), where U is the new value that the given continuation function returns. That new future will only have a U available when the original value is available and when the continuation has processed it into the new value. In that order.
The exact details about how .then works depend entirely on how future is implemented. And it may depend on how the specific future was created, as futures from std::async have special properties that other futures don't.
co_await just makes this process much more digestible visually. A co_awaitable future would simply shove the coroutine handle into future::then, thereby altering the future.
Here there is a full program that can await futures with C++20 coroutines. I did it myself these days to learn.
#include <cassert>
#include <coroutine>
#include <future>
#include <iostream>
#include <optional>
#include <thread>
using namespace std::literals;
template <class T>
class FutureAwaitable {
public:
template <class U> struct BasicPromiseType {
auto get_return_object() {
return FutureAwaitable<T>(CoroHandle::from_promise(*this));
}
std::suspend_always initial_suspend() noexcept {
std::cout << "Initial suspend\n";
return {};
}
std::suspend_never final_suspend() noexcept {
std::cout << "Final suspend\n";
return {};
}
template <class V>
requires std::is_convertible_v<V, T>
void return_value(V v) { _value = v; }
void unhandled_exception() { throw; }
std::optional<T> _value;
};
using promise_type = BasicPromiseType<FutureAwaitable<T>>;
using CoroHandle = std::coroutine_handle<promise_type>;
explicit FutureAwaitable(CoroHandle h) : _parent(h) { }
~FutureAwaitable() {
}
bool is_ready() const {
auto & fut = std::get<FutureAwaitable<T> *>(&_parent);
return fut->wait_for(std::chrono::seconds(0)) != std::future_status::ready;
}
FutureAwaitable(std::future<T> && f) {
_f = &f;
}
T get() const { return promise()._value.value(); }
std::future<T> & std_future() const {
assert(_f->valid());
return *_f;
}
bool await_ready() {
if (!(_f->wait_for(std::chrono::seconds(0)) == std::future_status::ready)) {
std::cout << "Await ready IS ready\n";
return true;
}
else
std::cout << "Await ready NOT ready\n";
return false;
}
auto await_resume() {
std::cout << "Await resume" << std::endl;
return std_future().get();
}
bool await_suspend(CoroHandle parent) {
_parent = parent;
std::cout << "Await suspend\n";
return true;
}
void resume() {
assert(_parent);
_parent.resume();
}
auto parent() const { return _parent; }
bool done() const noexcept {
return _parent.done();
}
private:
auto & promise() const noexcept { return _parent.promise(); }
CoroHandle _parent = nullptr;
std::future<T> * _f = nullptr;
};
template <class T> auto operator co_await(std::future<T> &&f) {
return FutureAwaitable<T>(std::forward<std::future<T>>(f));
}
template <class T> auto operator co_await(std::future<T> & f) {
return FutureAwaitable<T>(std::forward<std::future<T>>(f));
}
FutureAwaitable<int> coroutine() {
std::promise<int> p;
auto fut = p.get_future();
p.set_value(31);
std::cout << "Entered func()" << std::endl;
auto res = co_await std::move(fut);
std::cout << "Continue func(): " << res << std::endl;
auto computation = co_await std::async(std::launch::async, [] {
int j = 0;
for (int i = 0; i < 1000; ++i) {
j += i;
}
return j;
});
auto computation2 = std::async(std::launch::async, [] {
int j = 0;
std::this_thread::sleep_for(20s);
for (int i = 0; i < 1000; ++i) {
j += i;
}
return j;
});
auto computation3 = std::async(std::launch::async, [] {
int j = 0;
std::this_thread::sleep_for(20s);
for (int i = 0; i < 1000; ++i) {
j += i;
}
return j;
});
co_await computation2;
co_await computation3;
std::cout << "Computation result is " << computation << std::endl;
co_return computation;
}
#define ASYNC_MAIN(coro) \
int main() { \
FutureAwaitable<int> c = coro(); \
do { c.resume(); } while (!c.done()); \
std::cout << "The coroutine returned " << c.get(); \
return 0; \
}
ASYNC_MAIN(coroutine)

Parallel calling a function in std::vector

I have an std::vector of std::function<void()> like this:
std::map<Event, std::vector<std::function<void()>>> observers_;
calling each function like this:
for (const auto& obs : observers_.at(event)) obs();
I want to turn this into a parallel for loop. Since I am using C++14, and don't have access to the std::execution::parallel of C++17, I found a little library that allows me to create a ThreadPool.
How do I turn for (const auto& obs : observers_.at(event)) obs(); into a version that calls each function in observers_ in parallel? I can't seem to get the syntax correct. I tried, but this doesn't work.
std::vector<std::function<void()>> vec = observers_.at(event);
ThreadPool::ParallelFor(0, vec.size(), [&](int i)
{
vec.at(i);
});
The example program that uses the library below:
#include <iostream>
#include <mutex>
#include "ThreadPool.hpp"
////////////////////////////////////////////////////////////////////////////////
int main()
{
std::mutex critical;
ThreadPool::ParallelFor(0, 16, [&] (int i)
{
std::lock_guard<std::mutex> lock(critical);
std::cout << i << std::endl;
});
return 0;
}
The ThreadPool library.
#ifndef THREADPOOL_HPP_INCLUDED
#define THREADPOOL_HPP_INCLUDED
////////////////////////////////////////////////////////////////////////////////
#include <thread>
#include <vector>
#include <cmath>
////////////////////////////////////////////////////////////////////////////////
class ThreadPool {
public:
template<typename Index, typename Callable>
static void ParallelFor(Index start, Index end, Callable func) {
// Estimate number of threads in the pool
const static unsigned nb_threads_hint = std::thread::hardware_concurrency();
const static unsigned nb_threads = (nb_threads_hint == 0u ? 8u : nb_threads_hint);
// Size of a slice for the range functions
Index n = end - start + 1;
Index slice = (Index) std::round(n / static_cast<double> (nb_threads));
slice = std::max(slice, Index(1));
// [Helper] Inner loop
auto launchRange = [&func] (int k1, int k2) {
for (Index k = k1; k < k2; k++) {
func(k);
}
};
// Create pool and launch jobs
std::vector<std::thread> pool;
pool.reserve(nb_threads);
Index i1 = start;
Index i2 = std::min(start + slice, end);
for (unsigned i = 0; i + 1 < nb_threads && i1 < end; ++i) {
pool.emplace_back(launchRange, i1, i2);
i1 = i2;
i2 = std::min(i2 + slice, end);
}
if (i1 < end) {
pool.emplace_back(launchRange, i1, end);
}
// Wait for jobs to finish
for (std::thread &t : pool) {
if (t.joinable()) {
t.join();
}
}
}
// Serial version for easy comparison
template<typename Index, typename Callable>
static void SequentialFor(Index start, Index end, Callable func) {
for (Index i = start; i < end; i++) {
func(i);
}
}
};
#endif // THREADPOOL_HPP_INCLUDED
It seems that you should simply change:
vec.at(i); // Only returns a reference to the element at index i
into:
vec.at(i)(); // The second () calls the function
--- OR ---
vec[i](); // Same
Hint: What does this do?
vec.at(i);
What do you want it to do?
Unrelatedly, you're using at() when you mean [].
This works:
ThreadPool::ParallelFor(0, (int)vec.size(), [&] (int i)
{
vec[i]();
});

Using std::thread and std::function from a std::bind with a function with arguments and a non-void return

Let's say we have a function odd which is a bool(int) function. I'd like to execute this function in parallel but with different parameter (differ numbers).
bool odd(int i) { return (((i&1)==1)?true:false); }
Here's the code I'm trying to use (which works but has a wart).
std::size_t num = 256;
std::vector<bool> results(num);
std::vector<std::function<bool(int)>> funcs(num);
std::vector<std::packaged_task<bool(int)>> tasks(num);
std::vector<std::future<bool>> futures(num);
std::vector<std::thread> threads(num);
for (std::size_t i = 0; i < num; i++) {
results[i] = false;
funcs[i] = std::bind(odd, static_cast<int>(i));
tasks[i] = std::packaged_task<bool(int)>(funcs[i]);
futures[i] = tasks[i].get_future();
threads[i] = std::thread(std::move(tasks[i]),0); // args ignored
}
for (std::size_t i = 0; i < num; i++) {
results[i] = futures[i].get();
threads[i].join();
}
for (std::size_t i = 0; i < num; i++) {
printf("odd(%d)=%s\n", i, (results[i]?"true":"false"));
}
I'd like to get rid of the arguments to the thread creation, as they are dependent on the argument types of the function bool(int). I'd like to make a function template of this code and be able to make a massive parallel function executor.
template <typename _returnType, typename ..._argTypes>
void exec_and_collect(std::vector<_returnType>& results,
std::vector<std::function<_returnType(_argTypes...)>> funcs) {
std::size_t numTasks = (funcs.size() > results.size() ? results.size() : funcs.size());
std::vector<std::packaged_task<_returnType(_argTypes...)>> tasks(numTasks);
std::vector<std::future<_returnType>> futures(numTasks);
std::vector<std::thread> threads(numTasks);
for (std::size_t h = 0; h < numTasks; h++) {
tasks[h] = std::packaged_task<_returnType(_argTypes...)>(funcs[h]);
futures[h] = tasks[h].get_future();
threads[h] = std::thread(std::move(tasks[h]), 0); // zero is a wart
}
// threads are now running, collect results
for (std::size_t h = 0; h < numTasks; h++) {
results[h] = futures[h].get();
threads[h].join();
}
}
Then called like this:
std::size_t num = 8;
std::vector<bool> results(num);
std::vector<std::function<bool(int)>> funcs(num);
for (std::size_t i = 0; i < num; i++) {
funcs[i] = std::bind(odd, static_cast<int>(i));
}
exec_and_collect<bool,int>(results, funcs);
I'd to remove the zero in the std::thread(std::move(task), 0); line since it's completely ignored by the thread. If I do completely remove it, the compiler can't find the arguments to pass to the thread create and it fails.
You could just not be micromanaging/control freak in the generic code. Just take any task returntype() and let the caller handle the binding of arguments:
Live On Coliru
#include <thread>
#include <future>
#include <iostream>
#include <vector>
#include <functional>
bool odd(int i) { return (((i&1)==1)?true:false); }
template <typename _returnType>
void exec_and_collect(std::vector<_returnType>& results,
std::vector<std::function<_returnType()>> funcs
) {
std::size_t numTasks = std::min(funcs.size(), results.size());
std::vector<std::packaged_task<_returnType()>> tasks(numTasks);
std::vector<std::future<_returnType>> futures(numTasks);
std::vector<std::thread> threads(numTasks);
for (std::size_t h = 0; h < numTasks; h++) {
tasks[h] = std::packaged_task<_returnType()>(funcs[h]);
futures[h] = tasks[h].get_future();
threads[h] = std::thread(std::move(tasks[h]));
}
// threads are now running, collect results
for (std::size_t h = 0; h < numTasks; h++) {
results[h] = futures[h].get();
threads[h].join();
}
}
int main() {
std::size_t num = 8;
std::vector<bool> results(num);
std::vector<std::function<bool()>> funcs(num);
for (std::size_t i = 0; i < num; i++) {
funcs[i] = std::bind(odd, static_cast<int>(i));
}
exec_and_collect<bool>(results, funcs);
}
Note this is a quick job, I've seen quite a few things that are overly specific here still.
In particular all the temporary collections are just paper weight (you even move each tasks[h] out of the vector even before moving to the next task, so why keep a vector of dead bits?)
There's no scheduling at all; you just create new threads willy nilly. That's not gonna scale (also, you want pluggable pooling models; see the Executor specifications and Boost Async's implementation of these)
UPDATE
A somewhat more cleaned up version that demonstrates what unneeded dependencies can be shed:
no temporary vectors of packaged tasks/threads
no assumption/requirement to have std::function<> wrapped tasks (this removes dynamic allocations and virtual dispatch internally in the implementation)
no requirement that the results must be in a vector (in fact, you can collect them anywhere you want using a custom output iterator)
move-awareness (this is arguably a "complicated" part of the code seeing that there is no std::move_transform, so go the extra mile using std::make_move_iterator
Live On Coliru
#include <thread>
#include <future>
#include <iostream>
#include <vector>
#include <algorithm>
#include <boost/range.hpp>
bool odd(int i) { return (((i&1)==1)?true:false); }
template <typename Range, typename OutIt>
void exec_and_collect(OutIt results, Range&& tasks) {
using namespace std;
using T = typename boost::range_value<Range>::type;
using R = decltype(declval<T>()());
auto tb = std::make_move_iterator(boost::begin(tasks)),
te = std::make_move_iterator(boost::end(tasks));
vector<future<R>> futures;
transform(
tb, te,
back_inserter(futures), [](auto&& t) {
std::packaged_task<R()> task(std::forward<decltype(t)>(t));
auto future = task.get_future();
thread(std::move(task)).detach();
return future;
});
// threads are now running, collect results
transform(begin(futures), end(futures), results, [](auto& fut) { return fut.get(); });
}
#include <boost/range/irange.hpp>
#include <boost/range/adaptors.hpp>
using namespace boost::adaptors;
int main() {
std::vector<bool> results;
exec_and_collect(
std::back_inserter(results),
boost::irange(0, 8) | transformed([](int i) { return [i] { return odd(i); }; })
);
std::copy(results.begin(), results.end(), std::ostream_iterator<bool>(std::cout << std::boolalpha, "; "));
}
Output
false; false; false; false; false; false; false; false;
Note that you could indeed write
exec_and_collect(
std::ostream_iterator<bool>(std::cout << std::boolalpha, "; "),
boost::irange(0, 8) | transformed([](int i) { return [i] { return odd(i); }; })
);
and do without any results container :)