Parallel calling a function in std::vector - c++

I have an std::vector of std::function<void()> like this:
std::map<Event, std::vector<std::function<void()>>> observers_;
calling each function like this:
for (const auto& obs : observers_.at(event)) obs();
I want to turn this into a parallel for loop. Since I am using C++14, and don't have access to the std::execution::parallel of C++17, I found a little library that allows me to create a ThreadPool.
How do I turn for (const auto& obs : observers_.at(event)) obs(); into a version that calls each function in observers_ in parallel? I can't seem to get the syntax correct. I tried, but this doesn't work.
std::vector<std::function<void()>> vec = observers_.at(event);
ThreadPool::ParallelFor(0, vec.size(), [&](int i)
{
vec.at(i);
});
The example program that uses the library below:
#include <iostream>
#include <mutex>
#include "ThreadPool.hpp"
////////////////////////////////////////////////////////////////////////////////
int main()
{
std::mutex critical;
ThreadPool::ParallelFor(0, 16, [&] (int i)
{
std::lock_guard<std::mutex> lock(critical);
std::cout << i << std::endl;
});
return 0;
}
The ThreadPool library.
#ifndef THREADPOOL_HPP_INCLUDED
#define THREADPOOL_HPP_INCLUDED
////////////////////////////////////////////////////////////////////////////////
#include <thread>
#include <vector>
#include <cmath>
////////////////////////////////////////////////////////////////////////////////
class ThreadPool {
public:
template<typename Index, typename Callable>
static void ParallelFor(Index start, Index end, Callable func) {
// Estimate number of threads in the pool
const static unsigned nb_threads_hint = std::thread::hardware_concurrency();
const static unsigned nb_threads = (nb_threads_hint == 0u ? 8u : nb_threads_hint);
// Size of a slice for the range functions
Index n = end - start + 1;
Index slice = (Index) std::round(n / static_cast<double> (nb_threads));
slice = std::max(slice, Index(1));
// [Helper] Inner loop
auto launchRange = [&func] (int k1, int k2) {
for (Index k = k1; k < k2; k++) {
func(k);
}
};
// Create pool and launch jobs
std::vector<std::thread> pool;
pool.reserve(nb_threads);
Index i1 = start;
Index i2 = std::min(start + slice, end);
for (unsigned i = 0; i + 1 < nb_threads && i1 < end; ++i) {
pool.emplace_back(launchRange, i1, i2);
i1 = i2;
i2 = std::min(i2 + slice, end);
}
if (i1 < end) {
pool.emplace_back(launchRange, i1, end);
}
// Wait for jobs to finish
for (std::thread &t : pool) {
if (t.joinable()) {
t.join();
}
}
}
// Serial version for easy comparison
template<typename Index, typename Callable>
static void SequentialFor(Index start, Index end, Callable func) {
for (Index i = start; i < end; i++) {
func(i);
}
}
};
#endif // THREADPOOL_HPP_INCLUDED

It seems that you should simply change:
vec.at(i); // Only returns a reference to the element at index i
into:
vec.at(i)(); // The second () calls the function
--- OR ---
vec[i](); // Same

Hint: What does this do?
vec.at(i);
What do you want it to do?
Unrelatedly, you're using at() when you mean [].

This works:
ThreadPool::ParallelFor(0, (int)vec.size(), [&] (int i)
{
vec[i]();
});

Related

Determining function time using a wrapper

I'm looking for a generic way of measuring a functions timing like Here, but for c++.
My main goal is to not have cluttered code like this piece everywhere:
auto t1 = std::chrono::high_resolution_clock::now();
function(arg1, arg2);
auto t2 = std::chrono::high_resolution_clock::now();
auto tDur = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1);
But rather have a nice wrapper around the function.
What I got so far is:
timing.hpp:
#pragma once
#include <chrono>
#include <functional>
template <typename Tret, typename Tin1, typename Tin2> unsigned int getDuration(std::function<Tret(Tin1, Tin2)> function, Tin1 arg1, Tin2 arg2, Tret& retValue)
{
auto t1 = std::chrono::high_resolution_clock::now();
retValue = function(arg1, arg2);
auto t2 = std::chrono::high_resolution_clock::now();
auto tDur = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1);
return tDur.count();
}
main.cpp:
#include "timing.hpp"
#include "matrix.hpp"
constexpr int G_MATRIXSIZE = 2000;
int main(int argc, char** argv)
{
CMatrix<double> myMatrix(G_MATRIXSIZE);
bool ret;
// this call is quite ugly
std::function<bool(int, std::vector<double>)> fillRow = std::bind(&CMatrix<double>::fillRow, &myMatrix, 0, fillVec);
auto duration = getDuration(fillRow, 5, fillVec, ret );
std::cout << "duration(ms): " << duration << std::endl;
}
in case sb wants to test the code, matrix.hpp:
#pragma once
#include <iostream>
#include <string>
#include <sstream>
#include <vector>
template<typename T> class CMatrix {
public:
// ctor
CMatrix(int size) :
m_size(size)
{
m_matrixData = new std::vector<std::vector<T>>;
createUnityMatrix();
}
// dtor
~CMatrix()
{
std::cout << "Destructor of CMatrix called" << std::endl;
delete m_matrixData;
}
// print to std::out
void printMatrix()
{
std::ostringstream oss;
for (int i = 0; i < m_size; i++)
{
for (int j = 0; j < m_size; j++)
{
oss << m_matrixData->at(i).at(j) << ";";
}
oss << "\n";
}
std::cout << oss.str() << std::endl;
}
bool fillRow(int index, std::vector<T> row)
{
// checks
if (!indexValid(index))
{
return false;
}
if (row.size() != m_size)
{
return false;
}
// data replacement
for (int j = 0; j < m_size; j++)
{
m_matrixData->at(index).at(j) = row.at(j);
}
return true;
}
bool fillColumn(int index, std::vector<T> column)
{
// checks
if (!indexValid(index))
{
return false;
}
if (column.size() != m_size)
{
return false;
}
// data replacement
for (int j = 0; j < m_size; j++)
{
m_matrixData->at(index).at(j) = column.at(j);
}
return true;
}
private:
// variables
std::vector<std::vector<T>>* m_matrixData;
int m_size;
bool indexValid(int index)
{
if (index + 1 > m_size)
{
return false;
}
return true;
}
// functions
void createUnityMatrix()
{
for (int i = 0; i < m_size; i++)
{
std::vector<T> _vector;
for (int j = 0; j < m_size; j++)
{
if (i == j)
{
_vector.push_back(1);
}
else
{
_vector.push_back(0);
}
}
m_matrixData->push_back(_vector);
}
}
};
The thing is, this code is still quite ugly due to the std::function usage. Is there a better and/or simpler option ?
(+ also I'm sure I messed sth up with the std::bind, I think I need to use std::placeholders since I want to set the arguments later on.)
// edit, correct use of placeholder in main:
std::function<bool(int, std::vector<double>)> fillRow = std::bind(&CMatrix<double>::fillRow, &myMatrix, std::placeholders::_1, std::placeholders::_2);
auto duration = getDuration(fillRow, 18, fillVec, ret );
You can utilize RAII to implement a timer that records the execution time of a code block and a template function that wraps the function you would like to execute with the timer.
#include<string>
#include<chrono>
#include <unistd.h>
struct Timer
{
std::string fn, title;
std::chrono::time_point<std::chrono::steady_clock> start;
Timer(std::string fn, std::string title)
: fn(std::move(fn)), title(std::move(title)), start(std::chrono::steady_clock::now())
{
}
~Timer()
{
const auto elapsed =
std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - start).count();
printf("%s: function=%s; elasepd=%f ms\n", title.c_str(), fn.c_str(), elapsed / 1000.0);
}
};
#ifndef ENABLE_BENCHMARK
static constexpr inline void dummy_fn() { }
#define START_BENCHMARK_TIMER(...) dummy_fn()
#else
#define START_BENCHMARK_TIMER(title) bench::Timer timer(__FUNCTION__, title)
#endif
template<typename F, typename ...Args>
auto time_fn(F&& fn, Args&&... args) {
START_BENCHMARK_TIMER("wrapped fn");
return fn(std::forward<Args>(args)...);
}
int foo(int i) {
usleep(70000);
return i;
}
int main()
{
printf("%d\n", time_fn(foo, 3));
}
stdout:
wrapped fn: function=time_fn; elasepd=71.785000 ms
3
General Idea:
time_fn is a simple template function that calls START_BENCHMARK_TIMER and calls fn with the provided arguments
START_BENCHMARK_TIMER then creates a Timer object. It will record the current time in start. Do note that __FUNCTION__ will be replaced with the function that was called.
When the
provided fn returns or throws an exception, the Timer object from (1) will be destroyed and the destructor will be called. The destructor will then calculate the time difference between the current time and the recorded start time and prints it to stdout
Note:
Even though declaring start and end in time_fn instead of the RAII timer will work, having an RAII timer will allow you to cleanly handle the situation when fn throws an exception
If you are on c++11, you will need to change time_fn declaration to typename std::result_of<F &&(Args &&...)>::type time_fn(F&& fn, Args&&... args).
Edit: Updated the response to include a wrapper function approach.

thread racing when working with vector of data in the thread function

Being an early stage c++/thread coder I am having some hard time with thread racing in one of my test functions and would truly appreciate some feedback.
My parent() function takes in as input a rather large vector of images (cv::Mat from openCV) and the task is to compute an operator on each one separately (e.g. dilation). I wrote a loop that creates threads using a worker() function and passes on each thread a subset of my input vector.
The result from each thread is to be stored on that input subset vector. My problem is that I cannot retrieve it back from within the parent().
As an alternative I passed the entire vector to worker() with start and end indices for each thread but then I run into some serious thread racing issues consuming more time than the serial approach.
Please see my code below.
std::vector<cv::Mat> worker(std::vector<cv::Mat>& ctn);
std::vector<cv::Mat> worker(std::vector<cv::Mat>& ctn) {
int erosion_type = cv::MORPH_RECT;
int erosion_size = 5;
cv::Mat element = cv::getStructuringElement( erosion_type,
cv::Size( 2*erosion_size + 1, 2*erosion_size+1 ),
cv::Point( erosion_size, erosion_size ) );
this_mutex.lock();
for(uint it=0; it<ctn.size(); ++it) {
cv::erode(ctn[it], ctn[it], element);
}
this_mutex.unlock();
return ctn;
}
void parent(std::vector<cv::Mat>& imageSet) {
auto start = std::chrono::steady_clock::now();
const auto processor_count = std::thread::hardware_concurrency();
std::vector<std::thread> threads;
const int grainsize = imageSet.size() / processor_count;
uint work_iter = 0;
std::vector<cv::Mat> target; // holds the output vector
// create the threads
for(uint it=0; it<processor_count-1; ++it) {
std::vector<cv::Mat> subvec(imageSet.begin() + work_iter, imageSet.begin() + work_iter + grainsize);
threads.emplace_back([&,it]() {
std::vector<cv::Mat> tmp = worker(subvec);
target.insert(target.end(), tmp.begin(), tmp.end());
});
work_iter += grainsize;
}
// create the last thread for the remainder of the vector elements
std::vector<cv::Mat> subvec(imageSet.begin() + work_iter, imageSet.end());
int it = processor_count-1;
threads.emplace_back([&,it]() {
std::vector<cv::Mat> tmp = worker(subvec);
target.insert(target.end(), tmp.begin(), tmp.end());
});
// join the threads
for(int i=0; i<threads.size(); ++i) {
threads[i].join();
}
auto end = std::chrono::steady_clock::now();
std::chrono::duration<double> elapsed_seconds = end-start;
std::cout << "elapsed time: " << elapsed_seconds.count() << "s\n";
// try to reconstruct the output
imageSet.clear();
for(int i=0; i<target.size(); ++i) {
imageSet.push_back(target[i]);
}
}
In this code the statement target.insert(target.end(), tmp.begin(), tmp.end()) is meant to concatenate the target[ ] vector with the result of each thread but it does not execute in time thus I get an empty target[] at the end.
Any ideas how to get target[] to collect all tmp[]s?
Where you thinking something like this?
This processes them all individually, but you can chunk it up however you want and return a vector from the lamda if you want.
Note: This is in C++11, since that is what you tagged. If you have access to 17, this becomes a whole lot simpler.
#include <vector>
#include <algorithm>
#include <numeric>
#include <future>
#include <iostream>
int main()
{
std::vector<int> input{0,1,2,3,4,5,6,7,8,9,10};
for(const auto& item : input)
{
std::cout << item << " ";
}
std::cout << std::endl;
std::vector<std::future<int>> threads{};
for(const auto& item : input)
{
threads.push_back(std::async(std::launch::async, [&item]{
return item * 100;
}));
}
std::vector<int> output{};
for(auto& thread : threads)
{
output.push_back(thread.get());
}
for(const auto& item : output)
{
std::cout << item << " ";
}
return 0;
}
One result (res) for each thread.
#include <iostream>
#include <thread>
#include <vector>
#include <algorithm>
#include <cassert>
void threadFunction (std::vector<int> &speeds, int start, int end, std::vector<int>& res);
int main()
{
std::vector<int> images (100000);
auto processor_count = std::thread::hardware_concurrency();
auto step = images.size() / processor_count;
auto startFrom = 0;
// one result vector (res) for each thread (t).
std::vector<std::thread>t;
std::vector<std::vector<int>>res (processor_count);
// Start the threads
for (auto i = 0; i < processor_count; ++i)
{
auto th = std::thread(threadFunction, std::ref(images), startFrom, startFrom+step, std::ref(res[i]));
t.push_back(std::move(th));
startFrom += step;
}
// Join
std::for_each(begin(t), end(t), [](std::thread &t)
{
assert(t.joinable());
t.join();
});
// Results here. Each thread puts the results in res[i];
return 0;
}
void threadFunction (std::vector<int> &images, int start, int end, std::vector<int>& res)
{
for (int i = start; i <= end; ++i)
res.push_back(images[i]);
}

Efficiently process each unique permutation of a vector when number of unique elements in vector is much smaller than vector size

In a program I need to apply a function in parallel to each unique permutation of a vector. The size of the vector is around N=15
I already have a function void parallel_for_each_permutation which I can use in combination with a std::set to only process each unique permutation exactly once.
This all works well for the general case. However, in my use case the number of unique elements k per vector is very limited, usually around k=4. This means that I'm currently wasting time constructing the same unique permutation over and over again, just to throw it away because it has already been processed.
Is it possible to process all unique permutations in this special case, without constructing all N! permutations?
Example use-case:
#include <algorithm>
#include <thread>
#include <vector>
#include <mutex>
#include <numeric>
#include <set>
#include <iostream>
template<class Container1, class Container2>
struct Comp{
//compare element-wise less than
bool operator()(const Container1& l, const Container2& r) const{
auto pair = std::mismatch(l.begin(), l.end(), r.begin());
if(pair.first == l.end() && pair.second == r.end())
return false;
return *(pair.first) < *(pair.second);
}
};
template<class Container, class Func>
void parallel_for_each_permutation(const Container& container, int num_threads, Func func){
auto ithPermutation = [](int n, size_t i) -> std::vector<size_t>{
// https://stackoverflow.com/questions/7918806/finding-n-th-permutation-without-computing-others
std::vector<size_t> fact(n);
std::vector<size_t> perm(n);
fact[0] = 1;
for(int k = 1; k < n; k++)
fact[k] = fact[k-1] * k;
for(int k = 0; k < n; k++){
perm[k] = i / fact[n-1-k];
i = i % fact[n-1-k];
}
for(int k = n-1; k > 0; k--){
for(int j = k-1; j >= 0; j--){
if(perm[j] <= perm[k])
perm[k]++;
}
}
return perm;
};
size_t totalNumPermutations = 1;
for(size_t i = 1; i <= container.size(); i++)
totalNumPermutations *= i;
std::vector<std::thread> threads;
for(int threadId = 0; threadId < num_threads; threadId++){
threads.emplace_back([&, threadId](){
const size_t firstPerm = size_t(float(threadId) * totalNumPermutations / num_threads);
const size_t last_excl = std::min(totalNumPermutations, size_t(float(threadId+1) * totalNumPermutations / num_threads));
Container permutation(container);
auto permIndices = ithPermutation(container.size(), firstPerm);
size_t count = firstPerm;
do{
for(int i = 0; i < int(permIndices.size()); i++){
permutation[i] = container[permIndices[i]];
}
func(threadId, permutation);
std::next_permutation(permIndices.begin(), permIndices.end());
++count;
}while(count < last_excl);
});
}
for(auto& thread : threads)
thread.join();
}
template<class Container, class Func>
void parallel_for_each_unique_permutation(const Container& container, Func func){
using Comparator = Comp<Container, Container>;
constexpr int numThreads = 4;
std::set<Container, Comparator> uniqueProcessedPermutations(Comparator{});
std::mutex m;
parallel_for_each_permutation(
container,
numThreads,
[&](int threadId, const auto& permutation){
{
std::lock_guard<std::mutex> lg(m);
if(uniqueProcessedPermutations.count(permutation) > 0){
return;
}else{
uniqueProcessedPermutations.insert(permutation);
}
}
func(permutation);
}
);
}
int main(){
std::vector<int> vector1{1,1,1,1,2,3,2,2,3,3,1};
auto func = [](const auto& vec){return;};
parallel_for_each_unique_permutation(vector1, func);
}
The permutations you have to work with are known in the field of combinatorics as multiset permutations.
They are described for example on The Combinatorial Object Server
with more detailed explanations in this paper by professor Tadao Takaoka.
You have some related Python code and some C++ code in the FXT open source library.
You might consider adding the "multiset" and "combinatorics" tags to your question.
One possibility is to borrow the (header-only) algorithmic code from the FXT library, which provides a simple generator class for those multiset permutations.
Performance level:
Using the FXT algorithm on a test vector of 15 objects, {1,1,1, 2,2,2, 3,3,3,3, 4,4,4,4,4}, one can generate all associated 12,612,600 "permutations" in less than 2 seconds on a plain vanilla Intel x86-64 machine; this is without diagnostics text I/O and without any attempt at optimization.
The algorithm generates exactly those "permutations" that are required, nothing more. So there is no longer a need to generate all 15! "raw" permutations nor to use mutual exclusion to update a shared data structure for filtering purposes.
An adaptor class for generating the permutations:
I will try below to provide code for an adaptor class, which allows your application to use the FXT algorithm while containing the dependency into a single implementation file. That way, the code will hopefully fit better into your application. Think FXT's ulong type and use of raw pointers, versus std::vector<std::size_t> in your code. Besides, FXT is a very extensive library.
Header file for the "adaptor" class:
// File: MSetPermGen.h
#ifndef MSET_PERM_GEN_H
#define MSET_PERM_GEN_H
#include <iostream>
#include <vector>
class MSetPermGenImpl; // from algorithmic backend
using IntVec = std::vector<int>;
using SizeVec = std::vector<std::size_t>;
// Generator class for multiset permutations:
class MSetPermGen {
public:
MSetPermGen(const IntVec& vec);
std::size_t getCycleLength() const;
bool forward(size_t incr);
bool next();
const SizeVec& getPermIndices() const;
const IntVec& getItems() const;
const IntVec& getItemValues() const;
private:
std::size_t cycleLength_;
MSetPermGenImpl* genImpl_; // implementation generator
IntVec itemValues_; // only once each
IntVec items_; // copy of ctor argument
SizeVec freqs_; // repetition counts
SizeVec state_; // array of indices in 0..n-1
};
#endif
The class constructor takes exactly the argument type provided in your main program. Of course, the key method is next(). You can also move the automaton by several steps at once using the forward(incr)method.
Example client program:
// File: test_main.cpp
#include <cassert>
#include "MSetPermGen.h"
using std::cout;
using std::cerr;
using std::endl;
// utility functions:
std::vector<int> getMSPermutation(const MSetPermGen& mspg)
{
std::vector<int> res;
auto indices = mspg.getPermIndices(); // always between 0 and n-1
auto values = mspg.getItemValues(); // whatever the user put in
std::size_t n = indices.size();
assert( n == items.size() );
res.reserve(n);
for (std::size_t i=0; i < n; i++) {
auto xi = indices[i];
res.push_back(values[xi]);
}
return res;
}
void printPermutation(const std::vector<int>& p, std::ostream& fh)
{
std::size_t n = p.size();
for (size_t i=0; i < n; i++)
fh << p[i] << " ";
fh << '\n';
}
int main(int argc, const char* argv[])
{
std::vector<int> vec0{1,1, 2,2,2}; // N=5
std::vector<int> vec1{1,1, 1,1, 2, 3, 2,2, 3,3, 1}; // N=11
std::vector<int> vec2{1,1,1, 2,2,2, 3,3,3,3, 4,4,4,4,4}; // N=15
MSetPermGen pg0{vec0};
MSetPermGen pg1{vec1};
MSetPermGen pg2{vec2};
auto pg = &pg0; // choice of 0, 1, 2 for sizing
auto cl = pg->getCycleLength();
auto permA = getMSPermutation(*pg);
printPermutation(permA, cout);
for (std::size_t pi=0; pi < (cl-1); pi++) {
pg->next();
auto permB = getMSPermutation(*pg);
printPermutation(permB, cout);
}
return EXIT_SUCCESS;
}
Text output from the above small program:
1 1 2 2 2
1 2 1 2 2
1 2 2 1 2
1 2 2 2 1
2 1 1 2 2
2 1 2 1 2
2 1 2 2 1
2 2 1 1 2
2 2 1 2 1
2 2 2 1 1
You get only 10 items from vector {1,1, 2,2,2}, because 5! / (2! * 3!) = 120/(2*6) = 10.
The implementation file for the adaptor class, MSetPermGen.cpp, consists of two parts. The first part is FXT code with minimal adaptations. The second part is the MSetPermGen class proper.
First part of implementation file:
// File: MSetPermGen.cpp - part 1 of 2 - FXT code
// -------------- Beginning of header-only FXT combinatorics code -----------
// This file is part of the FXT library.
// Copyright (C) 2010, 2012, 2014 Joerg Arndt
// License: GNU General Public License version 3 or later,
// see the file COPYING.txt in the main directory.
//-- https://www.jjj.de/fxt/
//-- https://fossies.org/dox/fxt-2018.07.03/mset-perm-lex_8h_source.html
#include <cstddef>
using ulong = std::size_t;
inline void swap2(ulong& xa, ulong& xb)
{
ulong save_xb = xb;
xb = xa;
xa = save_xb;
}
class mset_perm_lex
// Multiset permutations in lexicographic order, iterative algorithm.
{
public:
ulong k_; // number of different sorts of objects
ulong *r_; // number of elements '0' in r[0], '1' in r[1], ..., 'k-1' in r[k-1]
ulong n_; // number of objects
ulong *ms_; // multiset data in ms[0], ..., ms[n-1], sentinels at [-1] and [-2]
private: // have pointer data
mset_perm_lex(const mset_perm_lex&); // forbidden
mset_perm_lex & operator = (const mset_perm_lex&); // forbidden
public:
explicit mset_perm_lex(const ulong *r, ulong k)
{
k_ = k;
r_ = new ulong[k];
for (ulong j=0; j<k_; ++j) r_[j] = r[j]; // get buckets
n_ = 0;
for (ulong j=0; j<k_; ++j) n_ += r_[j];
ms_ = new ulong[n_+2];
ms_[0] = 0; ms_[1] = 1; // sentinels: ms[0] < ms[1]
ms_ += 2; // nota bene
first();
}
void first()
{
for (ulong j=0, i=0; j<k_; ++j)
for (ulong h=r_[j]; h!=0; --h, ++i)
ms_[i] = j;
}
~mset_perm_lex()
{
ms_ -= 2;
delete [] ms_;
delete [] r_;
}
const ulong * data() const { return ms_; }
ulong next()
// Return position of leftmost change,
// return n with last permutation.
{
// find rightmost pair with ms[i] < ms[i+1]:
const ulong n1 = n_ - 1;
ulong i = n1;
do { --i; } while ( ms_[i] >= ms_[i+1] ); // can read sentinel
if ( (long)i < 0 ) return n_; // last sequence is falling seq.
// find rightmost element ms[j] less than ms[i]:
ulong j = n1;
while ( ms_[i] >= ms_[j] ) { --j; }
swap2(ms_[i], ms_[j]);
// Here the elements ms[i+1], ..., ms[n-1] are a falling sequence.
// Reverse order to the right:
ulong r = n1;
ulong s = i + 1;
while ( r > s ) { swap2(ms_[r], ms_[s]); --r; ++s; }
return i;
}
};
// -------------- End of header-only FXT combinatorics code -----------
Second part of the class implementation file:
// Second part of file MSetPermGen.cpp: non-FXT code
#include <cassert>
#include <tuple>
#include <map>
#include <iostream>
#include <cstdio>
#include "MSetPermGen.h"
using std::cout;
using std::cerr;
using std::endl;
class MSetPermGenImpl { // wrapper class
public:
MSetPermGenImpl(const SizeVec& freqs) : fg(freqs.data(), freqs.size())
{}
private:
mset_perm_lex fg;
friend class MSetPermGen;
};
static std::size_t fact(size_t n)
{
std::size_t f = 1;
for (std::size_t i = 1; i <= n; i++)
f = f*i;
return f;
}
MSetPermGen::MSetPermGen(const IntVec& vec) : items_(vec)
{
std::map<int,int> ma;
for (int i: vec) {
ma[i]++;
}
int item, freq;
for (const auto& p : ma) {
std::tie(item, freq) = p;
itemValues_.push_back(item);
freqs_.push_back(freq);
}
cycleLength_ = fact(items_.size());
for (auto i: freqs_)
cycleLength_ /= fact(i);
// create FXT-level generator:
genImpl_ = new MSetPermGenImpl(freqs_);
for (std::size_t i=0; i < items_.size(); i++)
state_.push_back(genImpl_->fg.ms_[i]);
}
std::size_t MSetPermGen::getCycleLength() const
{
return cycleLength_;
}
bool MSetPermGen::forward(size_t incr)
{
std::size_t n = items_.size();
std::size_t rc = 0;
// move forward state by brute force, could be improved:
for (std::size_t i=0; i < incr; i++)
rc = genImpl_->fg.next();
for (std::size_t j=0; j < n; j++)
state_[j] = genImpl_->fg.ms_[j];
return (rc != n);
}
bool MSetPermGen::next()
{
return forward(1);
}
const SizeVec& MSetPermGen::getPermIndices() const
{
return (this->state_);
}
const IntVec& MSetPermGen::getItems() const
{
return (this->items_);
}
const IntVec& MSetPermGen::getItemValues() const
{
return (this->itemValues_);
}
Adapting the parallel application:
Regarding your multithreaded application, given that generating the "permutations" is cheap, you can afford to create one generator object per thread.
Before launching the actual computation, you forward each generator to its appropriate initial position, that is at step thread_id * (cycleLength / num_threads).
I have tried to adapt your code to this MSetPermGen class along these lines. See code below.
With 3 threads, an input vector {1,1,1, 2,2,2, 3,3,3,3, 4,4,4,4,4} of size 15 (giving 12,612,600 permutations) and all diagnostics enabled, your modified parallel program runs in less than 10 seconds; less than 2 seconds with all diagnostics switched off.
Modified parallel program:
#include <algorithm>
#include <thread>
#include <vector>
#include <atomic>
#include <mutex>
#include <numeric>
#include <set>
#include <iostream>
#include <fstream>
#include <sstream>
#include <cstdlib>
#include "MSetPermGen.h"
using std::cout;
using std::endl;
// debug and instrumentation:
static std::atomic<size_t> permCounter;
static bool doManagePermCounter = true;
static bool doThreadLogfiles = true;
static bool doLogfileHeaders = true;
template<class Container, class Func>
void parallel_for_each_permutation(const Container& container, int numThreads, Func mfunc) {
MSetPermGen gen0(container);
std::size_t totalNumPermutations = gen0.getCycleLength();
std::size_t permShare = totalNumPermutations / numThreads;
if ((totalNumPermutations % numThreads) != 0)
permShare++;
std::cout << "totalNumPermutations: " << totalNumPermutations << std::endl;
std::vector<std::thread> threads;
for (int threadId = 0; threadId < numThreads; threadId++) {
threads.emplace_back([&, threadId]() {
// generate some per-thread logfile name
std::ostringstream fnss;
fnss << "thrlog_" << threadId << ".txt";
std::string fileName = fnss.str();
std::ofstream fh(fileName);
MSetPermGen thrGen(container);
const std::size_t firstPerm = permShare * threadId;
thrGen.forward(firstPerm);
const std::size_t last_excl = std::min(totalNumPermutations,
(threadId+1) * permShare);
if (doLogfileHeaders) {
fh << "MSG threadId: " << threadId << '\n';
fh << "MSG firstPerm: " << firstPerm << '\n';
fh << "MSG lastExcl : " << last_excl << '\n';
}
Container permutation(container);
auto values = thrGen.getItemValues();
auto permIndices = thrGen.getPermIndices();
auto nsz = permIndices.size();
std::size_t count = firstPerm;
do {
for (std::size_t i = 0; i < nsz; i++) {
permutation[i] = values[permIndices[i]];
}
mfunc(threadId, permutation);
if (doThreadLogfiles) {
for (std::size_t i = 0; i < nsz; i++)
fh << permutation[i] << ' ';
fh << '\n';
}
thrGen.next();
permIndices = thrGen.getPermIndices();
++count;
if (doManagePermCounter) {
permCounter++;
}
} while (count < last_excl);
fh.close();
});
}
for(auto& thread : threads)
thread.join();
}
template<class Container, class Func>
void parallel_for_each_unique_permutation(const Container& container, Func func) {
constexpr int numThreads = 3;
parallel_for_each_permutation(
container,
numThreads,
[&](int threadId, const auto& permutation){
// no longer need any mutual exclusion
func(permutation);
}
);
}
int main()
{
std::vector<int> vector1{1,1,1,1,2,3,2,2,3,3,1}; // N=11
std::vector<int> vector0{1,1, 2,2,2}; // N=5
std::vector<int> vector2{1,1,1, 2,2,2, 3,3,3,3, 4,4,4,4,4}; // N=15
auto func = [](const auto& vec) { return; };
permCounter.store(0);
parallel_for_each_unique_permutation(vector2, func);
auto finalPermCounter = permCounter.load();
cout << "FinalPermCounter = " << finalPermCounter << endl;
}

Thread unable to join for_each parallel c++

I wrote a sample code to run parallel instances of for_each
I am unable to join the threads, in the below code. I am little early to concurrent programming so im not sure if i have done everything right.
template <typename Iterator, typename F>
class for_each_block
{
public :
void operator()(Iterator start, Iterator end, F f) {
cout << this_thread::get_id << endl;
this_thread::sleep_for(chrono::seconds(5));
for_each(start, end, [&](auto& x) { f(x); });
}
};
typedef unsigned const long int ucli;
template <typename Iterator, typename F>
void for_each_par(Iterator first, Iterator last, F f)
{
ucli size = distance(first, last);
if (!size)
return;
ucli min_per_thread = 4;
ucli max_threads = (size + min_per_thread - 1) / min_per_thread;
ucli hardware_threads = thread::hardware_concurrency();
ucli no_of_threads = min(max_threads, hardware_threads != 0 ? hardware_threads : 4);
ucli block_size = size / no_of_threads;
vector<thread> vf(no_of_threads);
Iterator block_start = first;
for (int i = 0; i < (no_of_threads - 1); i++)
{
Iterator end = first;
advance(end, block_size);
vf.push_back(std::move(thread(for_each_block<Iterator, F>(),first,end,f)));
first = end;
}
vf.push_back(std::move(thread(for_each_block<Iterator, F>(), first, last, f)));
cout << endl;
cout << vf.size() << endl;
for(auto& x: vf)
{
if (x.joinable())
x.join();
else
cout << "threads not joinable " << endl;
}
this_thread::sleep_for(chrono::seconds(100));
}
int main()
{
vector<int> v1 = { 1,8,12,5,4,9,20,30,40,50,10,21,34,33 };
for_each_par(v1.begin(), v1.end(), print_type<int>);
return 0;
}
In the above code i am getting threads not joinable. I have also tried with async futures still i get the same. Am i missing something here?
Any help is greatly appreciated ,
Thank you in advance ..
vector<thread> vf(no_of_threads);
This creates a vector with no_of_threads default-initialized threads. Since they're default initialized, none of them will be joinable. You probably meant to do:
vector<thread> vf;
vf.reserve(no_of_threads);
P.S.: std::move on a temporary is redundant :); consider changing this:
vf.push_back(std::move(thread(for_each_block<Iterator, F>(), first, last, f)));
to this:
vf.emplace_back(for_each_block<Iterator, F>(), first, last, f);
This may or may not be interesting. I had a go at refactoring the code to use what I think is a more idiomatic approach. I'm not saying that your approach is wrong, but since you're learning thread management I thought you may be interested in what else is possible.
Feel free to flame/question as appropriate. Comments inline:
#include <vector>
#include <chrono>
#include <thread>
#include <mutex>
#include <iomanip>
#include <future>
using namespace std;
//
// provide a means of serialising writing to a stream.
//
struct locker
{
locker() : _lock(mutex()) {}
static std::mutex& mutex() { static std::mutex m; return m; }
std::unique_lock<std::mutex> _lock;
};
std::ostream& operator<<(std::ostream& os, const locker& l) {
return os;
}
//
// fill in the missing work function
//
template<class T>
void print_type(const T& t) {
std::cout << locker() << hex << std::this_thread::get_id() << " : " << dec << t << std::endl;
}
// put this in your personable library.
// the standards committee really should have given us ranges by now...
template<class I1, class I2>
struct range_impl
{
range_impl(I1 i1, I2 i2) : _begin(i1), _end(i2) {};
auto begin() const { return _begin; }
auto end() const { return _end; }
I1 _begin;
I2 _end;
};
// distinct types because sometimes dissimilar iterators are comparable
template<class I1, class I2>
auto range(I1 i1, I2 i2) {
return range_impl<I1, I2>(i1, i2);
}
//
// lets make a helper function so we can auto-deduce template args
//
template<class Iterator, typename F>
auto make_for_each_block(Iterator start, Iterator end, F&& f)
{
// a lambda gives all the advantages of a function object with none
// of the boilerplate.
return [start, end, f = std::move(f)] {
cout << locker() << this_thread::get_id() << endl;
this_thread::sleep_for(chrono::seconds(1));
// let's keep loops simple. for_each is a bit old-skool.
for (auto& x : range(start, end)) {
f(x);
}
};
}
template <typename Iterator, typename F>
void for_each_par(Iterator first, Iterator last, F f)
{
if(auto size = distance(first, last))
{
std::size_t min_per_thread = 4;
std::size_t max_threads = (size + min_per_thread - 1) / min_per_thread;
std::size_t hardware_threads = thread::hardware_concurrency();
auto no_of_threads = min(max_threads, hardware_threads != 0 ? hardware_threads : 4);
auto block_size = size / no_of_threads;
// futures give us two benefits:
// 1. they automatically transmit exceptions
// 2. no need for if(joinable) join. get is sufficient
//
vector<future<void>> vf;
vf.reserve(no_of_threads - 1);
for (auto count = no_of_threads ; --count ; )
{
//
// I was thinking of refactoring this into std::generate_n but actually
// it was less readable.
//
auto end = std::next(first, block_size);
vf.push_back(async(launch::async, make_for_each_block(first, end, f)));
first = end;
}
cout << locker() << endl << "threads: " << vf.size() << " (+ main thread)" << endl;
//
// why spawn a thread for the remaining block? we may as well use this thread
//
/* auto partial_sum = */ make_for_each_block(first, last, f)();
// join the threads
// note that if the blocks returned a partial aggregate, we could combine them
// here by using the values in the futures.
for (auto& f : vf) f.get();
}
}
int main()
{
vector<int> v1 = { 1,8,12,5,4,9,20,30,40,50,10,21,34,33 };
for_each_par(v1.begin(), v1.end(), print_type<int>);
return 0;
}
sample output:
0x700000081000
0x700000104000
threads: 3 (+ main thread)
0x700000187000
0x100086000
0x700000081000 : 1
0x700000104000 : 5
0x700000187000 : 20
0x100086000 : 50
0x700000081000 : 8
0x700000104000 : 4
0x700000187000 : 30
0x100086000 : 10
0x700000081000 : 12
0x700000104000 : 9
0x700000187000 : 40
0x100086000 : 21
0x100086000 : 34
0x100086000 : 33
Program ended with exit code: 0
please explain std::move here: [start, end, f = std::move(f)] {...};
This is a welcome language feature that was made available in c++14. f = std::move(f) inside the capture block is equivalent to: decltype(f) new_f = std::move(f) except that the new variable is called f and not new_f. It allows us to std::move objects into lambdas rather than copy them.
For most function objects it won't matter - but some can large and this gives the compiler the opportunity to use a move rather than a copy if available.

Using std::thread and std::function from a std::bind with a function with arguments and a non-void return

Let's say we have a function odd which is a bool(int) function. I'd like to execute this function in parallel but with different parameter (differ numbers).
bool odd(int i) { return (((i&1)==1)?true:false); }
Here's the code I'm trying to use (which works but has a wart).
std::size_t num = 256;
std::vector<bool> results(num);
std::vector<std::function<bool(int)>> funcs(num);
std::vector<std::packaged_task<bool(int)>> tasks(num);
std::vector<std::future<bool>> futures(num);
std::vector<std::thread> threads(num);
for (std::size_t i = 0; i < num; i++) {
results[i] = false;
funcs[i] = std::bind(odd, static_cast<int>(i));
tasks[i] = std::packaged_task<bool(int)>(funcs[i]);
futures[i] = tasks[i].get_future();
threads[i] = std::thread(std::move(tasks[i]),0); // args ignored
}
for (std::size_t i = 0; i < num; i++) {
results[i] = futures[i].get();
threads[i].join();
}
for (std::size_t i = 0; i < num; i++) {
printf("odd(%d)=%s\n", i, (results[i]?"true":"false"));
}
I'd like to get rid of the arguments to the thread creation, as they are dependent on the argument types of the function bool(int). I'd like to make a function template of this code and be able to make a massive parallel function executor.
template <typename _returnType, typename ..._argTypes>
void exec_and_collect(std::vector<_returnType>& results,
std::vector<std::function<_returnType(_argTypes...)>> funcs) {
std::size_t numTasks = (funcs.size() > results.size() ? results.size() : funcs.size());
std::vector<std::packaged_task<_returnType(_argTypes...)>> tasks(numTasks);
std::vector<std::future<_returnType>> futures(numTasks);
std::vector<std::thread> threads(numTasks);
for (std::size_t h = 0; h < numTasks; h++) {
tasks[h] = std::packaged_task<_returnType(_argTypes...)>(funcs[h]);
futures[h] = tasks[h].get_future();
threads[h] = std::thread(std::move(tasks[h]), 0); // zero is a wart
}
// threads are now running, collect results
for (std::size_t h = 0; h < numTasks; h++) {
results[h] = futures[h].get();
threads[h].join();
}
}
Then called like this:
std::size_t num = 8;
std::vector<bool> results(num);
std::vector<std::function<bool(int)>> funcs(num);
for (std::size_t i = 0; i < num; i++) {
funcs[i] = std::bind(odd, static_cast<int>(i));
}
exec_and_collect<bool,int>(results, funcs);
I'd to remove the zero in the std::thread(std::move(task), 0); line since it's completely ignored by the thread. If I do completely remove it, the compiler can't find the arguments to pass to the thread create and it fails.
You could just not be micromanaging/control freak in the generic code. Just take any task returntype() and let the caller handle the binding of arguments:
Live On Coliru
#include <thread>
#include <future>
#include <iostream>
#include <vector>
#include <functional>
bool odd(int i) { return (((i&1)==1)?true:false); }
template <typename _returnType>
void exec_and_collect(std::vector<_returnType>& results,
std::vector<std::function<_returnType()>> funcs
) {
std::size_t numTasks = std::min(funcs.size(), results.size());
std::vector<std::packaged_task<_returnType()>> tasks(numTasks);
std::vector<std::future<_returnType>> futures(numTasks);
std::vector<std::thread> threads(numTasks);
for (std::size_t h = 0; h < numTasks; h++) {
tasks[h] = std::packaged_task<_returnType()>(funcs[h]);
futures[h] = tasks[h].get_future();
threads[h] = std::thread(std::move(tasks[h]));
}
// threads are now running, collect results
for (std::size_t h = 0; h < numTasks; h++) {
results[h] = futures[h].get();
threads[h].join();
}
}
int main() {
std::size_t num = 8;
std::vector<bool> results(num);
std::vector<std::function<bool()>> funcs(num);
for (std::size_t i = 0; i < num; i++) {
funcs[i] = std::bind(odd, static_cast<int>(i));
}
exec_and_collect<bool>(results, funcs);
}
Note this is a quick job, I've seen quite a few things that are overly specific here still.
In particular all the temporary collections are just paper weight (you even move each tasks[h] out of the vector even before moving to the next task, so why keep a vector of dead bits?)
There's no scheduling at all; you just create new threads willy nilly. That's not gonna scale (also, you want pluggable pooling models; see the Executor specifications and Boost Async's implementation of these)
UPDATE
A somewhat more cleaned up version that demonstrates what unneeded dependencies can be shed:
no temporary vectors of packaged tasks/threads
no assumption/requirement to have std::function<> wrapped tasks (this removes dynamic allocations and virtual dispatch internally in the implementation)
no requirement that the results must be in a vector (in fact, you can collect them anywhere you want using a custom output iterator)
move-awareness (this is arguably a "complicated" part of the code seeing that there is no std::move_transform, so go the extra mile using std::make_move_iterator
Live On Coliru
#include <thread>
#include <future>
#include <iostream>
#include <vector>
#include <algorithm>
#include <boost/range.hpp>
bool odd(int i) { return (((i&1)==1)?true:false); }
template <typename Range, typename OutIt>
void exec_and_collect(OutIt results, Range&& tasks) {
using namespace std;
using T = typename boost::range_value<Range>::type;
using R = decltype(declval<T>()());
auto tb = std::make_move_iterator(boost::begin(tasks)),
te = std::make_move_iterator(boost::end(tasks));
vector<future<R>> futures;
transform(
tb, te,
back_inserter(futures), [](auto&& t) {
std::packaged_task<R()> task(std::forward<decltype(t)>(t));
auto future = task.get_future();
thread(std::move(task)).detach();
return future;
});
// threads are now running, collect results
transform(begin(futures), end(futures), results, [](auto& fut) { return fut.get(); });
}
#include <boost/range/irange.hpp>
#include <boost/range/adaptors.hpp>
using namespace boost::adaptors;
int main() {
std::vector<bool> results;
exec_and_collect(
std::back_inserter(results),
boost::irange(0, 8) | transformed([](int i) { return [i] { return odd(i); }; })
);
std::copy(results.begin(), results.end(), std::ostream_iterator<bool>(std::cout << std::boolalpha, "; "));
}
Output
false; false; false; false; false; false; false; false;
Note that you could indeed write
exec_and_collect(
std::ostream_iterator<bool>(std::cout << std::boolalpha, "; "),
boost::irange(0, 8) | transformed([](int i) { return [i] { return odd(i); }; })
);
and do without any results container :)