What am I doing wrong with my C++ threading? - c++

I trying to solve this following problem:
Give a vector V[] of integers with positive and negetive. A number N is paired with its negative counter part, which is -N. Now if there are pairs of such numbers in the given vector V[], take the positive integer and push them to a return result vector.
Example:
If input is V = [1,-1,0,2,-3,3]
return [1,3]
I tried to solve this problem in 3 flavors:
Single Threaded | Runtime: 404000
Multithreaded course grained lock | Runtime: 39882000
Multithreaded fine grained lock | Runtime: 43921000
My idea with fine grained locking is to update memory at decrete memory locations based upon the input.
I see that my Multithreaded course grained lock is performing worst than Single Threaded one (which is kind of expected). But what I don't understand is why my Multithreaded fine grained lock is most-of-the-time performing worse than Multithreaded course grained lock, performing poor compared to Single-Threaded version. I expected the *Multithreaded fine grained lock** should perform better than the Single-Threaded version.
What is wrong with my implementation? What am I doing wrong. How can I improve performance of this code with multithreading?
#include <iostream>
#include <unordered_map>
#include <vector>
#include <mutex>
#include <thread>
#include <chrono>
#include <cstdlib>
#include <memory>
using namespace std;
class Solution
{
private:
const static uint32_t THREAD_N = 5;
unordered_map<uint32_t, int32_t> records;
vector<uint32_t> results;
vector<atomic<uint32_t>> atm_results;
mutex mut[THREAD_N];
mutex mutrec;
bool bzero;
public:
Solution(): bzero(true){
records.reserve(100);
}
void InsertVal(const vector<int32_t> &vin)
{
for (auto iter : vin) {
if(iter < 0)
{
if(records[0-iter] > 0) results.emplace_back(0-iter);
records[0-iter]--;
}
else if(iter > 0)
{
if(records[iter] < 0) results.emplace_back(iter);
records[iter]++;
}
else
{
bzero = !bzero;
if (bzero) {
results.emplace_back(0);
}
}
}
}
void InsertValEach(const int32_t &val)
{
lock_guard<mutex> lock(mutrec); // single block of lock
if(val < 0)
{
if(records[0-val] > 0) results.emplace_back(0-val);
records[0-val]--;
}
else if(val > 0)
{
if(records[val] < 0) results.emplace_back(val);
records[val]++;
}
else
{
bzero = !bzero;
if (bzero) {
results.emplace_back(0);
}
}
}
void InsertValEachFree(const int32_t &val)
{
if(val < 0)
{
lock_guard<mutex> lock(mut[(0-val)%THREAD_N]); // finer lock based on input
if(records[0-val] > 0)
{
lock_guard<mutex> l(mutrec); // yet another finer lock to update results
results.emplace_back(0-val);
}
records[0-val]--;
}
else if(val > 0)
{
lock_guard<mutex> lock(mut[(val)%THREAD_N]);
if(records[val] < 0)
{
lock_guard<mutex> l(mutrec);
results.emplace_back(val);
}
records[val]++;
}
else
{
lock_guard<mutex> lock(mut[0]);
bzero = !bzero;
if (bzero) {
lock_guard<mutex> l(mutrec);
results.emplace_back(0);
}
}
}
vector<uint32_t> GetResult()
{
lock_guard<mutex> l(mutrec);
return results;
}
void reset()
{
lock_guard<mutex> l(mutrec);
results = vector<uint32_t>();
}
};
void Display(Solution &s)
{
auto v = s.GetResult();
// for (auto &iter : v) {
// cout<<iter<<" ";
// }
cout<<v.size()<<"\n";
}
size_t SingleThread(Solution &s, const vector<int32_t> &vec)
{
chrono::time_point<chrono::system_clock> start, stop;
start = chrono::system_clock::now();
s.InsertVal(vec);
stop = chrono::system_clock::now();
chrono::duration<double> elapse_time = stop - start;
Display(s);
s.reset();
return chrono::duration_cast<chrono::nanoseconds>(elapse_time).count();
}
size_t CourseGrainLock(Solution &s, const vector<int32_t> &vec)
{
chrono::time_point<chrono::system_clock> start, stop;
vector<thread> vthreads;
auto vsize = vec.size();
start = chrono::system_clock::now();
for (int32_t iter=0; iter<vsize; iter++) {
vthreads.push_back(thread(&Solution::InsertValEach, &s, vec[iter]));
}
stop = chrono::system_clock::now();
for (auto &th : vthreads) {
th.join();
}
chrono::duration<double> elapse_time = stop - start;
Display(s);
s.reset();
return chrono::duration_cast<chrono::nanoseconds>(elapse_time).count();
}
size_t FineGrainLock(Solution &s, const vector<int32_t> &vec)
{
chrono::time_point<chrono::system_clock> start, stop;
vector<thread> vthreads;
auto vsize = vec.size();
start = chrono::system_clock::now();
for (int32_t iter=0; iter<vsize; iter++) {
vthreads.push_back(thread(&Solution::InsertValEachFree, &s, vec[iter]));
}
stop = chrono::system_clock::now();
for (auto &th : vthreads) {
th.join();
}
chrono::duration<double> elapse_time = stop - start;
Display(s);
s.reset();
return chrono::duration_cast<chrono::nanoseconds>(elapse_time).count();
}
int main(int argc, const char * argv[]) {
vector<int32_t> vec;
int count = 1000;
while(count--)
{
vec.emplace_back(rand()%50);
vec.emplace_back(0-(rand()%50));
}
Solution s;
auto nanosec = SingleThread(s, vec);
cout<<"Time of Execution (nano) Single Thread: "<<nanosec<<"\n";
nanosec = CourseGrainLock(s, vec);
cout<<"Time of Execution (nano) Course Grain: "<<nanosec<<"\n";
nanosec = FineGrainLock(s, vec);
cout<<"Time of Execution (nano) Fine Grain: "<<nanosec<<"\n";
return 0;
}

You're creating one thread for each number in vec. There is a considerable cost in creating a thread. You should create a few threads (no more than the number of execution units in your hardware) and have each thread process multiple entries of the vector. main can run one set of results, thus avoiding creating of one thread.
With the locking in CourseGrainLock (in InsertValEach), since the first thing each thread does is grab a lock that is not release until the function is done, your code is effectively single threaded but with the cost of creating all those threads.
The locking in your FineGrainLock (in InsertValEachFree) is not much better. You have several locks, but you make changes to results in multiple threads with different locks. Adding elements to an unordered map (which you do with results[i] or results[0-i] is not thread safe, and you risk Undefined Behavior with that code.
A reasonable approach here is to have each thread keep track of its own results independently, thus avoiding the need for locks at all, and combine them into the main results once all the threads are done.

You probably can't improve it with multithreading. All of the threads have to access the same shared input vector and result vector. The tremendous slowdown that you see vs. the single-threaded solution is the overhead of serializing the access to the shared structure.
Multithreading is not a panacea. If you need to do something like this to an array, "just do it." Single-thread.

the major issue i see with your code is the majority of the work is being done inside the mutex. That completely blocks the other threads, so there is no benefit. Even the fine grained one is only doing a very small calculation outside the mutex compared to the cost of updating the map ant the output vector.
I'm not even totally convinced your finegrained locking is completely thread safe? If the array index might create nodes in the map for a value that hasn't been seen before, then that invalidates any other thread's simultaneous searches. You could use a separate map for each locked value range I think.
But to be honest I think you are just doing too little work in each thread. Try creating a smaller number of threads and have each one do a range of the input values - calling the existing code for each entry in that range.

Related

C++ Reusing a vector of threads that call the same function

I would like to reuse a vector of threads that call the same function several times with different parameters. There is no writing (with the exception of an atomic parameter), so no need for a mutex. To depict the idea, I created a basic example of a parallelized code that finds the maximum value of a vector. There are clearly better ways to find the max of a vector, but for the sake of the explanation and to avoid getting into further details of the real code I am writing, I am going with this silly example.
The code finds the maximum number of a vector by calling a function pFind that checks whether the vector contains the number k (k is initialized with an upper bound). If it does, the execution stops, otherwise k is reduced by one and the process repeats.
The code bellow generates a vector of threads that parallelize the search for k in the vector. The issue is that, for every value of k, the vector of threads is regenerated and each time the new threads are joined.
Generating the vector of threads and joining them every time comes with an overhead that I want to avoid.
I am wondering if there is a way of generating a vector (a pool) of threads only once and reuse them for the new executions. Any other speedup tip will be appreciated.
void pFind(
vector<int>& a,
int n,
std::atomic<bool>& flag,
int k,
int numTh,
int val
) {
int i = k;
while (i < n) {
if (a[i] == val) {
flag = true;
break;
} else
i += numTh;
}
}
int main() {
std::atomic<bool> flag;
flag = false;
int numTh = 8;
int val = 1000;
int pos = 0;
while (!flag) {
vector<thread>threads;
for (int i = 0; i < numTh; i++){
thread th(&pFind, std::ref(a), size, std::ref(flag), i, numTh, val);
threads.push_back(std::move(th));
}
for (thread& th : threads)
th.join();
if (flag)
break;
val--;
}
cout << val << "\n";
return 0;
}
There is no way to assign a different execution function (closure) to a std::thread after construction. This is generally true of all thread abstractions, though often implementations try to memoize or cache lower-level abstractions internally to make thread fork and join fast so just constructing new threads is viable. There is a debate in systems programming circles about whether creating a new thread should be incredibly lightweight or whether clients should be written to not fork threads as frequently. (Given this has been ongoing for a very long time, it should be clear there are a lot of tradeoffs involved.)
There are a lot of other abstractions which try to do what you really want. They have names such as "threadpools," "task executors" (or just "executors"), and "futures." All of them tend to map onto threads by creating some set of threads, often related to the number of hardware cores in the system, and then having each of those threads loop and look for requests.
As the comments indicated, the main way you would do this yourself is to have threads with a top-level loop that accepts execution requests, processes them, and then posts the results. To do this you will need to use other synchronization methods such as mutexes and condition variables. It is generally faster to do things this way if there are a lot of requests and requests are not incredibly large.
As much as standard C++ concurrency support is a good thing, it is also rather significantly lacking for real world high performance work. Something like Intel's TBB is far more of an industrial strength solution.
By piecing together some code from different online searches, the following works, but is not as fast as as the approach that regenerates the threads at each iteration of the while loop.
Perhaps someone can comment on this approach.
The following class describes the thread pool
class ThreadPool {
public:
ThreadPool(int threads) : shutdown_(false){
threads_.reserve(threads);
for (int i = 0; i < threads; ++i)
threads_.emplace_back(std::bind(&ThreadPool::threadEntry, this, i));
}
~ThreadPool(){
{
// Unblock any threads and tell them to stop
std::unique_lock<std::mutex>l(lock_);
shutdown_ = true;
condVar_.notify_all();
}
// Wait for all threads to stop
std::cerr << "Joining threads" << std::endl;
for (auto & thread : threads_) thread.join();
}
void doJob(std::function<void(void)>func){
// Place a job on the queu and unblock a thread
std::unique_lock<std::mutex>l(lock_);
jobs_.emplace(std::move(func));
condVar_.notify_one();
}
void threadEntry(int i){
std::function<void(void)>job;
while (1){
{
std::unique_lock<std::mutex>l(lock_);
while (!shutdown_ && jobs_.empty()) condVar_.wait(l);
if (jobs_.empty()){
// No jobs to do and we are shutting down
std::cerr << "Thread " << i << " terminates" << std::endl;
return;
}
std::cerr << "Thread " << i << " does a job" << std::endl;
job = std::move(jobs_.front());
jobs_.pop();
}
// Do the job without holding any locks
job();
}
}
};
Here is the rest of the code
void pFind(
vector<int>& a,
int n,
std::atomic<bool>& flag,
int k,
int numTh,
int val,
std::atomic<int>& completed) {
int i = k;
while (i < n) {
if (a[i] == val) {
flag = true;
break;
} else
i += numTh;
}
completed++;
}
int main() {
std::atomic<bool> flag;
flag = false;
int numTh = 8;
int val = 1000;
int pos = 0;
std::atomic<int> completed;
completed=0;
ThreadPool p(numThreads);
while (!flag) {
for (int i = 0; i < numThreads; i++) {
p.doJob(std::bind(pFind, std::ref(a), size, std::ref(flag), i, numTh, val, std::ref(completed)));
}
while (completed < numTh) {}
if (flag) {
break;
} else {
completed = 0;
val--;
}
}
cout << val << "\n";
return 0;
}
Your code has a race condition: bool is not an atomic type and is therefore not safe for multiple threads to write to concurrently. You need to use std::atomic_bool or std::atomic_flag.
To answer your question, you're recreating the threads vector each iteration of the loop, which you can avoid by moving its declaration outside the loop body. Reusing the threads themselves is a much more complex topic that's hard to get right or describe concisely.
vector<thread> threads;
threads.reserve(numTh);
while (!flag) {
for (size_t i = 0; i < numTh; ++i)
threads.emplace_back(pFind, a, size, flag, i, numTh, val);
for (auto &th : threads)
th.join();
threads.clear();
}

How can I refactor this code into multi-thread version?

There is a loop which takes quite a long time and I'm considering refactoring this code into multi-thread version. And here is the model.
Photon photon;
for (int i=0;i<1000000;++i){
func(){
photon.lanuch(args...){
// do something
}
}
}
I have to call this function a thousand and thousand times.So I was wondering how can I create some threads to run this function at the some time.
But the photon have to be individual every single time.
the index i can be converted to this:
atomic<int> i{0};
while(i<1000000){
func(){
photon.lanuch(args...){
// do something
++i;
}
}
}
With threading you have to pay attention to object lifetime and sharing far more than normal.
But the basic solution is
void do_tasks( std::size_t count, std::function<void( std::size_t start, std::size_t finish )> task ) {
auto thread_count = std::thread::hardware_concurrency();
if (thread_count <= 0) thread_count = 1;
std::vector<std::future<void>> threads( thread_count-1 );
auto get_task = [=](std::size_t index) {
auto start = count * index / thread_count;
auto finish = count * (index+1) / thread_count;
// std::cout << "from " << start << " to " << finish << "\n";
return [task, start, finish]{ task(start, finish); };
};
for( auto& thread : threads ) {
auto index = &thread-threads.data();
thread = std::async( std::launch::async, get_task(index) );
}
get_task( threads.size() )();
for (auto& thread : threads) {
thread.get();
}
}
This is a little multi threading library.
You use it like this:
do_tasks( 100, [&](size_t start, size_t finish) {
// do subtasks starting at index start, up to and not including finish
});
There are other more complex threading libraries, but writing a small half-decent one isn't hard so I did it.
To be explicit:
Photon photon;
do_tasks( 1000000, [&](size_t start, size_t finish) {
for (int i = start; i < finish; ++i) {
photon.lanuch(args...){
}
});
but you'll have to be extremely careful making sure there is no unsafe data sharing between the threads, and you aren't just blocking each thread on a common mutex.
Live example
A awful lot depends on how and to what extent photon.launch() can be parallelised.
The code below divides a range into (approximately) equal segments and then executes each segment in a separate thread.
As stated whether that helps will depend on how much of photon.launch() can be done in parallel. If it spends most of its time modifying a shared state and essentially has the form:
void launch(int index){
std::lock_guard<std::mutex> guard{m};
//.....
}
Where m is a member of Photon then little if anything will be gained.
If (at the other extreme) the individual calls to launch never contend for the same data then it can be parallelised up to the number of cores the system can provide.
#include <thread>
#include <vector>
class Photon {
public:
void launch(int index){
//... what goes here matters a lot...
}
};
void photon_launch(Photon& photon,int from,int to){
for(auto i=from;i<=to;++i){
photon.launch(i);
}
}
int main() {
const size_t loop_count=100000;//How big is the loop?
const size_t thread_count=4;//How many threads can we utilize?
std::vector< std::thread > threads;
Photon photon;
int from=1;
for(size_t i=1;i<=thread_count;++i){
//If loop_count isn't divisible by thread_count evens out the remainder.
int to=(loop_count*i)/thread_count;
threads.emplace_back(photon_launch,std::ref(photon),from,to);
from=to+1;
}
//Now the threads are launched we block until they all finish.
//If we don't the program may (will?) finish before the threads.
for(auto& curr : threads){
curr.join();
}
return 0;
}

How to apply a concurrent solution to a Producer-Consumer like situation

I have a XML file with a sequence of nodes. Each node represents an element that I need to parse and add in a sorted list (the order must be the same of the nodes found in the file).
At the moment I am using a sequential solution:
struct Graphic
{
bool parse()
{
// parsing...
return parse_outcome;
}
};
vector<unique_ptr<Graphic>> graphics;
void producer()
{
for (size_t i = 0; i < N_GRAPHICS; i++)
{
auto g = new Graphic();
if (g->parse())
graphics.emplace_back(g);
else
delete g;
}
}
So, only if the graphic (that actually is an instance of a class derived from Graphic, a Line, a Rectangle and so on, that is why the new) can be properly parse, it will be added to my data structure.
Since I only care about the order in which thes graphics are added to my list, I though to call the parse method asynchronously, such that the producer has the task of read each node from the file and add this graphic to the data structure, while the consumer has the task of parse each graphic whenever a new graphic is ready to be parsed.
Now I have several consumer threads (created in the main) and my code looks like the following:
queue<pair<Graphic*, size_t>> q;
mutex m;
atomic<size_t> n_elements;
void producer()
{
for (size_t i = 0; i < N_GRAPHICS; i++)
{
auto g = new Graphic();
graphics.emplace_back(g);
q.emplace(make_pair(g, i));
}
n_elements = graphics.size();
}
void consumer()
{
pair<Graphic*, size_t> item;
while (true)
{
{
std::unique_lock<std::mutex> lk(m);
if (n_elements == 0)
return;
n_elements--;
item = q.front();
q.pop();
}
if (!item.first->parse())
{
// here I should remove the item from the vector
assert(graphics[item.second].get() == item.first);
delete item.first;
graphics[item.second] = nullptr;
}
}
}
I run the producer first of all in my main, so that when the first consumer starts the queue is already completely full.
int main()
{
producer();
vector<thread> threads;
for (auto i = 0; i < N_THREADS; i++)
threads.emplace_back(consumer);
for (auto& t : threads)
t.join();
return 0;
}
The concurrent version seems to be at least twice as faster as the original one.
The full code has been uploaded here.
Now I am wondering:
Are there any (synchronization) errors in my code?
Is there a way to achieve the same result faster (or better)?
Also, I noticed that on my computer I get the best result (in terms of elapsed time) if I set the number of thread equals to 8. More (or less) threads give me worst results. Why?
Blockquote
There isn't synchronization errors, but I think that the memory managing could be better, since your code leaked if parse() throws an exception.
There isn't synchronization errors, but I think that your memory managing could be better, since you will have leaks if parse() throw an exception.
Blockquote
Is there a way to achieve the same result faster (or better)?
Probably. You could use a simple implementation of a thread pool and a lambda that do the parse() for you.
The code below illustrate this approach. I use the threadpool implementation
here
#include <iostream>
#include <stdexcept>
#include <vector>
#include <memory>
#include <chrono>
#include <utility>
#include <cassert>
#include <ThreadPool.h>
using namespace std;
using namespace std::chrono;
#define N_GRAPHICS (1000*1000*1)
#define N_THREADS 8
struct Graphic;
using GPtr = std::unique_ptr<Graphic>;
static vector<GPtr> graphics;
struct Graphic
{
Graphic()
: status(false)
{
}
bool parse()
{
// waste time
try
{
throw runtime_error("");
}
catch (runtime_error)
{
}
status = true;
//return false;
return true;
}
bool status;
};
int main()
{
auto start = system_clock::now();
auto producer_unit = []()-> GPtr {
std::unique_ptr<Graphic> g(new Graphic);
if(!g->parse()){
g.reset(); // if g don't parse, return nullptr
}
return g;
};
using ResultPool = std::vector<std::future<GPtr>>;
ResultPool results;
// ThreadPool pool(thread::hardware_concurrency());
ThreadPool pool(N_THREADS);
for(int i = 0; i <N_GRAPHICS; ++i){
// Running async task
results.emplace_back(pool.enqueue(producer_unit));
}
for(auto &t : results){
auto value = t.get();
if(value){
graphics.emplace_back(std::move(value));
}
}
auto duration = duration_cast<milliseconds>(system_clock::now() - start);
cout << "Elapsed: " << duration.count() << endl;
for (size_t i = 0; i < graphics.size(); i++)
{
if (!graphics[i]->status)
{
cerr << "Assertion failed! (" << i << ")" << endl;
break;
}
}
cin.get();
return 0;
}
It is a bit faster (1s) on my machine, more readable, and removes the necessity of shared datas (synchronization is evil, avoid it or hide it in a reliable and efficient way).

How to multithread reading a file in c++11?

I have a big file, and i have to read it by chunk. Each time when i read a chunk, i have to do some time consuming operation, so i think multithread reading might help, each thread reads a chunk one by one and does its operation. here is my code in c++11
#include<iostream>
#include<fstream>
#include <condition_variable>
#include <mutex>
#include <thread>
using namespace std;
const int CHAR_PER_FILE = 1e8;
const int NUM_THREAD = 2;
int order = -1;
bool is_reading = false;
mutex mtx;
condition_variable file_not_reading;
void partition(ifstream& is)
{
while (is.peek() != EOF)
{
unique_lock<mutex> lock(mtx);
while (is_reading)
file_not_reading.wait(lock);
is_reading = true;
char *c = new char[CHAR_PER_FILE];
is.read(c, CHAR_PER_FILE);
order++;
is_reading = false;
file_not_reading.notify_all();
lock.unlock();
char oc[3];
sprintf(oc, "%d", order);
this_thread::sleep_for(chrono::milliseconds(2000));//some operations that take long time
ofstream os(oc, ios::binary);
os.write(c, CHAR_PER_FILE);
delete[] c;
os.close();
}
}
int main()
{
ifstream is("bigfile.txt",ios::binary);
thread threads[NUM_THREAD];
for (int i = 0; i < NUM_THREAD; i++)
threads[i] = thread(partition, ref(is));
for (int i = 0; i < NUM_THREAD; i++)
threads[i].join();
is.close();
system("pause");
return 0;
}
But my code didn't work, it only created 4 files instead of `bigfilesize/CHAR_PER_FILE, and threads seem got stuck, how can i make it work?
Is there any c++11 multithread reading file implementation or example?
Thanks.
My advice:
Use one thread to read chunks from the file. Every time a chunk is read, post it to a request queue. It is not worth reading multithreaded as there will be internal locks/blocking reading a common resource.
Use a pool of threads. Each of them read from the queue, retrieves a chunk, execute the expensive operation and go back to wait for a new request.
The queue must be mutex protected.
Don't use more threads than the number of processing units (CPU/Cores/HyperThreads) you have.
The main caveat of the above is that it will not guarantee the processing order. You will probably need to post the results to a central place that can reorder (again central place -> must be mutex protected).
You could use task-based parallelism with std::async:
class result; // result of expensive operation
result expensive_operation(std::vector<char> const& data)
{
result r = // long computation
return r;
}
std::vector<char>::size_type BLOCK_SIZE = 4096;
std::vector<std::future<result>> partition(ifstream& in)
{
std::vector<std::future<result>> tasks;
while (!in.eof() && !in.fail())
{
std::vector<char> c(BLOCK_SIZE);
is.read(c.data(), BLOCK_SIZE);
c.resize(in.gcount());
tasks.push_back( std::async( [](std::vector<char> data)
{
return expensive_operation(data);
},
std::move(c) ));
}
return tasks;
}
int main()
{
ifstream is("bigfile.txt",ios::binary);
auto results = partition(is);
// iterate over results and do something with it
}
Does the file have to be read in "sequential" order, i.e. do the chunks have to be "operated" on in a special order? Otherwise you could e.g. make 4 threads and let each thread read 1/4 of the file (you could do this by using tellg and saving the position in e.g. a vector or variable). That way you wouldn't have to use locks.
Maybe you could tell us how the data you read in has to be evaluated.
Perhaps...
void partition(ifstream& is)
{
unique_lock<mutex> lock(mtx);
std::vector<char> c(CHAR_PER_FILE);
is.read(c.data(), CHAR_PER_FILE);
lock.unlock();
if (is.fail() && !is.eof()) return;
size_t num_bytes_read = is.gcount();
std::ostringstream oc;
oc << order;
this_thread::sleep_for(chrono::milliseconds(2000)); //take long time
if (std::ofstream os(oc, ios::binary))
os.write(c.data(), CHAR_PER_FILE);
}
Notes:
The mutex serialises the operations already - no need for a condition variable.
I've added a little input error and bytes-read handling - you should check after os.write() too, add an else for failed ofstream creation etc.

Thread pooling in C++11

Relevant questions:
About C++11:
C++11: std::thread pooled?
Will async(launch::async) in C++11 make thread pools obsolete for avoiding expensive thread creation?
About Boost:
C++ boost thread reusing threads
boost::thread and creating a pool of them!
How do I get a pool of threads to send tasks to, without creating and deleting them over and over again? This means persistent threads to resynchronize without joining.
I have code that looks like this:
namespace {
std::vector<std::thread> workers;
int total = 4;
int arr[4] = {0};
void each_thread_does(int i) {
arr[i] += 2;
}
}
int main(int argc, char *argv[]) {
for (int i = 0; i < 8; ++i) { // for 8 iterations,
for (int j = 0; j < 4; ++j) {
workers.push_back(std::thread(each_thread_does, j));
}
for (std::thread &t: workers) {
if (t.joinable()) {
t.join();
}
}
arr[4] = std::min_element(arr, arr+4);
}
return 0;
}
Instead of creating and joining threads each iteration, I'd prefer to send tasks to my worker threads each iteration and only create them once.
This is adapted from my answer to another very similar post.
Let's build a ThreadPool class:
class ThreadPool {
public:
void Start();
void QueueJob(const std::function<void()>& job);
void Stop();
void busy();
private:
void ThreadLoop();
bool should_terminate = false; // Tells threads to stop looking for jobs
std::mutex queue_mutex; // Prevents data races to the job queue
std::condition_variable mutex_condition; // Allows threads to wait on new jobs or termination
std::vector<std::thread> threads;
std::queue<std::function<void()>> jobs;
};
ThreadPool::Start
For an efficient threadpool implementation, once threads are created according to num_threads, it's better not to
create new ones or destroy old ones (by joining). There will be a performance penalty, and it might even make your
application go slower than the serial version. Thus, we keep a pool of threads that can be used at any time (if they
aren't already running a job).
Each thread should be running its own infinite loop, constantly waiting for new tasks to grab and run.
void ThreadPool::Start() {
const uint32_t num_threads = std::thread::hardware_concurrency(); // Max # of threads the system supports
threads.resize(num_threads);
for (uint32_t i = 0; i < num_threads; i++) {
threads.at(i) = std::thread(ThreadLoop);
}
}
ThreadPool::ThreadLoop
The infinite loop function. This is a while (true) loop waiting for the task queue to open up.
void ThreadPool::ThreadLoop() {
while (true) {
std::function<void()> job;
{
std::unique_lock<std::mutex> lock(queue_mutex);
mutex_condition.wait(lock, [this] {
return !jobs.empty() || should_terminate;
});
if (should_terminate) {
return;
}
job = jobs.front();
jobs.pop();
}
job();
}
}
ThreadPool::QueueJob
Add a new job to the pool; use a lock so that there isn't a data race.
void ThreadPool::QueueJob(const std::function<void()>& job) {
{
std::unique_lock<std::mutex> lock(queue_mutex);
jobs.push(job);
}
mutex_condition.notify_one();
}
To use it:
thread_pool->QueueJob([] { /* ... */ });
ThreadPool::busy
void ThreadPool::busy() {
bool poolbusy;
{
std::unique_lock<std::mutex> lock(queue_mutex);
poolbusy = jobs.empty();
}
return poolbusy;
}
The busy() function can be used in a while loop, such that the main thread can wait the threadpool to complete all the tasks before calling the threadpool destructor.
ThreadPool::Stop
Stop the pool.
void ThreadPool::Stop() {
{
std::unique_lock<std::mutex> lock(queue_mutex);
should_terminate = true;
}
mutex_condition.notify_all();
for (std::thread& active_thread : threads) {
active_thread.join();
}
threads.clear();
}
Once you integrate these ingredients, you have your own dynamic threading pool. These threads always run, waiting for
job to do.
I apologize if there are some syntax errors, I typed this code and and I have a bad memory. Sorry that I cannot provide
you the complete thread pool code; that would violate my job integrity.
Notes:
The anonymous code blocks are used so that when they are exited, the std::unique_lock variables created within them
go out of scope, unlocking the mutex.
ThreadPool::Stop will not terminate any currently running jobs, it just waits for them to finish via active_thread.join().
You can use C++ Thread Pool Library, https://github.com/vit-vit/ctpl.
Then the code your wrote can be replaced with the following
#include <ctpl.h> // or <ctpl_stl.h> if ou do not have Boost library
int main (int argc, char *argv[]) {
ctpl::thread_pool p(2 /* two threads in the pool */);
int arr[4] = {0};
std::vector<std::future<void>> results(4);
for (int i = 0; i < 8; ++i) { // for 8 iterations,
for (int j = 0; j < 4; ++j) {
results[j] = p.push([&arr, j](int){ arr[j] +=2; });
}
for (int j = 0; j < 4; ++j) {
results[j].get();
}
arr[4] = std::min_element(arr, arr + 4);
}
}
You will get the desired number of threads and will not create and delete them over and over again on the iterations.
A pool of threads means that all your threads are running, all the time – in other words, the thread function never returns. To give the threads something meaningful to do, you have to design a system of inter-thread communication, both for the purpose of telling the thread that there's something to do, as well as for communicating the actual work data.
Typically this will involve some kind of concurrent data structure, and each thread would presumably sleep on some kind of condition variable, which would be notified when there's work to do. Upon receiving the notification, one or several of the threads wake up, recover a task from the concurrent data structure, process it, and store the result in an analogous fashion.
The thread would then go on to check whether there's even more work to do, and if not go back to sleep.
The upshot is that you have to design all this yourself, since there isn't a natural notion of "work" that's universally applicable. It's quite a bit of work, and there are some subtle issues you have to get right. (You can program in Go if you like a system which takes care of thread management for you behind the scenes.)
A threadpool is at core a set of threads all bound to a function working as an event loop. These threads will endlessly wait for a task to be executed, or their own termination.
The threadpool job is to provide an interface to submit jobs, define (and perhaps modify) the policy of running these jobs (scheduling rules, thread instantiation, size of the pool), and monitor the status of the threads and related resources.
So for a versatile pool, one must start by defining what a task is, how it is launched, interrupted, what is the result (see the notion of promise and future for that question), what sort of events the threads will have to respond to, how they will handle them, how these events shall be discriminated from the ones handled by the tasks. This can become quite complicated as you can see, and impose restrictions on how the threads will work, as the solution becomes more and more involved.
The current tooling for handling events is fairly barebones(*): primitives like mutexes, condition variables, and a few abstractions on top of that (locks, barriers). But in some cases, these abstrations may turn out to be unfit (see this related question), and one must revert to using the primitives.
Other problems have to be managed too:
signal
i/o
hardware (processor affinity, heterogenous setup)
How would these play out in your setting?
This answer to a similar question points to an existing implementation meant for boost and the stl.
I offered a very crude implementation of a threadpool for another question, which doesn't address many problems outlined above. You might want to build up on it. You might also want to have a look of existing frameworks in other languages, to find inspiration.
(*) I don't see that as a problem, quite to the contrary. I think it's the very spirit of C++ inherited from C.
Follwoing [PhD EcE](https://stackoverflow.com/users/3818417/phd-ece) suggestion, I implemented the thread pool:
function_pool.h
#pragma once
#include <queue>
#include <functional>
#include <mutex>
#include <condition_variable>
#include <atomic>
#include <cassert>
class Function_pool
{
private:
std::queue<std::function<void()>> m_function_queue;
std::mutex m_lock;
std::condition_variable m_data_condition;
std::atomic<bool> m_accept_functions;
public:
Function_pool();
~Function_pool();
void push(std::function<void()> func);
void done();
void infinite_loop_func();
};
function_pool.cpp
#include "function_pool.h"
Function_pool::Function_pool() : m_function_queue(), m_lock(), m_data_condition(), m_accept_functions(true)
{
}
Function_pool::~Function_pool()
{
}
void Function_pool::push(std::function<void()> func)
{
std::unique_lock<std::mutex> lock(m_lock);
m_function_queue.push(func);
// when we send the notification immediately, the consumer will try to get the lock , so unlock asap
lock.unlock();
m_data_condition.notify_one();
}
void Function_pool::done()
{
std::unique_lock<std::mutex> lock(m_lock);
m_accept_functions = false;
lock.unlock();
// when we send the notification immediately, the consumer will try to get the lock , so unlock asap
m_data_condition.notify_all();
//notify all waiting threads.
}
void Function_pool::infinite_loop_func()
{
std::function<void()> func;
while (true)
{
{
std::unique_lock<std::mutex> lock(m_lock);
m_data_condition.wait(lock, [this]() {return !m_function_queue.empty() || !m_accept_functions; });
if (!m_accept_functions && m_function_queue.empty())
{
//lock will be release automatically.
//finish the thread loop and let it join in the main thread.
return;
}
func = m_function_queue.front();
m_function_queue.pop();
//release the lock
}
func();
}
}
main.cpp
#include "function_pool.h"
#include <string>
#include <iostream>
#include <mutex>
#include <functional>
#include <thread>
#include <vector>
Function_pool func_pool;
class quit_worker_exception : public std::exception {};
void example_function()
{
std::cout << "bla" << std::endl;
}
int main()
{
std::cout << "stating operation" << std::endl;
int num_threads = std::thread::hardware_concurrency();
std::cout << "number of threads = " << num_threads << std::endl;
std::vector<std::thread> thread_pool;
for (int i = 0; i < num_threads; i++)
{
thread_pool.push_back(std::thread(&Function_pool::infinite_loop_func, &func_pool));
}
//here we should send our functions
for (int i = 0; i < 50; i++)
{
func_pool.push(example_function);
}
func_pool.done();
for (unsigned int i = 0; i < thread_pool.size(); i++)
{
thread_pool.at(i).join();
}
}
You can use thread_pool from boost library:
void my_task(){...}
int main(){
int threadNumbers = thread::hardware_concurrency();
boost::asio::thread_pool pool(threadNumbers);
// Submit a function to the pool.
boost::asio::post(pool, my_task);
// Submit a lambda object to the pool.
boost::asio::post(pool, []() {
...
});
}
You also can use threadpool from open source community:
void first_task() {...}
void second_task() {...}
int main(){
int threadNumbers = thread::hardware_concurrency();
pool tp(threadNumbers);
// Add some tasks to the pool.
tp.schedule(&first_task);
tp.schedule(&second_task);
}
Something like this might help (taken from a working app).
#include <memory>
#include <boost/asio.hpp>
#include <boost/thread.hpp>
struct thread_pool {
typedef std::unique_ptr<boost::asio::io_service::work> asio_worker;
thread_pool(int threads) :service(), service_worker(new asio_worker::element_type(service)) {
for (int i = 0; i < threads; ++i) {
auto worker = [this] { return service.run(); };
grp.add_thread(new boost::thread(worker));
}
}
template<class F>
void enqueue(F f) {
service.post(f);
}
~thread_pool() {
service_worker.reset();
grp.join_all();
service.stop();
}
private:
boost::asio::io_service service;
asio_worker service_worker;
boost::thread_group grp;
};
You can use it like this:
thread_pool pool(2);
pool.enqueue([] {
std::cout << "Hello from Task 1\n";
});
pool.enqueue([] {
std::cout << "Hello from Task 2\n";
});
Keep in mind that reinventing an efficient asynchronous queuing mechanism is not trivial.
Boost::asio::io_service is a very efficient implementation, or actually is a collection of platform-specific wrappers (e.g. it wraps I/O completion ports on Windows).
Edit: This now requires C++17 and concepts. (As of 9/12/16, only g++ 6.0+ is sufficient.)
The template deduction is a lot more accurate because of it, though, so it's worth the effort of getting a newer compiler. I've not yet found a function that requires explicit template arguments.
It also now takes any appropriate callable object (and is still statically typesafe!!!).
It also now includes an optional green threading priority thread pool using the same API. This class is POSIX only, though. It uses the ucontext_t API for userspace task switching.
I created a simple library for this. An example of usage is given below. (I'm answering this because it was one of the things I found before I decided it was necessary to write it myself.)
bool is_prime(int n){
// Determine if n is prime.
}
int main(){
thread_pool pool(8); // 8 threads
list<future<bool>> results;
for(int n = 2;n < 10000;n++){
// Submit a job to the pool.
results.emplace_back(pool.async(is_prime, n));
}
int n = 2;
for(auto i = results.begin();i != results.end();i++, n++){
// i is an iterator pointing to a future representing the result of is_prime(n)
cout << n << " ";
bool prime = i->get(); // Wait for the task is_prime(n) to finish and get the result.
if(prime)
cout << "is prime";
else
cout << "is not prime";
cout << endl;
}
}
You can pass async any function with any (or void) return value and any (or no) arguments and it will return a corresponding std::future. To get the result (or just wait until a task has completed) you call get() on the future.
Here's the github: https://github.com/Tyler-Hardin/thread_pool.
looks like threadpool is very popular problem/exercise :-)
I recently wrote one in modern C++; it’s owned by me and publicly available here - https://github.com/yurir-dev/threadpool
It supports templated return values, core pinning, ordering of some tasks.
all implementation in two .h files.
So, the original question will be something like this:
#include "tp/threadpool.h"
int arr[5] = { 0 };
concurency::threadPool<void> tp;
tp.start(std::thread::hardware_concurrency());
std::vector<std::future<void>> futures;
for (int i = 0; i < 8; ++i) { // for 8 iterations,
for (int j = 0; j < 4; ++j) {
futures.push_back(tp.push([&arr, j]() {
arr[j] += 2;
}));
}
}
// wait until all pushed tasks are finished.
for (auto& f : futures)
f.get();
// or just tp.end(); // will kill all the threads
arr[4] = *std::min_element(arr, arr + 4);
I found the pending tasks' future.get() call hangs on caller side if the thread pool gets terminated and leaves some tasks inside task queue. How to set future exception inside thread pool with only the wrapper std::function?
template <class F, class... Args>
std::future<std::result_of_t<F(Args...)>> enqueue(F &&f, Args &&...args) {
auto task = std::make_shared<std::packaged_task<std::result_of_t<F(Args...)>()>>(
std::bind(std::forward<F>(f), std::forward<Args>(args)...));
std::future<return_type> res = task->get_future();
{
std::unique_lock<std::mutex> lock(_mutex);
_tasks.push([task]() -> void { (*task)(); });
}
return res;
}
class StdThreadPool {
std::vector<std::thread> _workers;
std::priority_queue<TASK> _tasks;
...
}
struct TASK {
//int _func_return_value;
std::function<void()> _func;
int priority;
...
}
The Stroika library has a threadpool implementation.
Stroika ThreadPool.h
ThreadPool p;
p.AddTask ([] () {doIt ();});
Stroika's thread library also supports cancelation (cooperative) - so that when the ThreadPool above goes out of scope - it cancels any running tasks (similar to c++20's jthread).