what I want to do is to push integers to my threadSafe queue implementation with multiple threads and concurrently with another series of threads pop away the inserted numbers. All of this operation has to be thread safe, but another option that I want to have is that the size of the queue must be fixed, just like a buffer. If the buffer is full all the push threads must wait the pop threads to free some slot.
This is my implementation of the queue/buffer, it seems to work but after few iterations it stops and remains blocked without any error.
#include <queue>
#include <thread>
#include <mutex>
#include <condition_variable>
#include <iostream>
template <typename T>
class Queue
{
private:
std::queue<T> queue_;
std::mutex mutex_;
std::condition_variable cond_;
public:
T pop()
{
std::unique_lock<std::mutex> mlock(mutex_);
cond_.wait(mlock, [this]{return !queue_.empty();});
auto val = queue_.front();
queue_.pop();
return val;
}
void pop(T& item)
{
std::unique_lock<std::mutex> mlock(mutex_);
cond_.wait(mlock, [this]{return !queue_.empty();});
item = queue_.front();
queue_.pop();
}
void push(const T& item, int buffer)
{
std::unique_lock<std::mutex> mlock(mutex_);
while (queue_.size() >= buffer)
{
cond_.wait(mlock);
}
queue_.push(item);
mlock.unlock();
cond_.notify_one();
}
Queue()=default;
Queue(const Queue&) = delete; // disable copying
Queue& operator=(const Queue&) = delete; // disable assignment
};
The size of the buffer is defined in the push function with the variable buffer. This is an example of usage:
void prepare(Queue<int>& loaded, int buffer, int num_frames)
{
for (int i = 0; i < num_frames; i++)
{
cout<< "push "<<i<<endl;
loaded.push(i, buffer);
}
}
void load (vector<Frame>& movie, Queue<int>& loaded, int num_frames,
int num_points, int buffer, int height, int width)
{
for (int i = 0; i < num_frames; i++)
{
int num = loaded.pop();
cout<< "pop "<<num<<endl;
}
}
int main()
{
srand(time(NULL));
int num_threadsXstage = 4;
int width = 500;
int height = 500;
int num_points = width * height;
int num_frames = 100;
int frames_thread = num_frames/num_threadsXstage;
int preset = 3;
int buffer = 10;
//Vectors of threads
vector<thread> loader;
//Final vector
vector<Frame> movie;
movie.resize(num_frames);
//Working queues
Queue<int> loaded;
//Prepare loading queue task
thread preparator(prepare, ref(loaded), buffer, num_frames);
for (int i = 0; i < num_threadsXstage; i++)
{
//stage 1
loader.push_back(thread(&load, ref(movie), ref(loaded), frames_thread,
num_points, buffer, height, width));
}
// JOIN
preparator.join();
join_all(loader);
return 0;
}
Your pop functions could allow a thread waiting to push to make forward progress, but they don't call any notify function. You must call the appropriate notify function any time you may make it possible for a thread blocked on the condition variable to make forward progress.
Although it's quite complex to explain why, you should either call notify_all or call notify_one while still holding the lock. It is theoretically possible to "wake the wrong thread" otherwise because you are using the same condition variable for two predicates (the queue is not empty and the queue is not full).
To avoid very hard to understand failure modes, always do one of these three things:
Do not use the same condition variable to handle more than one predicate. For example, use one condition variable for "not empty" and another for "not full";
Always use notify_all, never notify_one; or
Always call notify functions while holding the mutex.
So long as you follow at least one of these three rules, you will avoid an obscure failure mode where you wake only a thread that chose to sleep after you released the mutex while leaving the only thread that could handle the condition still blocked.
Related
For study purposes, I’m comparing implementations of single producer single consumer queues. So I compared a condition variable implementation with a C++20 counting semaphore implementation. I would have guessed that the semaphore implementation would be faster, but that is not the case. Under Windows, MSVC, on my computer, the semaphore implementation is about 25% slower. I’ve included both implementations below.
The condition variable implementation has a small functional advantage: aborting operations can be achieved with the done() API function, while the semaphore implementation requires a special ‘stop’ value to be queued to unlock and exit the pulling thread.
In my imagination, a single producer single consumer queue was a typical application for semaphores, but apparently not.
Now I wonder:
Did I do something not clever so that my semaphore implementation is needlessly slow?
Is possibly the Microsoft counting semaphore implementation too slow?
Or do requirements in the C++ standard make the semaphore slow in general?
Am I just mistaken that a queue is proper application for semaphores?
When a queue is not a proper application, for what other application does the semaphore outperform the condition variable?
Condition variable implementation:
#include <array>
#include <mutex>
#include <condition_variable>
/*
* locked_single_producer_single_consumer_queue_T is responsible for locked packet communication
* between 2 threads. One thread pushes, the other thread pulls.
*/
template<class T, int N = 16> // N must be a power 2
class locked_single_producer_single_consumer_queue_T
{
public:
/* When packet fits in the queue, then push shall return immediatelly. Otherwise it will block until it can push the packet. */
void push(T const& packet)
{
std::unique_lock<std::mutex> lock(m_mutex);
m_cv.wait(lock, [this] {return ((m_tail - m_head) & m_mask) != 1; });
m_data[m_head++] = packet;
m_head &= m_mask;
lock.unlock();
m_cv.notify_one();
}
/* When packet could be retreived from the queue, then pull shall return immediatelly. Otherwise it will block until it can pull the packet. */
bool pull(T& packet)
{
std::unique_lock<std::mutex> lock(m_mutex);
m_cv.wait(lock, [this] {return (((m_head - m_tail) & m_mask) != 0) || m_done; });
if(((m_head - m_tail) & m_mask) != 0) [[likely]]
{
packet = m_data[m_tail++];
m_tail &= m_mask;
lock.unlock();
m_cv.notify_one();
return true;
}
return false;
}
/* done() indicates that the pushing thread stopped. The pulling thread can continue reading
the remainder of the queue and should then return */
void done()
{
{
std::lock_guard<std::mutex> lock(m_mutex);
m_done = true;
}
m_cv.notify_one();
}
private:
static_assert((N& (N - 1)) == 0, "N must be a power of 2");
static signed int const m_mask = N - 1;
using data_t = std::array<T, N>;
data_t m_data;
std::mutex m_mutex;
std::condition_variable m_cv;
int m_tail{ 0 };
int m_head{ 0 };
bool m_done{};
};
Semaphore implementation:
#include <array>
#include <semaphore>
#include <atomic>
/*
* locked_single_producer_single_consumer_queue2_T is responsible for locking packet communication
* between 2 threads. One thread pushes, the other thread pulls.
*/
template<class T, int N = 16> // N must be a power 2
class locked_single_producer_single_consumer_queue2_T
{
public:
/* When packet fits in the queue, then push shall return immediatelly. Otherwise it will block until it can push the packet. */
void push(T const& packet)
{
m_available_space.acquire();
int head = m_head.load(std::memory_order_acquire);
m_data[head++ & m_mask] = packet;
m_head.store(head, std::memory_order_release);
m_available_packages.release();
}
/* When packet could be retreived from the queue, then pull shall return immediatelly. Otherwise it will block until it can pull the packet. */
T pull()
{
m_available_packages.acquire();
int tail = m_tail.load(std::memory_order_acquire);
T packet = m_data[tail++ & m_mask];
m_tail.store(tail, std::memory_order_release);
m_available_space.release();
return packet;
}
private:
static_assert((N& (N - 1)) == 0, "N must be a power of 2");
static signed int const m_mask = N - 1;
using data_t = std::array<T, N>;
data_t m_data;
std::atomic_int m_tail{ 0 };
std::atomic_int m_head{ 0 };
std::counting_semaphore<N> m_available_space{ N };
std::counting_semaphore<N> m_available_packages{ 0 };
};
*** EDIT ***
Upon request, I've also included a complete test program. It already includes both implementations. (It needs C++20 with semaphores)
#include <array>
#include <mutex>
#include <condition_variable>
#include <semaphore>
#include <atomic>
#include <iostream>
#include <vector>
#include <algorithm>
#include <future>
/*
* locked_single_producer_single_consumer_queue_T is responsible for locked packet communication
* between 2 threads. One thread pushes, the other thread pulls.
*/
template<class T, int N = 16> // N must be a power 2
class locked_single_producer_single_consumer_queue_T
{
public:
/* When packet fits in the queue, then push shall return immediatelly. Otherwise it will block until it can push the packet. */
void push(T const& packet)
{
std::unique_lock<std::mutex> lock(m_mutex);
m_cv.wait(lock, [this] {return ((m_tail - m_head) & m_mask) != 1; });
m_data[m_head++] = packet;
m_head &= m_mask;
lock.unlock();
m_cv.notify_one();
}
/* When packet could be retreived from the queue, then pull shall return immediatelly. Otherwise it will block until it can pull the packet. */
bool pull(T& packet)
{
std::unique_lock<std::mutex> lock(m_mutex);
m_cv.wait(lock, [this] {return (((m_head - m_tail) & m_mask) != 0) || m_done; });
if (((m_head - m_tail) & m_mask) != 0) [[likely]]
{
packet = m_data[m_tail++];
m_tail &= m_mask;
lock.unlock();
m_cv.notify_one();
return true;
}
return false;
}
/* done() indicates that the pushing thread stopped. The pulling thread can continue reading
the remainder of the queue and should then return */
void done()
{
{
std::lock_guard<std::mutex> lock(m_mutex);
m_done = true;
}
m_cv.notify_one();
}
private:
static_assert((N& (N - 1)) == 0, "N must be a power of 2");
static signed int const m_mask = N - 1;
using data_t = std::array<T, N>;
data_t m_data;
std::mutex m_mutex;
std::condition_variable m_cv;
int m_tail{ 0 };
int m_head{ 0 };
bool m_done{};
};
/*
* locked_single_producer_single_consumer_queue2_T is responsible for locking packet communication
* between 2 threads. One thread pushes, the other thread pulls.
*/
template<class T, int N = 16> // N must be a power 2
class locked_single_producer_single_consumer_queue2_T
{
public:
/* When packet fits in the queue, then push shall return immediatelly. Otherwise it will block until it can push the packet. */
void push(T const& packet)
{
m_available_space.acquire();
int head = m_head.load(std::memory_order_acquire);
m_data[head++ & m_mask] = packet;
m_head.store(head, std::memory_order_release);
m_available_packages.release();
}
/* When packet could be retreived from the queue, then pull shall return immediatelly. Otherwise it will block until it can pull the packet. */
T pull()
{
m_available_packages.acquire();
int tail = m_tail.load(std::memory_order_acquire);
T packet = m_data[tail++ & m_mask];
m_tail.store(tail, std::memory_order_release);
m_available_space.release();
return packet;
}
private:
static_assert((N& (N - 1)) == 0, "N must be a power of 2");
static signed int const m_mask = N - 1;
using data_t = std::array<T, N>;
data_t m_data;
std::atomic_int m_tail{ 0 };
std::atomic_int m_head{ 0 };
std::counting_semaphore<N> m_available_space{ N };
std::counting_semaphore<N> m_available_packages{ 0 };
};
/******************************************************************************************************/
using implementation_t = bool;
implementation_t const condition_variable = false;
implementation_t const semaphore = true;
/*
* pusher() is a thread function that is responsible for pushing a defined
* sequence of integers in the lock_free queue
*/
std::atomic_int sum_ref{};
template<class queue_t>
void pusher(std::atomic_bool& do_continue_token, queue_t& queue)
{
int i = 0;
while (do_continue_token.load(std::memory_order_acquire))
{
queue.push(i);
sum_ref += i;
++i;
}
}
/*
* puller() is a thread function that is responsible for pulling
* integers from the lock_free queue, and compare it with the
* expected sequence
*/
std::atomic_int sum_check{};
template<implementation_t implementation, class queue_t>
int puller(queue_t& queue)
{
int i;
if constexpr (implementation == condition_variable)
{
while (queue.pull(i))
{
sum_check += i;
}
}
if constexpr (implementation == semaphore)
{
int j;
while ((j = queue.pull()) != -1)
{
sum_check += j;
i = j;
}
}
return i;
}
/*
* test() is responsible for kicking off two threads that push and pull from
* the queue for a duration of 10s. Test returns the last integer value that was
* pulled from the queue as an indication of speed.
*/
template<implementation_t implementation, class queue_t>
int test()
{
using namespace std::chrono_literals;
std::atomic_bool do_continue_token(true);
queue_t queue;
std::cout << '<' << std::flush;
std::future<void> fpusher = std::async(pusher<queue_t>, std::ref(do_continue_token), std::ref(queue));
std::future<int> fpuller = std::async(puller<implementation, queue_t>, std::ref(queue));
std::this_thread::sleep_for(10s);
do_continue_token.store(false, std::memory_order_release);
fpusher.wait();
if constexpr (implementation == condition_variable)
{
queue.done(); // to stop the waiting thread
}
if constexpr (implementation == semaphore)
{
queue.push(-1); // to stop the waiting thread
}
int i = fpuller.get();
if (sum_check != sum_ref)
{
throw;
}
std::cout << '>' << std::endl;
return i;
}
/*
* main() is responsible for performing multiple tests of different implementations.
* Results are collected, ordered and printed.
*/
int main()
{
struct result_t
{
std::string m_name;
int m_count;
};
using condition_variable_queue_t = locked_single_producer_single_consumer_queue_T<int, 1024>;
using semaphore_queue_t = locked_single_producer_single_consumer_queue2_T<int, 1024>;
std::vector<result_t> results // 6 runs
{
{ "condition_variable", test<condition_variable, condition_variable_queue_t>() },
{ "semaphore", test<semaphore, semaphore_queue_t>() },
{ "condition_variable", test<condition_variable, condition_variable_queue_t>() },
{ "semaphore", test<semaphore, semaphore_queue_t>() },
{ "condition_variable", test<condition_variable, condition_variable_queue_t>() },
{ "semaphore", test<semaphore, semaphore_queue_t>() },
};
std::sort(results.begin(), results.end(), [](result_t const& lhs, result_t const& rhs) { return lhs.m_count < rhs.m_count; });
std::cout << "The higher the count, the faster the solution" << std::endl;
for (result_t const& result : results)
{
std::cout << result.m_name << ": " << result.m_count << std::endl;
}
}
Output of a run:
<>
<>
<>
<>
<>
<>
The higher the count, the faster the solution
semaphore: 58304215
semaphore: 59302013
semaphore: 61896024
condition_variable: 84140445
condition_variable: 87045903
condition_variable: 90893057
My question kept bothering me, so I investigated Microsoft’s current implementation of semaphores. The counting semaphore has two atomics, and to implements the blocking wait with a wait on one of the atomics. Note that when the semaphore count does not reach zero, then also the wait for atomic is not called. The implementation also only notifies (the atomic) when it is sure that at least one thread is waiting for it. But still the semaphore implementation depends on the new C++20 wait/notify functions.
The new C++20 wait/notify functions are implemented with a pool of condition variables. I guess that is optimal, at least I wouldn’t know another faster way.
Bottom-line this implementation of semaphore is based on condition variables, and then I can imagine that above mentioned “condition variable implementation” is faster. Assuming that the mutex is most of the time not locked, then getting the mutex is cheap. Assuming that (due to the large queue size of 1024) we almost never have to wait for the condition variable predicate, also m_cv.wait() is cheap.
The “semaphore implementation” is in effect almost the same, only now two atomics (m_head & m_tail) need to be read and written. In the “condition variable implementation” the mutex implicitly protected these variables. Then my conclusion is that these two atomics in the “semaphore implementation” make the difference. And, unfortunately, you cannot do without them (in the “semaphore implementation”), so the “condition variable implementation” is faster.
To answer the question:
Q: Did I do something not clever so that my semaphore implementation is needlessly slow?
A: Not that I know (yet)
Q: Is possibly the Microsoft counting semaphore implementation too slow?
A: Does not look like it
Q: Or do requirements in the C++ standard make the semaphore slow in general?
A: Again, does not look like it.
Q: Am I just mistaken that a queue is proper application for semaphores?
A: Yes, that was probably in the early days
Q: When a queue is not a proper application, for what other application does the semaphore outperform the condition variable?
A: Don’t know yet. Possibly an application with simple waiting for limited resources.
I have been looking into boost::fibers as a method for dealing with some of my problems with data processing and IO. The shared_work scheduler in particular looks promising because it would let me spin up one data processing task for every data processing source and then let them distribute each other as needed across a few threads.
However this brings me to the source of my question: It looks like I can only have one shared_work 'pool' per process. What do I do if I want to have a set of 12 fibers in a processing data shared among 4 threads while, at the same time, a different set of 12 fibers are writing processed data to file shared among another 4 threads.
Something like:
#include<string>
#include<iostream>
#include<vector>
#include<mutex>
#include<thread>
#include<random>
#include<map>
#include<sstream>
#include<boost/bind.hpp>
#include<boost/fiber/all.hpp>
typedef boost::fibers::fiber FiberType;
typedef std::unique_lock<boost::fibers::mutex> LockType;
static const int fiberIterationCount = 5000;
static const int fiberCount = 12;
static const int threadCount = 4;
static const int distLowerLimit = 50;
static const int distUpperLimit = 500;
static boost::fibers::mutex firstMutex{};
static boost::fibers::mutex secondMutex{};
static boost::fibers::condition_variable firstCondition{};
static boost::fibers::condition_variable secondCondition{};
static boost::fibers::barrier synchronize{2*threadCount};
static int typeOneFibersFinished{0};
static int typeTwoFibersFinished{0};
static std::mt19937 typeOneGenerators[fiberCount];
static std::mt19937 typeTwoGenerators[fiberCount];
static std::mutex typeMapMutex;//lock for writing unnecessary for reads
static std::map<std::thread::id, std::string> threadTypeMap;
//simple function to give a heavy cpu load of variable duration
unsigned long long findPrimeNumber(int n)
{
int count=0;
unsigned long long a = 2;
while(count<n)
{
bool isPrime = true;
for(unsigned long long b = 2; (b * b) <= a; ++b)
{
if((a % b) == 0)
{
isPrime = false;
break;
}
}
if(isPrime)
{
count++;
}
++a;
}
return (a - 1);
}
void fiberTypeOne(int fiberNumber)
{
std::cout<<"Starting Type One Fiber #"<<fiberNumber;
std::uniform_int_distribution<int> dist(distLowerLimit, distUpperLimit);
for(int i=0; i<fiberIterationCount; ++i)
{
//generate a randomish load on this fiber so that it does not take a regular time slice
int tempPrime = dist(typeOneGenerators[fiberNumber]);
unsigned long long temp = findPrimeNumber(tempPrime);
std::cout << "T1 fiber #"<<fiberNumber<<" running on "<<threadTypeMap[std::this_thread::get_id()]
<<"\n Generated: "<<tempPrime<<", "<<temp;
boost::this_fiber::yield();
}
{
LockType lock(firstMutex);
++typeOneFibersFinished;
}
firstCondition.notify_all();
}
void threadTypeOne(int threadNumber)
{
//make a shared work scheduler that associates its fibers with "fiber pool 0"
boost::fibers::use_scheduling_algorithm< multi_pool_scheduler<0> >();
std::cout<<"Starting Type One Thread #"<<threadNumber<<" With Thread ID: "<<std::this_thread::get_id();
{
std::unique_lock<std::mutex> lock{typeMapMutex};
std::ostringstream gen;
gen<<"Thread Type 1 - Number: "<<threadNumber<<" with id: "<<std::this_thread::get_id();
threadTypeMap[std::this_thread::get_id()] = gen.str();
}
if(threadNumber == 0)
{ //if we are thread zero, create the fibers then join them to take ourselves off the "fiber list"
std::cout<<"Spawning Type One Fibers";
for(int fiberNumber=0; fiberNumber<fiberCount; ++fiberNumber)
{//create the fibers and instantly detach them
FiberType(boost::bind(&fiberTypeOne, fiberNumber)).detach();
}
}
synchronize.wait();
std::cout<<"T1 Thread preparing to wait";
//now let the fibers do their thing
LockType lock(firstMutex);
firstCondition.wait(lock, [](){return (typeOneFibersFinished == fiberCount);});
}
void fiberTypeTwo(int fiberNumber)
{
std::cout<<"Starting Type Two Fiber #"<<fiberNumber;
std::uniform_int_distribution<int> dist(distLowerLimit, distUpperLimit);
for(int i=0; i<fiberIterationCount; ++i)
{
//generate a randomish load on this fiber so that it does not take a regular time slice
int tempPrime = dist(typeTwoGenerators[fiberNumber]);
unsigned long long temp = findPrimeNumber(tempPrime);
std::cout << "T2 fiber #"<<fiberNumber<<" running on "<<threadTypeMap[std::this_thread::get_id()]
<<"\n Generated: "<<tempPrime<<", "<<temp;
boost::this_fiber::yield();
}
{
LockType lock(secondMutex);
++typeTwoFibersFinished;
}
secondCondition.notify_all();
}
void threadTypeTwo(int threadNumber)
{
//make a shared work scheduler that associates its fibers with "fiber pool 1"
boost::fibers::use_scheduling_algorithm< multi_pool_scheduler<1> >();
std::cout<<"Starting Type Two Thread #"<<threadNumber<<" With Thread ID: "<<std::this_thread::get_id();
{
std::unique_lock<std::mutex> lock{typeMapMutex};
std::ostringstream gen;
gen<<"Thread Type 2 - Number: "<<threadNumber<<" with id: "<<std::this_thread::get_id();
threadTypeMap[std::this_thread::get_id()] = gen.str();
}
if(threadNumber == 0)
{ //if we are thread zero, create the fibers then join them to take ourselves off the "fiber list"
std::cout<<"Spawning Type Two Fibers";
for(int fiberNumber=0; fiberNumber<fiberCount; ++fiberNumber)
{//create the fibers and instantly detach them
FiberType(boost::bind(&fiberTypeTwo, fiberNumber)).detach();
}
}
synchronize.wait();
std::cout<<"T2 Thread preparing to wait";
//now let the fibers do their thing
LockType lock(secondMutex);
secondCondition.wait(lock, [](){return (typeTwoFibersFinished == fiberCount);});
}
int main(int argc, char* argv[])
{
std::cout<<"Initializing Random Number Generators";
for(unsigned i=0; i<fiberCount; ++i)
{
typeOneGenerators->seed(i*500U - 1U);
typeTwoGenerators->seed(i*1500U - 1U);
}
std::cout<<"Commencing Main Thread Startup Startup";
std::vector<std::thread> typeOneThreads;
std::vector<std::thread> typeTwoThreads;
for(int i=0; i<threadCount; ++i)
{
typeOneThreads.emplace_back(std::thread(boost::bind(&threadTypeOne, i)));
typeTwoThreads.emplace_back(std::thread(boost::bind(&threadTypeTwo, i)));
}
//now let the threads do their thing and wait for them to finish with join
for(unsigned i=0; i<threadCount; ++i)
{
typeOneThreads[i].join();
}
for(unsigned i=0; i<threadCount; ++i)
{
typeTwoThreads[i].join();
}
std::cout<<"Shutting Down";
return 0;
}
Is this possible without writing your own fiber scheduler? If so, how?
I determined that I did require writing my own scheduler. However, the actual amount of work was minimal. The boost::fibers::shared_work scheduler manages the list of fibers that are shared between threads using a single static queue, guarded by a static mutex. There is another queue that governs the main fiber for each thread (since each thread has its own scheduler) but that is local to the class instance instead of shared between all instances of the class the way the static members are.
The solution then, to prevent the static queue and lock from being shared between separate sets of threads, is to put a, mostly useless, template parameter in front of the class. Then each thread passes a different parameter to this template. In this fashion, since you get a different object for every specialization of the template, you get different set of static variables for each instantiation with a different pool number.
Below is my implementation of this solution, (mostly a copy of boost::fiber::shared_work with a few variables and types more clearly named and the template parameter added).
#include <condition_variable>
#include <chrono>
#include <deque>
#include <mutex>
#include <boost/config.hpp>
#include <boost/fiber/algo/algorithm.hpp>
#include <boost/fiber/context.hpp>
#include <boost/fiber/detail/config.hpp>
#include <boost/fiber/scheduler.hpp>
#include <boost/assert.hpp>
#include "boost/fiber/type.hpp"
#ifdef BOOST_HAS_ABI_HEADERS
# include BOOST_ABI_PREFIX
#endif
#ifdef _MSC_VER
# pragma warning(push)
# pragma warning(disable:4251)
#endif
/*!
* #class SharedWorkPool
* #brief A scheduler for boost::fibers that operates in a manner similar to the
* shared work scheduler, except that it takes a template parameter determining
* which pool to draw fibers from. In this fashion, one group of threads can share
* a pool of fibers among themselves while another group of threads can work with
* a completely separate pool
* #tparam PoolNumber The index of the pool number for this thread
*/
template <int PoolNumber>
class SharedWorkPool : public boost::fibers::algo::algorithm
{
typedef std::deque<boost::fibers::context * > ReadyQueueType;
typedef boost::fibers::scheduler::ready_queue_type LocalQueueType;
typedef std::unique_lock<std::mutex> LockType;
public:
SharedWorkPool() = default;
~SharedWorkPool() override {}
SharedWorkPool( bool suspend) : suspendable{suspend}{}
SharedWorkPool( SharedWorkPool const&) = delete;
SharedWorkPool( SharedWorkPool &&) = delete;
SharedWorkPool& operator=(const SharedWorkPool&) = delete;
SharedWorkPool& operator=(SharedWorkPool&&) = delete;
void awakened(boost::fibers::context* ctx) noexcept override;
boost::fibers::context* pick_next() noexcept override;
bool has_ready_fibers() const noexcept override
{
LockType lock{readyQueueMutex};
return ((!readyQueue.empty()) || (!localQueue.empty()));
}
void suspend_until(const std::chrono::steady_clock::time_point& timePoint) noexcept override;
void notify() noexcept override;
private:
static ReadyQueueType readyQueue;
static std::mutex readyQueueMutex;
LocalQueueType localQueue{};
std::mutex instanceMutex{};
std::condition_variable suspendCondition{};
bool waitNotifyFlag{false};
bool suspendable{false};
};
template <int PoolNumber>
void SharedWorkPool<PoolNumber>::awakened(boost::fibers::context* ctx) noexcept
{
if(ctx->is_context(boost::fibers::type::pinned_context))
{ // we have been passed the thread's main fiber, never put those in the shared queue
localQueue.push_back(*ctx);
}
else
{//worker fiber, enqueue on shared queue
ctx->detach();
LockType lock{readyQueueMutex};
readyQueue.push_back(ctx);
}
}
template <int PoolNumber>
boost::fibers::context* SharedWorkPool<PoolNumber>::pick_next() noexcept
{
boost::fibers::context * ctx = nullptr;
LockType lock{readyQueueMutex};
if(!readyQueue.empty())
{ //pop an item from the ready queue
ctx = readyQueue.front();
readyQueue.pop_front();
lock.unlock();
BOOST_ASSERT( ctx != nullptr);
boost::fibers::context::active()->attach( ctx); //attach context to current scheduler via the active fiber of this thread
}
else
{
lock.unlock();
if(!localQueue.empty())
{ //nothing in the ready queue, return main or dispatcher fiber
ctx = & localQueue.front();
localQueue.pop_front();
}
}
return ctx;
}
template <int PoolNumber>
void SharedWorkPool<PoolNumber>::suspend_until(const std::chrono::steady_clock::time_point& timePoint) noexcept
{
if(suspendable)
{
if (std::chrono::steady_clock::time_point::max() == timePoint)
{
LockType lock{instanceMutex};
suspendCondition.wait(lock, [this](){return waitNotifyFlag;});
waitNotifyFlag = false;
}
else
{
LockType lock{instanceMutex};
suspendCondition.wait_until(lock, timePoint, [this](){return waitNotifyFlag;});
waitNotifyFlag = false;
}
}
}
template <int PoolNumber>
void SharedWorkPool<PoolNumber>::notify() noexcept
{
if(suspendable)
{
LockType lock{instanceMutex};
waitNotifyFlag = true;
lock.unlock();
suspendCondition.notify_all();
}
}
template <int PoolNumber>
std::deque<boost::fibers::context*> SharedWorkPool<PoolNumber>::readyQueue{};
template <int PoolNumber>
std::mutex SharedWorkPool<PoolNumber>::readyQueueMutex{};
Note, I am not entirely sure what will happen if you try to use the same pool number from declarations in different compilation units. However, under normal circumstances, i.e. you have only written boost::fibers::use_scheduling_algorithm< Threads::Fibers::SharedWorkPool<WorkPoolNumber> >(); in a single location for each WorkPoolNumber, it works perfectly. Fibers assigned to a given set of threads always run within the same set of threads, never being run by a different set of threads.
I was trying to make a code with multi producers and consumers. I have created multi-threads for producer and consumer and used semaphores for synchronization. The code was working fine with single producer and consumer.
The problem which I am facing is that after some time of program execution, only the consumer1 and producer1 are participating in the process. I am not able to understand what happened to the other producers and consumers.
I would also like to know as how to make multi producer-consumer problem efficient? Efficient in the sense that all producer and consumer gets equal opportunity to produce and consume respectively?
C++ code(it includes a lot of C):
#include <iostream>
#include <pthread.h>
#include <semaphore.h>
#include <unistd.h>
#include <queue>
using namespace std;
sem_t empty;
sem_t full;
int cnt = 0;
pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
queue<int> q;
void *producer(void *a)
{
int *num = (int *)a;
while(1) {
sem_wait(&empty);
pthread_mutex_lock(&mutex);
cnt = cnt+1;
q.push(cnt);
cout<<cnt<<" item produced by producer "<<(*num+1)<<endl;
pthread_mutex_unlock(&mutex);
sem_post(&full);
sleep(1);
}
}
void *consumer(void *a)
{
int *num = (int *)a;
while(1) {
sem_wait(&full);
pthread_mutex_lock(&mutex);
cout<<q.front()<<" item consumed by consumer "<<(*num+1)<<endl;
q.pop();
pthread_mutex_unlock(&mutex);
sem_post(&empty);
sleep(1);
}
}
int main()
{
pthread_t p[5];
pthread_t c[5];
sem_init(&empty,0,5);
sem_init(&full,0,0);
int i;
for(i = 0; i < 5; i++) {
pthread_create(&p[i],NULL,producer,(void *)(&i));
}
for(i = 0; i < 5; i++) {
pthread_create(&c[i],NULL,consumer,(void *)(&i));
}
for(i = 0; i < 5; i++) {
pthread_join(p[i],NULL);
pthread_join(c[i],NULL);
}
}
Updated code:
#include <iostream>
#include <pthread.h>
#include <semaphore.h>
#include <unistd.h>
#include <queue>
#include <map>
using namespace std;
sem_t empty;
sem_t full;
int cnt = 0;
pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
map<pthread_t,int> mc,mp;
queue<int> q;
void *producer(void *a)
{
while(1) {
sem_wait(&empty);
pthread_mutex_lock(&mutex);
cnt = cnt+1;
q.push(cnt);
cout<<cnt<<" item produced by producer "<<mp[pthread_self()]<<endl;
pthread_mutex_unlock(&mutex);
sem_post(&full);
sleep(1);
}
}
void *consumer(void *a)
{
while(1) {
sem_wait(&full);
pthread_mutex_lock(&mutex);
cout<<q.front()<<" item consumed by consumer "<<mc[pthread_self()]<<endl;
q.pop();
pthread_mutex_unlock(&mutex);
sem_post(&empty);
sleep(1);
}
}
int main()
{
pthread_t p[5];
pthread_t c[5];
sem_init(&empty,0,5);
sem_init(&full,0,0);
int i;
pthread_mutex_lock(&mutex);
for(i = 0; i < 5; i++) {
pthread_create(&p[i],NULL,producer,NULL);
pthread_create(&c[i],NULL,consumer,NULL);
mc[c[i]] = i+1;
mp[p[i]] = i+1;
}
pthread_mutex_unlock(&mutex);
for(i = 0; i < 5; i++) {
pthread_join(p[i],NULL);
pthread_join(c[i],NULL);
}
}
Short answer
The threads do in fact execute with equal opportunity, but they just printout an identifier which is not theirs.
Detailed explanation
You keep in each thread a pointer num to the thread number. It's the pointer to that value which is saved and not the value itself. So all the threads point to the same counter, thinking to find there their own identifier.
Everytime you access *num, you get access not to the value that i had when you launched the thread, but its current value.
Unfortunately, in every loop of main(), you reuse the variable i. So the last loop, you'll set i back to 0, and wait for the first threads to join. But all these threads loop forever, so the loop will hardly get a chance to go beyond this initial 0 value. So that every thread thinks it's the number *num+1 that is 1 at this moment.
Note by the way that you create a race condition as someone pointed out in the comments: all the consumer and producer threads dereference the pointer, accessing to the same variable in a mutex-protected region. This is ok. But while they are reading the variable, the main thread still happily can change the shared variable outside of any lock. This is definitively a risk of race.
Workaround
std::thread would allow you to pass i by walue, so that each thread has its own unaltered copy of is id.
With pthreads you have to pass a pointer to a value. Unfortunately, even if you'd do a local copy of the value pointed at, right at the start of the thread, you'd still be in a race condition.
A quick workaround to observe which thread is really doing the work would be to printout as well the result of pthread_self() (see here how to do it). Or to store the ids in an array of int, and pass to each thread the address to a unique element in that array.
Thanks to Michael's and Adam's comments I made some changes here.
Suppose I build a special linked queue with max length N.
class QueueItem
{
int id; //object identity
double data; //object priority
}
class MyQueue
{
private:
list<QueueItem> rec;
std::mutex mymutex;
public:
LinkArray(){};
~LinkArray(){};
int insert(int id, double d)
{
std::lock_guard<std::mutex> guard(mymutex);
list<QueueItem>::iterator iter=rec.begin();
QueueItem temp;
temp.id = id;
temp.data = d;
int count=0;
for (iter;iter!=rec.end();iter++)
{
if (iter->data<d)
{
rec.insert(iter,temp);
return count;
}
count++;
}
rec.push_back(temp);
return count;
}
void pop(void)
{
std::lock_guard<std::mutex> guard(mymutex);
rec.pop_back();
}
int getLength(void)
{
return rec.size();
}
};
The rule is: all the items in the queue always have the highest priority and they are sorted. For example for N=4, the priority in the queue is (100,99,98,97). When priority 101 enqueue, the queue becomes (101,100,99,98). If 1 enqueue, then the queue doesn't change.
Now, if I use <thread> in C++11 to compute the priority for different object in different threads and then enqueue them, how can I avoid race condition?
My code is like:
....
int N = 4; //queue length
int pnum=4;//thread number
MyQueue* queue = new MyQueue(8);
vector<thread> threads;
std::mutex mymux;
int totalItem = 16;
int itemIds[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
double itemPriority[] = {9,10,3,11,2,12,5,6,7,8,1,15,14,4,13,16};
int itemperthread = totalItem/pnum;
for (int p=0; p<pnum;p++)
{
threads.push_back(thread([&](int p, MyQueue* queue ){
for (int i=p*itemperthread; i<(p+1)*itemperthread; i++)
{
int id = itemIds[i];
int priority = itemPriority[i];
}
mymux.lock();
queue->insert(id,priority); //enqueue
if (queue->getLength()>N)
queue->pop();
mymux.unlock();
},p, queue));
}
for (auto& thread : threads){
thread.join();
}
I tried use a mutex to lock the queue (or even define a mutex in Queue class and use it in Queue::insert method). But everytime I run this, the result of queue is different. I am pretty sure the computation of priority is correct.
The expected queue after run this part of program will be [16,15,14,13]. But the result can be different every time I run it.
Any idea to avoid this issue? Thanks.
I have a program with a function which takes a pointer as arg, and a main. The main is creating n threads, each of them running the function on different memory areas depending on the passed arg. Threads are then joined, the main performs some data mixing between the area and creates n new threads which do the the same operation as the old ones.
To improve the program I would like to keep the threads alive, removing the long time necessary to create them. Threads should sleep when the main is working and notified when they have to come up again. At the same way the main should wait when threads are working as it did with join.
I cannot end up with a strong implementation of this, always falling in a deadlock.
Simple baseline code, any hints about how to modify this would be much appreciated
#include <thread>
#include <climits>
...
void myfunc(void * p) {
do_something(p);
}
int main(){
void * myp[n_threads] {a_location, another_location,...};
std::thread mythread[n_threads];
for (unsigned long int j=0; j < ULONG_MAX; j++) {
for (unsigned int i=0; i < n_threads; i++) {
mythread[i] = std::thread(myfunc, myp[i]);
}
for (unsigned int i=0; i < n_threads; i++) {
mythread[i].join();
}
mix_data(myp);
}
return 0;
}
Here is a possible approach using only classes from the C++11 Standard Library. Basically, each thread you create has an associated command queue (encapsulated in std::packaged_task<> objects) which it continuously check. If the queue is empty, the thread will just wait on a condition variable (std::condition_variable).
While data races are avoided through the use of std::mutex and std::unique_lock<> RAII wrappers, the main thread can wait for a particular job to be terminated by storing the std::future<> object associated to each submitted std::packaged_tast<> and call wait() on it.
Below is a simple program that follows this design. Comments should be sufficient to explain what it does:
#include <thread>
#include <iostream>
#include <sstream>
#include <future>
#include <queue>
#include <condition_variable>
#include <mutex>
// Convenience type definition
using job = std::packaged_task<void()>;
// Some data associated to each thread.
struct thread_data
{
int id; // Could use thread::id, but this is filled before the thread is started
std::thread t; // The thread object
std::queue<job> jobs; // The job queue
std::condition_variable cv; // The condition variable to wait for threads
std::mutex m; // Mutex used for avoiding data races
bool stop = false; // When set, this flag tells the thread that it should exit
};
// The thread function executed by each thread
void thread_func(thread_data* pData)
{
std::unique_lock<std::mutex> l(pData->m, std::defer_lock);
while (true)
{
l.lock();
// Wait until the queue won't be empty or stop is signaled
pData->cv.wait(l, [pData] () {
return (pData->stop || !pData->jobs.empty());
});
// Stop was signaled, let's exit the thread
if (pData->stop) { return; }
// Pop one task from the queue...
job j = std::move(pData->jobs.front());
pData->jobs.pop();
l.unlock();
// Execute the task!
j();
}
}
// Function that creates a simple task
job create_task(int id, int jobNumber)
{
job j([id, jobNumber] ()
{
std::stringstream s;
s << "Hello " << id << "." << jobNumber << std::endl;
std::cout << s.str();
});
return j;
}
int main()
{
const int numThreads = 4;
const int numJobsPerThread = 10;
std::vector<std::future<void>> futures;
// Create all the threads (will be waiting for jobs)
thread_data threads[numThreads];
int tdi = 0;
for (auto& td : threads)
{
td.id = tdi++;
td.t = std::thread(thread_func, &td);
}
//=================================================
// Start assigning jobs to each thread...
for (auto& td : threads)
{
for (int i = 0; i < numJobsPerThread; i++)
{
job j = create_task(td.id, i);
futures.push_back(j.get_future());
std::unique_lock<std::mutex> l(td.m);
td.jobs.push(std::move(j));
}
// Notify the thread that there is work do to...
td.cv.notify_one();
}
// Wait for all the tasks to be completed...
for (auto& f : futures) { f.wait(); }
futures.clear();
//=================================================
// Here the main thread does something...
std::cin.get();
// ...done!
//=================================================
//=================================================
// Posts some new tasks...
for (auto& td : threads)
{
for (int i = 0; i < numJobsPerThread; i++)
{
job j = create_task(td.id, i);
futures.push_back(j.get_future());
std::unique_lock<std::mutex> l(td.m);
td.jobs.push(std::move(j));
}
// Notify the thread that there is work do to...
td.cv.notify_one();
}
// Wait for all the tasks to be completed...
for (auto& f : futures) { f.wait(); }
futures.clear();
// Send stop signal to all threads and join them...
for (auto& td : threads)
{
std::unique_lock<std::mutex> l(td.m);
td.stop = true;
td.cv.notify_one();
}
// Join all the threads
for (auto& td : threads) { td.t.join(); }
}
The concept you want is the threadpool. This SO question deals with existing implementations.
The idea is to have a container for a number of thread instances. Each instance is associated with a function which polls a task queue, and when a task is available, pulls it and run it. Once the task is over (if it terminates, but that's another problem), the thread simply loop over to the task queue.
So you need a synchronized queue, a thread class which implements the loop on the queue, an interface for the task objects, and maybe a class to drive the whole thing (the pool class).
Alternatively, you could make a very specialized thread class for the task it has to perform (with only the memory area as a parameter for instance). This requires a notification mechanism for the threads to indicate that they are done with the current iteration.
The thread main function would be a loop on that specific task, and at the end of one iteration, the thread signals its end, and wait on condition variables to start the next loop. In essence, you would be inlining the task code within the thread, dropping the need of a queue altogether.
using namespace std;
// semaphore class based on C++11 features
class semaphore {
private:
mutex mMutex;
condition_variable v;
int mV;
public:
semaphore(int v): mV(v){}
void signal(int count=1){
unique_lock lock(mMutex);
mV+=count;
if (mV > 0) mCond.notify_all();
}
void wait(int count = 1){
unique_lock lock(mMutex);
mV-= count;
while (mV < 0)
mCond.wait(lock);
}
};
template <typename Task>
class TaskThread {
thread mThread;
Task *mTask;
semaphore *mSemStarting, *mSemFinished;
volatile bool mRunning;
public:
TaskThread(Task *task, semaphore *start, semaphore *finish):
mTask(task), mRunning(true),
mSemStart(start), mSemFinished(finish),
mThread(&TaskThread<Task>::psrun){}
~TaskThread(){ mThread.join(); }
void run(){
do {
(*mTask)();
mSemFinished->signal();
mSemStart->wait();
} while (mRunning);
}
void finish() { // end the thread after the current loop
mRunning = false;
}
private:
static void psrun(TaskThread<Task> *self){ self->run();}
};
classcMyTask {
public:
MyTask(){}
void operator()(){
// some code here
}
};
int main(){
MyTask task1;
MyTask task2;
semaphore start(2), finished(0);
TaskThread<MyTask> t1(&task1, &start, &finished);
TaskThread<MyTask> t2(&task2, &start, &finished);
for (int i = 0; i < 10; i++){
finished.wait(2);
start.signal(2);
}
t1.finish();
t2.finish();
}
The proposed (crude) implementation above relies on the Task type which must provide the operator() (ie. a functor like class). I said you could incorporate the task code directly in the thread function body earlier, but since I don't know it, I kept it as abstract as I could. There's one condition variable for the start of threads, and one for their end, both encapsulated in semaphore instances.
Seeing the other answer proposing the use of boost::barrier, I can only support this idea: make sure to replace my semaphore class with that class if possible, the reason being that it is better to rely on well tested and maintained external code rather than a self implemented solution for the same feature set.
All in all, both approaches are valid, but the former gives up a tiny bit of performance in favor of flexibility. If the task to be performed takes a sufficiently long time, the management and queue synchronization cost becomes negligible.
Update: code fixed and tested. Replaced a simple condition variable by a semaphore.
It can easily be achieved using a barrier (just a convenience wrapper over a conditional variable and a counter). It basically blocks until all N threads have reached the "barrier". It then "recycles" again. Boost provides an implementation.
void myfunc(void * p, boost::barrier& start_barrier, boost::barrier& end_barrier) {
while (!stop_condition) // You'll need to tell them to stop somehow
{
start_barrier.wait ();
do_something(p);
end_barrier.wait ();
}
}
int main(){
void * myp[n_threads] {a_location, another_location,...};
boost::barrier start_barrier (n_threads + 1); // child threads + main thread
boost::barrier end_barrier (n_threads + 1); // child threads + main thread
std::thread mythread[n_threads];
for (unsigned int i=0; i < n_threads; i++) {
mythread[i] = std::thread(myfunc, myp[i], start_barrier, end_barrier);
}
start_barrier.wait (); // first unblock the threads
for (unsigned long int j=0; j < ULONG_MAX; j++) {
end_barrier.wait (); // mix_data must not execute before the threads are done
mix_data(myp);
start_barrier.wait (); // threads must not start new iteration before mix_data is done
}
return 0;
}
The following is a simple compiling and working code performing some random stuffs. It implements aleguna's concept of barrier. The task length of each thread is different so it is really necessary to have a strong synchronization mechanism. I will try to do a pool on the same tasks and benchmark the result, and then maybe with futures as pointed out by Andy Prowl.
#include <iostream>
#include <thread>
#include <mutex>
#include <condition_variable>
#include <chrono>
#include <complex>
#include <random>
const unsigned int n_threads=4; //varying this will not (almost) change the total amount of work
const unsigned int task_length=30000/n_threads;
const float task_length_variation=task_length/n_threads;
unsigned int rep=1000; //repetitions of tasks
class t_chronometer{
private:
std::chrono::steady_clock::time_point _t;
public:
t_chronometer(): _t(std::chrono::steady_clock::now()) {;}
void reset() {_t = std::chrono::steady_clock::now();}
double get_now() {return std::chrono::duration_cast<std::chrono::duration<double>>(std::chrono::steady_clock::now() - _t).count();}
double get_now_ms() {return
std::chrono::duration_cast<std::chrono::duration<double,std::milli>>(std::chrono::steady_clock::now() - _t).count();}
};
class t_barrier {
private:
std::mutex m_mutex;
std::condition_variable m_cond;
unsigned int m_threshold;
unsigned int m_count;
unsigned int m_generation;
public:
t_barrier(unsigned int count):
m_threshold(count),
m_count(count),
m_generation(0) {
}
bool wait() {
std::unique_lock<std::mutex> lock(m_mutex);
unsigned int gen = m_generation;
if (--m_count == 0)
{
m_generation++;
m_count = m_threshold;
m_cond.notify_all();
return true;
}
while (gen == m_generation)
m_cond.wait(lock);
return false;
}
};
using namespace std;
void do_something(complex<double> * c, unsigned int max) {
complex<double> a(1.,0.);
complex<double> b(1.,0.);
for (unsigned int i = 0; i<max; i++) {
a *= polar(1.,2.*M_PI*i/max);
b *= polar(1.,4.*M_PI*i/max);
*(c)+=a+b;
}
}
bool done=false;
void task(complex<double> * c, unsigned int max, t_barrier* start_barrier, t_barrier* end_barrier) {
while (!done) {
start_barrier->wait ();
do_something(c,max);
end_barrier->wait ();
}
cout << "task finished" << endl;
}
int main() {
t_chronometer t;
std::default_random_engine gen;
std::normal_distribution<double> dis(.0,1000.0);
complex<double> cpx[n_threads];
for (unsigned int i=0; i < n_threads; i++) {
cpx[i] = complex<double>(dis(gen), dis(gen));
}
t_barrier start_barrier (n_threads + 1); // child threads + main thread
t_barrier end_barrier (n_threads + 1); // child threads + main thread
std::thread mythread[n_threads];
unsigned long int sum=0;
for (unsigned int i=0; i < n_threads; i++) {
unsigned int max = task_length + i * task_length_variation;
cout << i+1 << "th task length: " << max << endl;
mythread[i] = std::thread(task, &cpx[i], max, &start_barrier, &end_barrier);
sum+=max;
}
cout << "total task length " << sum << endl;
complex<double> c(0,0);
for (unsigned long int j=1; j < rep+1; j++) {
start_barrier.wait (); //give to the threads the missing call to start
if (j==rep) done=true;
end_barrier.wait (); //wait for the call from each tread
if (j%100==0) cout << "cycle: " << j << endl;
for (unsigned int i=0; i<n_threads; i++) {
c+=cpx[i];
}
}
for (unsigned int i=0; i < n_threads; i++) {
mythread[i].join();
}
cout << "result: " << c << " it took: " << t.get_now() << " s." << endl;
return 0;
}