I'm developing a class to queue a given number of elements where the reading thread pops and the writing thread pushes them. If the queue is empty the reader blocks and if the queue is full the writer blocks aswell. But after that the writing and reading threads should optionally block for a certain period of time. Can someone please explain if is there a perfect way to do this requirement?
template<typename T>
class Queue
{
private:
mutable std::mutex mut;
std::queue<T> queue;
std::condition_variable condNotEmpty;
std::condition_variable condNotFull;
int _count;
int max_size;
int idx;
public:
Queue(int size){
max_size = size;
}
void push(T new_value)
{
std::unique_lock<std::mutex> lk(mut);
/* initialize random seed: */
srand (time(NULL));
/* generate secret number between 1 and 3: */
idx = rand() % 3 + 1;
condNotFull.wait_for(lk,std :: chrono :: milliseconds ( 20 ),[this]{
if ((_count == max_size)) {
return false;
}else{
return true;
}
});
queue.push(new_value);
_count++;
condNotEmpty.notify_one();
}
T pop()
{
std::unique_lock<std::mutex> lk(mut);
/* initialize random seed: */
srand (time(NULL));
/* generate secret number between 1 and 3: */
idx = rand() % 3 + 1;
condNotEmpty.wait_for(lk, std :: chrono :: milliseconds ( 15 ),[this]{
if(queue.empty()){
return false;
}else{
return true;
}
});
auto value=queue.front();
queue.pop();
_count--;
condNotFull.notify_one();
return value;
}
int Count(){
std::lock_guard<std::mutex> lk(mut);
return queue.size();
}
int Size(){
return max_size;
}
};
You can use std::timed_mutex and it's methods try_lock_for and try_lock_until for try to lock for/until a certain period of time.
Also it would probably be meaningful to reflect this API to your Queue with methods like tryPush, tryPop or tryPushFor, PushUntil, tryPopFor, tryPopUntil, etc. and leave push/pop do the blocking operations as is.
In your code you should also check the return status of wait_for call: it can be std::cv_status::timeout or std::cv_status::no_timeout, depending if condition variable was notified or timeout expired.
I have a single producer queue and I'd like it to work with multiple threads.
Queue is just a simple ring buffer queue. I have a pop method already implemented and it works for single producer - multi consumers. I am not sure what is wrong with my push method.
I tried to use CAS loop for write position, but it just pauses my program and does nothing.
template<typename T, uint size = 1>
struct mpWriter
{
bool push(const T& element)
{
while (true) {
uint oldWritePosition = writePosition.load();
uint newWritePosition = getPositionAfter(oldWritePosition);
// if the write position is the old write position
// advance the write position (so, now it is the new write position
// and write data to the old write position
// if this makes sense
if (writePosition.compare_exchange_strong(oldWritePosition, newWritePosition)) {
ringBuffer[oldWritePosition].store(element);
return true;
}
}
}
static constexpr uint getPositionAfter(uint position) noexcept
{
return ++position == ringBufferSize ? 0 : position;
}
static constexpr uint ringBufferSize = size + 1;
std::atomic<T> ringBuffer[ringBufferSize];
std::atomic<uint> writePosition = 0;
};
int main()
{
mpWriter<int, 50> ints;
std::vector<std::thread> producers;
for(int i = 0; i < 50; ++i) {
producers.push_back(std::thread([&](){
ints.push(0);
}));
}
for (auto& producer : producers) {
producer.join();
}
return 0;
}
What is wrong with the code? How to write it properly? I do not want to use any external queues. I want to understand how it is done.
I am trying to implement an array-based ring buffer that is thread-safe for multiple producers and a single consumer. The main idea is to have atomic head and tail indices. When pushing an element to the queue, the head is increased atomically to reserve a slot in the buffer:
#include <atomic>
#include <chrono>
#include <iostream>
#include <stdexcept>
#include <thread>
#include <vector>
template <class T> class MPSC {
private:
int MAX_SIZE;
std::atomic<int> head{0}; ///< index of first free slot
std::atomic<int> tail{0}; ///< index of first occupied slot
std::unique_ptr<T[]> data;
std::unique_ptr<std::atomic<bool>[]> valid; ///< indicates whether data at an
///< index has been fully written
/// Compute next index modulo size.
inline int advance(int x) { return (x + 1) % MAX_SIZE; }
public:
explicit MPSC(int size) {
if (size <= 0)
throw std::invalid_argument("size must be greater than 0");
MAX_SIZE = size + 1;
data = std::make_unique<T[]>(MAX_SIZE);
valid = std::make_unique<std::atomic<bool>[]>(MAX_SIZE);
}
/// Add an element to the queue.
///
/// If the queue is full, this method blocks until a slot is available for
/// writing. This method is not starvation-free, i.e. it is possible that one
/// thread always fills up the queue and prevents others from pushing.
void push(const T &msg) {
int idx;
int next_idx;
int k = 100;
do {
idx = head;
next_idx = advance(idx);
while (next_idx == tail) { // queue is full
k = k >= 100000 ? k : k * 2; // exponential backoff
std::this_thread::sleep_for(std::chrono::nanoseconds(k));
} // spin
} while (!head.compare_exchange_weak(idx, next_idx));
if (valid[idx])
// this throws, suggesting that two threads are writing to the same index. I have no idea how this is possible.
throw std::runtime_error("message slot already written");
data[idx] = msg;
valid[idx] = true; // this was set to false by the reader,
// set it to true to indicate completed data write
}
/// Read an element from the queue.
///
/// If the queue is empty, this method blocks until a message is available.
/// This method is only safe to be called from one single reader thread.
T pop() {
int k = 100;
while (is_empty() || !valid[tail]) {
k = k >= 100000 ? k : k * 2;
std::this_thread::sleep_for(std::chrono::nanoseconds(k));
} // spin
T res = data[tail];
valid[tail] = false;
tail = advance(tail);
return res;
}
bool is_full() { return (head + 1) % MAX_SIZE == tail; }
bool is_empty() { return head == tail; }
};
When there is a lot of congestion, some messages get overwritten by other threads. Hence there must be something fundamentally wrong with what I'm doing here.
What seems to be happening is that two threads are acquiring the same index to write their data to. Why could that be?
Even if a producer were to pause just before writing it's data, the tail could not increase past this threads idx and hence no other thread should be able to overtake and claim that same idx.
EDIT
At the risk of posting too much code, here is a simple program that reproduces the problem. It sends some incrementing numbers from many threads and checks whether all numbers are received by the consumer:
#include "mpsc.hpp" // or whatever; the above queue
#include <thread>
#include <iostream>
int main() {
static constexpr int N_THREADS = 10; ///< number of threads
static constexpr int N_MSG = 1E+5; ///< number of messages per thread
struct msg {
int t_id;
int i;
};
MPSC<msg> q(N_THREADS / 2);
std::thread threads[N_THREADS];
// consumer
threads[0] = std::thread([&q] {
int expected[N_THREADS] {};
for (int i = 0; i < N_MSG * (N_THREADS - 1); ++i) {
msg m = q.pop();
std::cout << "Got message from T-" << m.t_id << ": " << m.i << std::endl;
if (expected[m.t_id] != m.i) {
std::cout << "T-" << m.t_id << " unexpected msg " << m.i << "; expected " << expected[m.t_id] << std::endl;
return -1;
}
expected[m.t_id] = m.i + 1;
}
});
// producers
for (int id = 1; id < N_THREADS; ++id) {
threads[id] = std::thread([id, &q] {
for (int i = 0; i < N_MSG; ++i) {
q.push(msg{id, i});
}
});
}
for (auto &t : threads)
t.join();
}
I am trying to implement an array-based ring buffer that is thread-safe for multiple producers and a single consumer.
I assume you are doing this as a learning exercise. Implementing a lock-free queue yourself is most probably the wrong thing to do if you want to solve a real problem.
What seems to be happening is that two threads are acquiring the same index to write their data to. Why could that be?
The combination of that producer spinlock with the outer CAS loop does not work in the intended way:
do {
idx = head;
next_idx = advance(idx);
while (next_idx == tail) { // queue is full
k = k >= 100000 ? k : k * 2; // exponential backoff
std::this_thread::sleep_for(std::chrono::nanoseconds(k));
} // spin
//
// ...
//
// All other threads (producers and consumers) can progress.
//
// ...
//
} while (!head.compare_exchange_weak(idx, next_idx));
The queue may be full when the CAS happens because those checks are performed independently. In addition, the CAS may succeed because the other threads may have advanced head to exactly match idx.
what I want to do is to push integers to my threadSafe queue implementation with multiple threads and concurrently with another series of threads pop away the inserted numbers. All of this operation has to be thread safe, but another option that I want to have is that the size of the queue must be fixed, just like a buffer. If the buffer is full all the push threads must wait the pop threads to free some slot.
This is my implementation of the queue/buffer, it seems to work but after few iterations it stops and remains blocked without any error.
#include <queue>
#include <thread>
#include <mutex>
#include <condition_variable>
#include <iostream>
template <typename T>
class Queue
{
private:
std::queue<T> queue_;
std::mutex mutex_;
std::condition_variable cond_;
public:
T pop()
{
std::unique_lock<std::mutex> mlock(mutex_);
cond_.wait(mlock, [this]{return !queue_.empty();});
auto val = queue_.front();
queue_.pop();
return val;
}
void pop(T& item)
{
std::unique_lock<std::mutex> mlock(mutex_);
cond_.wait(mlock, [this]{return !queue_.empty();});
item = queue_.front();
queue_.pop();
}
void push(const T& item, int buffer)
{
std::unique_lock<std::mutex> mlock(mutex_);
while (queue_.size() >= buffer)
{
cond_.wait(mlock);
}
queue_.push(item);
mlock.unlock();
cond_.notify_one();
}
Queue()=default;
Queue(const Queue&) = delete; // disable copying
Queue& operator=(const Queue&) = delete; // disable assignment
};
The size of the buffer is defined in the push function with the variable buffer. This is an example of usage:
void prepare(Queue<int>& loaded, int buffer, int num_frames)
{
for (int i = 0; i < num_frames; i++)
{
cout<< "push "<<i<<endl;
loaded.push(i, buffer);
}
}
void load (vector<Frame>& movie, Queue<int>& loaded, int num_frames,
int num_points, int buffer, int height, int width)
{
for (int i = 0; i < num_frames; i++)
{
int num = loaded.pop();
cout<< "pop "<<num<<endl;
}
}
int main()
{
srand(time(NULL));
int num_threadsXstage = 4;
int width = 500;
int height = 500;
int num_points = width * height;
int num_frames = 100;
int frames_thread = num_frames/num_threadsXstage;
int preset = 3;
int buffer = 10;
//Vectors of threads
vector<thread> loader;
//Final vector
vector<Frame> movie;
movie.resize(num_frames);
//Working queues
Queue<int> loaded;
//Prepare loading queue task
thread preparator(prepare, ref(loaded), buffer, num_frames);
for (int i = 0; i < num_threadsXstage; i++)
{
//stage 1
loader.push_back(thread(&load, ref(movie), ref(loaded), frames_thread,
num_points, buffer, height, width));
}
// JOIN
preparator.join();
join_all(loader);
return 0;
}
Your pop functions could allow a thread waiting to push to make forward progress, but they don't call any notify function. You must call the appropriate notify function any time you may make it possible for a thread blocked on the condition variable to make forward progress.
Although it's quite complex to explain why, you should either call notify_all or call notify_one while still holding the lock. It is theoretically possible to "wake the wrong thread" otherwise because you are using the same condition variable for two predicates (the queue is not empty and the queue is not full).
To avoid very hard to understand failure modes, always do one of these three things:
Do not use the same condition variable to handle more than one predicate. For example, use one condition variable for "not empty" and another for "not full";
Always use notify_all, never notify_one; or
Always call notify functions while holding the mutex.
So long as you follow at least one of these three rules, you will avoid an obscure failure mode where you wake only a thread that chose to sleep after you released the mutex while leaving the only thread that could handle the condition still blocked.
I have a program with a function which takes a pointer as arg, and a main. The main is creating n threads, each of them running the function on different memory areas depending on the passed arg. Threads are then joined, the main performs some data mixing between the area and creates n new threads which do the the same operation as the old ones.
To improve the program I would like to keep the threads alive, removing the long time necessary to create them. Threads should sleep when the main is working and notified when they have to come up again. At the same way the main should wait when threads are working as it did with join.
I cannot end up with a strong implementation of this, always falling in a deadlock.
Simple baseline code, any hints about how to modify this would be much appreciated
#include <thread>
#include <climits>
...
void myfunc(void * p) {
do_something(p);
}
int main(){
void * myp[n_threads] {a_location, another_location,...};
std::thread mythread[n_threads];
for (unsigned long int j=0; j < ULONG_MAX; j++) {
for (unsigned int i=0; i < n_threads; i++) {
mythread[i] = std::thread(myfunc, myp[i]);
}
for (unsigned int i=0; i < n_threads; i++) {
mythread[i].join();
}
mix_data(myp);
}
return 0;
}
Here is a possible approach using only classes from the C++11 Standard Library. Basically, each thread you create has an associated command queue (encapsulated in std::packaged_task<> objects) which it continuously check. If the queue is empty, the thread will just wait on a condition variable (std::condition_variable).
While data races are avoided through the use of std::mutex and std::unique_lock<> RAII wrappers, the main thread can wait for a particular job to be terminated by storing the std::future<> object associated to each submitted std::packaged_tast<> and call wait() on it.
Below is a simple program that follows this design. Comments should be sufficient to explain what it does:
#include <thread>
#include <iostream>
#include <sstream>
#include <future>
#include <queue>
#include <condition_variable>
#include <mutex>
// Convenience type definition
using job = std::packaged_task<void()>;
// Some data associated to each thread.
struct thread_data
{
int id; // Could use thread::id, but this is filled before the thread is started
std::thread t; // The thread object
std::queue<job> jobs; // The job queue
std::condition_variable cv; // The condition variable to wait for threads
std::mutex m; // Mutex used for avoiding data races
bool stop = false; // When set, this flag tells the thread that it should exit
};
// The thread function executed by each thread
void thread_func(thread_data* pData)
{
std::unique_lock<std::mutex> l(pData->m, std::defer_lock);
while (true)
{
l.lock();
// Wait until the queue won't be empty or stop is signaled
pData->cv.wait(l, [pData] () {
return (pData->stop || !pData->jobs.empty());
});
// Stop was signaled, let's exit the thread
if (pData->stop) { return; }
// Pop one task from the queue...
job j = std::move(pData->jobs.front());
pData->jobs.pop();
l.unlock();
// Execute the task!
j();
}
}
// Function that creates a simple task
job create_task(int id, int jobNumber)
{
job j([id, jobNumber] ()
{
std::stringstream s;
s << "Hello " << id << "." << jobNumber << std::endl;
std::cout << s.str();
});
return j;
}
int main()
{
const int numThreads = 4;
const int numJobsPerThread = 10;
std::vector<std::future<void>> futures;
// Create all the threads (will be waiting for jobs)
thread_data threads[numThreads];
int tdi = 0;
for (auto& td : threads)
{
td.id = tdi++;
td.t = std::thread(thread_func, &td);
}
//=================================================
// Start assigning jobs to each thread...
for (auto& td : threads)
{
for (int i = 0; i < numJobsPerThread; i++)
{
job j = create_task(td.id, i);
futures.push_back(j.get_future());
std::unique_lock<std::mutex> l(td.m);
td.jobs.push(std::move(j));
}
// Notify the thread that there is work do to...
td.cv.notify_one();
}
// Wait for all the tasks to be completed...
for (auto& f : futures) { f.wait(); }
futures.clear();
//=================================================
// Here the main thread does something...
std::cin.get();
// ...done!
//=================================================
//=================================================
// Posts some new tasks...
for (auto& td : threads)
{
for (int i = 0; i < numJobsPerThread; i++)
{
job j = create_task(td.id, i);
futures.push_back(j.get_future());
std::unique_lock<std::mutex> l(td.m);
td.jobs.push(std::move(j));
}
// Notify the thread that there is work do to...
td.cv.notify_one();
}
// Wait for all the tasks to be completed...
for (auto& f : futures) { f.wait(); }
futures.clear();
// Send stop signal to all threads and join them...
for (auto& td : threads)
{
std::unique_lock<std::mutex> l(td.m);
td.stop = true;
td.cv.notify_one();
}
// Join all the threads
for (auto& td : threads) { td.t.join(); }
}
The concept you want is the threadpool. This SO question deals with existing implementations.
The idea is to have a container for a number of thread instances. Each instance is associated with a function which polls a task queue, and when a task is available, pulls it and run it. Once the task is over (if it terminates, but that's another problem), the thread simply loop over to the task queue.
So you need a synchronized queue, a thread class which implements the loop on the queue, an interface for the task objects, and maybe a class to drive the whole thing (the pool class).
Alternatively, you could make a very specialized thread class for the task it has to perform (with only the memory area as a parameter for instance). This requires a notification mechanism for the threads to indicate that they are done with the current iteration.
The thread main function would be a loop on that specific task, and at the end of one iteration, the thread signals its end, and wait on condition variables to start the next loop. In essence, you would be inlining the task code within the thread, dropping the need of a queue altogether.
using namespace std;
// semaphore class based on C++11 features
class semaphore {
private:
mutex mMutex;
condition_variable v;
int mV;
public:
semaphore(int v): mV(v){}
void signal(int count=1){
unique_lock lock(mMutex);
mV+=count;
if (mV > 0) mCond.notify_all();
}
void wait(int count = 1){
unique_lock lock(mMutex);
mV-= count;
while (mV < 0)
mCond.wait(lock);
}
};
template <typename Task>
class TaskThread {
thread mThread;
Task *mTask;
semaphore *mSemStarting, *mSemFinished;
volatile bool mRunning;
public:
TaskThread(Task *task, semaphore *start, semaphore *finish):
mTask(task), mRunning(true),
mSemStart(start), mSemFinished(finish),
mThread(&TaskThread<Task>::psrun){}
~TaskThread(){ mThread.join(); }
void run(){
do {
(*mTask)();
mSemFinished->signal();
mSemStart->wait();
} while (mRunning);
}
void finish() { // end the thread after the current loop
mRunning = false;
}
private:
static void psrun(TaskThread<Task> *self){ self->run();}
};
classcMyTask {
public:
MyTask(){}
void operator()(){
// some code here
}
};
int main(){
MyTask task1;
MyTask task2;
semaphore start(2), finished(0);
TaskThread<MyTask> t1(&task1, &start, &finished);
TaskThread<MyTask> t2(&task2, &start, &finished);
for (int i = 0; i < 10; i++){
finished.wait(2);
start.signal(2);
}
t1.finish();
t2.finish();
}
The proposed (crude) implementation above relies on the Task type which must provide the operator() (ie. a functor like class). I said you could incorporate the task code directly in the thread function body earlier, but since I don't know it, I kept it as abstract as I could. There's one condition variable for the start of threads, and one for their end, both encapsulated in semaphore instances.
Seeing the other answer proposing the use of boost::barrier, I can only support this idea: make sure to replace my semaphore class with that class if possible, the reason being that it is better to rely on well tested and maintained external code rather than a self implemented solution for the same feature set.
All in all, both approaches are valid, but the former gives up a tiny bit of performance in favor of flexibility. If the task to be performed takes a sufficiently long time, the management and queue synchronization cost becomes negligible.
Update: code fixed and tested. Replaced a simple condition variable by a semaphore.
It can easily be achieved using a barrier (just a convenience wrapper over a conditional variable and a counter). It basically blocks until all N threads have reached the "barrier". It then "recycles" again. Boost provides an implementation.
void myfunc(void * p, boost::barrier& start_barrier, boost::barrier& end_barrier) {
while (!stop_condition) // You'll need to tell them to stop somehow
{
start_barrier.wait ();
do_something(p);
end_barrier.wait ();
}
}
int main(){
void * myp[n_threads] {a_location, another_location,...};
boost::barrier start_barrier (n_threads + 1); // child threads + main thread
boost::barrier end_barrier (n_threads + 1); // child threads + main thread
std::thread mythread[n_threads];
for (unsigned int i=0; i < n_threads; i++) {
mythread[i] = std::thread(myfunc, myp[i], start_barrier, end_barrier);
}
start_barrier.wait (); // first unblock the threads
for (unsigned long int j=0; j < ULONG_MAX; j++) {
end_barrier.wait (); // mix_data must not execute before the threads are done
mix_data(myp);
start_barrier.wait (); // threads must not start new iteration before mix_data is done
}
return 0;
}
The following is a simple compiling and working code performing some random stuffs. It implements aleguna's concept of barrier. The task length of each thread is different so it is really necessary to have a strong synchronization mechanism. I will try to do a pool on the same tasks and benchmark the result, and then maybe with futures as pointed out by Andy Prowl.
#include <iostream>
#include <thread>
#include <mutex>
#include <condition_variable>
#include <chrono>
#include <complex>
#include <random>
const unsigned int n_threads=4; //varying this will not (almost) change the total amount of work
const unsigned int task_length=30000/n_threads;
const float task_length_variation=task_length/n_threads;
unsigned int rep=1000; //repetitions of tasks
class t_chronometer{
private:
std::chrono::steady_clock::time_point _t;
public:
t_chronometer(): _t(std::chrono::steady_clock::now()) {;}
void reset() {_t = std::chrono::steady_clock::now();}
double get_now() {return std::chrono::duration_cast<std::chrono::duration<double>>(std::chrono::steady_clock::now() - _t).count();}
double get_now_ms() {return
std::chrono::duration_cast<std::chrono::duration<double,std::milli>>(std::chrono::steady_clock::now() - _t).count();}
};
class t_barrier {
private:
std::mutex m_mutex;
std::condition_variable m_cond;
unsigned int m_threshold;
unsigned int m_count;
unsigned int m_generation;
public:
t_barrier(unsigned int count):
m_threshold(count),
m_count(count),
m_generation(0) {
}
bool wait() {
std::unique_lock<std::mutex> lock(m_mutex);
unsigned int gen = m_generation;
if (--m_count == 0)
{
m_generation++;
m_count = m_threshold;
m_cond.notify_all();
return true;
}
while (gen == m_generation)
m_cond.wait(lock);
return false;
}
};
using namespace std;
void do_something(complex<double> * c, unsigned int max) {
complex<double> a(1.,0.);
complex<double> b(1.,0.);
for (unsigned int i = 0; i<max; i++) {
a *= polar(1.,2.*M_PI*i/max);
b *= polar(1.,4.*M_PI*i/max);
*(c)+=a+b;
}
}
bool done=false;
void task(complex<double> * c, unsigned int max, t_barrier* start_barrier, t_barrier* end_barrier) {
while (!done) {
start_barrier->wait ();
do_something(c,max);
end_barrier->wait ();
}
cout << "task finished" << endl;
}
int main() {
t_chronometer t;
std::default_random_engine gen;
std::normal_distribution<double> dis(.0,1000.0);
complex<double> cpx[n_threads];
for (unsigned int i=0; i < n_threads; i++) {
cpx[i] = complex<double>(dis(gen), dis(gen));
}
t_barrier start_barrier (n_threads + 1); // child threads + main thread
t_barrier end_barrier (n_threads + 1); // child threads + main thread
std::thread mythread[n_threads];
unsigned long int sum=0;
for (unsigned int i=0; i < n_threads; i++) {
unsigned int max = task_length + i * task_length_variation;
cout << i+1 << "th task length: " << max << endl;
mythread[i] = std::thread(task, &cpx[i], max, &start_barrier, &end_barrier);
sum+=max;
}
cout << "total task length " << sum << endl;
complex<double> c(0,0);
for (unsigned long int j=1; j < rep+1; j++) {
start_barrier.wait (); //give to the threads the missing call to start
if (j==rep) done=true;
end_barrier.wait (); //wait for the call from each tread
if (j%100==0) cout << "cycle: " << j << endl;
for (unsigned int i=0; i<n_threads; i++) {
c+=cpx[i];
}
}
for (unsigned int i=0; i < n_threads; i++) {
mythread[i].join();
}
cout << "result: " << c << " it took: " << t.get_now() << " s." << endl;
return 0;
}