Concurrent queue gets stuck while dequeing - c++

I wrote the following code to implement a concurrent queue.
template <typename T>
class ConcurrentQueue
{
// Internal storage for a queue element
struct Element
{
T m_elem;
std::mutex m_mtx;
std::condition_variable m_cv;
bool m_hasElement = false;
};
public:
// The number of enqueued elements cannot go beyond p_capacity.
ConcurrentQueue(size_t p_capacity) :
m_elements(p_capacity),
m_approxCount(0),
m_actualCount(0),
m_front(0),
m_back(0)
{}
// Enqueues an element to the queue. Returns true on success and false
// if the enqueue failed due to the capacity being reached.
bool Enqueue(T p_element)
{
if (++m_approxCount > m_elements.size())
{
--m_approxCount;
return false;
}
++m_actualCount;
size_t slot = m_back.fetch_add(1) % m_elements.size();
auto& element = m_elements[slot];
std::lock_guard<std::mutex> lk(element.m_mtx);
element.m_elem = std::move(p_element);
element.m_hasElement = true;
element.m_cv.notify_one();
return true;
}
// Dequeues an element from the queue. Returns true on success and false
// if the dequeue failed due to the queue being empty.
bool Dequeue(T& p_element)
{
size_t count = m_actualCount.load();
if (count == 0)
{
return false;
}
while (!m_actualCount.compare_exchange_strong(count, count - 1))
{
if (count == 0)
{
return false;
}
}
size_t slot = m_front.fetch_add(1) % m_elements.size();
auto& element = m_elements[slot];
std::unique_lock<std::mutex> lk(element.m_mtx);
element.m_cv.wait(lk, [&element] { return element.m_hasElement; });
p_element = std::move(element.m_elem);
element.m_hasElement = false;
--m_approxCount;
return true;
}
private:
// Fixed size vector that stores the elements
std::vector<Element> m_elements;
// Approx count of number of elements in the queue.
std::atomic<size_t> m_approxCount;
// Actual count of the number of elements in the queue
std::atomic<size_t> m_actualCount;
// Index to the front of the queue
std::atomic<size_t> m_front;
// Index to the back of the queue
std::atomic<size_t> m_back;
};
and the following test to verify its functionality
int main()
{
int numElements = 1000;
int numThreads = 10;
ConcurrentQueue<int> q(numElements * numThreads / 2);
std::vector<std::thread> enqueueThreads;
for (int i = 0; i < numThreads; ++i)
{
enqueueThreads.emplace_back([&q, i, numElements]
{
for (int j = 0; j < numElements; ++j)
{
while (!q.Enqueue(i * numElements + j));
}
});
}
std::atomic<int> toDequeue = numElements * numThreads;
std::vector<std::thread> dequeueThreads;
for (int i = 0; i < numThreads; ++i)
{
dequeueThreads.emplace_back([&q, &toDequeue]
{
while (toDequeue > 0)
{
int element;
if (q.Dequeue(element))
{
--toDequeue;
}
}
});
}
for (auto& t : enqueueThreads)
{
t.join();
}
for (auto& t : dequeueThreads)
{
t.join();
}
}
In the debug build (VS2017), the test runs fine, but in the retail build, the main function doesn't return (the Dequeue threads don't complete) indicating a bug in the ConcurrentQueue implementation. What is the bug in the Enqueue or Dequeue method?

The Enqueue method needs to wait for slot to be free if the dequeuer hasn't freed it up.
The following code fixed the problem.
template <typename T>
bool ConcurrentQueue<T>::Enqueue(T p_element)
{
if (++m_approxCount > m_elements.size())
{
--m_approxCount;
return false;
}
size_t slot = m_back.fetch_add(1) % m_elements.size();
auto& element = m_elements[slot];
{
std::unique_lock<std::mutex> lk(element.m_mtx);
element.m_cv.wait(lk, [&element] { return !element.m_hasElement; });
element.m_elem = std::move(p_element);
element.m_hasElement = true;
element.m_cv.notify_all();
}
++m_actualCount;
return true;
}
template <typename T>
bool ConcurrentQueue<T>::Dequeue(T& p_element)
{
size_t count = UINT64_MAX;
while (!m_actualCount.compare_exchange_strong(count, count - 1))
{
if (count == 0)
{
return false;
}
}
size_t slot = m_front.fetch_add(1) % m_elements.size();
auto& element = m_elements[slot];
{
std::unique_lock<std::mutex> lk(element.m_mtx);
element.m_cv.wait(lk, [&element] { return element.m_hasElement; });
p_element = std::move(element.m_elem);
element.m_hasElement = false;
element.m_cv.notify_all();
}
--m_approxCount;
return true;
}

Related

Synchronize worker threads with a main thread

how to correctly synchronize worker threads with a main thread if a worker thread can generate another tasks? I've used std::queue to maintain tasks guarded by mutex and atomic variable to track busy threads. Unfortunately I'm facing deadlocks at the end of the execution.
I've extracted code from my project and created a following example (you can easily compile it with g++ or MSVC):
#include <iostream>
#include <thread>
#include <mutex>
#include <condition_variable>
#include <stdexcept>
#include <functional>
#include <stack>
#include <atomic>
#include <queue>
template <class T, class Compare>
class USort {
using Task = std::pair<T*, T*>;
private:
size_t m_ThreadsNum;
std::atomic<bool> m_Finished;
std::atomic<size_t> m_Busy;
std::thread* m_Threads;
std::queue<Task> m_Tasks;
size_t m_Size;
T* m_Data;
Compare m_Comparator;
std::condition_variable m_WaitFinished;
std::condition_variable m_WaitSorter;
std::mutex m_TaskQueueMutex;
private:
const size_t THREAD_THRESHOLD = 1024;
const size_t THREAD_POOL_THRESHOLD = 8192;
bool HasTask() {
std::unique_lock<std::mutex> lock(m_TaskQueueMutex);
return m_Tasks.size() > 0;
}
bool PopTask(T** L, T** R) {
std::unique_lock<std::mutex> lock(m_TaskQueueMutex);
if (m_Tasks.size() == 0) {
*L = *R = nullptr;
return false;
}
*L = m_Tasks.front().first;
*R = m_Tasks.front().second;
m_Tasks.pop();
return true;
}
void PushTask(T* L, T* R) {
std::unique_lock<std::mutex> lock(m_TaskQueueMutex);
m_Tasks.emplace(std::pair<T*, T*>(L, R));
m_WaitSorter.notify_one();
}
void SortThread(size_t Id) {
std::mutex sorter_mutex;
for (;;) {
std::unique_lock<std::mutex> lock(sorter_mutex);
///
/// ----------------------------------> some threads wait here
///
m_WaitSorter.wait(lock, [this]() { return m_Finished || HasTask(); });
if (m_Finished) break;
m_Busy++;
T *left, *right;
while (PopTask(&left, &right)) {
Sort(left, right);
}
if (--m_Busy == 0) {
m_WaitFinished.notify_one();
}
}
}
// just simulate work
void Sort(T* Left, T* Right) {
if (Right - Left > 10) {
PushTask(Left, Right-10);
}
}
void WaitForSortingIsFinished() {
std::mutex finished;
std::unique_lock<std::mutex> lock(finished);
m_WaitFinished.wait(lock, [this]() { return m_Busy == 0 && !HasTask(); });
}
void FinishThreads() {
m_Finished = true;
m_WaitSorter.notify_all();
}
void ReleaseThreads() {
if (m_Threads) {
for (size_t i = 0; i < m_ThreadsNum; i++) {
///
/// ----------------------------------> main thread stuck here
///
m_Threads[i].join();
}
delete[] m_Threads;
m_Threads = nullptr;
}
}
public:
USort(size_t NumberOfThreads = 0) : m_Comparator(Compare()) {
if (NumberOfThreads == 0) {
static const unsigned int max_concurrency = std::thread::hardware_concurrency();
NumberOfThreads = max_concurrency;
if (NumberOfThreads == 0) NumberOfThreads = 4;
}
m_Finished = false;
m_ThreadsNum = NumberOfThreads;
m_Threads = nullptr;
}
~USort() {
ReleaseThreads();
}
void Sort(T* Data, size_t Size) {
// build thread pool
m_Threads = new std::thread[m_ThreadsNum];
for (size_t i = 0; i < m_ThreadsNum; i++) {
m_Threads[i] = std::thread(&USort::SortThread, this, i);
}
// process data
PushTask(Data, Data + Size - 1);
WaitForSortingIsFinished();
FinishThreads();
}
};
template <class T, class Compare>
void usort(T* Data, size_t Size, size_t NumberOfThreads = 0) {
USort<T, Compare> mt_sorter(NumberOfThreads);
mt_sorter.Sort(Data, Size);
}
const size_t ARR_SIZE = 0x00010000;
struct comp {
bool operator()(const int& L, const int& R) const {
return L < R;
}
};
int main()
{
int* arr = new int[ARR_SIZE];
for (int i = 0; i < ARR_SIZE; i++) {
arr[i] = rand() % 3200000;
}
usort<int, comp>(arr, ARR_SIZE, 16);
delete[] arr;
return 0;
}
The thing is, that in my example threads aren't always finished. From time to time some thread pending in m_WaitSorter.wait() and therefore main thread pending in m_Threads[i].join();. Where is the flaw in the logic. Why the calling to FinishThreads() doesn't finish all threads?
EDIT:
Basically I'd like to implement multithread sorting algorithm.
The main thread creates thread pool, push first task(sort whole array) to a task queue and waits for sorting to be finished
The pool thread takes task, divide it to smaller tasks(1-3). One of this task is immediatelly processed by the current pool thread, others are push to the queue
The pool thread musn't finish until the whole data set is sorted(there are no task in the queue and all pool threads are pending)
When the sorting is finished the main thread should be woken
Main thread should finish pending threads
So for this, from my perspective, I need two conditional_variabes with predicate "all threads are pending && has no task in queue" in main thread and "has task in queue || finish thread" in pool thread.
OK, I've read the documentation through carefully and found a bug in my code. Calls to notify_one(), notify_all() and wait() have to be controlled via the same mutext. With that in mind I've update and little bit simplified my code:
bool WaitAndPopTask(T** L, T** R) {
std::unique_lock<std::mutex> lock(m_TaskQueueMutex);
m_WaitSorter.wait(lock, [this]() { return m_Finished || !m_Tasks.empty(); });
if (m_Finished) return false;
m_Busy++;
*L = m_Tasks.front().first;
*R = m_Tasks.front().second;
m_Tasks.pop();
return true;
}
void SortThread(size_t Id) {
for (;;) {
T *left, *right;
if (!WaitAndPopTask(&left, &right)) break;
Sort(left, right);
std::lock_guard<std::mutex> lk(m_TaskQueueMutex);
if (--m_Busy == 0 && m_Tasks.empty()) {
FinishThreads();
}
}
}
void Sort(T* Data, size_t Size) {
// build thread pool
m_Threads = new std::thread[m_ThreadsNum];
for (size_t i = 0; i < m_ThreadsNum; i++) {
m_Threads[i] = std::thread(&USort::SortThread, this, i);
}
// process data
PushTask(Data, Data + Size - 1);
ReleaseThreads();
}

C++ Multiple consumer threads stuck on condition variable

I'm making a single producer, multiple consumers program in C++. I begin by calling consumer threads and then I add elements to an array.
Everything works fine, but in the end the consumer threads are not joining, because they're stuck waiting on condition variable and the program freezes.
I think the problem is that threads are constantly called in the loop because currentSize is not protected and they just can't exit out of the condition variable, but I don't know how to fix it.
struct Item {
public:
string name;
int time;
double height;
};
struct Monitor {
private:
Item items[12];
int currentSize;
bool finished;
mutex lock;
condition_variable cv;
public:
Monitor() {
finished = false;
currentSize = 0;
}
void put(Item item) {
unique_lock<mutex> guard(lock);
cv.wait(guard, [&] { return (currentSize < 12); });
items[currentSize] = item;
currentSize++;
cv.notify_all();
}
Item get() {
unique_lock<mutex> guard(lock);
cv.wait(guard, [&] { return (currentSize > 0); });
Item item = items[currentSize - 1];
currentSize--;
return item;
}
bool get_finished() {
return finished;
}
void set_finished() {
finished = true;
}
int get_size() {
return currentSize;
}
};
int main() {
vector<Item> items = read_file(file);
Monitor monitor;
vector<thread> threads;
vector<Item> results;
for (int i = 0; i < 4; i++) {
threads.emplace_back([&] {
while (!monitor.get_finished()) {
if (monitor.get_size() > 0) {
Item item = monitor.get();
results.push_back(item);
}
}
});
}
for (int i = 0; i < items.size(); i++) {
monitor.put(items[i]);
}
monitor.set_finished();
for_each(threads.begin(), threads.end(), mem_fn(&thread::join));
return 0;
}
Why the consumer threads block?
I have tested your code, and it turns out to be the producer thread blocking on the put() method. Why?
Imagine the following scenario: there are 13 items in the vector items.
The main thread (producer) happily loads the first 12 items, and waits on cv for the currentSize to become lower than 12.
The consumer threads are notified, and happily consume the first 12 items, and then wait on cv for currentSize to become greater than 0.
But wait! Now everyone is waiting on something, with no one notifying. Thus, all threads would block. You need to notify the producer when currentSize becomes lower than 12.
I noticed a few issues. made the member variables atomic, notify_all in get api. However there was al logic error as well. Imagine that you have 4 threads currently running and 5 items were in queue. At this point lets say each of the thread is able to get one out of the queue and now there are 4 threads and only one item in the queue. One of the thread takes the last one out and now there is 0 items in there however other three threads still waiting on the condition variable. So a solution is if the last item is out everythread should be notified and if there is no other elemnet get back from the API.
#include <iostream>
#include <vector>
#include <condition_variable>
#include <thread>
#include <algorithm>
#include <atomic>
using namespace std;
using Item = int;
struct Monitor {
private:
Item items[12];
std::atomic<int> currentSize;
std::atomic<bool> finished;
mutex lock;
condition_variable cv;
public:
Monitor() {
finished = false;
currentSize = 0;
}
void put(Item item) {
unique_lock<mutex> guard(lock);
cv.wait(guard, [&] { return (currentSize < 12); });
items[currentSize] = item;
currentSize++;
cv.notify_all();
std::cerr << "+ " << currentSize << std::endl ;
}
Item get() {
unique_lock<mutex> guard(lock);
cv.wait(guard, [&] { return (currentSize >= 0 ); });
Item item;
if (currentSize > 0 ){
currentSize--;
item = items[currentSize];
cv.notify_all();
std::cerr << "- " << currentSize << std::endl ;
}
return item;
}
bool get_finished() {
return finished;
}
void set_finished() {
finished = true;
}
int get_size() {
return currentSize;
}
};
int main() {
vector<Item> items(200);
std::fill ( items.begin() , items.end(), 100);
Monitor monitor;
vector<thread> threads;
vector<Item> results;
for (int i = 0; i < 10; i++) {
threads.emplace_back([&] {
while ( !monitor.get_finished() ) {
if (monitor.get_size() > 0) {
Item item = monitor.get();
results.push_back(item);
}
}
});
}
for (int i = 0; i < items.size(); i++) {
monitor.put(items[i]);
}
monitor.set_finished();
for_each(threads.begin(), threads.end(), mem_fn(&thread::join));
return 0;
}

C++ Data Strutures Queue: Find the largest element in the queue using for loop

I need to find the largest element in an unsorted queue, remove it and store it in an auxiliary queue, then place it back as the first element in the main queue, WITHOUT using the standard queue functions from C++.
I tried to use a for loop to go through the queue and search for the largest element, but it didn't work. Any suggestion on how to do it properly?
//The queue was created as a class as follows:
class Queue{
private:
int arrayqueue[size];
int start, end, counter;
public:
Queue();
bool empty();
bool full();
bool insert(int item);
bool remove();
bool front(int &item);
int counter_size();
void copy(Queue &F);
bool equal(Queue &F);
void print();
int largest_value(Queue &F,int n);
};
Queue::Queue(){
counter = 0;
start = 0;
end = size -1;
}
This is the function I tried to build to find the largest element in the queue and store in the auxiliary queue:
//Create an auxiliary queue
//Find the largest element in the main queue, then remove it and insert it in the aux queue
int Queue::largest_value(Queue &F,int n){
//int *Queue;
// n is the size of the queue
Queue aux;
int largest = Queue[0];
for(int i = 1;i <n; i++) {
if(largest < Queue[i])
largest = Queue[i];
}
F.remove();
aux.insert(largest);
return largest;
}
Function to verify if it's empty:
bool Queue::empty(){
if(counter == 0)
return true;
else
return false;
}
The function to remove elements:
bool Queue::remove(){
if(empty()==true)
return false;
if(start==size-1)
start = 0;
else
start++;
counter--;
return true;
}
Try something more like this:
class Queue{
private:
int arrayqueue[size];
int ... counter;
public:
...
bool remove_largest_value(int &value);
};
...
bool Queue::remove_largest_value(int &value) {
if (counter == 0)
return false;
int largest = 0;
for(int i = 1; i < counter; ++i) {
if (arrayqueue[largest] < arrayqueue[i])
largest = i;
}
value = arrayqueue[largest];
for(int i = largest + 1; i < counter; ++i) {
arrayqueue[i-1] = arrayqueue[i];
}
--counter;
return true;
}
Then you can do this:
Queue main;
// populate main as needed...
Queue aux;
int item;
if (main.remove_largest_value(item))
{
aux.insert(item);
...
main.insert(item);
}

boost mutex in parallel quicksort

This is my first time using mutexes so I am not exactly sure about what I am doing but I think I am having an error with the thread safety of the push_back function using the vector container (I am having multiple threads write to it at the same time and getting this error):
* glibc detected * ./quicksort: double free or corruption (out): 0x00007f2638000980 *
To solve this I added a mutex but it didn't seem to do anything, the code is here:
void parallel_quicksort(vector<int>& input)
{
boost::mutex mutex;
queue<pr_pair> partitions, temp_partitions;
vector<pr_pair> jobs;
parallel_partition(input, partitions, 0, input.size());
pr_pair temp;
while(1)
{
boost::thread_group threadpool;
while(!partitions.empty())
{
temp = partitions.front();
partitions.pop();
jobs.push_back(temp);
if (jobs.size() == NUM_THREADS)
{
for (int i = 0; i < NUM_THREADS; i++)
{
temp = jobs.back();
jobs.pop_back();
threadpool.create_thread(boost::bind(&parallel_partition, boost::ref(input), boost::ref(temp_partitions), temp.p, temp.r));
}
threadpool.join_all();
}
}
while(!jobs.empty())
{
temp = jobs.back();
jobs.pop_back();
threadpool.create_thread(boost::bind(&parallel_partition, boost::ref(input), boost::ref(temp_partitions), temp.p, temp.r));
}
threadpool.join_all();
while(!temp_partitions.empty())
{
temp = temp_partitions.front();
partitions.push(temp);
temp_partitions.pop();
}
if(partitions.empty())
{
break;
}
}
return;
}
void parallel_partition(vector<int>& input, queue<pr_pair>& partitions, int p, int r)
{
int p_store = p;
int r_store = r;
int pivot = input[r];
while (p<r)
{
while (input[p] < pivot)
p++;
while (input[r] > pivot)
r--;
if (input[p] == input[r])
p++;
else if (p<r)
{
int tmp = input[p];
input[p] = input[r];
input[r] = tmp; }
}
pr_pair temp;
if (r-1 > p_store)
{
boost::mutex::scoped_lock scoped_lock(mutex);
temp.p = p_store;
temp.r = r-1;
partitions.push(temp);
}
if (r_store > r+1)
{
boost::mutex::scoped_lock scoped_lock(mutex);
temp.p = r+1;
temp.r = r_store;
partitions.push(temp);
}
return;
}
Quickly scanning the code it seems that you guarded access to the partitions data structure, but your input data structure is modified as well in the parallel_partition method. So that could cause problems.

How do I make a circular queue thread-safe?

So my Enqueue and Dequeue functions are below. How do I take what I have and make it thread safe? I thought about using a mutex from Windows.h, but I'd like to not limit my program to Windows-only, if possible.
void Queue::Enqueue(int num){
//increase recorded size
size++;
//stick in num
numbers[nextSpace] = num;
//find the next available space
nextSpace = (++nextSpace) % maxSize;
}
int Queue::Dequeue(){
int temp;
temp = items[curSpace];
curSpace = (++curSpace) % maxSize;
size--;
return temp;
}
You can refer this code (with pthreads):
#include<pthread.h>
#define DEFAULT_SIZE 100
class circularQueue{
private:
int *m_queue;
int p_head;
int p_tail;
int m_cap;
pthread_mutex_t mp = PTHREAD_MUTEX_INITIALIZER;
public:
circularQueue(int size)
{
/*in case invalid input*/
if(size<0)
size = DEFAULT_SIZE ;
m_queue = new int[size];
p_head = 0;
p_tail = -1;
m_cap = 0;
pthread_mutex_init(&mp,NULL);
}
bool enqueue(int x)
{
bool res= false;
p_thread_mutex_lock(&mp);
/*queue is full*/
if(m_cap == size)
{
res = false;
}
else
{
m_queue[(++p_tail)%size)] = x;
++m_cap;
res = true;
}
p_thread_mutex_unlock(&mp);
return res;
}
int dequeue()
{
int res=0;
pthread_mutex_lock(&mp);
/*empty queue*/
if(m_cap == 0)
{
throw("empty queue!");
pthread_mutex_unlock(&mp);
}
else{
res = m_queue[p_head];
p_head = (p_head+1)%size;
}
pthread_mutex_unlock(&mp);
return res;
}
~virtual circularQueue()
{
delete[] m_queue;
m_queue = NULL;
pthread_mutex_destroy(&mp);
}
}