I have an implementation of a lockfree queue, which I believe to be correct (or at least data race-free):
#include <atomic>
#include <iostream>
#include <optional>
#include <thread>
struct Job {
int id;
int data;
};
class JobQueue {
using stdmo = std::memory_order;
struct Node {
std::atomic<Node *> next = QUEUE_END;
Job job;
};
static inline Node *const QUEUE_END = nullptr;
static inline Node *const STACK_END = QUEUE_END + 1;
struct GenNodePtr {
Node *node;
std::uintptr_t gen;
};
alignas(64) std::atomic<Node *> jobs_back;
alignas(64) std::atomic<GenNodePtr> jobs_front;
alignas(64) std::atomic<GenNodePtr> stack_top;
public:
JobQueue()
: jobs_back{new Node{}},
jobs_front{GenNodePtr{jobs_back.load(stdmo::relaxed), 1}},
stack_top{GenNodePtr{STACK_END, 1}} {}
~JobQueue() {
Node *cur_queue = jobs_front.load(stdmo::relaxed).node;
while (cur_queue != QUEUE_END) {
Node *next = cur_queue->next;
delete cur_queue;
cur_queue = next;
}
Node *cur_stack = stack_top.load(stdmo::relaxed).node;
while (cur_stack != STACK_END) {
Node *next = cur_stack->next;
delete cur_stack;
cur_stack = next;
}
}
Node *allocate_node() {
GenNodePtr cur_stack = stack_top.load(stdmo::acquire);
while (true) {
if (cur_stack.node == STACK_END) {
return new Node{};
}
Node *cur_stack_next = cur_stack.node->next.load(stdmo::relaxed);
GenNodePtr new_stack{cur_stack_next, cur_stack.gen + 1};
if (stack_top.compare_exchange_weak(cur_stack, new_stack,
stdmo::acq_rel)) {
return cur_stack.node;
}
}
}
void deallocate_node(Node *node) {
GenNodePtr cur_stack = stack_top.load(stdmo::acquire);
while (true) {
node->next.store(cur_stack.node, stdmo::relaxed);
GenNodePtr new_stack{node, cur_stack.gen + 1};
if (stack_top.compare_exchange_weak(cur_stack, new_stack,
stdmo::acq_rel)) {
break;
}
}
}
public:
void enqueue(Job job) {
Node *new_node = allocate_node();
new_node->next.store(QUEUE_END, stdmo::relaxed);
Node *old_dummy = jobs_back.exchange(new_node, stdmo::acq_rel);
old_dummy->job = job;
old_dummy->next.store(new_node, stdmo::release);
}
std::optional<Job> try_dequeue() {
GenNodePtr old_front = jobs_front.load(stdmo::relaxed);
while (true) {
Node *old_front_next = old_front.node->next.load(stdmo::acquire);
if (old_front_next == QUEUE_END) {
return std::nullopt;
}
GenNodePtr new_front{old_front_next, old_front.gen + 1};
if (jobs_front.compare_exchange_weak(old_front, new_front,
stdmo::relaxed)) {
break;
}
}
Job job = old_front.node->job;
deallocate_node(old_front.node);
return job;
}
};
int main() {
JobQueue queue;
std::atomic<int> i = 0;
std::thread consumer{[&queue, &i]() {
// producer enqueues 1
while (i.load(std::memory_order_relaxed) != 1) {}
std::atomic_thread_fence(std::memory_order_acq_rel);
std::cout << queue.try_dequeue().value_or(Job{-1, -1}).data
<< std::endl;
std::atomic_thread_fence(std::memory_order_acq_rel);
i.store(2, std::memory_order_relaxed);
// producer enqueues 2 and 3
}};
std::thread producer{[&queue, &i]() {
queue.enqueue(Job{1, 1});
std::atomic_thread_fence(std::memory_order_acq_rel);
i.store(1, std::memory_order_relaxed);
// consumer consumes here
while (i.load(std::memory_order_relaxed) != 2) {}
std::atomic_thread_fence(std::memory_order_acq_rel);
queue.enqueue(Job{2, 2});
queue.enqueue(Job{3, 3});
}};
producer.join();
consumer.join();
return 0;
}
This queue is implemented as a singly-linked double ended linked list. It uses a dummy node to decouple producers and consumers, and it uses a generation counter and node recycling (using an internal stack) to avoid the ABA problem and a use-after-free in try_dequeue.
Running this under TSan compiled with Clang 13.0.1, Linux x64, I get the following race:
WARNING: ThreadSanitizer: data race (pid=223081)
Write of size 8 at 0x7b0400000008 by thread T2:
#0 JobQueue::enqueue(Job) .../bug4.cpp:85 (bug4.tsan+0xe3e53)
#1 operator() .../bug4.cpp:142 (bug4.tsan+0xe39ee)
...
Previous read of size 8 at 0x7b0400000008 by thread T1:
#0 JobQueue::try_dequeue() .../bug4.cpp:104 (bug4.tsan+0xe3c07)
#1 operator() .../bug4.cpp:121 (bug4.tsan+0xe381c)
...
Run on Godbolt (note, because of how Godbolt runs the program, TSan doesn't show line number information)
This race is between this previous read in try_dequeue called from the consumer thread:
Job job = old_front.node->job;
and this later write in enqueue, which is the third call to enqueue by the producer thread:
old_dummy->job = job;
I believe this race to be impossible, because the producer thread should synchronise with the consumer thread via the acquire-release compare-exchange to stack_top in allocate_node and deallocate_node.
Now, the weird thing is that making GenNodePointer alignas(32) removes the race.
Run on Godbolt
Questions:
Is this race actually possible?
Why does increasing the alignment of GenNodePointer make TSan no longer register a race?
Related
For study purposes, I’m comparing implementations of single producer single consumer queues. So I compared a condition variable implementation with a C++20 counting semaphore implementation. I would have guessed that the semaphore implementation would be faster, but that is not the case. Under Windows, MSVC, on my computer, the semaphore implementation is about 25% slower. I’ve included both implementations below.
The condition variable implementation has a small functional advantage: aborting operations can be achieved with the done() API function, while the semaphore implementation requires a special ‘stop’ value to be queued to unlock and exit the pulling thread.
In my imagination, a single producer single consumer queue was a typical application for semaphores, but apparently not.
Now I wonder:
Did I do something not clever so that my semaphore implementation is needlessly slow?
Is possibly the Microsoft counting semaphore implementation too slow?
Or do requirements in the C++ standard make the semaphore slow in general?
Am I just mistaken that a queue is proper application for semaphores?
When a queue is not a proper application, for what other application does the semaphore outperform the condition variable?
Condition variable implementation:
#include <array>
#include <mutex>
#include <condition_variable>
/*
* locked_single_producer_single_consumer_queue_T is responsible for locked packet communication
* between 2 threads. One thread pushes, the other thread pulls.
*/
template<class T, int N = 16> // N must be a power 2
class locked_single_producer_single_consumer_queue_T
{
public:
/* When packet fits in the queue, then push shall return immediatelly. Otherwise it will block until it can push the packet. */
void push(T const& packet)
{
std::unique_lock<std::mutex> lock(m_mutex);
m_cv.wait(lock, [this] {return ((m_tail - m_head) & m_mask) != 1; });
m_data[m_head++] = packet;
m_head &= m_mask;
lock.unlock();
m_cv.notify_one();
}
/* When packet could be retreived from the queue, then pull shall return immediatelly. Otherwise it will block until it can pull the packet. */
bool pull(T& packet)
{
std::unique_lock<std::mutex> lock(m_mutex);
m_cv.wait(lock, [this] {return (((m_head - m_tail) & m_mask) != 0) || m_done; });
if(((m_head - m_tail) & m_mask) != 0) [[likely]]
{
packet = m_data[m_tail++];
m_tail &= m_mask;
lock.unlock();
m_cv.notify_one();
return true;
}
return false;
}
/* done() indicates that the pushing thread stopped. The pulling thread can continue reading
the remainder of the queue and should then return */
void done()
{
{
std::lock_guard<std::mutex> lock(m_mutex);
m_done = true;
}
m_cv.notify_one();
}
private:
static_assert((N& (N - 1)) == 0, "N must be a power of 2");
static signed int const m_mask = N - 1;
using data_t = std::array<T, N>;
data_t m_data;
std::mutex m_mutex;
std::condition_variable m_cv;
int m_tail{ 0 };
int m_head{ 0 };
bool m_done{};
};
Semaphore implementation:
#include <array>
#include <semaphore>
#include <atomic>
/*
* locked_single_producer_single_consumer_queue2_T is responsible for locking packet communication
* between 2 threads. One thread pushes, the other thread pulls.
*/
template<class T, int N = 16> // N must be a power 2
class locked_single_producer_single_consumer_queue2_T
{
public:
/* When packet fits in the queue, then push shall return immediatelly. Otherwise it will block until it can push the packet. */
void push(T const& packet)
{
m_available_space.acquire();
int head = m_head.load(std::memory_order_acquire);
m_data[head++ & m_mask] = packet;
m_head.store(head, std::memory_order_release);
m_available_packages.release();
}
/* When packet could be retreived from the queue, then pull shall return immediatelly. Otherwise it will block until it can pull the packet. */
T pull()
{
m_available_packages.acquire();
int tail = m_tail.load(std::memory_order_acquire);
T packet = m_data[tail++ & m_mask];
m_tail.store(tail, std::memory_order_release);
m_available_space.release();
return packet;
}
private:
static_assert((N& (N - 1)) == 0, "N must be a power of 2");
static signed int const m_mask = N - 1;
using data_t = std::array<T, N>;
data_t m_data;
std::atomic_int m_tail{ 0 };
std::atomic_int m_head{ 0 };
std::counting_semaphore<N> m_available_space{ N };
std::counting_semaphore<N> m_available_packages{ 0 };
};
*** EDIT ***
Upon request, I've also included a complete test program. It already includes both implementations. (It needs C++20 with semaphores)
#include <array>
#include <mutex>
#include <condition_variable>
#include <semaphore>
#include <atomic>
#include <iostream>
#include <vector>
#include <algorithm>
#include <future>
/*
* locked_single_producer_single_consumer_queue_T is responsible for locked packet communication
* between 2 threads. One thread pushes, the other thread pulls.
*/
template<class T, int N = 16> // N must be a power 2
class locked_single_producer_single_consumer_queue_T
{
public:
/* When packet fits in the queue, then push shall return immediatelly. Otherwise it will block until it can push the packet. */
void push(T const& packet)
{
std::unique_lock<std::mutex> lock(m_mutex);
m_cv.wait(lock, [this] {return ((m_tail - m_head) & m_mask) != 1; });
m_data[m_head++] = packet;
m_head &= m_mask;
lock.unlock();
m_cv.notify_one();
}
/* When packet could be retreived from the queue, then pull shall return immediatelly. Otherwise it will block until it can pull the packet. */
bool pull(T& packet)
{
std::unique_lock<std::mutex> lock(m_mutex);
m_cv.wait(lock, [this] {return (((m_head - m_tail) & m_mask) != 0) || m_done; });
if (((m_head - m_tail) & m_mask) != 0) [[likely]]
{
packet = m_data[m_tail++];
m_tail &= m_mask;
lock.unlock();
m_cv.notify_one();
return true;
}
return false;
}
/* done() indicates that the pushing thread stopped. The pulling thread can continue reading
the remainder of the queue and should then return */
void done()
{
{
std::lock_guard<std::mutex> lock(m_mutex);
m_done = true;
}
m_cv.notify_one();
}
private:
static_assert((N& (N - 1)) == 0, "N must be a power of 2");
static signed int const m_mask = N - 1;
using data_t = std::array<T, N>;
data_t m_data;
std::mutex m_mutex;
std::condition_variable m_cv;
int m_tail{ 0 };
int m_head{ 0 };
bool m_done{};
};
/*
* locked_single_producer_single_consumer_queue2_T is responsible for locking packet communication
* between 2 threads. One thread pushes, the other thread pulls.
*/
template<class T, int N = 16> // N must be a power 2
class locked_single_producer_single_consumer_queue2_T
{
public:
/* When packet fits in the queue, then push shall return immediatelly. Otherwise it will block until it can push the packet. */
void push(T const& packet)
{
m_available_space.acquire();
int head = m_head.load(std::memory_order_acquire);
m_data[head++ & m_mask] = packet;
m_head.store(head, std::memory_order_release);
m_available_packages.release();
}
/* When packet could be retreived from the queue, then pull shall return immediatelly. Otherwise it will block until it can pull the packet. */
T pull()
{
m_available_packages.acquire();
int tail = m_tail.load(std::memory_order_acquire);
T packet = m_data[tail++ & m_mask];
m_tail.store(tail, std::memory_order_release);
m_available_space.release();
return packet;
}
private:
static_assert((N& (N - 1)) == 0, "N must be a power of 2");
static signed int const m_mask = N - 1;
using data_t = std::array<T, N>;
data_t m_data;
std::atomic_int m_tail{ 0 };
std::atomic_int m_head{ 0 };
std::counting_semaphore<N> m_available_space{ N };
std::counting_semaphore<N> m_available_packages{ 0 };
};
/******************************************************************************************************/
using implementation_t = bool;
implementation_t const condition_variable = false;
implementation_t const semaphore = true;
/*
* pusher() is a thread function that is responsible for pushing a defined
* sequence of integers in the lock_free queue
*/
std::atomic_int sum_ref{};
template<class queue_t>
void pusher(std::atomic_bool& do_continue_token, queue_t& queue)
{
int i = 0;
while (do_continue_token.load(std::memory_order_acquire))
{
queue.push(i);
sum_ref += i;
++i;
}
}
/*
* puller() is a thread function that is responsible for pulling
* integers from the lock_free queue, and compare it with the
* expected sequence
*/
std::atomic_int sum_check{};
template<implementation_t implementation, class queue_t>
int puller(queue_t& queue)
{
int i;
if constexpr (implementation == condition_variable)
{
while (queue.pull(i))
{
sum_check += i;
}
}
if constexpr (implementation == semaphore)
{
int j;
while ((j = queue.pull()) != -1)
{
sum_check += j;
i = j;
}
}
return i;
}
/*
* test() is responsible for kicking off two threads that push and pull from
* the queue for a duration of 10s. Test returns the last integer value that was
* pulled from the queue as an indication of speed.
*/
template<implementation_t implementation, class queue_t>
int test()
{
using namespace std::chrono_literals;
std::atomic_bool do_continue_token(true);
queue_t queue;
std::cout << '<' << std::flush;
std::future<void> fpusher = std::async(pusher<queue_t>, std::ref(do_continue_token), std::ref(queue));
std::future<int> fpuller = std::async(puller<implementation, queue_t>, std::ref(queue));
std::this_thread::sleep_for(10s);
do_continue_token.store(false, std::memory_order_release);
fpusher.wait();
if constexpr (implementation == condition_variable)
{
queue.done(); // to stop the waiting thread
}
if constexpr (implementation == semaphore)
{
queue.push(-1); // to stop the waiting thread
}
int i = fpuller.get();
if (sum_check != sum_ref)
{
throw;
}
std::cout << '>' << std::endl;
return i;
}
/*
* main() is responsible for performing multiple tests of different implementations.
* Results are collected, ordered and printed.
*/
int main()
{
struct result_t
{
std::string m_name;
int m_count;
};
using condition_variable_queue_t = locked_single_producer_single_consumer_queue_T<int, 1024>;
using semaphore_queue_t = locked_single_producer_single_consumer_queue2_T<int, 1024>;
std::vector<result_t> results // 6 runs
{
{ "condition_variable", test<condition_variable, condition_variable_queue_t>() },
{ "semaphore", test<semaphore, semaphore_queue_t>() },
{ "condition_variable", test<condition_variable, condition_variable_queue_t>() },
{ "semaphore", test<semaphore, semaphore_queue_t>() },
{ "condition_variable", test<condition_variable, condition_variable_queue_t>() },
{ "semaphore", test<semaphore, semaphore_queue_t>() },
};
std::sort(results.begin(), results.end(), [](result_t const& lhs, result_t const& rhs) { return lhs.m_count < rhs.m_count; });
std::cout << "The higher the count, the faster the solution" << std::endl;
for (result_t const& result : results)
{
std::cout << result.m_name << ": " << result.m_count << std::endl;
}
}
Output of a run:
<>
<>
<>
<>
<>
<>
The higher the count, the faster the solution
semaphore: 58304215
semaphore: 59302013
semaphore: 61896024
condition_variable: 84140445
condition_variable: 87045903
condition_variable: 90893057
My question kept bothering me, so I investigated Microsoft’s current implementation of semaphores. The counting semaphore has two atomics, and to implements the blocking wait with a wait on one of the atomics. Note that when the semaphore count does not reach zero, then also the wait for atomic is not called. The implementation also only notifies (the atomic) when it is sure that at least one thread is waiting for it. But still the semaphore implementation depends on the new C++20 wait/notify functions.
The new C++20 wait/notify functions are implemented with a pool of condition variables. I guess that is optimal, at least I wouldn’t know another faster way.
Bottom-line this implementation of semaphore is based on condition variables, and then I can imagine that above mentioned “condition variable implementation” is faster. Assuming that the mutex is most of the time not locked, then getting the mutex is cheap. Assuming that (due to the large queue size of 1024) we almost never have to wait for the condition variable predicate, also m_cv.wait() is cheap.
The “semaphore implementation” is in effect almost the same, only now two atomics (m_head & m_tail) need to be read and written. In the “condition variable implementation” the mutex implicitly protected these variables. Then my conclusion is that these two atomics in the “semaphore implementation” make the difference. And, unfortunately, you cannot do without them (in the “semaphore implementation”), so the “condition variable implementation” is faster.
To answer the question:
Q: Did I do something not clever so that my semaphore implementation is needlessly slow?
A: Not that I know (yet)
Q: Is possibly the Microsoft counting semaphore implementation too slow?
A: Does not look like it
Q: Or do requirements in the C++ standard make the semaphore slow in general?
A: Again, does not look like it.
Q: Am I just mistaken that a queue is proper application for semaphores?
A: Yes, that was probably in the early days
Q: When a queue is not a proper application, for what other application does the semaphore outperform the condition variable?
A: Don’t know yet. Possibly an application with simple waiting for limited resources.
I'm not used to post any question on the internet so please tell me if I'm doing something wrong.
In short
How to correctly prevent false sharing on a 64bits architecture with a CPU cacheline size of 64bytes ?
How does the usage of C++ 'alignas' keyword and a simple byte array (ex: char[64]) can affect multithreading efficiency ?
Context
While working on a very efficient implementation of a Single Consumer Single Producer Queue, I've encountered unlogical behaviours from GCC compiler while benchmarking my code.
Full story
I hope somebody will have the necessary knowledge to explain what is going on.
I'm currently using GCC 10.2.0 and its C++ 20 implementation on arch linux. My laptop is a Lenovo T470S having a i7-7500U processor.
Let me begin with the data structure:
class SPSCQueue
{
public:
...
private:
alignas(64) std::atomic<size_t> _tail { 0 }; // Tail accessed by both producer and consumer
Buffer _buffer {}; // Buffer cache for the producer, equivalent to _buffer2
std::size_t _headCache { 0 }; // Head cache for the producer
char _pad0[64 - sizeof(Buffer) - sizeof(std::size_t)]; // 64 bytes alignment padding
alignas(64) std::atomic<size_t> _head { 0 }; // Head accessed by both producer and consumer
Buffer _buffer2 {}; // Buffer cache for the consumer, equivalent to _buffer2
std::size_t _tailCache { 0 }; // Head cache for the consumer
char _pad1[64 - sizeof(Buffer) - sizeof(std::size_t)]; // 64 bytes alignment padding
};
The following data structure obtains a fast and stable 20ns at pushing / poping on my system.
However, only changing the alignment using the following members makes the benchmark unstable and give between 20 and 30ns.
alignas(64) std::atomic<size_t> _tail { 0 }; // Tail accessed by both producer and consumer
struct alignas(64) {
Buffer _buffer {}; // Buffer cache for the producer, equivalent to _buffer2
std::size_t _headCache { 0 }; // Head cache for the producer
};
alignas(64) std::atomic<size_t> _head { 0 }; // Head accessed by both producer and consumer
struct alignas(64) {
Buffer _buffer2 {}; // Buffer cache for the consumer, equivalent to _buffer1
std::size_t _tailCache { 0 }; // Tail cache for the consumer
};
Lastly, I got even more lost when I tried this configuration giving me results between 40 and 55ns.
std::atomic<size_t> _tail { 0 }; // Tail accessed by both producer and consumer
char _pad0[64 - sizeof(std::atomic<size_t>)];
Buffer _buffer {}; // Buffer cache for the producer, equivalent to _buffer2
std::size_t _headCache { 0 }; // Head cache for the producer
char _pad1[64 - sizeof(Buffer) - sizeof(std::size_t)];
std::atomic<size_t> _head { 0 }; // Head accessed by both producer and consumer
char _pad2[64 - sizeof(std::atomic<size_t>)];
Buffer _buffer2 {}; // Buffer cache for the consumer, equivalent to _buffer2
std::size_t _tailCache { 0 }; // Head cache for the consumer
char _pad3[64 - sizeof(Buffer) - sizeof(std::size_t)];
This time I got the queue push/pop oscillating both between 40 and 55ns.
I'm very lost at this point because I don't know where should I look for answers. Until now the C++ memory layout has been very intuitive for me but I realized that I still miss very important knowledges to be better at high frequency multithreading.
Minimal code sample
If you wish to compile the whole code to test it yourself, here are the few files needed:
SPSCQueue.hpp:
#pragma once
#include <atomic>
#include <cstdlib>
#include <cinttypes>
#define KF_ALIGN_CACHELINE alignas(kF::Core::Utils::CacheLineSize)
namespace kF::Core
{
template<typename Type>
class SPSCQueue;
namespace Utils
{
/** #brief Helper used to perfect forward move / copy constructor */
template<typename Type, bool ForceCopy = false>
void ForwardConstruct(Type *dest, Type *source) {
if constexpr (!ForceCopy && std::is_move_assignable_v<Type>)
new (dest) Type(std::move(*source));
else
new (dest) Type(*source);
}
/** #brief Helper used to perfect forward move / copy assignment */
template<typename Type, bool ForceCopy = false>
void ForwardAssign(Type *dest, Type *source) {
if constexpr (!ForceCopy && std::is_move_assignable_v<Type>)
*dest = std::move(*source);
else
*dest = *source;
}
/** #brief Theorical cacheline size */
constexpr std::size_t CacheLineSize = 64ul;
}
}
/**
* #brief The SPSC queue is a lock-free queue that only supports a Single Producer and a Single Consumer
* The queue is really fast compared to other more flexible implementations because the fact that only two thread can simultaneously read / write
* means that less synchronization is needed for each operation.
* The queue supports ranged push / pop to insert multiple elements without performance impact
*
* #tparam Type to be inserted
*/
template<typename Type>
class kF::Core::SPSCQueue
{
public:
/** #brief Buffer structure containing all cells */
struct Buffer
{
Type *data { nullptr };
std::size_t capacity { 0 };
};
/** #brief Local thread cache */
struct Cache
{
Buffer buffer {};
std::size_t value { 0 };
};
/** #brief Default constructor initialize the queue */
SPSCQueue(const std::size_t capacity);
/** #brief Destruct and release all memory (unsafe) */
~SPSCQueue(void) { clear(); std::free(_buffer.data); }
/** #brief Push a single element into the queue
* #return true if the element has been inserted */
template<typename ...Args>
[[nodiscard]] inline bool push(Args &&...args);
/** #brief Pop a single element from the queue
* #return true if an element has been extracted */
[[nodiscard]] inline bool pop(Type &value);
/** #brief Clear all elements of the queue (unsafe) */
void clear(void);
private:
KF_ALIGN_CACHELINE std::atomic<size_t> _tail { 0 }; // Tail accessed by both producer and consumer
struct {
Buffer _buffer {}; // Buffer cache for the producer, equivalent to _buffer2
std::size_t _headCache { 0 }; // Head cache for the producer
char _pad0[Utils::CacheLineSize - sizeof(Buffer) - sizeof(std::size_t)];
};
KF_ALIGN_CACHELINE std::atomic<size_t> _head { 0 }; // Head accessed by both producer and consumer
struct{
Buffer _buffer2 {}; // Buffer cache for the consumer, equivalent to _buffer2
std::size_t _tailCache { 0 }; // Head cache for the consumer
char _pad1[Utils::CacheLineSize - sizeof(Buffer) - sizeof(std::size_t)];
};
/** #brief Copy and move constructors disabled */
SPSCQueue(const SPSCQueue &other) = delete;
SPSCQueue(SPSCQueue &&other) = delete;
};
static_assert(sizeof(kF::Core::SPSCQueue<int>) == 4 * kF::Core::Utils::CacheLineSize);
template<typename Type>
kF::Core::SPSCQueue<Type>::SPSCQueue(const std::size_t capacity)
{
_buffer.capacity = capacity;
if (_buffer.data = reinterpret_cast<Type *>(std::malloc(sizeof(Type) * capacity)); !_buffer.data)
throw std::runtime_error("Core::SPSCQueue: Malloc failed");
_buffer2 = _buffer;
}
template<typename Type>
template<typename ...Args>
bool kF::Core::SPSCQueue<Type>::push(Args &&...args)
{
static_assert(std::is_constructible<Type, Args...>::value, "Type must be constructible from Args...");
const auto tail = _tail.load(std::memory_order_relaxed);
auto next = tail + 1;
if (next == _buffer.capacity) [[unlikely]]
next = 0;
if (auto head = _headCache; next == head) [[unlikely]] {
head = _headCache = _head.load(std::memory_order_acquire);
if (next == head) [[unlikely]]
return false;
}
new (_buffer.data + tail) Type{ std::forward<Args>(args)... };
_tail.store(next, std::memory_order_release);
return true;
}
template<typename Type>
bool kF::Core::SPSCQueue<Type>::pop(Type &value)
{
const auto head = _head.load(std::memory_order_relaxed);
if (auto tail = _tailCache; head == tail) [[unlikely]] {
tail = _tailCache = _tail.load(std::memory_order_acquire);
if (head == tail) [[unlikely]]
return false;
}
auto *elem = reinterpret_cast<Type *>(_buffer2.data + head);
auto next = head + 1;
if (next == _buffer2.capacity) [[unlikely]]
next = 0;
value = std::move(*elem);
elem->~Type();
_head.store(next, std::memory_order_release);
return true;
}
template<typename Type>
void kF::Core::SPSCQueue<Type>::clear(void)
{
for (Type type; pop(type););
}
The benchmark, using google benchmark.
bench_SPSCQueue.cpp:
#include <thread>
#include <benchmark/benchmark.h>
#include "SPSCQueue.hpp"
using namespace kF;
using Queue = Core::SPSCQueue<std::size_t>;
constexpr std::size_t Capacity = 4096;
static void SPSCQueue_NoisyPush(benchmark::State &state)
{
Queue queue(Capacity);
std::atomic<bool> running = true;
std::size_t i = 0ul;
std::thread thd([&queue, &running] { for (std::size_t tmp; running; benchmark::DoNotOptimize(queue.pop(tmp))); });
for (auto _ : state) {
decltype(std::chrono::high_resolution_clock::now()) start;
do {
start = std::chrono::high_resolution_clock::now();
} while (!queue.push(42ul));
auto end = std::chrono::high_resolution_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
auto iterationTime = elapsed.count();
state.SetIterationTime(iterationTime);
}
running = false;
if (thd.joinable())
thd.join();
}
BENCHMARK(SPSCQueue_NoisyPush)->UseManualTime();
static void SPSCQueue_NoisyPop(benchmark::State &state)
{
Queue queue(Capacity);
std::atomic<bool> running = true;
std::size_t i = 0ul;
std::thread thd([&queue, &running] { while (running) benchmark::DoNotOptimize(queue.push(42ul)); });
for (auto _ : state) {
std::size_t tmp;
decltype(std::chrono::high_resolution_clock::now()) start;
do {
start = std::chrono::high_resolution_clock::now();
} while (!queue.pop(tmp));
auto end = std::chrono::high_resolution_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::duration<double>>(end - start);
auto iterationTime = elapsed.count();
state.SetIterationTime(iterationTime);
}
running = false;
if (thd.joinable())
thd.join();
}
BENCHMARK(SPSCQueue_NoisyPop)->UseManualTime();
Thanks to your useful comments (and mostly, thanks to Peter Cordes), it seems that the issue was coming from the L2 data prefetcher.
Because of my SPSC queue design, each thread must access two consecutive cachelines to push / pop the queue.
If the structure itself is not aligned to 128 bytes, its address will not be aligned on 128 bytes and the compiler will not be able to optimize out the access of the two aligned cacheline.
Thus, the simple fix is:
template<typename Type>
class alignas(128) SPSCQueue { ... };
Here (section 2.5.5.4 Data Prefetching) is a interesting paper from Intel explaining optimization on their architectures and how prefetching is done in various levels of caches.
I am learning OpenMP parallel processing library in C++. I felt that I got the basic concepts, and try to test my knowledge by implementing a linked list queue. I wanted to consume the queue from multiple threads.
The challenge here is that not to consume the same node twice. So I was considering sharing the queue between threads but allowing only a single thread to update(go to the next node in the queue) it at a time. For this purpose, I could use critical or lock. However, without using them; somehow, it seems to be working perfectly. No race-condition has occurred.
#include <iostream>
#include <omp.h>
#include <zconf.h>
struct Node {
int data;
struct Node* next = NULL;
Node() {}
Node(int data) {
this->data = data;
}
Node(int data, Node* node) {
this->data = data;
this->next = node;
}
};
void processNode(Node *pNode);
struct Queue {
Node *head = NULL, *tail = NULL;
Queue& add(int data) {
add(new Node(data));
return *this;
}
void add(Node *node) {
if (head == NULL) {
head = node;
tail = node;
} else {
tail->next = node;
tail = node;
}
}
Node* remove() {
Node *node;
node = head;
if (head != NULL)
head = head->next;
return node;
}
};
int main() {
srand(12);
Queue queue;
for (int i = 0; i < 6; ++i) {
queue.add(i);
}
double timer_started = omp_get_wtime();
omp_set_num_threads(3);
#pragma omp parallel
{
Node *n;
while ((n = queue.remove()) != NULL) {
double started = omp_get_wtime();
processNode(n);
double elapsed = omp_get_wtime() - started;
printf("Thread id: %d data: %d, took: %f \n", omp_get_thread_num(), n->data, elapsed);
}
}
double elapsed = omp_get_wtime() - timer_started;
std::cout << "end. took " << elapsed << " in total " << std::endl;
return 0;
}
void processNode(Node *node) {
int r = rand() % 3 + 1; // between 1 and 3
sleep(r);
}
Output looks like this:
Thread id: 0 data: 0, took: 1.000136
Thread id: 2 data: 2, took: 1.000127
Thread id: 2 data: 4, took: 1.000208
Thread id: 1 data: 1, took: 3.001371
Thread id: 0 data: 3, took: 2.001041
Thread id: 2 data: 5, took: 2.004960
end. took 4.00583 in total
I've run this with a different number of threads and many times. But, I couldn't get any race condition or something wrong. I was thinking it was possible for two different threads to invoke 'remove' and process a single node twice. But it did not happen. Why?
https://github.com/muatik/openmp-examples/blob/master/linkedlist/main.cpp
First and foremost, you can never prove multi-threaded code to be correct through testing. Your hunch, that you need a lock / critical section is correct.
Your test is particularly easy on the queue. The following breaks your queue quickly:
for (int i = 0; i < 10000; ++i) {
queue.add(i);
}
double timer_started = omp_get_wtime();
#pragma omp parallel
{
size_t counter = 0;
Node *n;
while ((n = queue.remove()) != NULL) {
processNode(n);
counter++;
}
#pragma omp critical
std::cout << "Thread " << omp_get_thread_num() << " processed " << counter << " nodes." << std::endl;
}
void processNode(Node *node) {}
Show for example the following interesting result:
Thread 1 processed 11133 nodes.
Thread 0 processed 9039 nodes.
But again, if you made a queue that runs a million times correctly with this test code, doesn't mean the queue is implemented correctly.
In particular, it is not sufficient to just protect remove, you must properly protect each and every read and write to the queue data. To get an idea of the difficulty to get this right, watch this excellent talk by Herb Sutter.
Generally, I recommend to use an existing parallel data structure, for example from Boost.Lockfree.
However, unfortunately OpenMP and C++11 lock / atomic primitives don't officially play well together. So strictly speaking, if you use OpenMP, you should stick to OpenMP synchronization primitives or libraries that use them.
Thanks to Michael's and Adam's comments I made some changes here.
Suppose I build a special linked queue with max length N.
class QueueItem
{
int id; //object identity
double data; //object priority
}
class MyQueue
{
private:
list<QueueItem> rec;
std::mutex mymutex;
public:
LinkArray(){};
~LinkArray(){};
int insert(int id, double d)
{
std::lock_guard<std::mutex> guard(mymutex);
list<QueueItem>::iterator iter=rec.begin();
QueueItem temp;
temp.id = id;
temp.data = d;
int count=0;
for (iter;iter!=rec.end();iter++)
{
if (iter->data<d)
{
rec.insert(iter,temp);
return count;
}
count++;
}
rec.push_back(temp);
return count;
}
void pop(void)
{
std::lock_guard<std::mutex> guard(mymutex);
rec.pop_back();
}
int getLength(void)
{
return rec.size();
}
};
The rule is: all the items in the queue always have the highest priority and they are sorted. For example for N=4, the priority in the queue is (100,99,98,97). When priority 101 enqueue, the queue becomes (101,100,99,98). If 1 enqueue, then the queue doesn't change.
Now, if I use <thread> in C++11 to compute the priority for different object in different threads and then enqueue them, how can I avoid race condition?
My code is like:
....
int N = 4; //queue length
int pnum=4;//thread number
MyQueue* queue = new MyQueue(8);
vector<thread> threads;
std::mutex mymux;
int totalItem = 16;
int itemIds[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
double itemPriority[] = {9,10,3,11,2,12,5,6,7,8,1,15,14,4,13,16};
int itemperthread = totalItem/pnum;
for (int p=0; p<pnum;p++)
{
threads.push_back(thread([&](int p, MyQueue* queue ){
for (int i=p*itemperthread; i<(p+1)*itemperthread; i++)
{
int id = itemIds[i];
int priority = itemPriority[i];
}
mymux.lock();
queue->insert(id,priority); //enqueue
if (queue->getLength()>N)
queue->pop();
mymux.unlock();
},p, queue));
}
for (auto& thread : threads){
thread.join();
}
I tried use a mutex to lock the queue (or even define a mutex in Queue class and use it in Queue::insert method). But everytime I run this, the result of queue is different. I am pretty sure the computation of priority is correct.
The expected queue after run this part of program will be [16,15,14,13]. But the result can be different every time I run it.
Any idea to avoid this issue? Thanks.
Short version:
I'm trying to replace std::atomic from C++11 used in a lock-free, single producer, single consumer queue implementation from here. How do I replace this with boost::atomic?
Long version:
I'm trying to get a better performance out of our app with worker threads. Each thread has its own task queue. We have to synchronize using lock before dequeue/enqueue each task.
Then I found Herb Sutter's article on lock-free queue. It seems like an ideal replacement. But the code uses std::atomic from C++11, which I couldn't introduce to the project at this time.
More googling led to some examples, such as this one for Linux (echelon's), and this one for Windows (TINESWARE's). Both use platform's specific constructs such as WinAPI's InterlockedExchangePointer, and GCC's __sync_lock_test_and_set.
I only need to support Windows & Linux so maybe I can get away with some #ifdefs. But I thought it might be nicer to use what boost::atomic provides. Boost Atomic is not part of official Boost library yet. So I downloaded the source from http://www.chaoticmind.net/~hcb/projects/boost.atomic/ and use the include files with my project.
This is what I get so far:
#pragma once
#include <boost/atomic.hpp>
template <typename T>
class LockFreeQueue
{
private:
struct Node
{
Node(T val) : value(val), next(NULL) { }
T value;
Node* next;
};
Node* first; // for producer only
boost::atomic<Node*> divider; // shared
boost::atomic<Node*> last; // shared
public:
LockFreeQueue()
{
first = new Node(T());
divider = first;
last= first;
}
~LockFreeQueue()
{
while(first != NULL) // release the list
{
Node* tmp = first;
first = tmp->next;
delete tmp;
}
}
void Produce(const T& t)
{
last.load()->next = new Node(t); // add the new item
last = last.load()->next;
while(first != divider) // trim unused nodes
{
Node* tmp = first;
first = first->next;
delete tmp;
}
}
bool Consume(T& result)
{
if(divider != last) // if queue is nonempty
{
result = divider.load()->next->value; // C: copy it back
divider = divider.load()->next;
return true; // and report success
}
return false; // else report empty
}
};
Some modifications to note:
boost::atomic<Node*> divider; // shared
boost::atomic<Node*> last; // shared
and
last.load()->next = new Node(t); // add the new item
last = last.load()->next;
and
result = divider.load()->next->value; // C: copy it back
divider = divider.load()->next;
Am I applying the load() (and the implicit store()) from boost::atomic correctly right here? Can we say this is equivalent to Sutter's original C++11 lock-free queue?
PS. I studied many of the threads on SO, but none seems to provide an example for boost::atomic & lock-free queue.
Have you tried Intel Thread Building Blocks' atomic<T>? Cross platform and free.
Also...
Single producer/single consumer makes your problem much easier because your linearization point can be a single operator. It becomes easier still if you are prepared to accept a bounded queue.
A bounded queue offers advantages for cache performance because you can reserve a cache aligned memory block to maximize your hits, e.g.:
#include <vector>
#include "tbb/atomic.h"
#include "tbb/cache_aligned_allocator.h"
template< typename T >
class SingleProdcuerSingleConsumerBoundedQueue {
typedef vector<T, cache_aligned_allocator<T> > queue_type;
public:
BoundedQueue(int capacity):
queue(queue_type()) {
head = 0;
tail = 0;
queue.reserve(capacity);
}
size_t capacity() {
return queue.capacity();
}
bool try_pop(T& result) {
if(tail - head == 0)
return false;
else {
result = queue[head % queue.capacity()];
head.fetch_and_increment(); //linearization point
return(true);
}
}
bool try_push(const T& source) {
if(tail - head == queue.capacity())
return(false);
else {
queue[tail % queue.capacity()] = source;
tail.fetch_and_increment(); //linearization point
return(true);
}
}
~BoundedQueue() {}
private:
queue_type queue;
atomic<int> head;
atomic<int> tail;
};
Check out this boost.atomic ringbuffer example from the documentation:
#include <boost/atomic.hpp>
template <typename T, size_t Size>
class ringbuffer
{
public:
ringbuffer() : head_(0), tail_(0) {}
bool push(const T & value)
{
size_t head = head_.load(boost::memory_order_relaxed);
size_t next_head = next(head);
if (next_head == tail_.load(boost::memory_order_acquire))
return false;
ring_[head] = value;
head_.store(next_head, boost::memory_order_release);
return true;
}
bool pop(T & value)
{
size_t tail = tail_.load(boost::memory_order_relaxed);
if (tail == head_.load(boost::memory_order_acquire))
return false;
value = ring_[tail];
tail_.store(next(tail), boost::memory_order_release);
return true;
}
private:
size_t next(size_t current)
{
return (current + 1) % Size;
}
T ring_[Size];
boost::atomic<size_t> head_, tail_;
};
// How to use
int main()
{
ringbuffer<int, 32> r;
// try to insert an element
if (r.push(42)) { /* succeeded */ }
else { /* buffer full */ }
// try to retrieve an element
int value;
if (r.pop(value)) { /* succeeded */ }
else { /* buffer empty */ }
}
The code's only limitation is that the buffer length has to be known at compile time (or at construction time, if you replace the array by a std::vector<T>). To allow the buffer to grow and shrink is not trivial, as far as I understand.