How to share pthread synchronisation primitives between C++ and Rust? - c++

I have a C++ program and a Rust program, and between them I have successfully got them talking over POSIX shared memory (C++ and rust).
What I am now trying to do is synchronise them. I already managed to create a working, but inefficient, primitive system using an atomic bool (creating the AtomicBool on the rust side like this).
However, I would really like to use a mutex/condvar to synchronise between the threads, and this is where I am stuck.
I seem to be able to initialise the C++ side of it, following this example pretty much word for word.
I have attempted to translate it directly into rust:
let raw_shm = shm.get_shm();
let mut mtx_attrs = MaybeUninit::<nix::libc::pthread_mutexattr_t>::uninit();
if unsafe { nix::libc::pthread_mutexattr_init(mtx_attrs.as_mut_ptr()) } != 0 {
panic!("failed to create mtx_attrs");
};
let mtx_attrs = unsafe { mtx_attrs.assume_init() };
let mut cond_attrs = MaybeUninit::<nix::libc::pthread_condattr_t>::uninit();
if unsafe { nix::libc::pthread_condattr_init(cond_attrs.as_mut_ptr()) } != 0 {
panic!("failed to create cond_attrs");
};
let cond_attrs = unsafe { cond_attrs.assume_init() };
if unsafe {
nix::libc::pthread_mutexattr_setpshared(
&mtx_attrs as *const _ as *mut _,
PTHREAD_PROCESS_SHARED,
)
} != 0
{
panic!("failed to set mtx as process shared");
};
if unsafe {
nix::libc::pthread_condattr_setpshared(
&cond_attrs as *const _ as *mut _,
PTHREAD_PROCESS_SHARED,
)
} != 0
{
panic!("failed to set cond as process shared");
};
// I know that these offsets are correct, having used `offsetof` on the C++ side
let mtx_start = unsafe { &raw_shm.as_slice()[3110416] };
let mtx = unsafe { &*(mtx_start as *const _ as *const pthread_mutex_t) };
let cond_start = unsafe { &raw_shm.as_slice()[3110440] };
let cond = unsafe { &*(cond_start as *const _ as *const pthread_mutex_t) };
if unsafe {
nix::libc::pthread_mutex_init(&mtx as *const _ as *mut _, &mtx_attrs as *const _ as *mut _)
} != 0
{
panic!("failed to init mtx");
};
if unsafe {
nix::libc::pthread_cond_init(
&cond as *const _ as *mut _,
&cond_attrs as *const _ as *mut _,
)
} != 0
{
panic!("failed to init cond");
};
All of that passes with return values of 0... so far so good.
I can now test it in one of two ways:
I can set the trivial C++ program going and have it stop waiting at the condvar:
if (pthread_mutex_lock(&shmp->mutex) != 0)
throw("Error locking mutex");
if (pthread_cond_wait(&shmp->condition, &shmp->mutex) != 0)
throw("Error waiting for condition variable");
and in rust:
let sig = unsafe { nix::libc::pthread_cond_signal(&cond as *const _ as *mut _) };
dbg!(sig);
Despite returning 0 (i.e. success), my C++ program is not released past the condvar; it remains waiting as if it never received a signal.
I can set of another trivial C++ program which endlessly signals the condition variable in a loop:
for (unsigned int count = 0;; count++) {
if (pthread_cond_signal(condition) != 0)
throw("Error")
// sleep for a bit
}
and then in rust, something like:
loop {
if unsafe { nix::libc::pthread_mutex_lock(&mtx as *const _ as *mut _) } > 0 {
panic!("Failed to acquire lock")
};
if unsafe {
nix::libc::pthread_cond_wait(&cond as *const _ as *mut _, &mtx as *const _ as *mut _)
} > 0
{
panic!("Failed to acquire lock")
};
}
Doing it this way around, the call to lock the mutex is successful, but I get an EINVAL on pthread_cond_wait defined here, which I cannot seem to rectify...
I feel like I'm close... any thoughts on how to get this to work? (this is mostly just a proof of concept).

For posterity, I have managed to get this working.
To clarify how the program is architectured, there are two binaries: one C++ and one rust. The Rust program spawns the C++ program using std::process::Command.
Error handling and imports elided for brevity.
The rust program starts and creates a new shared memory block (removing an existing block if it exists, to ensure the program always starts in a fresh state). I use the shared_memory crate to handle the details for me, and that also provides useful helpers such as access to a raw pointer to the start of the memory block.
The shared memory block is structured like the following:
#[repr(c)]
struct SharedMemoryLayout {
ready: std::sync::atomic::AtomicBool,
mutex: libc::pthread_mutex_t,
condition: libc::pthread_cond_t,
}
Shared memory blocks are initialised with zeros, so ready will always be falseto begin with.
The rust program spawns the C++ program with std::process::Command::spawn and then waits in a loop until ready is true.
let proc = Command::new("/path/to/c++/binary").spawn().unwrap();
let ptr: *mut u8 = // pointer to first byte of shared memory block;
let ready: &AtomicBool = unsafe { &*(ptr as *mut bool as *const AtomicBool) };
loop {
if ready.load(Ordering::SeqCst) {
break
} else {
thread::sleep(Duration::from_secs(1));
}
}
The C++ program opens the shared memory block and mmaps it into its local address space.
struct SharedMemoryLayout
{
std::atomic_bool ready;
pthread_mutex_t mutex;
pthread_cond_t condition;
};
int fd = shm_open("name_of_shared_memory_block", O_RDWR, S_IRUSR | S_IWUSR);
struct SharedMemoryLayout *sync = (SharedMemoryLayout *)mmap(NULL, sizeof(*sync), PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
The C++ program carries on and proceeds to initialise the mutex and the condition, before marking the memory block as ready.
pthread_mutexattr_t mutex_attributes;
pthread_condattr_t condition_attributes;
pthread_mutexattr_init(&mutex_attributes);
pthread_condattr_init(&condition_attributes);
pthread_mutexattr_setpshared(&mutex_attributes, PTHREAD_PROCESS_SHARED);
pthread_condattr_setpshared(&condition_attributes, PTHREAD_PROCESS_SHARED);
pthread_mutex_init(&sync->mutex, &mutex_attributes);
pthread_cond_init(&sync->condition, &condition_attributes);
pthread_mutexattr_destroy(&mutex_attributes);
pthread_condattr_destroy(&condition_attributes);
std::atomic_bool *ready = &syncp->ready;
ready->store(true);
And then enter a loop signalling on the condition:
for (unsigned int count = 0;; count++) {
// do something
sleep(1);
pthread_cond_signal(&sync->condition);
}
Now, the rust program will have been released from the loop in step 2). Materialise the mutex and condition that were initialised in step 4).
let mutex = unsafe {ptr.offset(4) as *mut pthread_mutex_t};
let condition = unsafe {ptr.offset(32) as *mut pthread_cond_t};
And now we can wait on the condition, getting notified by the C++ program.
loop {
unsafe {
pthread_mutex_lock(mutex);
pthread_cond_wait(condition, mutex);
pthread_mutex_unlock(mutex);
// Do something
}
}

Related

Wait until a variable becomes zero

I'm writing a multithreaded program that can execute some tasks in separate threads.
Some operations require waiting for them at the end of execution of my program. I've written simple guard for such "important" operations:
class CPendingOperationGuard final
{
public:
CPendingOperationGuard()
{
InterlockedIncrementAcquire( &m_ullCounter );
}
~CPendingOperationGuard()
{
InterlockedDecrementAcquire( &m_ullCounter );
}
static bool WaitForAll( DWORD dwTimeOut )
{
// Here is a topic of my question
// Return false on timeout
// Return true if wait was successful
}
private:
static volatile ULONGLONG m_ullCounter;
};
Usage is simple:
void ImportantTask()
{
CPendingOperationGuard guard;
// Do work
}
// ...
void StopExecution()
{
if(!CPendingOperationGuard::WaitForAll( 30000 )) {
// Handle error
}
}
The question is: how to effectively wait until a m_ullCounter becames zero or until timeout.
I have two ideas:
To launch this function in another separate thread and write WaitForSingleObject( hThread, dwTimeout ):
DWORD WINAPI WaitWorker( LPVOID )
{
while(InterlockedCompareExchangeRelease( &m_ullCounter, 0, 0 ))
;
}
But it will "eat" almost 100% of CPU time - bad idea.
Second idea is to allow other threads to start:
DWORD WINAPI WaitWorker( LPVOID )
{
while(InterlockedCompareExchangeRelease( &m_ullCounter, 0, 0 ))
Sleep( 0 );
}
But it'll switch execution context into kernel mode and back - too expensive in may task. Bad idea too
The question is:
How to perform almost-zero-overhead waiting until my variable becames zero? Maybe without separate thread... The main condition is to support stopping of waiting by timeout.
Maybe someone can suggest completely another idea for my task - to wait for all registered operations (like in WinAPI's ThreadPools - its API has, for instance, WaitForThreadpoolWaitCallbacks to perform waiting for ALL registered tasks).
PS: it is not possible to rewrite my code with ThreadPool API :(
Have a look at the WaitOnAddress() and WakeByAddressSingle()/WakeByAddressAll() functions introduced in Windows 8.
For example:
class CPendingOperationGuard final
{
public:
CPendingOperationGuard()
{
InterlockedIncrementAcquire(&m_ullCounter);
Wake­By­Address­All(&m_ullCounter);
}
~CPendingOperationGuard()
{
InterlockedDecrementAcquire(&m_ullCounter);
Wake­By­Address­All(&m_ullCounter);
}
static bool WaitForAll( DWORD dwTimeOut )
{
ULONGLONG Captured, Now, Deadline = GetTickCount64() + dwTimeOut;
DWORD TimeRemaining;
do
{
Captured = InterlockedExchangeAdd64((LONG64 volatile *)&m_ullCounter, 0);
if (Captured == 0) return true;
Now = GetTickCount64();
if (Now >= Deadline) return false;
TimeRemaining = static_cast<DWORD>(Deadline - Now);
}
while (WaitOnAddress(&m_ullCounter, &Captured, sizeof(ULONGLONG), TimeRemaining));
return false;
}
private:
static volatile ULONGLONG m_ullCounter;
};
Raymond Chen wrote a series of blog articles about these functions:
WaitOnAddress lets you create a synchronization object out of any data variable, even a byte
Implementing a critical section in terms of WaitOnAddress
Spurious wakes, race conditions, and bogus FIFO claims: A peek behind the curtain of WaitOnAddress
Extending our critical section based on WaitOnAddress to support timeouts
Comparing WaitOnAddress with futexes (futexi? futexen?)
Creating a semaphore from WaitOnAddress
Creating a semaphore with a maximum count from WaitOnAddress
Creating a manual-reset event from WaitOnAddress
Creating an automatic-reset event from WaitOnAddress
A helper template function to wait for WaitOnAddress in a loop
you need for this task something like Run-Down Protection instead CPendingOperationGuard
before begin operation, you call ExAcquireRundownProtection and only if it return TRUE - begin execute operation. at the end you must call ExReleaseRundownProtection
so pattern must be next
if (ExAcquireRundownProtection(&RunRef)) {
do_operation();
ExReleaseRundownProtection(&RunRef);
}
when you want stop this process and wait for all active calls do_operation(); finished - you call ExWaitForRundownProtectionRelease (instead WaitWorker)
After ExWaitForRundownProtectionRelease is called, the ExAcquireRundownProtection routine will return FALSE (so new operations will not start after this). ExWaitForRundownProtectionRelease waits to return until all calls the ExReleaseRundownProtection routine to release the previously acquired run-down protection (so when all current(if exist) operation complete). When all outstanding accesses are completed, ExWaitForRundownProtectionRelease returns
unfortunately this api implemented by system only in kernel mode and no analog in user mode. however not hard implement such idea yourself
this is my example:
enum RundownState {
v_complete = 0, v_init = 0x80000000
};
template<typename T>
class RundownProtection
{
LONG _Value;
public:
_NODISCARD BOOL IsRundownBegin()
{
return 0 <= _Value;
}
_NODISCARD BOOL AcquireRP()
{
LONG Value, NewValue;
if (0 > (Value = _Value))
{
do
{
NewValue = InterlockedCompareExchangeNoFence(&_Value, Value + 1, Value);
if (NewValue == Value) return TRUE;
} while (0 > (Value = NewValue));
}
return FALSE;
}
void ReleaseRP()
{
if (InterlockedDecrement(&_Value) == v_complete)
{
static_cast<T*>(this)->RundownCompleted();
}
}
void Rundown_l()
{
InterlockedBitTestAndResetNoFence(&_Value, 31);
}
void Rundown()
{
if (AcquireRP())
{
Rundown_l();
ReleaseRP();
}
}
RundownProtection(RundownState Value = v_init) : _Value(Value)
{
}
void Init()
{
_Value = v_init;
}
};
///////////////////////////////////////////////////////////////
class OperationGuard : public RundownProtection<OperationGuard>
{
friend RundownProtection<OperationGuard>;
HANDLE _hEvent;
void RundownCompleted()
{
SetEvent(_hEvent);
}
public:
OperationGuard() : _hEvent(0) {}
~OperationGuard()
{
if (_hEvent)
{
CloseHandle(_hEvent);
}
}
ULONG WaitComplete(ULONG dwMilliseconds = INFINITE)
{
return WaitForSingleObject(_hEvent, dwMilliseconds);
}
ULONG Init()
{
return (_hEvent = CreateEvent(0, 0, 0, 0)) ? NOERROR : GetLastError();
}
} g_guard;
//////////////////////////////////////////////
ULONG CALLBACK PendingOperationThread(void*)
{
while (g_guard.AcquireRP())
{
Sleep(1000);// do operation
g_guard.ReleaseRP();
}
return 0;
}
void demo()
{
if (g_guard.Init() == NOERROR)
{
if (HANDLE hThread = CreateThread(0, 0, PendingOperationThread, 0, 0, 0))
{
CloseHandle(hThread);
}
MessageBoxW(0, 0, L"UI Thread", MB_ICONINFORMATION|MB_OK);
g_guard.Rundown();
g_guard.WaitComplete();
}
}
why simply wait when wait until a m_ullCounter became zero not enough
if we read 0 from m_ullCounter this mean only at this time no active operation. but pending operation can begin already after we check that m_ullCounter == 0 . we can use special flag (say bool g_bQuit) and set it. operation before begin check this flag and not begin if it true. but this anyway not enough
naive code:
//worker thread
if (!g_bQuit) // (1)
{
// MessageBoxW(0, 0, L"simulate delay", MB_ICONWARNING);
InterlockedIncrement(&g_ullCounter); // (4)
// do operation
InterlockedDecrement(&g_ullCounter); // (5)
}
// here we wait for all operation done
g_bQuit = true; // (2)
// wait on g_ullCounter == 0, how - not important
while (g_ullCounter) continue; // (3)
pending operation checking g_bQuit flag (1) - it yet false, so it
begin
worked thread is swapped (use MessageBox for simulate this)
we set g_bQuit = true; // (2)
we check/wait for g_ullCounter == 0, it 0 so we exit (3)
working thread wake (return from MessageBox) and increment
g_ullCounter (4)
problem here that operation can use some resources which we already begin destroy after g_ullCounter == 0
this happens because check quit flag (g_Quit) and increment counter after this not atomic - can be a gap between them.
for correct solution we need atomic access to flag+counter. this and do rundown protection. for flag+counter used single LONG variable (32 bit) because we can do atomic access to it. 31 bits used for counter and 1 bits used for quit flag. windows solution use 0 bit for flag (1 mean quit) and [1..31] bits for counter. i use the [0..30] bits for counter and 31 bit for flag (0 mean quit). look for

Implementing/replacement of volatile std::string

As already noted here using volatile std::string isn't a good idea.
I'm developing an application on FreeRTOS and I need to have a string which is moved between tasks. There is one task which receives commands through UART and can be asked by other tasks to get the response on a specified command. I want to use std::move on a string to make the application optimal.
Is there a neat and fast replacement of volatile std::string or do I have to implement a class with volatile field on my own? Maybe this approach is bad and I should use another structure to handle moving around responses on the commands?
EDIT: Here's some code.
I get the single bytes of commands through interrupt. The commands are human readable commands terminated with \r.
void rx_interrupt(char c)
{
if(c == '\r')
{
c == '\0')
BaseType_t higher_prior_task_woken = pdFALSE;
vTaskNotifyGiveFromISR(rx_task_handle, &higher_prior_task_woken);
portYIELD_FROM_ISR(higher_prior_task_woken);
}
rx_buf.push_byte(c);
}
rx_buf is a circular buffer which allows to pop whole commands as std::string.
Then the rx_task:
for (;;)
{
auto notif_num = ulTaskNotifyTake(pdTRUE, portMAX_DELAY);
while (notif_num--)
{
auto comm = rx_buf.pop_command();
if (comm.length() == 0)
continue;
if (is_unsolicited_command(comm))
handle_unsolicited_command(std::move(comm));
if (is_awaited_command(comm))
handle_awaited_command(std::move(comm));
}
}
The rx_task is needed because I must firstly check whether an asynchronous event occurred which is indicated by an unsolicited command.
The received (awaited) commands may be long so I want to move them.
void handle_awaited_command(std::string &&cmd)
{
os_lockguard guard(var_mux);
if (!awaiting)
return;
awaited_command = std::move(cmd); // Problematic line
xSemaphoreGive(cmd_received_sem);
awaited_cmd_handled = true;
}
Finally any of the other task may await a command:
std::string get_command()
{
os_lockguard guard_glob(mux);
{
os_lockguard guard(var_mux);
awaiting = true;
}
xSemaphoreTake(cmd_received_sem, timeout);
{
os_lockguard guard(var_mux);
if(awaited_cmd_handled)
return std::move(awaited_command); // Problematic line
else
return std::string("");
}
}
The thing is that the definitions looks like that:
volatile bool awaiting;
volatile bool awaited_cmd_handled;
volatile std::string awaited_command;
So I have here a volatile std::string.

Does the use of an anonymous pipe introduce a memory barrier for interthread communication?

For example, say I allocate a struct with new and write the pointer into the write end of an anonymous pipe.
If I read the pointer from the corresponding read end, am I guaranteed to see the 'correct' contents on the struct?
Also of of interest is whether the results of socketpair() on unix & self connecting over tcp loopback on windows have the same guarantees.
The context is a server design which centralizes event dispatch with select/epoll
For example, say I allocate a struct with new and write the pointer into the write end of an anonymous pipe.
If I read the pointer from the corresponding read end, am I guaranteed to see the 'correct' contents on the struct?
No. There is no guarantee that the writing CPU will have flushed the write out of its cache and made it visible to the other CPU that might do the read.
Also of of interest is whether the results of socketpair() on unix & self connecting over tcp loopback on windows have the same guarantees.
No.
In practice, calling write(), which is a system call, will end up locking one or more data structures in the kernel, which should take care of the reordering issue. For example, POSIX requires subsequent reads to see data written before their call, which implies a lock (or some kind of acquire/release) by itself.
As for whether that's part of the formal spec of the calls, probably it's not.
A pointer is just a memory address, so provided you are on the same process the pointer will be valid on the receiving thread and will point to the same struct. If you are on different processes, at best you will get immediately a memory error, at worse you will read (or write) to a random memory which is essentially Undefined Behaviour.
Will you read the correct content? Neither better nor worse than if your pointer was in a static variable shared by both threads: you still have to do some synchronization if you want consistency.
Will the kind of transfer address matter between static memory (shared by threads), anonymous pipes, socket pairs, tcp loopback, etc.? No: all those channels transfers bytes, so if you pass a memory address, you will get your memory address. What is left you then is synchronization, because here you are just sharing a memory address.
If you do not use any other synchronization, anything can happen (did I already spoke of Undefined Behaviour?):
reading thread can access memory before it has been written by writing one giving stale data
if you forgot to declare the struct members as volatile, reading thread can keep using cached values, here again getting stale data
reading thread can read partially written data meaning incoherent data
Interesting question with, so far, only one correct answer from Cornstalks.
Within the same (multi-threaded) process there are no guarantees since pointer and data follow different paths to reach their destination.
Implicit acquire/release guarantees do not apply since the struct data cannot piggyback on the pointer through the cache and formally you are dealing with a data race.
However, looking at how the pointer and the struct data itself reach the second thread (through the pipe and memory cache respectively), there is a real chance that this mechanism is not going to cause any harm.
Sending the pointer to a peer thread takes 3 system calls (write() in the sending thread, select() and read() in the receiving thread) which is (relatively) expensive and by the time the pointer value is available
in the receiving thread, the struct data probably has arrived long before.
Note that this is just an observation, the mechanism is still incorrect.
I believe, your case might be reduced to this 2 threads model:
int data = 0;
std::atomic<int*> atomicPtr{nullptr};
//...
void thread1()
{
data = 42;
atomicPtr.store(&integer, std::memory_order_release);
}
void thread2()
{
int* ptr = nullptr;
while(!ptr)
ptr = atomicPtr.load(std::memory_order_consume);
assert(*ptr == 42);
}
Since you have 2 processes you can't use one atomic variable across them but since you listed windows you can omit atomicPtr.load(std::memory_order_consume) from the consuming part because, AFAIK, all the architectures Windows is running on guarantee this load to be correct without any barrier on the loading side. In fact, I think there are not much architectures out there where that instruction would not be a NO-OP(I heard only about DEC Alpha)
I agree with Serge Ballesta's answer. Within the same process, it's feasible to send and receive object address via anonymous pipe.
Since the write system call is guaranteed to be atomic when message size is below PIPE_BUF (normally 4096 bytes), so multi-producer threads will not mess up each other's object address (8 bytes for 64 bit applications).
Talk is cheap, here is the demo code for Linux (defensive code and error handlers are omitted for simplicity). Just copy & paste to pipe_ipc_demo.cc then compile & run the test.
#include <unistd.h>
#include <string.h>
#include <pthread.h>
#include <string>
#include <list>
template<class T> class MPSCQ { // pipe based Multi Producer Single Consumer Queue
public:
MPSCQ();
~MPSCQ();
int producerPush(const T* t);
T* consumerPoll(double timeout = 1.0);
private:
void _consumeFd();
int _selectFdConsumer(double timeout);
T* _popFront();
private:
int _fdProducer;
int _fdConsumer;
char* _consumerBuf;
std::string* _partial;
std::list<T*>* _list;
static const int _PTR_SIZE;
static const int _CONSUMER_BUF_SIZE;
};
template<class T> const int MPSCQ<T>::_PTR_SIZE = sizeof(void*);
template<class T> const int MPSCQ<T>::_CONSUMER_BUF_SIZE = 1024;
template<class T> MPSCQ<T>::MPSCQ() :
_fdProducer(-1),
_fdConsumer(-1) {
_consumerBuf = new char[_CONSUMER_BUF_SIZE];
_partial = new std::string; // for holding partial pointer address
_list = new std::list<T*>; // unconsumed T* cache
int fd_[2];
int r = pipe(fd_);
_fdConsumer = fd_[0];
_fdProducer = fd_[1];
}
template<class T> MPSCQ<T>::~MPSCQ() { /* omitted */ }
template<class T> int MPSCQ<T>::producerPush(const T* t) {
return t == NULL ? 0 : write(_fdProducer, &t, _PTR_SIZE);
}
template<class T> T* MPSCQ<T>::consumerPoll(double timeout) {
T* t = _popFront();
if (t != NULL) {
return t;
}
if (_selectFdConsumer(timeout) <= 0) { // timeout or error
return NULL;
}
_consumeFd();
return _popFront();
}
template<class T> void MPSCQ<T>::_consumeFd() {
memcpy(_consumerBuf, _partial->data(), _partial->length());
ssize_t r = read(_fdConsumer, _consumerBuf, _CONSUMER_BUF_SIZE - _partial->length());
if (r <= 0) { // EOF or error, error handler omitted
return;
}
const char* p = _consumerBuf;
int remaining_len_ = _partial->length() + r;
T* t;
while (remaining_len_ >= _PTR_SIZE) {
memcpy(&t, p, _PTR_SIZE);
_list->push_back(t);
remaining_len_ -= _PTR_SIZE;
p += _PTR_SIZE;
}
*_partial = std::string(p, remaining_len_);
}
template<class T> int MPSCQ<T>::_selectFdConsumer(double timeout) {
int r;
int nfds_ = _fdConsumer + 1;
fd_set readfds_;
struct timeval timeout_;
int64_t usec_ = timeout * 1000000.0;
while (true) {
timeout_.tv_sec = usec_ / 1000000;
timeout_.tv_usec = usec_ % 1000000;
FD_ZERO(&readfds_);
FD_SET(_fdConsumer, &readfds_);
r = select(nfds_, &readfds_, NULL, NULL, &timeout_);
if (r < 0 && errno == EINTR) {
continue;
}
return r;
}
}
template<class T> T* MPSCQ<T>::_popFront() {
if (!_list->empty()) {
T* t = _list->front();
_list->pop_front();
return t;
} else {
return NULL;
}
}
// = = = = = test code below = = = = =
#define _LOOP_CNT 5000000
#define _ONE_MILLION 1000000
#define _PRODUCER_THREAD_NUM 2
struct TestMsg { // all public
int _threadId;
int _msgId;
int64_t _val;
TestMsg(int thread_id, int msg_id, int64_t val) :
_threadId(thread_id),
_msgId(msg_id),
_val(val) { };
};
static MPSCQ<TestMsg> _QUEUE;
static int64_t _SUM = 0;
void* functor_producer(void* arg) {
int my_thr_id_ = pthread_self();
TestMsg* msg_;
for (int i = 0; i <= _LOOP_CNT; ++ i) {
if (i == _LOOP_CNT) {
msg_ = new TestMsg(my_thr_id_, i, -1);
} else {
msg_ = new TestMsg(my_thr_id_, i, i + 1);
}
_QUEUE.producerPush(msg_);
}
return NULL;
}
void* functor_consumer(void* arg) {
int msg_cnt_ = 0;
int stop_cnt_ = 0;
TestMsg* msg_;
while (true) {
if ((msg_ = _QUEUE.consumerPoll()) == NULL) {
continue;
}
int64_t val_ = msg_->_val;
delete msg_;
if (val_ <= 0) {
if ((++ stop_cnt_) >= _PRODUCER_THREAD_NUM) {
printf("All done, _SUM=%ld\n", _SUM);
break;
}
} else {
_SUM += val_;
if ((++ msg_cnt_) % _ONE_MILLION == 0) {
printf("msg_cnt_=%d, _SUM=%ld\n", msg_cnt_, _SUM);
}
}
}
return NULL;
}
int main(int argc, char* const* argv) {
pthread_t consumer_;
pthread_create(&consumer_, NULL, functor_consumer, NULL);
pthread_t producers_[_PRODUCER_THREAD_NUM];
for (int i = 0; i < _PRODUCER_THREAD_NUM; ++ i) {
pthread_create(&producers_[i], NULL, functor_producer, NULL);
}
for (int i = 0; i < _PRODUCER_THREAD_NUM; ++ i) {
pthread_join(producers_[i], NULL);
}
pthread_join(consumer_, NULL);
return 0;
}
And here is test result ( 2 * sum(1..5000000) == (1 + 5000000) * 5000000 == 25000005000000 ):
$ g++ -o pipe_ipc_demo pipe_ipc_demo.cc -lpthread
$ ./pipe_ipc_demo ## output may vary except for the final _SUM
msg_cnt_=1000000, _SUM=251244261289
msg_cnt_=2000000, _SUM=1000708879236
msg_cnt_=3000000, _SUM=2250159002500
msg_cnt_=4000000, _SUM=4000785160225
msg_cnt_=5000000, _SUM=6251640644676
msg_cnt_=6000000, _SUM=9003167062500
msg_cnt_=7000000, _SUM=12252615629881
msg_cnt_=8000000, _SUM=16002380952516
msg_cnt_=9000000, _SUM=20252025092401
msg_cnt_=10000000, _SUM=25000005000000
All done, _SUM=25000005000000
The technique showed here is used in our production applications. One typical usage is the consumer thread acts as a log writer, and worker threads can write log messages almost asynchronously. Yes, almost means sometimes writer threads may be blocked in write() when pipe is full, and this is a reliable congestion control feature provided by OS.

memory leak with sockets and map

I have a socket server, everytime a new connection is made, a XClient class is instantiated and I am inserting it into a map. I am watching the memory usage through task manager. everytime a new connection is made, lets assume, the memory usage of my program increases by 800kb for example. Inside that class, there is a connected variable, which will tell me wheter this client is active or not. I created a thread to run endlessly and iterate through all the elements of my map and I'm checking if the connected variable is true or false. if it is false, I am (at least I think I am...) releasing the memory used by the previously instantiated XClient class. BUT, the memory usage is being decreased only half of the 800kb (for example, no precise values). So, when a client connects: +800kb. when client disconnects: -400kb. I think I have a memory leak? If I have 100 clients connected, that 400kb that is not being released would turn into 4000kb of non-used(?) memory, and that would be a problem.
So, here is my code.
The thread to iterate through all elements:
DWORD Update(XSockets *sockets)
{
while(true)
{
for(sockets->it = sockets->clients.begin(); sockets->it != sockets->clients.end(); sockets->it++)
{
int key = (*sockets->it).first;
if(sockets->clients[key]->connected == false) // remove the client, releasing memory
{
delete sockets->clients[key];
}
}
Sleep(100);
}
return true;
}
The code that is adding new XClients instances to my map:
bool XSockets::AcceptConnections()
{
struct sockaddr_in from;
while(true)
{
try
{
int fromLen = sizeof(from);
SOCKET client = accept(this->loginSocket,(struct sockaddr*)&from,&fromLen);
if(client != INVALID_SOCKET)
{
srand(time(NULL));
int clientKey = rand();
XClient* clientClass = new XClient(inet_ntoa(from.sin_addr),clientKey,client);
this->clients.insert(make_pair(clientKey,clientClass));
}
Sleep(100);
}
catch(...)
{
printf("error accepting incoming connection!\r\n");
break;
}
}
closesocket(this->loginSocket);
WSACleanup();
return true;
}
And the declarations:
map<int,XClient*> clients;
map<int,XClient*>::iterator it;
You've got several problems, but the chief one is that you appear to be sharing a map between threads without any synchronization at all. That can lead to all kinds of trouble.
Are you using c++11 or Boost? To avoid memory leak nightmares like this, you could create a map of shared pointers. This way, you can let the structure clean itself up.
This is how I would do it:
#include <memory>
#include <map>
#include <algorithm>
#include <functional>
#include <mutex>
typedef std::shared_ptr<XClient> XClientPtr;
std::map<int, XClientPtr> client;
std::mutex the_lock;
bool XSockets::AcceptConnections()
{
/* snip */
auto clientClass = std::make_shared<XClient>(/*... params ...*/);
the_lock.lock();
clients[clientKey] = clientClass;
the_lock.unlock();
/* snip */
}
bool client_is_connected(const std::pair<int, XClientPtr> &p) {
return p.second->connected;
}
DWORD Update(XSockets *sockets) {
while(true) { /* You should probably have some kind of
exit condition here. Like a global "running" bool
so that the thread will eventually stop. */
the_lock.lock();
auto it = sockets->clients.begin(), end = sockets->clients.end();
for(; it != end; ) {
if (!it->second->connected)
//Clients will be destructed here if their refcount goes to 0
sockets->clients.erase(it++);
else
++it;
}
the_lock.unlock();
Sleep(100);
}
return 1;
}
Note: Above code is untested. I haven't even tried to compile it.
See What happens to an STL iterator after erasing it in VS, UNIX/Linux?. In your case, you are not deleting everything, so you will want to not use a for loop.
sockets->it = sockets->clients.begin();
while (sockets->it != sockets->clients.end())
{
int key = (*sockets->it).first;
if(sockets->clients[key]->connected == false) // remove the client, releasing memory
{
delete sockets->clients[key];
sockets->clients.erase(sockets->it++);
}
else
{
sockets->it++;
}
}

Win32 Read/Write Lock Using Only Critical Sections

I have to implement a read/write lock in C++ using the Win32 api as part of a project at work. All of the existing solutions use kernel objects (semaphores and mutexes) that require a context switch during execution. This is far too slow for my application.
I would like implement one using only critical sections, if possible. The lock does not have to be process safe, only threadsafe. Any ideas on how to go about this?
If you can target Vista or greater, you should use the built-in SRWLock's. They are lightweight like critical sections, entirely user-mode when there is no contention.
Joe Duffy's blog has some recent entries on implementing different types of non-blocking reader/writer locks. These locks do spin, so they would not be appropriate if you intend to do a lot of work while holding the lock. The code is C#, but should be straightforward to port to native.
You can implement a reader/writer lock using critical sections and events - you just need to keep enough state to only signal the event when necessary to avoid an unnecessary kernel mode call.
I don't think this can be done without using at least one kernel-level object (Mutex or Semaphore), because you need the help of the kernel to make the calling process block until the lock is available.
Critical sections do provide blocking, but the API is too limited. e.g. you cannot grab a CS, discover that a read lock is available but not a write lock, and wait for the other process to finish reading (because if the other process has the critical section it will block other readers which is wrong, and if it doesn't then your process will not block but spin, burning CPU cycles.)
However what you can do is use a spin lock and fall back to a mutex whenever there is contention. The critical section is itself implemented this way. I would take an existing critical section implementation and replace the PID field with separate reader & writer counts.
Old question, but this is something that should work. It doesn't spin on contention. Readers incur limited extra cost if they have little or no contention, because SetEvent is called lazily (look at the edit history for a more heavyweight version that doesn't have this optimization).
#include <windows.h>
typedef struct _RW_LOCK {
CRITICAL_SECTION countsLock;
CRITICAL_SECTION writerLock;
HANDLE noReaders;
int readerCount;
BOOL waitingWriter;
} RW_LOCK, *PRW_LOCK;
void rwlock_init(PRW_LOCK rwlock)
{
InitializeCriticalSection(&rwlock->writerLock);
InitializeCriticalSection(&rwlock->countsLock);
/*
* Could use a semaphore as well. There can only be one waiter ever,
* so I'm showing an auto-reset event here.
*/
rwlock->noReaders = CreateEvent (NULL, FALSE, FALSE, NULL);
}
void rwlock_rdlock(PRW_LOCK rwlock)
{
/*
* We need to lock the writerLock too, otherwise a writer could
* do the whole of rwlock_wrlock after the readerCount changed
* from 0 to 1, but before the event was reset.
*/
EnterCriticalSection(&rwlock->writerLock);
EnterCriticalSection(&rwlock->countsLock);
++rwlock->readerCount;
LeaveCriticalSection(&rwlock->countsLock);
LeaveCriticalSection(&rwlock->writerLock);
}
int rwlock_wrlock(PRW_LOCK rwlock)
{
EnterCriticalSection(&rwlock->writerLock);
/*
* readerCount cannot become non-zero within the writerLock CS,
* but it can become zero...
*/
if (rwlock->readerCount > 0) {
EnterCriticalSection(&rwlock->countsLock);
/* ... so test it again. */
if (rwlock->readerCount > 0) {
rwlock->waitingWriter = TRUE;
LeaveCriticalSection(&rwlock->countsLock);
WaitForSingleObject(rwlock->noReaders, INFINITE);
} else {
/* How lucky, no need to wait. */
LeaveCriticalSection(&rwlock->countsLock);
}
}
/* writerLock remains locked. */
}
void rwlock_rdunlock(PRW_LOCK rwlock)
{
EnterCriticalSection(&rwlock->countsLock);
assert (rwlock->readerCount > 0);
if (--rwlock->readerCount == 0) {
if (rwlock->waitingWriter) {
/*
* Clear waitingWriter here to avoid taking countsLock
* again in wrlock.
*/
rwlock->waitingWriter = FALSE;
SetEvent(rwlock->noReaders);
}
}
LeaveCriticalSection(&rwlock->countsLock);
}
void rwlock_wrunlock(PRW_LOCK rwlock)
{
LeaveCriticalSection(&rwlock->writerLock);
}
You could decrease the cost for readers by using a single CRITICAL_SECTION:
countsLock is replaced with writerLock in rdlock and rdunlock
rwlock->waitingWriter = FALSE is removed in wrunlock
wrlock's body is changed to
EnterCriticalSection(&rwlock->writerLock);
rwlock->waitingWriter = TRUE;
while (rwlock->readerCount > 0) {
LeaveCriticalSection(&rwlock->writerLock);
WaitForSingleObject(rwlock->noReaders, INFINITE);
EnterCriticalSection(&rwlock->writerLock);
}
rwlock->waitingWriter = FALSE;
/* writerLock remains locked. */
However this loses in fairness, so I prefer the above solution.
Take a look at the book "Concurrent Programming on Windows" which has lots of different reference examples for reader/writer locks.
Check out the spin_rw_mutex from Intel's Thread Building Blocks ...
spin_rw_mutex is strictly in user-land
and employs spin-wait for blocking
This is an old question but perhaps someone will find this useful. We developed a high-performance, open-source RWLock for Windows that automatically uses Vista+ SRWLock Michael mentioned if available, or otherwise falls back to a userspace implementation.
As an added bonus, there are four different "flavors" of it (though you can stick to the basic, which is also the fastest), each providing more synchronization options. It starts with the basic RWLock() which is non-reentrant, limited to single-process synchronization, and no swapping of read/write locks to a full-fledged cross-process IPC RWLock with re-entrance support and read/write de-elevation.
As mentioned, they dynamically swap out to the Vista+ slim read-write locks for best performance when possible, but you don't have to worry about that at all as it'll fall back to a fully-compatible implementation on Windows XP and its ilk.
If you already know of a solution that only uses mutexes, you should be able to modify it to use critical sections instead.
We rolled our own using two critical sections and some counters. It suits our needs - we have a very low writer count, writers get precedence over readers, etc. I'm not at liberty to publish ours but can say that it is possible without mutexes and semaphores.
Here is the smallest solution that I could come up with:
http://www.baboonz.org/rwlock.php
And pasted verbatim:
/** A simple Reader/Writer Lock.
This RWL has no events - we rely solely on spinlocks and sleep() to yield control to other threads.
I don't know what the exact penalty is for using sleep vs events, but at least when there is no contention, we are basically
as fast as a critical section. This code is written for Windows, but it should be trivial to find the appropriate
equivalents on another OS.
**/
class TinyReaderWriterLock
{
public:
volatile uint32 Main;
static const uint32 WriteDesireBit = 0x80000000;
void Noop( uint32 tick )
{
if ( ((tick + 1) & 0xfff) == 0 ) // Sleep after 4k cycles. Crude, but usually better than spinning indefinitely.
Sleep(0);
}
TinyReaderWriterLock() { Main = 0; }
~TinyReaderWriterLock() { ASSERT( Main == 0 ); }
void EnterRead()
{
for ( uint32 tick = 0 ;; tick++ )
{
uint32 oldVal = Main;
if ( (oldVal & WriteDesireBit) == 0 )
{
if ( InterlockedCompareExchange( (LONG*) &Main, oldVal + 1, oldVal ) == oldVal )
break;
}
Noop(tick);
}
}
void EnterWrite()
{
for ( uint32 tick = 0 ;; tick++ )
{
if ( (tick & 0xfff) == 0 ) // Set the write-desire bit every 4k cycles (including cycle 0).
_InterlockedOr( (LONG*) &Main, WriteDesireBit );
uint32 oldVal = Main;
if ( oldVal == WriteDesireBit )
{
if ( InterlockedCompareExchange( (LONG*) &Main, -1, WriteDesireBit ) == WriteDesireBit )
break;
}
Noop(tick);
}
}
void LeaveRead()
{
ASSERT( Main != -1 );
InterlockedDecrement( (LONG*) &Main );
}
void LeaveWrite()
{
ASSERT( Main == -1 );
InterlockedIncrement( (LONG*) &Main );
}
};
I wrote the following code using only critical sections.
class ReadWriteLock {
volatile LONG writelockcount;
volatile LONG readlockcount;
CRITICAL_SECTION cs;
public:
ReadWriteLock() {
InitializeCriticalSection(&cs);
writelockcount = 0;
readlockcount = 0;
}
~ReadWriteLock() {
DeleteCriticalSection(&cs);
}
void AcquireReaderLock() {
retry:
while (writelockcount) {
Sleep(0);
}
EnterCriticalSection(&cs);
if (!writelockcount) {
readlockcount++;
}
else {
LeaveCriticalSection(&cs);
goto retry;
}
LeaveCriticalSection(&cs);
}
void ReleaseReaderLock() {
EnterCriticalSection(&cs);
readlockcount--;
LeaveCriticalSection(&cs);
}
void AcquireWriterLock() {
retry:
while (writelockcount||readlockcount) {
Sleep(0);
}
EnterCriticalSection(&cs);
if (!writelockcount&&!readlockcount) {
writelockcount++;
}
else {
LeaveCriticalSection(&cs);
goto retry;
}
LeaveCriticalSection(&cs);
}
void ReleaseWriterLock() {
EnterCriticalSection(&cs);
writelockcount--;
LeaveCriticalSection(&cs);
}
};
To perform a spin-wait, comment the lines with Sleep(0).
Look my implementation here:
https://github.com/coolsoftware/LockLib
VRWLock is a C++ class that implements single writer - multiple readers logic.
Look also test project TestLock.sln.
UPD. Below is the simple code for reader and writer:
LONG gCounter = 0;
// reader
for (;;) //loop
{
LONG n = InterlockedIncrement(&gCounter);
// n = value of gCounter after increment
if (n <= MAX_READERS) break; // writer does not write anything - we can read
InterlockedDecrement(&gCounter);
}
// read data here
InterlockedDecrement(&gCounter); // release reader
// writer
for (;;) //loop
{
LONG n = InterlockedCompareExchange(&gCounter, (MAX_READERS+1), 0);
// n = value of gCounter before attempt to replace it by MAX_READERS+1 in InterlockedCompareExchange
// if gCounter was 0 - no readers/writers and in gCounter will be MAX_READERS+1
// if gCounter was not 0 - gCounter stays unchanged
if (n == 0) break;
}
// write data here
InterlockedExchangeAdd(&gCounter, -(MAX_READERS+1)); // release writer
VRWLock class supports spin count and thread-specific reference count that allows to release locks of terminated threads.