wait for all threads finishing one job then doing another - c++

Suppose there are NUM_TREADS threads and they have to complete job1() before doing job2(). How to guarantee this to happen, like this:
void thread_func(void *arg)
{
while(1) {
job1();
some_kind_of_waiting();
job2();
}
return NULL;
}
Will a semaphore like following work or are there any other/better solutions?
{
static int done;
static sem_t semaphore;
if(__sync_fetch_and_add(&done, 1) == THREAD_NUMS-1) {
done = 0;
for(i = 0; i < THREAD_NUMS-1; i++)
sem_post(&semaphore);
} else
sem_wait(&semaphore);
}
Thanks.

This precisely the problem that pthreads barriers are intended to solve. Initialise the barrier with NUM_THREADS (in your main function, prior to spawning the threads):
pthread_barrier_t barrier;
pthread_barrier_init(&barrier, NULL, NUM_THREADS);
and use pthread_barrier_wait() to synchronise:
void *thread_func(void *arg)
{
while(1) {
job1();
pthread_barrier_wait(&barrier);
job2();
}
return NULL;
}
If you also need the threads to wait until all other threads have compelted job2() before any of them can start on job1() again, you can add a second wait on the barrier:
void *thread_func(void *arg)
{
while(1) {
job1();
pthread_barrier_wait(&barrier);
job2();
pthread_barrier_wait(&barrier);
}
return NULL;
}

You describe a high level concurrency control structure typically called by the name barrier. The C++ standard library has no implementation of barrier although it has been proposed and will hopefully be part of C++ standard library in the future.
Until the standard library provides a solution, you can of course implement the barrier yourself (you can use a condition variable), use a platform specific threading API or a wrapper library.

Related

Win32 alternative to pthread

Is it possible to write this using the standard win32 CreateMutex style code. I am just wondering if I want to introduce a new library to our application or if I can find a way to write this myself. I just can't figure out how to to the wait inside a CriticalSection. This is my current working code with the pthread library.
T remove() {
pthread_mutex_lock(&m_mutex);
while (m_queue.size() == 0) {
pthread_cond_wait(&m_condv, &m_mutex);
}
T item = m_queue.front();
m_queue.pop_front();
pthread_mutex_unlock(&m_mutex);
return item;
}
For pre-VC-2012 support, the best alternative is Boost.Thread that supports conditional variables.
Here's my attempt. This is not the best implementation of a conditional wait lock in win32, but I think it works. It could use careful code review scrutiny.
One caveat - it doesn't necessarily guarantee ordered fairness since all the waiting threads may be initially blocked waiting for the event. The scheduler will resume all the threads at this point to continue running (up to the subsequent blocking EnterCriticalSection call), but not necessarily in the same order in which the threads arrived into the remove() call to begin with. This likely isn't a big deal for most app's with only a handful of threads, but it's something most threading frameworks guarantee.
Other caveat - for brevity, I'm leaving out the important steps of checking the return value from all of these Win32 APIs.
CRITICAL_SECTION m_cs;
HANDLE m_event;
void Init()
{
InitializeCriticalSection(&m_cs);
m_event = CreateEvent(NULL, TRUE, FALSE, NULL); // manual reset event
}
void UnInit()
{
DeleteCriticalSection(&m_cs);
CloseHandle(m_event);
m_event = NULL;
}
T remove()
{
T item;
bool fGotItem = false;
while (fGotItem == false)
{
// wait for event to be signaled
WaitForSingleObject(m_event, INFINITE);
// wait for mutex to become available
EnterCriticalSection(&m_cs);
// inside critical section
{
// try to deque something - it’s possible that the queue is empty because another
// thread pre-empted us and got the last item in the queue before us
size_t queue_size = m_queue.size();
if (queue_size == 1)
{
// the queue is about to go empty
ResetEvent(m_event);
}
if (queue_size > 0)
{
fGotItem = true;
item = m_queue.front();
m_queue.pop();
}
}
LeaveCriticalSection(&m_cs);
}
return item;
}
void Add(T& item)
{
// wait for critical section to become available
EnterCriticalSection(&m_cs);
// inside critical section
{
m_queue.push_back(item);
SetEvent(m_event); // signal other threads that something is available
}
LeaveCriticalSection(&m_cs);
}
Windows Vista introduced new native Win32 Conditional Variable and Slim Reader/Writer Lock primitives for exactly this type of scenario, for example:
Using a critical section:
CRITICAL_SECTION m_cs;
CONDITION_VARIABLE m_condv;
InitializeCriticalSection(&m_cs);
InitializeConditionVariable(&m_condv);
...
void add(T item)
{
EnterCriticalSection(&m_cs);
m_queue.push_back(item);
LeaveCriticalSection(&m_cs);
WakeConditionVariable(&m_condv);
}
T remove()
{
EnterCriticalSection(&m_cs);
while (m_queue.size() == 0)
SleepConditionVariableCS(&m_condv, &m_cs, INFINITE);
T item = m_queue.front();
m_queue.pop_front();
LeaveCriticalSection(&m_cs);
return item;
}
Using a SRW lock:
SRWLOCK m_lock;
CONDITION_VARIABLE m_condv;
InitializeSRWLock(&m_lock);
InitializeConditionVariable(&m_condv);
...
void add(T item)
{
AcquireSRWLockExclusive(&m_lock);
m_queue.push_back(item);
ReleaseSRWLockExclusive(&m_lock);
WakeConditionVariable(&m_condv);
}
T remove()
{
AcquireSRWLockExclusive(&m_lock);
while (m_queue.size() == 0)
SleepConditionVariableSRW(&m_condv, &m_lock, INFINITE, 0);
T item = m_queue.front();
m_queue.pop_front();
ReleaseSRWLockExclusive(&m_lock);
return item;
}

Detached pthreads and memory leak

Can somebody please explain to me why this simple code leaks memory?
I believe that since pthreads are created with detached state their resources should be released inmediatly after it's termination, but it's not the case.
My environment is Qt5.2.
#include <QCoreApplication>
#include <windows.h>
void *threadFunc( void *arg )
{
printf("#");
pthread_exit(NULL);
}
int main()
{
pthread_t thread;
pthread_attr_t attr;
while(1)
{
printf("\nStarting threads...\n");
for(int idx=0;idx<100;idx++)
{
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
pthread_create( &thread, &attr, &threadFunc, NULL);
pthread_attr_destroy ( &attr );
}
printf("\nSleeping 10 seconds...\n");
Sleep(10000);
}
}
UPDATE:
I discovered that if I add a slight delay of 5 milliseconds inside the for loop the leak is WAY slower:
for(int idx=0;idx<100;idx++)
{
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
pthread_create( &thread, &attr, &threadFunc, NULL);
pthread_attr_destroy ( &attr );
Sleep(5); /// <--- 5 MILLISECONDS DELAY ///
}
This is freaking me out, could somebody please tell me what is happening? How this slight delay may produce such a significant change? (or alter the behavior in any way)
Any advice would be greatly appreciated.
Thanks.
UPDATE2:
This leak was observed on Windows platforms (W7 and XP), no leak was observed on Linux platforms (thank you #MichaelGoren)
I checked the program with slight modifications on windows using cygwin, and memory consumption was steady. So it must be a qt issue; the pthread library on cygwin works fine without leaking.
#include <pthread.h>
#include <stdio.h>
#include <unistd.h>
void *threadFunc( void *arg )
{
printf("#");
pthread_exit(NULL);
}
int main()
{
pthread_t thread;
pthread_attr_t attr;
int idx;
while(1)
{
printf("\nStarting threads...\n");
for(idx=0;idx<100;idx++)
{
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
pthread_create( &thread, &attr, &threadFunc, NULL);
pthread_attr_destroy ( &attr );
}
printf("\nSleeping 10 seconds...\n");
//Sleep(10000);
sleep(10);
}
}
Compiler optimizations or the OS it self can decide to do loop unrolling. That is your for loop has a constant bound (100 here). Since there is no explicit synchronization to prevent it, a newly created, detached thread can die and have its thread ID reassigned to another new thread before its creator returns from pthread_create() due to this unrolling. The next iteration is already started before the thread was actually destroyed.
This also explains why your added slight delay has less issues; one iteration takes longer and hence the thread functions can actually finish in more cases and hence the threads are actually terminated most of the time.
A possible fix would be to disable compiler optimizations, or add synchronization; that is, you check whether the thread still exist, at the end of the code, if it does you'll have to wait for the function to finish.
A more tricky way would be to use mutexes; you let the thread claim a resource at creation and by definition of PTHREAD_CREATE_DETACHED this resource is automatically released when the thread is exited, hence you can use try_lock to test whether the thread is actually finished. Note that I haven't tested this approach so I'm not actually sure whether PTHREAD_CREATE_DETACHED actually is working according to its definition...
Concept:
pthread_mutex_t mutex;
void *threadFunc( void *arg )
{
printf("#");
pthread_mutex_lock(&mutex);
pthread_exit(NULL);
}
for(int idx=0;idx<100;idx++)
{
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
pthread_create( &thread, &attr, &threadFunc, NULL);
pthread_attr_destroy ( &attr );
pthread_mutex_lock(&mutex); //will block untill "destroy" has released the mutex
pthread_mutex_unlock(&mutex);
}
The delay can induce a large change in behavior because it gives the thread time to exit! Of course how your pthread library is implemented is also a factor here. I suspect it is using a 'free list' optimization.
If you create 1000 threads all at once, then the library allocates memory for them all before any significant number of those threads can exit.
If as in your second code sample you let the previous thread run and probably exit before you start a new thread, then your thread library can reuse that thread's allocated memory or data structures which it now knows are no longer needed and it is now probably holding in a free list just in case someone creates a thread again and it can efficiently recycle the memory.
It has nothing to do with compiler optimisations. Code is fine. Problem could be
a) Windows itself.
b) Qt implementation of pthread_create() with detached attributes
Checking for (a): Try to create many fast detached threads using Windows _beginthreadex directly and see if you get the same picture. Note: CloseHandle(thread_handle) as soon as _beginthreaex returns to make it detached.
Checking for (b): Trace which function Qt uses to create threads. If it is _beginthread then there is your answer. If it is _beginthreadex, then Qt is doing the right thing and you need to check if Qt closes the thread handle handle immediately. If it does not then that is the cause.
cheers
UPDATE 2
Qt5.2.0 does not provide pthreads API and is unlikely responsible for the observed leak.
I wrapped native windows api to see how the code runs without pthread library. You can include this fragment right after includes:
#include <process.h>
#define PTHREAD_CREATE_JOINABLE 0
#define PTHREAD_CREATE_DETACHED 1
typedef struct { int detachstate; } pthread_attr_t;
typedef HANDLE pthread_t;
_declspec(noreturn) void pthread_exit(void *retval)
{
static_assert(sizeof(unsigned) == sizeof(void*), "Modify code");
_endthreadex((unsigned)retval);
}
int pthread_attr_setdetachstate(pthread_attr_t *attr, int detachstate)
{
attr->detachstate = detachstate;
return 0;
}
int pthread_attr_init(pthread_attr_t *attr)
{
attr->detachstate = PTHREAD_CREATE_JOINABLE;
return 0;
}
int pthread_attr_destroy(pthread_attr_t *attr)
{
(void)attr;
return 0;
}
typedef struct {
void *(*start_routine)(void *arg);
void *arg;
} winapi_caller_args;
unsigned __stdcall winapi_caller(void *arglist)
{
winapi_caller_args *list = (winapi_caller_args *)arglist;
void *(*start_routine)(void *arg) = list->start_routine;
void *arg = list->arg;
free(list);
static_assert(sizeof(unsigned) == sizeof(void*), "Modify code");
return (unsigned)start_routine(arg);
}
int pthread_create( pthread_t *thread, pthread_attr_t *attr,
void *(*start_routine)(void *), void *arg)
{
winapi_caller_args *list;
list = (winapi_caller_args *)malloc(sizeof *list);
if (list == NULL)
return EAGAIN;
list->start_routine = start_routine;
list->arg = arg;
*thread = (HANDLE)_beginthreadex(NULL, 0, winapi_caller, list, 0, NULL);
if (*thread == 0) {
free(list);
return errno;
}
if (attr->detachstate == PTHREAD_CREATE_DETACHED)
CloseHandle(*thread);
return 0;
}
With Sleep() line commented out it works OK without leaks. Run time = 1hr approx.
If the code with Sleep line commented out is calling Pthreads-win32 2.9.1 library (prebuilt for MSVC) then the program stops spawning new threads and stops responding after 5..10 minutes.
Test environment: XP Home, MSVC 2010 Expresss, Qt5.2.0 qmake etc.
You forgot to join your thread (even if they are finished already).
Correct code should be:
pthread_t arr[100];
for(int idx=0;idx<100;idx++)
{
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
pthread_create( &arr[idx], &attr, &threadFunc, NULL);
pthread_attr_destroy ( &attr );
}
Sleep(2000);
for(int idx=0;idx<100;idx++)
{
pthread_join(arr[idx]);
}
Note from man page:
Failure to join with a thread that is joinable (i.e., one that is not detached), produces a "zombie thread". Avoid doing this, since each zombie thread consumes some system resources, and when enough zombie threads have
accumulated, it will no longer be possible to create new threads (or processes).

How to share variable to threads in smart way using pthreads?

I want to synchroznie threads in C++ using pthreads in smart way.
I have one global variable:
int Resources = 0;
I have two thread functions:
void *incResources(void *arg)
{
while(1)
{
pthread_mutex_lock (&resourcesMutex);
Resources += 2;
pthread_mutex_unlock (&resourcesMutex);
}
pthread_exit((void*) 0);
}
void *consumeResources(void *arg)
{
while(1)
{
pthread_mutex_lock (&resourcesMutex);
Resources--;
pthread_mutex_unlock (&resourcesMutex);
}
pthread_exit((void*) 0);
}
And in main function I intialize two consuming threads and one incrementing thread:
pthread_mutex_init(&resourcesMutex, NULL);
pthread_create(&callThd[0], &attr, incResources, (void *)i);
pthread_create(&callThd[1], &attr, consumeResources, (void *)i);
pthread_create(&callThd[2], &attr, consumeResources, (void *)i);
I feel this so unefficient and it can be done better. Can you provide me some ideas? I've tried to use wait but i dont get it :/
Thanks!
I you look for good and C++ ways, I strongly suggest to read C++ Concurrency in Action: by Anthony Williams and leave pthread behind to use futures and similar high-level thing where you can. And if you must go with manual thread fiddling you can find good examples for that too.
Your problem statement is too vague for sensible advice -- the basic idea of good threading is to NOT have shared state at all, and for handshake situation like yours is likely, use some synchronized queue made for that very purpose.
A smarter way to do this would use std::mutex and std::thread (or the Boost equivalents) so you don't need to unlock mutexes manually.
A condition variable will allow the consumers to block (without wasting CPU cycles) until there is work available for them:
struct Resource
{
int value;
std::mutex mx;
std::condition_variable cv;
};
void incResources(Resource& res)
{
while(1)
{
{
std::lock_guard<std::mutex> l(res.mx);
res.value += 2;
}
res.cv.notify_all();
}
}
void consumeResources(Resource& res)
{
while(1)
{
std::unique_lock<std::mutex> l(res.mx);
while (res.value == 0)
res.cv.wait(l);
res.value--;
}
}
and in the main thread:
Resources res;
res.value = 0;
std::thread t1(incResources, std::ref(res));
std::thread t2(consumeResources, std::ref(res));
std::thread t3(consumeResources, std::ref(res));
// ...
t1.join();
t2.join();
t3.join();
I think if you're using C++ there's no reason why to prefer native use of pthreads over the C++11 std::thread and STL synchronization classes.
If you can't use C++11 standards you should wrap the pthreads native interface to reasonable C++ class representations (see e.g. boost::thread or STTCL Posix Thread implementation).
It looks like you are attempting to implement a producer and consumer, with the += thread creating work (numbers to be reduced) and the consumer taking them away.
Rather than having the consumer in a trivial spin loop like that, take a look at condition variables.
std::queue<Job*> queue;
pthread_mutex mutex;
pthread_cond cond;
void AddJob(Job* job) {
pthread_mutex_lock(&mutex);
queue.push_back(job);
pthread_cond_signal(&cond);
pthread_mutex_unlock(&mutex);
}
void* QueueWorker(void* /*threadInfo*/) {
Job* job = NULL;
for (;;) {
pthread_mutex_lock(&mutex);
while ( queue.empty() ) {
// unlock the mutex until the cond is signal()d or broadcast() to.
// if this call succeeds, we will have the mutex locked again on the other side.
pthread_cond_wait(&cond, &mutex);
}
// take the first task and then release the lock.
job = queue.pop();
pthread_mutex_unlock(&mutex);
if ( job != NULL )
job->Execute();
}
return NULL;
}
This scales to multiple consumers.
As an aside, while it can be useful to familiarize yourself with the pthreads implementation, you should probably look at one of the threading wrappers available. C++11 introduced std::thread and std::mutex, many people swear by boost, but personally I've found the OpenSceneGraph team's "OpenThreads" library one of the easiest and most elegant to work with.
Edit: here's a complete working implementation albeit with a somewhat artificial mechanism for ending the run.
#include <queue>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
static int jobNo = 0;
class Job {
public:
Job() : m_i(++jobNo) { printf("Created job %d.\n", m_i); }
int m_i;
void Execute() { printf("Job %d executing.\n", m_i); usleep(500 * 1000); }
};
std::queue<Job*> queue;
pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
void AddJob(Job* job) {
pthread_mutex_lock(&mutex);
queue.push(job);
pthread_cond_signal(&cond);
pthread_mutex_unlock(&mutex);
}
void* QueueWorker(void* /*threadInfo*/) {
Job* job = NULL;
for (;;) {
pthread_mutex_lock(&mutex);
while ( queue.empty() ) {
// unlock the mutex until the cond is signal()d or broadcast() to.
// if this call succeeds, we will have the mutex locked again on the other side.
pthread_cond_wait(&cond, &mutex);
}
// take the first task and then release the lock.
job = queue.front();
queue.pop();
pthread_mutex_unlock(&mutex);
if ( job == NULL ) {
// in this demonstration, NULL ends the run, so forward to any other threads.
AddJob(NULL);
break;
}
job->Execute();
delete job;
}
return NULL;
}
int main(int argc, const char* argv[]) {
pthread_t worker1, worker2;
pthread_create(&worker1, NULL, &QueueWorker, NULL);
pthread_create(&worker2, NULL, &QueueWorker, NULL);
srand(time(NULL));
// queue 5 jobs with delays.
for ( size_t i = 0; i < 5; ++i ) {
long delay = (rand() % 800) * 1000;
printf("Producer sleeping %fs\n", (float)delay / (1000*1000));
usleep(delay);
Job* job = new Job();
AddJob(job);
}
// 5 more without delays.
for ( size_t i = 0; i < 5; ++i ) {
AddJob(new Job);
}
// null to end the run.
AddJob(NULL);
printf("Done with jobs.\n");
pthread_join(worker1, NULL);
pthread_join(worker2, NULL);
return 0;
}

Thread - synchronizing and sleeping thread refuses to wake up (LINUX)

I'm developing an application For OpenSUSE 12.1.
This application has a main thread and other two threads running instances of the same functions. I'm trying to use pthread_barrier to synchronize all threads but I'm having some problems:
When I put the derived threads to sleep, they will never wake up for some reason.
(in the case when I remove the sleep from the other threads, throwing CPU usage to the sky) In some point all the threads reach pthread_barrier_wait() but none of them continues execution after that.
Here's some pseudo code trying to illustrate what I'm doing.
pthread_barrier_t barrier;
int main(void)
{
pthread_barrier_init(&barrier, NULL , 3);
pthread_create(&thread_id1, NULL,&thread_func, (void*) &params1);
pthread_create(&thread_id2v, NULL,&thread_func, (void*) &params2);
while(1)
{
doSomeWork();
nanosleep(&t1, &t2);
pthread_barrier_wait(&barrier);
doSomeMoreWork();
}
}
void *thread_func(void *params)
{
init_thread(params);
while(1)
{
nanosleep(&t1, &t2);
doAnotherWork();
pthread_barrier_wait(&barrier);
}
}
I don't think it has to do with the barrier as you've presented it in the pseudocode. I'm making an assumption that your glibc is approximately the same as my machine. I compiled roughly your pseudo-code and it's running like I expect: the threads do some work, the main thread does some work, they all reach the barrier and then loop.
Can you comment more about any other synchronization methods or what the work functions are?
This is the the example program I'm using:
#include <pthread.h>
#include <stdio.h>
#include <time.h>
struct timespec req = {1,0}; //{.tv_sec = 1, .tv_nsec = 0};
struct timespec rem = {0,0}; //{.tv_sec = 0, .tv_nsec = 0};
pthread_barrier_t barrier;
void *thread_func(void *params) {
long int name;
name = (long int)params;
while(1) {
printf("This is thread %ld\n", name);
nanosleep(&req, &rem);
pthread_barrier_wait(&barrier);
printf("More work from %ld\n", name);
}
}
int main(void)
{
pthread_t th1, th2;
pthread_barrier_init(&barrier, NULL , 3);
pthread_create(&th1, NULL, &thread_func, (void*)1);
pthread_create(&th2, NULL, &thread_func, (void*)2);
while(1) {
nanosleep(&req, &rem);
printf("This is the parent\n\n");
pthread_barrier_wait(&barrier);
}
return 0;
}
I would suggest to use condition variables in order to synchronize threads.
Here some website about how to do it i hope it helps.
http://www.yolinux.com/TUTORIALS/LinuxTutorialPosixThreads.html

Win32 Read/Write Lock Using Only Critical Sections

I have to implement a read/write lock in C++ using the Win32 api as part of a project at work. All of the existing solutions use kernel objects (semaphores and mutexes) that require a context switch during execution. This is far too slow for my application.
I would like implement one using only critical sections, if possible. The lock does not have to be process safe, only threadsafe. Any ideas on how to go about this?
If you can target Vista or greater, you should use the built-in SRWLock's. They are lightweight like critical sections, entirely user-mode when there is no contention.
Joe Duffy's blog has some recent entries on implementing different types of non-blocking reader/writer locks. These locks do spin, so they would not be appropriate if you intend to do a lot of work while holding the lock. The code is C#, but should be straightforward to port to native.
You can implement a reader/writer lock using critical sections and events - you just need to keep enough state to only signal the event when necessary to avoid an unnecessary kernel mode call.
I don't think this can be done without using at least one kernel-level object (Mutex or Semaphore), because you need the help of the kernel to make the calling process block until the lock is available.
Critical sections do provide blocking, but the API is too limited. e.g. you cannot grab a CS, discover that a read lock is available but not a write lock, and wait for the other process to finish reading (because if the other process has the critical section it will block other readers which is wrong, and if it doesn't then your process will not block but spin, burning CPU cycles.)
However what you can do is use a spin lock and fall back to a mutex whenever there is contention. The critical section is itself implemented this way. I would take an existing critical section implementation and replace the PID field with separate reader & writer counts.
Old question, but this is something that should work. It doesn't spin on contention. Readers incur limited extra cost if they have little or no contention, because SetEvent is called lazily (look at the edit history for a more heavyweight version that doesn't have this optimization).
#include <windows.h>
typedef struct _RW_LOCK {
CRITICAL_SECTION countsLock;
CRITICAL_SECTION writerLock;
HANDLE noReaders;
int readerCount;
BOOL waitingWriter;
} RW_LOCK, *PRW_LOCK;
void rwlock_init(PRW_LOCK rwlock)
{
InitializeCriticalSection(&rwlock->writerLock);
InitializeCriticalSection(&rwlock->countsLock);
/*
* Could use a semaphore as well. There can only be one waiter ever,
* so I'm showing an auto-reset event here.
*/
rwlock->noReaders = CreateEvent (NULL, FALSE, FALSE, NULL);
}
void rwlock_rdlock(PRW_LOCK rwlock)
{
/*
* We need to lock the writerLock too, otherwise a writer could
* do the whole of rwlock_wrlock after the readerCount changed
* from 0 to 1, but before the event was reset.
*/
EnterCriticalSection(&rwlock->writerLock);
EnterCriticalSection(&rwlock->countsLock);
++rwlock->readerCount;
LeaveCriticalSection(&rwlock->countsLock);
LeaveCriticalSection(&rwlock->writerLock);
}
int rwlock_wrlock(PRW_LOCK rwlock)
{
EnterCriticalSection(&rwlock->writerLock);
/*
* readerCount cannot become non-zero within the writerLock CS,
* but it can become zero...
*/
if (rwlock->readerCount > 0) {
EnterCriticalSection(&rwlock->countsLock);
/* ... so test it again. */
if (rwlock->readerCount > 0) {
rwlock->waitingWriter = TRUE;
LeaveCriticalSection(&rwlock->countsLock);
WaitForSingleObject(rwlock->noReaders, INFINITE);
} else {
/* How lucky, no need to wait. */
LeaveCriticalSection(&rwlock->countsLock);
}
}
/* writerLock remains locked. */
}
void rwlock_rdunlock(PRW_LOCK rwlock)
{
EnterCriticalSection(&rwlock->countsLock);
assert (rwlock->readerCount > 0);
if (--rwlock->readerCount == 0) {
if (rwlock->waitingWriter) {
/*
* Clear waitingWriter here to avoid taking countsLock
* again in wrlock.
*/
rwlock->waitingWriter = FALSE;
SetEvent(rwlock->noReaders);
}
}
LeaveCriticalSection(&rwlock->countsLock);
}
void rwlock_wrunlock(PRW_LOCK rwlock)
{
LeaveCriticalSection(&rwlock->writerLock);
}
You could decrease the cost for readers by using a single CRITICAL_SECTION:
countsLock is replaced with writerLock in rdlock and rdunlock
rwlock->waitingWriter = FALSE is removed in wrunlock
wrlock's body is changed to
EnterCriticalSection(&rwlock->writerLock);
rwlock->waitingWriter = TRUE;
while (rwlock->readerCount > 0) {
LeaveCriticalSection(&rwlock->writerLock);
WaitForSingleObject(rwlock->noReaders, INFINITE);
EnterCriticalSection(&rwlock->writerLock);
}
rwlock->waitingWriter = FALSE;
/* writerLock remains locked. */
However this loses in fairness, so I prefer the above solution.
Take a look at the book "Concurrent Programming on Windows" which has lots of different reference examples for reader/writer locks.
Check out the spin_rw_mutex from Intel's Thread Building Blocks ...
spin_rw_mutex is strictly in user-land
and employs spin-wait for blocking
This is an old question but perhaps someone will find this useful. We developed a high-performance, open-source RWLock for Windows that automatically uses Vista+ SRWLock Michael mentioned if available, or otherwise falls back to a userspace implementation.
As an added bonus, there are four different "flavors" of it (though you can stick to the basic, which is also the fastest), each providing more synchronization options. It starts with the basic RWLock() which is non-reentrant, limited to single-process synchronization, and no swapping of read/write locks to a full-fledged cross-process IPC RWLock with re-entrance support and read/write de-elevation.
As mentioned, they dynamically swap out to the Vista+ slim read-write locks for best performance when possible, but you don't have to worry about that at all as it'll fall back to a fully-compatible implementation on Windows XP and its ilk.
If you already know of a solution that only uses mutexes, you should be able to modify it to use critical sections instead.
We rolled our own using two critical sections and some counters. It suits our needs - we have a very low writer count, writers get precedence over readers, etc. I'm not at liberty to publish ours but can say that it is possible without mutexes and semaphores.
Here is the smallest solution that I could come up with:
http://www.baboonz.org/rwlock.php
And pasted verbatim:
/** A simple Reader/Writer Lock.
This RWL has no events - we rely solely on spinlocks and sleep() to yield control to other threads.
I don't know what the exact penalty is for using sleep vs events, but at least when there is no contention, we are basically
as fast as a critical section. This code is written for Windows, but it should be trivial to find the appropriate
equivalents on another OS.
**/
class TinyReaderWriterLock
{
public:
volatile uint32 Main;
static const uint32 WriteDesireBit = 0x80000000;
void Noop( uint32 tick )
{
if ( ((tick + 1) & 0xfff) == 0 ) // Sleep after 4k cycles. Crude, but usually better than spinning indefinitely.
Sleep(0);
}
TinyReaderWriterLock() { Main = 0; }
~TinyReaderWriterLock() { ASSERT( Main == 0 ); }
void EnterRead()
{
for ( uint32 tick = 0 ;; tick++ )
{
uint32 oldVal = Main;
if ( (oldVal & WriteDesireBit) == 0 )
{
if ( InterlockedCompareExchange( (LONG*) &Main, oldVal + 1, oldVal ) == oldVal )
break;
}
Noop(tick);
}
}
void EnterWrite()
{
for ( uint32 tick = 0 ;; tick++ )
{
if ( (tick & 0xfff) == 0 ) // Set the write-desire bit every 4k cycles (including cycle 0).
_InterlockedOr( (LONG*) &Main, WriteDesireBit );
uint32 oldVal = Main;
if ( oldVal == WriteDesireBit )
{
if ( InterlockedCompareExchange( (LONG*) &Main, -1, WriteDesireBit ) == WriteDesireBit )
break;
}
Noop(tick);
}
}
void LeaveRead()
{
ASSERT( Main != -1 );
InterlockedDecrement( (LONG*) &Main );
}
void LeaveWrite()
{
ASSERT( Main == -1 );
InterlockedIncrement( (LONG*) &Main );
}
};
I wrote the following code using only critical sections.
class ReadWriteLock {
volatile LONG writelockcount;
volatile LONG readlockcount;
CRITICAL_SECTION cs;
public:
ReadWriteLock() {
InitializeCriticalSection(&cs);
writelockcount = 0;
readlockcount = 0;
}
~ReadWriteLock() {
DeleteCriticalSection(&cs);
}
void AcquireReaderLock() {
retry:
while (writelockcount) {
Sleep(0);
}
EnterCriticalSection(&cs);
if (!writelockcount) {
readlockcount++;
}
else {
LeaveCriticalSection(&cs);
goto retry;
}
LeaveCriticalSection(&cs);
}
void ReleaseReaderLock() {
EnterCriticalSection(&cs);
readlockcount--;
LeaveCriticalSection(&cs);
}
void AcquireWriterLock() {
retry:
while (writelockcount||readlockcount) {
Sleep(0);
}
EnterCriticalSection(&cs);
if (!writelockcount&&!readlockcount) {
writelockcount++;
}
else {
LeaveCriticalSection(&cs);
goto retry;
}
LeaveCriticalSection(&cs);
}
void ReleaseWriterLock() {
EnterCriticalSection(&cs);
writelockcount--;
LeaveCriticalSection(&cs);
}
};
To perform a spin-wait, comment the lines with Sleep(0).
Look my implementation here:
https://github.com/coolsoftware/LockLib
VRWLock is a C++ class that implements single writer - multiple readers logic.
Look also test project TestLock.sln.
UPD. Below is the simple code for reader and writer:
LONG gCounter = 0;
// reader
for (;;) //loop
{
LONG n = InterlockedIncrement(&gCounter);
// n = value of gCounter after increment
if (n <= MAX_READERS) break; // writer does not write anything - we can read
InterlockedDecrement(&gCounter);
}
// read data here
InterlockedDecrement(&gCounter); // release reader
// writer
for (;;) //loop
{
LONG n = InterlockedCompareExchange(&gCounter, (MAX_READERS+1), 0);
// n = value of gCounter before attempt to replace it by MAX_READERS+1 in InterlockedCompareExchange
// if gCounter was 0 - no readers/writers and in gCounter will be MAX_READERS+1
// if gCounter was not 0 - gCounter stays unchanged
if (n == 0) break;
}
// write data here
InterlockedExchangeAdd(&gCounter, -(MAX_READERS+1)); // release writer
VRWLock class supports spin count and thread-specific reference count that allows to release locks of terminated threads.