Windows creates events on thread shutdown - c++

I am attempting to add handle leak detection to the unit test framework on my code. (Windows 7, x64 VS2010)
I basically call GetProcessHandleCount() before and after each unit test.
This works fine except when threads are created/destroyed as part of the test.
It seems that windows is occasionally creating an 1-3 events on thread shutdown. Running the same test in a loop does not increase the event creation count. (eg running the test 5000 times in a loop only results in 1-3 extra events being created)
I do not create events manually in my own code.
It seems that this is similar to this problem:
boost::thread causing small event handle leak?
but I am doing manual thread creation/shutdown.
I followed this code:
http://blogs.technet.com/b/yongrhee/archive/2011/12/19/how-to-troubleshoot-a-handle-leak.aspx
And got this callstack from WinDbg:
Outstanding handles opened since the previous snapshot:
--------------------------------------
Handle = 0x0000000000000108 - OPEN
Thread ID = 0x00000000000030dc, Process ID = 0x0000000000000c90
0x000000007715173a: ntdll!NtCreateEvent+0x000000000000000a
0x0000000077133f26: ntdll!RtlpCreateCriticalSectionSem+0x0000000000000026
0x0000000077133ee3: ntdll!RtlpWaitOnCriticalSection+0x000000000000014e
0x000000007714e40b: ntdll!RtlEnterCriticalSection+0x00000000000000d1
0x0000000077146ad2: ntdll!LdrShutdownThread+0x0000000000000072
0x0000000077146978: ntdll!RtlExitUserThread+0x0000000000000038
0x0000000076ef59f5: kernel32!BaseThreadInitThunk+0x0000000000000015
0x000000007712c541: ntdll!RtlUserThreadStart+0x000000000000001d
--------------------------------------
As you can see, this is an event created on the thread shutdown.
Is there a better way of doing this handle leak detection in unit tests? My only current options are:
Forget trying to do this handle leak detection
Spin up some dummy tasks to attempt to create these spurious events.
Allow some small tolerance value in leaks and run each test 100's of times (so actual leaks will be a large number)
Get the handle count excluding events (difficult amount of code)
I have also tried switching to using std::thread in VS2013, but it seems that it creates a lot of background threads and handles when used. (makes the count difference much worse)
Here is a self contained example where 99+% of the time (on my computer) an event is created behind the scenes. (handle count is different). Putting the startup/shutdown code in a loop indicates it does not directly leak, but accumulates the occasional events:
#include "stdio.h"
#include <Windows.h>
#include <process.h>
#define THREADCOUNT 3
static HANDLE s_semCommand, s_semRender;
static unsigned __stdcall ExecutiveThread(void *)
{
WaitForSingleObject(s_semCommand, INFINITE);
ReleaseSemaphore(s_semRender, THREADCOUNT - 1, NULL);
return 0;
}
static unsigned __stdcall WorkerThread(void *)
{
WaitForSingleObject(s_semRender, INFINITE);
return 0;
}
int main(int argc, char* argv[])
{
DWORD oldHandleCount = 0;
GetProcessHandleCount(GetCurrentProcess(), &oldHandleCount);
s_semCommand = CreateSemaphoreA(NULL, 0, 0xFFFF, NULL);
s_semRender = CreateSemaphoreA(NULL, 0, 0xFFFF, NULL);
// Spool threads up
HANDLE threads[THREADCOUNT];
for (int i = 0; i < THREADCOUNT; i++)
{
threads[i] = (HANDLE)_beginthreadex(NULL, 4096, (i==0) ? ExecutiveThread : WorkerThread, NULL, 0, NULL);
}
// Signal shutdown - Wait for threads and close semaphores
ReleaseSemaphore(s_semCommand, 1, NULL);
for (int i = 0; i < THREADCOUNT; i++)
{
WaitForSingleObject(threads[i], INFINITE);
CloseHandle(threads[i]);
}
CloseHandle(s_semCommand);
CloseHandle(s_semRender);
DWORD newHandleCount = 0;
GetProcessHandleCount(GetCurrentProcess(), &newHandleCount);
printf("Handle %d -> %d", oldHandleCount, newHandleCount);
return 0;
}

Related

Problem with multi-threading and waiting on events

I have a problem with my code:
#define _CRT_SECURE_NO_WARNINGS
#include <iostream>
#include <windows.h>
#include <string.h>
#include <math.h>
HANDLE event;
HANDLE mutex;
int runner = 0;
DWORD WINAPI thread_fun(LPVOID lpParam) {
int* data = (int*)lpParam;
for (int j = 0; j < 4; j++) { //this loop necessary in order to reproduce the issue
if ((data[2] + 1) == data[0]) { // if it is last thread
while (1) {
WaitForSingleObject(mutex, INFINITE);
if (runner == data[0] - 1) { // if all other thread reach event break
ReleaseMutex(mutex);
break;
}
printf("Run:%d\n", runner);
ReleaseMutex(mutex);
Sleep(10);
}
printf("Check Done:<<%d>>\n", data[2]);
runner = 0;
PulseEvent(event); // let all other threads continue
}
else { // if it is not last thread
WaitForSingleObject(mutex, INFINITE);
runner++;
ReleaseMutex(mutex);
printf("Wait:<<%d>>\n", data[2]);
WaitForSingleObject(event, INFINITE); // wait till all other threads reach this stage
printf("Exit:<<%d>>\n", data[2]);
}
}
return 0;
}
int main()
{
event = CreateEvent(NULL, TRUE, FALSE, NULL);
mutex = CreateMutex(NULL, FALSE, NULL);
SetEvent(event);
int data[3] = {2,8}; //0 amount of threads //1 amount of numbers
HANDLE t[10000];
int ThreadData[1000][3];
for (int i = 0; i < data[0]; i++) {
memcpy(ThreadData[i], data, sizeof(int) * 2); // copy amount of threads and amount of numbers to the threads data
ThreadData[i][2] = i; // creat threads id
LPVOID ThreadsData = (LPVOID)(&ThreadData[i]);
t[i] = CreateThread(0, 0, thread_fun, ThreadsData, 0, NULL);
if (t[i] == NULL)return 0;
}
while (1) {
DWORD res = WaitForMultipleObjects(data[0], t, true, 1000);
if (res != WAIT_TIMEOUT) break;
}
for (int i = 0; i < data[0]; i++)CloseHandle(t[i]); // close all threads
CloseHandle(event); // close event
CloseHandle(mutex); //close mutex
printf("Done");
}
The main idea is to wait until all threads except one reach the event and wait there, meanwhile the last thread must release them from waiting.
But the code doesn't work reliably. 1 in 10 times, it ends correctly, and 9 times just gets stuck in while(1). In different tries, printf in while (printf("Run:%d\n", runner);) prints different numbers of runners (0 and 3).
What can be the problem?
As we found out in the comments section, the problem was that although the event was created in the initial state of being non-signalled
event = CreateEvent(NULL, TRUE, FALSE, NULL);
it was being set to the signalled state immediately afterwards:
SetEvent(event);
Due to this, at least on the first iteration of the loop, when j == 0, the first worker thread wouldn't wait for the second worker thread, which caused a race condition.
Also, the following issues with your code are worth mentioning (although these issues were not the reason for your problem):
According to the Microsoft documentation on PulseEvent, that function should not be used, as it can be unreliable and is mainly provided for backward-compatibility. According to the documentation, you should use condition variables instead.
In your function thread_fun, the last thread is locking and releasing the mutex in a loop. This can be bad, because mutexes are not guaranteed to be fair and it is possible that this will cause other threads to never be able to acquire the mutex. Although this possibility is mitigated by you calling Sleep(10); once in every loop iteration, it is still not the ideal solution. A better solution would be to use a condition variable, so that the thread only checks for changes of the variable runner when another thread actually signals a possible change. Such a solution would also be better for performance reasons.

Strange behaviour of GetQueuedCompletionStatus when used from thread pool worker threads

I've been testing to combine the IO Completion Ports with the worker threads from the Thread Pool and stumbled on a behaviour I can't explain. In particular, while the following code:
int data;
for (int i = 0; i < NUM; ++i)
PostQueuedCompletionStatus(cp, 1, NULL, reinterpret_cast<LPOVERLAPPED>(&data));
{
std::thread t([&] ()
{
LPOVERLAPPED aux;
DWORD cmd;
ULONG_PTR key;
for (int i = 0; i < NUM; ++i)
{
if (!GetQueuedCompletionStatus(cp, &cmd, &key, &aux, 0))
break;
++count;
}
});
t.join();
}
works perfectly fine and receives NUM status notifications (with NUM being large number, 100000 or more), the similar code that uses the thread pool work object that reads one status notification per work item and repost the work item after reading it, fails after reading couple of hundred status notifications. Having the following global variables (please don't mind the names):
HANDLE cport;
PTP_POOL pool;
TP_CALLBACK_ENVIRON env;
PTP_WORK work;
std::size_t num_calls;
std::mutex mutex;
std::condition_variable cv;
bool job_done;
and the callback function:
static VOID CALLBACK callback(PTP_CALLBACK_INSTANCE instance_, PVOID pv_, PTP_WORK work_)
{
LPOVERLAPPED aux;
DWORD cmd;
ULONG_PTR key;
if (GetQueuedCompletionStatus(cport, &cmd, &key, &aux, 0))
{
++num_calls;
SubmitThreadpoolWork(work);
}
else
{
std::unique_lock<std::mutex> l(mutex);
std::cout << "No work after " << num_calls << " calls.\n";
job_done = true;
cv.notify_one();
}
}
the following code:
{
job_done = false;
std::unique_lock<std::mutex> l(mutex);
num_calls = 0;
cport = CreateIoCompletionPort(INVALID_HANDLE_VALUE, NULL, 0, 1);
pool = CreateThreadpool(nullptr);
InitializeThreadpoolEnvironment(&env);
SetThreadpoolCallbackPool(&env, pool);
work = CreateThreadpoolWork(callback, nullptr, &env);
for (int i = 0; i < NUM; ++i)
PostQueuedCompletionStatus(cport, 1, NULL, reinterpret_cast<LPOVERLAPPED>(&data));
SubmitThreadpoolWork(work);
cv.wait_for(l, std::chrono::milliseconds(10000), [] { return job_done; } );
}
would report "No more work after ..." after 250 or so calls to GetQueuedCompletionStatus although the NUM was set to 1000000. Even more curious is that setting the wait from 0 to, way, 10 milliseconds would increase the number of successful calls to couple of hundred thousand and would occasionally read all 1000000 notifications. Which I don't really understand since all status notifications were posted before submitting the work object for the first time.
Is it possible that there really is a problem with combining completion ports and a thread pool or is there something wrong in my code? Please don't go into why would I want to do this - I was investigating the possibilities and stumbled on this. In my view it should work and can't figure put what's wrong. Thank you.
I've tried running this code, the issue seems to be the NumberOfConcurrentThreads parameters supplied to CreateIoCompletionPort. Passing 1 means that the first pool thread that executes callback becomes associated with io completion port but since thread pool may execute callback using different thread GetQueuedCompletionStatus will fail when this happens. From documentation:
The most important property of an I/O completion port to consider carefully is the concurrency value. The concurrency value of a completion port is specified when it is created with CreateIoCompletionPort via the NumberOfConcurrentThreads parameter. This value limits the number of runnable threads associated with the completion port. When the total number of runnable threads associated with the completion port reaches the concurrency value, the system blocks the execution of any subsequent threads associated with that completion port until the number of runnable threads drops below the concurrency value.
Although any number of threads can call GetQueuedCompletionStatus for a specified I/O completion port, when a specified thread calls GetQueuedCompletionStatus the first time, it becomes associated with the specified I/O completion port until one of three things occurs: The thread exits, specifies a different I/O completion port, or closes the I/O completion port. In other words, a single thread can be associated with, at most, one I/O completion port.
So to use io completion with thread pool you need to set the number of concurrent threads to the size of the thread pool (that you can set using SetThreadpoolThreadMaximum).
::DWORD const threads_count{1};
cport = ::CreateIoCompletionPort(INVALID_HANDLE_VALUE, NULL, 0, threads_count);
...
pool = ::CreateThreadpool(nullptr);
::SetThreadpoolThreadMaximum(pool, threads_count);

How do I interrupt xcb_wait_for_event?

In a separate thread (std::thread), I have an event loop that waits on xcb_wait_for_event. When the program exits, I'd like to shut things down nicely by interrupting (I have a solution that sets a thread-local variable, and checkpoints in the loop throw an exception), and then joining my event thread into the main thread. The issue is xcb_wait_for_event; I need a way to return from it early, or I need an alternative to the function.
Can anyone suggest a solution? Thanks for your help!
I believe I've come up with a suitable solution. I've replaced xcb_wait_for_event with the following function:
xcb_generic_event_t *WaitForEvent(xcb_connection_t *XConnection)
{
xcb_generic_event_t *Event = nullptr;
int XCBFileDescriptor = xcb_get_file_descriptor(XConnection);
fd_set FileDescriptors;
struct timespec Timeout = { 0, 250000000 }; // Check for interruptions every 0.25 seconds
while (true)
{
interruptible<std::thread>::check();
FD_ZERO(&FileDescriptors);
FD_SET(XCBFileDescriptor, &FileDescriptors);
if (pselect(XCBFileDescriptor + 1, &FileDescriptors, nullptr, nullptr, &Timeout, nullptr) > 0)
{
if ((Event = xcb_poll_for_event(XConnection)))
break;
}
}
interruptible<std::thread>::check();
return Event;
}
Making use of xcb_get_file_descriptor, I can use pselect to wait until there are new events, or until a specified timeout has occurred. This method incurs negligible additional CPU costs, resting at a flat 0.0% (on this i7). The only "downside" is having to wait a maximum of 0.25 seconds to check for interruptions, and I'm sure that limit could be safely lowered.
A neater way would be to do something like this (the code snippet is extracted from some code I am currently working on):
void QXcbEventQueue::sendCloseConnectionEvent() const {
// A hack to close XCB connection. Apparently XCB does not have any APIs for this?
xcb_client_message_event_t event;
memset(&event, 0, sizeof(event));
event.response_type = XCB_CLIENT_MESSAGE;
event.format = 32;
event.sequence = 0;
event.window = m_connection->clientLeader();
event.type = m_connection->atom(QXcbAtom::_QT_CLOSE_CONNECTION);
event.data.data32[0] = 0;
xcb_connection_t *c = m_connection->xcb_connection();
xcb_send_event(c, false, m_connection->clientLeader(),
XCB_EVENT_MASK_NO_EVENT, reinterpret_cast<const char *>(&event));
xcb_flush(c); }
For _QT_CLOSE_CONNECTION use your own atom to signal an exit and in my case clientLeader() is some invisible window that is always present on my X11 connection. If you don't have any invisible windows that could be reused for this purpose, create one :)
With this you can terminate the thread with xcb_wait_for_event when you see this special event arriving.

How to execute c++ function in a parallel thread under WinRT?

I have a C++ code which uses _beginthreadex() Windows method to execute a function in a thread. Now I want to port it to WinRT component to include it in windows phone app. But windows phone does't support _beginthreadex(). How do I do it?
my function is:
bool doesWordAppearInDictionarry(char* word);
I have 4 cores on my computer so I want to execute 4 copies of this function in parallel (searching for 4 different words in the dictionary simultaneously).
I read (here) and (here) about Windows::System::Threading::WorkItemHandler, ThreadPool's and IAsyncAction
But the supplied examples activate managed code and do not call a native function.
What I am looking for is a clean solution (minimal amount of lines of code) which will replace my current Windows Desktop code:
for (int i=0; i<4; i++){
tInfo[i].id = (HANDLE)_beginthreadex( NULL, stack_size, doesWordAppearInDictionarry,tInfo, 0, word[i]);
}
for (int i=0; i<4; i++){
WaitForSingleObject( tInfo[i].id, INFINITE );
CloseHandle(tInfo[i].id);
}
Here is a short solution: Few lines of code that emulate _beginthreadex() using WinRT api.
using namespace Platform;
using namespace Windows::System::Threading;
_Use_decl_annotations_ HANDLE WINAPI _beginthreadex(LPSECURITY_ATTRIBUTES unusedThreadAttributes, SIZE_T unusedStackSize, LPTHREAD_START_ROUTINE lpStartAddress, LPVOID lpParameter, DWORD dwCreationFlags, LPDWORD unusedThreadId){
// Create a handle for the new thread.
HANDLE threadHandle = CreateEventEx(nullptr, nullptr, CREATE_EVENT_MANUAL_RESET, EVENT_ALL_ACCESS);
if (!threadHandle)
return nullptr;
try{
auto workItemHandler = ref new WorkItemHandler([=](IAsyncAction^){
lpStartAddress(lpParameter);
SetEvent(threadHandle); // Signal that the thread has completed (assuming this handle was not destroyed by the caller due to time-out).
}, CallbackContext::Any);
ThreadPool::RunAsync(workItemHandler, WorkItemPriority::High, WorkItemOptions::TimeSliced);
return threadHandle; // Return the handle to the caller (he can wait for this handle until thread terminates).
}
catch (...){
CloseHandle(threadHandle); // Clean up if thread creation fails.
return nullptr;
}
}
This solution is based on stack overflow answer (here) which discusses (this) blog. The blog includes a full emulation of threads including the CreateThread() Win32 api, accessing threads while they are running and sharing memory between threads. My solution is a simplified case of the full emulator.
P.s. The caller method must waid for the threads using WaitForSingleObjectEx() method and not WaitForSingleObject()

using libev with multiple threads

I want to use libev with multiple threads for the handling of tcp connections. What I want to is:
The main thread listen on incoming connections, accept the
connections and forward the connection to a workerthread.
I have a pool of workerthreads. The number of threads depends on the
number of cpu's. Each worker-thread has an event loop. The worker-thread listen if I can write on the tcp socket or if
somethings available for reading.
I looked into the documentation of libev and I known this can be done with libev, but I can't find any example how I have to do that.
Does someone has an example?
I think that I have to use the ev_loop_new() api, for the worker-threads and for the main thread I have to use the ev_default_loop() ?
Regards
The following code can be extended to multiple threads
//This program is demo for using pthreads with libev.
//Try using Timeout values as large as 1.0 and as small as 0.000001
//and notice the difference in the output
//(c) 2009 debuguo
//(c) 2013 enthusiasticgeek for stack overflow
//Free to distribute and improve the code. Leave credits intact
#include <ev.h>
#include <stdio.h> // for puts
#include <stdlib.h>
#include <pthread.h>
pthread_mutex_t lock;
double timeout = 0.00001;
ev_timer timeout_watcher;
int timeout_count = 0;
ev_async async_watcher;
int async_count = 0;
struct ev_loop* loop2;
void* loop2thread(void* args)
{
printf("Inside loop 2"); // Here one could initiate another timeout watcher
ev_loop(loop2, 0); // similar to the main loop - call it say timeout_cb1
return NULL;
}
static void async_cb (EV_P_ ev_async *w, int revents)
{
//puts ("async ready");
pthread_mutex_lock(&lock); //Don't forget locking
++async_count;
printf("async = %d, timeout = %d \n", async_count, timeout_count);
pthread_mutex_unlock(&lock); //Don't forget unlocking
}
static void timeout_cb (EV_P_ ev_timer *w, int revents) // Timer callback function
{
//puts ("timeout");
if (ev_async_pending(&async_watcher)==false) { //the event has not yet been processed (or even noted) by the event loop? (i.e. Is it serviced? If yes then proceed to)
ev_async_send(loop2, &async_watcher); //Sends/signals/activates the given ev_async watcher, that is, feeds an EV_ASYNC event on the watcher into the event loop.
}
pthread_mutex_lock(&lock); //Don't forget locking
++timeout_count;
pthread_mutex_unlock(&lock); //Don't forget unlocking
w->repeat = timeout;
ev_timer_again(loop, &timeout_watcher); //Start the timer again.
}
int main (int argc, char** argv)
{
if (argc < 2) {
puts("Timeout value missing.\n./demo <timeout>");
return -1;
}
timeout = atof(argv[1]);
struct ev_loop *loop = EV_DEFAULT; //or ev_default_loop (0);
//Initialize pthread
pthread_mutex_init(&lock, NULL);
pthread_t thread;
// This loop sits in the pthread
loop2 = ev_loop_new(0);
//This block is specifically used pre-empting thread (i.e. temporary interruption and suspension of a task, without asking for its cooperation, with the intention to resume that task later.)
//This takes into account thread safety
ev_async_init(&async_watcher, async_cb);
ev_async_start(loop2, &async_watcher);
pthread_create(&thread, NULL, loop2thread, NULL);
ev_timer_init (&timeout_watcher, timeout_cb, timeout, 0.); // Non repeating timer. The timer starts repeating in the timeout callback function
ev_timer_start (loop, &timeout_watcher);
// now wait for events to arrive
ev_loop(loop, 0);
//Wait on threads for execution
pthread_join(thread, NULL);
pthread_mutex_destroy(&lock);
return 0;
}
Using libev within different threads at the same time is fine as long as each of them runs its own loop[1].
The c++ wrapper in libev (ev++.h) always uses the default loop instead of letting you specify which one you want to use. You should use the C header instead (ev.h) which allows you to specify which loop to use (e.g. ev_io_start takes a pointer to an ev_loop but the ev::io::start doesn't).
You can signal another thread's ev_loop safely through ev_async.
[1]http://doc.dvgu.ru/devel/ev.html#threads_and_coroutines