I'm trying to implement single thread writing, multiple thread reading mechanism for shared resource management using interlock in C++, windows environment.
Q1. The result code seems to work as what I intend, but I'd like to ask for your wisdom if I am missing something.
Q2. If there is a real life or good active open source code example I can refer to, it will be really appreciated.
Following are the objectives I've taken into account.
Writing can only be executed by single thread and reading must be blocked when writing in order to avoid "invariant" break.
Reading can be executed by multiple threads.
#include <iostream>
#include <Windows.h>
char g_c = 0i8;
char g_pReadChar[3]{};
void* g_pThreads[4]{};
unsigned long g_pThreadIDs[4]{};
long long g_llLock = 0ULL; // 0 : Not locked / 1 : Locked (Writing) / 2 : Locked (Reading)
long long g_llEntryCount = 0ULL; // Thread entry count
__forceinline void Read()
{
// <- if a thread execution is here (case 0)
InterlockedIncrement64(&g_llEntryCount);
// <- if a thread execution is here (case 1)
for (unsigned long long i = 0ULL; i < 100000ULL; ++i)
{
if (InterlockedCompareExchange64(&g_llLock, 2LL, 0LL) == 1LL)
{
continue;
}
// <- if a thread execution is here (case 2)
// --------------------------------------------------
// Read data
std::cout << g_c;
// --------------------------------------------------
InterlockedExchange64(&g_llLock, 1LL); // Lock is needed in order to block case 0
if (InterlockedDecrement64(&g_llEntryCount) == 0LL)
{
InterlockedExchange64(&g_llLock, 0LL);
}
else
{
InterlockedExchange64(&g_llLock, 2LL);
}
return;
}
InterlockedDecrement64(&g_llEntryCount);
}
__forceinline unsigned long __stdcall ReadLoop(void* _pParam)
{
while (true)
{
Read();
Sleep(1);
}
}
__forceinline void Write(const unsigned long long _ullKey)
{
for (unsigned long long i = 0ULL; i < 100000ULL; ++i)
{
if (InterlockedCompareExchange64(&g_llLock, 1LL, 0LL) != 0LL)
{
continue;
}
// --------------------------------------------------
// Write data
if (_ullKey == 0ULL)
{
g_c = 'A';
}
else if (_ullKey == 1ULL)
{
g_c = 'B';
}
else
{
g_c = 'C';
}
// --------------------------------------------------
InterlockedExchange64(&g_llLock, 0LL);
return;
}
}
__forceinline unsigned long __stdcall WriteLoop(void* _pParam)
{
unsigned long long ullCount = 0ULL;
unsigned long long ullKey = 0ULL;
while (true)
{
if (ullCount > 10000ULL)
{
++ullKey;
if (ullKey >= 3ULL)
{
ullKey = 0ULL;
}
ullCount = 0ULL;
}
Write(ullKey);
++ullCount;
}
}
int main()
{
g_pThreads[0] = CreateThread(nullptr, 0ULL, ReadLoop, nullptr, 0UL, &g_pThreadIDs[0]);
g_pThreads[1] = CreateThread(nullptr, 0ULL, ReadLoop, nullptr, 0UL, &g_pThreadIDs[1]);
g_pThreads[2] = CreateThread(nullptr, 0ULL, ReadLoop, nullptr, 0UL, &g_pThreadIDs[2]);
g_pThreads[3] = CreateThread(nullptr, 0ULL, WriteLoop, nullptr, 0UL, &g_pThreadIDs[3]);
Sleep(100000);
return 0;
}
Related
I have a task to compute Pi with following formula:
(i is in range from 0 to N, N = 10^8)
Computation should be completed in multiple threads with following requirement: each thread receives only a small fixed amount of computations to complete (in my case - 40 sum members at a time), and there should be a "Task pool" which gives new set of computations into a thread when it reports completion of previous set of operations given to it. Before a thread receives new task, it should wait. All of this should be done with WinAPI.
My solution is this class:
#include "ThreadManager.h"
#include <string>
HANDLE ThreadManager::mutex = (CreateMutexA(nullptr, true, "m"));
ThreadManager::ThreadManager(size_t threadCount)
{
threads.reserve(threadCount);
for (int i = 0; i < threadCount; i++)
{
threadInfo.push_back(new ThreadStruct(i * OP_COUNT));
HANDLE event = CreateEventA(nullptr, false, true, std::to_string(i).c_str());
if (event)
{
threadEvents.push_back(event);
DuplicateHandle(GetCurrentProcess(), event, GetCurrentProcess(),
&(threadInfo[i]->threadEvent), 0, false, DUPLICATE_SAME_ACCESS);
}
else std::cout << "Unknown error: " << GetLastError() << std::endl;
HANDLE thread = CreateThread(nullptr, 0,
reinterpret_cast<LPTHREAD_START_ROUTINE>(&ThreadManager::threadFunc),
threadInfo[i],
CREATE_SUSPENDED, nullptr);
if (thread) threads.push_back(thread);
else std::cout << "Unknown error: " << GetLastError() << std::endl;
}
}
double ThreadManager::run()
{
size_t operations_done = threads.size() * OP_COUNT;
for (HANDLE t : threads) ResumeThread(t);
DWORD index;
Sleep(10);
while (operations_done < ThreadManager::N)
{
ReleaseMutex(ThreadManager::mutex);
index = WaitForMultipleObjects(this->threadEvents.size(), this->threadEvents.data(), false, 10000);
WaitForSingleObject(ThreadManager::mutex, 1000);
threadInfo[index] -> operationIndex = operations_done + OP_COUNT;
SetEvent(threadEvents[index]);
//std::cout << "Operations completed: " << operations_done << "/1000" << std::endl;
operations_done += OP_COUNT;
}
long double res_pi = 0;
for (auto&& ts: this->threadInfo)
{
res_pi += ts->pi;
ts->operationIndex = N;
}
res_pi /= N;
WaitForMultipleObjects(this->threads.size(), this->threads.data(), true, 10000);
std::cout.precision(10);
std::cout << "Pi value for " << threads.size() << " threads: " << res_pi;
threads.clear();
return 0;
}
ThreadManager::~ThreadManager()
{
if (!threads.empty())
for (HANDLE t: threads)
{
TerminateThread(t, -1);
CloseHandle(t);
}
std::destroy(threadInfo.begin(), threadInfo.end());
}
long double ThreadManager::calc(size_t startIndex)
{
long double xi = 0;
long double pi = 0;
for (size_t i = startIndex; i < startIndex + OP_COUNT; i++)
{
const long double ld_i = i;
const long double half = 0.5f;
xi = (ld_i + half) * (1.0 / N);
pi += ((4.0 / (1.0 + xi * xi)));
}
return pi;
}
DWORD WINAPI ThreadManager::threadFunc(ThreadStruct *ts)
{
while (ts->operationIndex < N)
{
WaitForSingleObject(ts->threadEvent, 1000);
ts->pi += calc(ts->operationIndex);
WaitForSingleObject(ThreadManager::mutex, 1000);
SetEvent(ts->threadEvent);
ReleaseMutex(ThreadManager::mutex);
}
return 0;
}
ThreadStruct::ThreadStruct(size_t opIndex)
{
this -> pi = 0;
this -> operationIndex = opIndex;
}
My Idea was that there will be an auto-reset event for each thread, which is set to signaled when a thread finishes it's computation. Main thread is waiting on one of thread Events to signal, and after modifying some values in a shared ThreadStruct (to enable thread start another portion of computations) it sets that same event to signaled, which is received by the exact same thread and the process received. But this doesn't work for even one thread: as a result i see values which are pretty random and not close to Pi (like 0.0001776328265).
Though my GDB debugger was working poorly (not displaying some variables and sometimes even crashing), I noticed that there were too much computations happening (I scaled down N to 1000. Therefore, I should have seen threads printing out "computing" 1000/40 = 25 times, but actually it happened hundreds of times)
Then I tried adding a mutex so threads wait until main thread is not busy before signaling the event. That made computation much slower, and still inaccurate and random (example: 50.26492171 in case of 16 threads).
What can be the problem? Or, if it's completely wrong, how do I organize multithread calculation then? Was creating a class a bad idea?
If you want to reproduce the problem, here is header file content (I am using c++20, MinGW 6.0):
#ifndef MULTITHREADPI_THREADMANAGER_H
#define MULTITHREADPI_THREADMANAGER_H
#include <iostream>
#include <vector>
#include <list>
#include <windows.h>
#include <memory>
struct ThreadStruct
{
size_t operationIndex;
long double pi;
HANDLE threadEvent = nullptr;
explicit ThreadStruct(size_t opIndex);
};
class ThreadManager
{
public:
explicit ThreadManager(size_t threadCount);
double run();
~ThreadManager();
private:
std::vector<ThreadStruct*> threadInfo;
std::vector<HANDLE> threads;
std::vector<HANDLE> threadEvents;
static HANDLE mutex;
static long double calc(size_t startIndex);
static const int OP_COUNT = 40;
static const int N = 100000000;
static DWORD WINAPI threadFunc(ThreadStruct* ts);
};
#endif //MULTITHREADPI_THREADMANAGER_H
To execute code, just construct ThreadManager with desired number of threads as argument and call run() on it.
Even with all below changed, it doesn't give consistent values close to PI. There must be more stuff to fix. I think it has to do with the events. If I understand it correctly, there are two different things the mutex protects. And the event is also used for 2 different things. So both change their meaning during execution. This makes it very hard to think it through.
1. Timeouts
WaitForMultipleObjects may run into a timeout. In that case it returns WAIT_TIMEOUT, which is defined as 0x102 or 258. You access the threadInfo vector with that value without bounds checking. You can use at(n) for a bounds-checked version of [n].
You can easily run into a 10 second timeout when debugging or when setting OP_COUNT to high numbers. So, maybe you want to set it to INFINITE instead.
This leads to all sorts of misbehavior:
the threads information (operationIndex) is updated while the thread might work on it.
operations_done is updated although those operations may not be done
The mutex is probably overreleased
2. Limit the number of threads
The thread manager should also check the number of threads, since you can't set it to a number higher than MAXIMUM_WAIT_OBJECTS, otherwise WaitForMultipleObjects() won't work reliably.
3. Off by 1 error
Should be
size_t operations_done = (threads.size()-1) * OP_COUNT;
or
threadInfo[index] -> operationIndex = operations_done; // was + OP_COUNT
otherwise it'll skip one batch
4. Ending the threads
Ending the threads relies on the timeouts.
When you replace all timeouts by INFINITE, you'll notice that your threads never end. You need another ReleaseMutex(mutex); before
res_pi /= N;
struct CommonData;
struct ThreadData
{
CommonData* pData;
ULONG i, k;
ThreadData(CommonData* pData, ULONG i, ULONG k) : pData(pData), i(i), k(k) {}
static ULONG CALLBACK Work(void* p);
};
struct CommonData
{
HANDLE hEvent = 0;
LONG dwActiveThreadCount = 1;
ULONG N;
union {
double res = 0;
__int64 i64;
};
CommonData(ULONG N) : N(N) {}
~CommonData()
{
if (HANDLE h = hEvent)
{
CloseHandle(h);
}
}
void DecThread()
{
if (!InterlockedDecrement(&dwActiveThreadCount))
{
if (!SetEvent(hEvent)) __debugbreak();
}
}
BOOL AddThread(ULONG i, ULONG k)
{
InterlockedIncrementNoFence(&dwActiveThreadCount);
if (ThreadData* ptd = new ThreadData(this, i, k))
{
if (HANDLE hThread = CreateThread(0, 0, ThreadData::Work, ptd, 0, 0))
{
CloseHandle(hThread);
return TRUE;
}
delete ptd;
}
DecThread();
return FALSE;
}
BOOL Init()
{
return 0 != (hEvent = CreateEvent(0, 0, 0, 0));
}
void Wait()
{
DecThread();
if (WaitForSingleObject(hEvent, INFINITE) != WAIT_OBJECT_0) __debugbreak();
}
};
ULONG CALLBACK ThreadData::Work(void* p)
{
CommonData* pData = reinterpret_cast<ThreadData*>(p)->pData;
ULONG i = reinterpret_cast<ThreadData*>(p)->i;
ULONG k = reinterpret_cast<ThreadData*>(p)->k;
delete p;
ULONG N = pData->N;
double pi = 0;
do
{
double xi = (i++ + 0.5) / N;
pi += 4 / (1 + xi * xi);
} while (--k);
union {
double d;
__int64 i64;
};
i64 = pData->i64;
for (;;)
{
union {
double d_compare;
__int64 i64_compare;
};
i64_compare = i64;
d += pi;
if (i64_compare == (i64 = InterlockedCompareExchange64(
&pData->i64, i64, i64_compare)))
{
break;
}
}
pData->DecThread();
return 0;
}
double calc_pi(ULONG N)
{
SYSTEM_INFO si;
GetSystemInfo(&si);
if (si.dwNumberOfProcessors)
{
CommonData cd(N);
if (cd.Init())
{
ULONG k = (N + si.dwNumberOfProcessors - 1) / si.dwNumberOfProcessors, i = 0;
do
{
if (!cd.AddThread(i, k))
{
break;
}
} while (i += k, --si.dwNumberOfProcessors);
cd.Wait();
if (!si.dwNumberOfProcessors)
{
return cd.res/ N;
}
}
}
return 0;
}
when i call calc_pi(100000000) on 8 core i got 3.1415926535898153
Hi I am having trouble with expected false sharing not occurring from my test code.
I am trying to create a process unique thread manager which manages multiple threads homogeneously.
The unique thread manager class is NOT a thread pool, it operates by assigning task functions to designated thread, and able to get return value of the task function, which is not just pushing tasks to the queue without consideration. Also, the thread manager does not care the size (computation amount) of task.
The thread manager will be used by a thread (main thread) for handling computation parts and it will be used quite frequently. The reason for this is, my process will be having game loop design pattern and I want to make the game loop over 120 FPS, which means 1 game loop must be done in less than 8.3 millisecond. A thread (main thread) might this task assignment for a number of times within 1 game loop, so reducing/eliminating context switching cost was my primary concern. My conclusion was having the thread manager's threads spinlock.
In short, the game loop will be iterating following two steps for a number of times.
Main loop assigns tasks to the thread manager.
Wait for results of tasks by the thread manager.
Below is my test code.
ThreadManager.h
namespace YSLibrary
{
class CThreadManager final
{
private:
static long long s_llLock;
static unsigned long long s_ullThreadCount;
static void** s_ppThreads;
static unsigned long* s_pThreadIDs;
static long long* s_pThreadQuits;
static long long* s_pTaskLocks;
static unsigned long long (**s_ppTasks)();
static unsigned long long* s_pTaskResults;
CThreadManager(){}
~CThreadManager(){}
__forceinline static void Lock()
{
while (true)
{
if (InterlockedCompareExchange64(&s_llLock, 1LL, 0LL) == 0LL)
{
return;
}
Sleep(0UL);
}
}
__forceinline static void Unlock()
{
InterlockedExchange64(&s_llLock, 0LL);
}
static unsigned long __stdcall Thread(void* const _pParameter)
{
const unsigned long long ullThreadIndex = reinterpret_cast<const unsigned long long>(_pParameter);
while (true)
{
if (InterlockedCompareExchange64(&s_pThreadQuits[ullThreadIndex], 0LL, 1LL) == 1LL)
{
return 1UL;
}
if (InterlockedCompareExchange64(&s_pTaskLocks[ullThreadIndex], 1LL, 0LL) == 0LL)
{
if (s_ppTasks[ullThreadIndex] != nullptr)
{
s_pTaskResults[ullThreadIndex] = s_ppTasks[ullThreadIndex]();
s_ppTasks[ullThreadIndex] = nullptr;
}
InterlockedExchange64(&s_pTaskLocks[ullThreadIndex], 0LL);
}
}
}
public:
enum class EResult : unsigned long long
{
None = 0ULL,
Success = 1ULL,
Fail_ArgumentNull = 2ULL,
Fail_ArgumentInvalid = 3ULL,
Fail_Locked = 4ULL,
Fail_ThreadCountNotZero = 5ULL,
Fail_ThreadCountZero = 6ULL,
Fail_ThreadsNotNull = 7ULL,
Fail_ThreadsNull = 8ULL,
Fail_ThreadIDsNotNull = 9ULL,
Fail_ThreadIDsNull = 10ULL,
Fail_ThreadQuitsNotNull = 11ULL,
Fail_ThreadQuitsNull = 12ULL,
Fail_TaskLocksNotNull = 13ULL,
Fail_TaskLocksNull = 14ULL,
Fail_TasksNotNull = 15ULL,
Fail_TasksNull = 16ULL,
Fail_TaskResultsNotNull = 17ULL,
Fail_TaskResultsNull = 18ULL,
Fail_CreateThread = 19ULL
};
__forceinline static EResult Initialize(const unsigned long long _ullThreadCount)
{
if (_ullThreadCount == 0ULL)
{
return EResult::Fail_ArgumentNull;
}
Lock();
if (s_ullThreadCount != 0ULL)
{
Unlock();
return EResult::Fail_ThreadCountNotZero;
}
if (s_ppThreads != nullptr)
{
Unlock();
return EResult::Fail_ThreadsNotNull;
}
if (s_pThreadIDs != nullptr)
{
Unlock();
return EResult::Fail_ThreadIDsNotNull;
}
if (s_pThreadQuits != nullptr)
{
Unlock();
return EResult::Fail_ThreadQuitsNotNull;
}
if (s_pTaskLocks != nullptr)
{
Unlock();
return EResult::Fail_TaskLocksNotNull;
}
if (s_ppTasks != nullptr)
{
Unlock();
return EResult::Fail_TasksNotNull;
}
if (s_pTaskResults != nullptr)
{
Unlock();
return EResult::Fail_TaskResultsNotNull;
}
s_ullThreadCount = _ullThreadCount;
s_ppThreads = new void*[s_ullThreadCount]{};
s_pThreadIDs = new unsigned long[s_ullThreadCount]{};
s_pThreadQuits = new long long[s_ullThreadCount]{};
s_pTaskLocks = new long long[s_ullThreadCount]{};
s_ppTasks = new (unsigned long long (*[s_ullThreadCount])()){};
s_pTaskResults = new unsigned long long[s_ullThreadCount]{};
for (unsigned long long i = 0ULL; i < s_ullThreadCount; ++i)
{
s_ppThreads[i] = CreateThread(nullptr, 0ULL, &Thread, reinterpret_cast<void*>(i), 0UL, &s_pThreadIDs[i]);
if (s_ppThreads[i] == nullptr)
{
// Rollback
for (unsigned long long j = 0ULL; j < i; ++j)
{
InterlockedExchange64(&s_pThreadQuits[i], 1LL);
}
unsigned long ulExitCode = 0UL;
for (unsigned long long j = 0ULL; j < i; ++j)
{
while (true)
{
GetExitCodeThread(s_ppThreads[j], &ulExitCode);
if (ulExitCode != static_cast<unsigned long>(STILL_ACTIVE))
{
CloseHandle(s_ppThreads[j]);
s_ppThreads[j] = nullptr;
break;
}
Sleep(0UL);
}
}
delete[] s_pTaskResults;
s_pTaskResults = nullptr;
delete[] s_ppTasks;
s_ppTasks = nullptr;
delete[] s_pTaskLocks;
s_pTaskLocks = nullptr;
delete[] s_pThreadQuits;
s_pThreadQuits = nullptr;
delete[] s_pThreadIDs;
s_pThreadIDs = nullptr;
delete[] s_ppThreads;
s_ppThreads = nullptr;
s_ullThreadCount = 0ULL;
Unlock();
return EResult::Fail_CreateThread;
}
}
Unlock();
return EResult::Success;
}
__forceinline static EResult Terminate()
{
Lock();
if (s_ullThreadCount == 0ULL)
{
Unlock();
return EResult::Fail_ThreadCountZero;
}
if (s_ppThreads == nullptr)
{
Unlock();
return EResult::Fail_ThreadsNull;
}
if (s_pThreadIDs == nullptr)
{
Unlock();
return EResult::Fail_ThreadIDsNull;
}
if (s_pThreadQuits == nullptr)
{
Unlock();
return EResult::Fail_ThreadQuitsNull;
}
if (s_pTaskLocks == nullptr)
{
Unlock();
return EResult::Fail_TaskLocksNull;
}
if (s_ppTasks == nullptr)
{
Unlock();
return EResult::Fail_TasksNull;
}
if (s_pTaskResults == nullptr)
{
Unlock();
return EResult::Fail_TaskResultsNull;
}
for (unsigned long long i = 0ULL; i < s_ullThreadCount; ++i)
{
InterlockedExchange64(&s_pThreadQuits[i], 1LL);
}
unsigned long ulExitCode = 0UL;
for (unsigned long long i = 0ULL; i < s_ullThreadCount; ++i)
{
while (true)
{
GetExitCodeThread(s_ppThreads[i], &ulExitCode);
if (ulExitCode != static_cast<unsigned long>(STILL_ACTIVE))
{
CloseHandle(s_ppThreads[i]);
s_ppThreads[i] = nullptr;
break;
}
Sleep(0UL);
}
}
delete[] s_pTaskResults;
s_pTaskResults = nullptr;
delete[] s_ppTasks;
s_ppTasks = nullptr;
delete[] s_pTaskLocks;
s_pTaskLocks = nullptr;
delete[] s_pThreadQuits;
s_pThreadQuits = nullptr;
delete[] s_pThreadIDs;
s_pThreadIDs = nullptr;
delete[] s_ppThreads;
s_ppThreads = nullptr;
s_ullThreadCount = 0ULL;
Unlock();
return EResult::Success;
}
__forceinline static EResult Execute(const unsigned long long _ullThreadIndex, unsigned long long (*_pFunction)())
{
if (_pFunction == nullptr)
{
return EResult::Fail_ArgumentNull;
}
Lock();
if (s_ullThreadCount == 0ULL)
{
Unlock();
return EResult::Fail_ThreadCountZero;
}
if (s_ppThreads == nullptr)
{
Unlock();
return EResult::Fail_ThreadsNull;
}
if (s_pThreadIDs == nullptr)
{
Unlock();
return EResult::Fail_ThreadIDsNull;
}
if (s_pThreadQuits == nullptr)
{
Unlock();
return EResult::Fail_ThreadQuitsNull;
}
if (s_pTaskLocks == nullptr)
{
Unlock();
return EResult::Fail_TaskLocksNull;
}
if (s_ppTasks == nullptr)
{
Unlock();
return EResult::Fail_TasksNull;
}
if (s_pTaskResults == nullptr)
{
Unlock();
return EResult::Fail_TaskResultsNull;
}
if (_ullThreadIndex >= s_ullThreadCount)
{
Unlock();
return EResult::Fail_ArgumentInvalid;
}
while (true)
{
if (InterlockedCompareExchange64(&s_pTaskLocks[_ullThreadIndex], 1LL, 0LL) == 0LL)
{
s_ppTasks[_ullThreadIndex] = _pFunction;
InterlockedExchange64(&s_pTaskLocks[_ullThreadIndex], 0LL);
Unlock();
return EResult::Success;
}
Sleep(0UL);
}
}
__forceinline static EResult WaitForResult(const unsigned long long _ullThreadIndex, unsigned long long* const _pFunctionResult)
{
if (_pFunctionResult == nullptr)
{
return EResult::Fail_ArgumentNull;
}
Lock();
if (s_ullThreadCount == 0ULL)
{
Unlock();
return EResult::Fail_ThreadCountZero;
}
if (s_ppThreads == nullptr)
{
Unlock();
return EResult::Fail_ThreadsNull;
}
if (s_pThreadIDs == nullptr)
{
Unlock();
return EResult::Fail_ThreadIDsNull;
}
if (s_pThreadQuits == nullptr)
{
Unlock();
return EResult::Fail_ThreadQuitsNull;
}
if (s_pTaskLocks == nullptr)
{
Unlock();
return EResult::Fail_TaskLocksNull;
}
if (s_ppTasks == nullptr)
{
Unlock();
return EResult::Fail_TasksNull;
}
if (s_pTaskResults == nullptr)
{
Unlock();
return EResult::Fail_TaskResultsNull;
}
if (_ullThreadIndex >= s_ullThreadCount)
{
Unlock();
return EResult::Fail_ArgumentInvalid;
}
while (true)
{
if (InterlockedCompareExchange64(&s_pTaskLocks[_ullThreadIndex], 1LL, 0LL) == 0LL)
{
if (s_ppTasks[_ullThreadIndex] == nullptr)
{
(*_pFunctionResult) = s_pTaskResults[_ullThreadIndex];
InterlockedExchange64(&s_pTaskLocks[_ullThreadIndex], 0LL);
Unlock();
return EResult::Success;
}
InterlockedExchange64(&s_pTaskLocks[_ullThreadIndex], 0LL);
}
Sleep(0UL);
}
}
};
}
main.cpp
#include <iostream>
#include <Windows.h>
#include "ThreadManager.h"
long long YSLibrary::CThreadManager::s_llLock = 0LL;
unsigned long long YSLibrary::CThreadManager::s_ullThreadCount = 0ULL;
void** YSLibrary::CThreadManager::s_ppThreads = nullptr;
unsigned long* YSLibrary::CThreadManager::s_pThreadIDs = nullptr;
long long* YSLibrary::CThreadManager::s_pThreadQuits = nullptr;
long long* YSLibrary::CThreadManager::s_pTaskLocks = nullptr;
unsigned long long (**YSLibrary::CThreadManager::s_ppTasks)() = nullptr;
unsigned long long* YSLibrary::CThreadManager::s_pTaskResults = nullptr;
unsigned long long g_pResults[10]{};
struct SData
{
unsigned long long ullData[8];
};
SData g_stData{};
SData g_stData0{};
SData g_stData1{};
SData g_stData2{};
SData g_stData3{};
SData g_stData4{};
SData g_stData5{};
SData g_stData6{};
unsigned long long Function()
{
for (unsigned long long i = 0ULL; i < 70000000ULL; ++i)
{
g_stData.ullData[0] = static_cast<unsigned long long>(rand());
}
return 1ULL;
}
unsigned long long Function0()
{
for (unsigned long long i = 0ULL; i < 10000000ULL; ++i)
{
g_stData0.ullData[0] = static_cast<unsigned long long>(rand());
}
return 1ULL;
}
unsigned long long Function1()
{
for (unsigned long long i = 0ULL; i < 10000000ULL; ++i)
{
g_stData1.ullData[0] = static_cast<unsigned long long>(rand());
}
return 1ULL;
}
unsigned long long Function2()
{
for (unsigned long long i = 0ULL; i < 10000000ULL; ++i)
{
g_stData2.ullData[0] = static_cast<unsigned long long>(rand());
}
return 1ULL;
}
unsigned long long Function3()
{
for (unsigned long long i = 0ULL; i < 10000000ULL; ++i)
{
g_stData3.ullData[0] = static_cast<unsigned long long>(rand());
}
return 1ULL;
}
unsigned long long Function4()
{
for (unsigned long long i = 0ULL; i < 10000000ULL; ++i)
{
g_stData4.ullData[0] = static_cast<unsigned long long>(rand());
}
return 1ULL;
}
unsigned long long Function5()
{
for (unsigned long long i = 0ULL; i < 10000000ULL; ++i)
{
g_stData5.ullData[0] = static_cast<unsigned long long>(rand());
}
return 1ULL;
}
unsigned long long Function6()
{
for (unsigned long long i = 0ULL; i < 10000000ULL; ++i)
{
g_stData6.ullData[0] = static_cast<unsigned long long>(rand());
}
return 1ULL;
}
int main()
{
unsigned long long ullStartTick = 0ULL;
unsigned long long ullEndTick = 0ULL;
srand((unsigned int)time(nullptr));
ullStartTick = GetTickCount64();
Function();
ullEndTick = GetTickCount64();
std::wcout << L"[Main]" << std::endl;
std::wcout << ullEndTick - ullStartTick << std::endl;
YSLibrary::CThreadManager::EResult eResult = YSLibrary::CThreadManager::EResult::None;
eResult = YSLibrary::CThreadManager::Initialize(7ULL);
ullStartTick = GetTickCount64();
eResult = YSLibrary::CThreadManager::Execute(0ULL, &Function0);
eResult = YSLibrary::CThreadManager::Execute(1ULL, &Function1);
eResult = YSLibrary::CThreadManager::Execute(2ULL, &Function2);
eResult = YSLibrary::CThreadManager::Execute(3ULL, &Function3);
eResult = YSLibrary::CThreadManager::Execute(4ULL, &Function4);
eResult = YSLibrary::CThreadManager::Execute(5ULL, &Function5);
eResult = YSLibrary::CThreadManager::Execute(6ULL, &Function6);
eResult = YSLibrary::CThreadManager::WaitForResult(0ULL, &g_pResults[0]);
eResult = YSLibrary::CThreadManager::WaitForResult(1ULL, &g_pResults[1]);
eResult = YSLibrary::CThreadManager::WaitForResult(2ULL, &g_pResults[2]);
eResult = YSLibrary::CThreadManager::WaitForResult(3ULL, &g_pResults[3]);
eResult = YSLibrary::CThreadManager::WaitForResult(4ULL, &g_pResults[4]);
eResult = YSLibrary::CThreadManager::WaitForResult(5ULL, &g_pResults[5]);
eResult = YSLibrary::CThreadManager::WaitForResult(6ULL, &g_pResults[6]);
ullEndTick = GetTickCount64();
std::wcout << L"[Thread Manager]" << std::endl;
std::wcout << ullEndTick - ullStartTick << std::endl;
YSLibrary::CThreadManager::Terminate();
system("pause");
return 0;
}
I am really sorry about Interlocked family of functions, __forceinline, dirty declaration of static variables, etc.
On the other hand, the reason why I used "long long" for lock variable is there was no "bool" type. I'd rather tried "short" but it had no significant difference when I measured time between "short" and "long long". Rather, "short" was slightly slower and I guess the reason is the use of 16 bit registers in 64 bit environment. Also, bool or short type might lead to a problem of memory alignment. So I used "long long" type.
The reason why CThreadManager has private constructor is to explicitly prohibit "new CThreadManager()".
The use of "reinterpret_cast" is minimized. I thought it's cost is compile time, but I saw a question from stackoverflow that it has runtime cost. I'm not sure about it yet. So just use it once when thread function begins.
So far, I have checked false sharing phenomenon by changing
SData::ullData[8] -> SData::ullData1
Also, use of Sleep(0) significantly reduced waste of thread time slice in WaitForResult() and reduction of total execution time within threads.
The result of this code showed
[Main]
1828
[Thread Manager]
344
in my environment.
However, I just realized that there was another place other than SData::ullData where false sharing must occur, which are s_pThreadQuits, s_pTaskLocks, s_ppTasks, s_pTaskResults.
Why false sharing is not occurring with these variables?
[EDIT]
What I mean by "false sharing" is "memory address accessed by different threads but share the same cache-line" are
SData g_stDataN (in each FunctionN())
s_pThreadQuits, s_pTaskLocks, s_pTaskResults, s_ppTasks (in Thread())
I thought of 2. variables will also loaded to cache just like g_stDataN (64 byte in my environment) did. I've set the size of SData to 64 bytes in order to achieve the result of "padding" method to avoid false sharing.
However, as far as s_pThreadQuits are neither sized to 64 bytes nor padded, it should also have false sharing.
Like this image below.
Source of image is from
https://www.codeproject.com/Articles/85356/Avoiding-and-Identifying-False-Sharing-Among-Threa
I am trying to code task manager and i stuck with %CPU for each process dy PID.
I wrote something like, that:
static float CalculateCPULoad(unsigned long long idleTicks, unsigned long long totalTicks)
{
static unsigned long long _previousTotalTicks = 0;
static unsigned long long _previousIdleTicks = 0;
unsigned long long totalTicksSinceLastTime = totalTicks - _previousTotalTicks;
unsigned long long idleTicksSinceLastTime = idleTicks - _previousIdleTicks;
float ret = 1.0f - ((totalTicksSinceLastTime > 0) ? ((float)idleTicksSinceLastTime) / totalTicksSinceLastTime : 0);
_previousTotalTicks = totalTicks;
_previousIdleTicks = idleTicks;
return ret;
}
static unsigned long long FileTimeToInt64(const FILETIME& ft) { return (((unsigned long long)(ft.dwHighDateTime)) << 32) | ((unsigned long long)ft.dwLowDateTime); }
And was using it like:
hProcessSnap = CreateToolhelp32Snapshot(TH32CS_SNAPPROCESS, 0);
if (hProcessSnap == INVALID_HANDLE_VALUE)
{
printError("Failed to create Process Snap");
return FALSE;
}
pe.dwSize = sizeof(PROCESSENTRY32);
if (!Process32First(hProcessSnap, &pe))
{
printError("Failed to move along process snap");
CloseHandle(hProcessSnap);
return FALSE;
}
do
{
printf("\n\n=====================================================");
_tprintf(TEXT("\n PROCESS NAME: %s"), pe.szExeFile);
printf("\n-----------------------------------------------------");
dwPriorityClass = 0;
hProcess = OpenProcess(PROCESS_ALL_ACCESS, FALSE, pe.th32ProcessID);
if (hProcess == NULL)
{
printError("Failed to open process");
}
else
{
for (int i = 0; i < 2; i++)
{
GetProcessTimes(hProcess, &exist, &exit, &lastKernel, &lastUser);
GetSystemTimes(&lastIdle, 0, 0);
GetCPULoad(lastIdle, lastKernel, lastUser);
Sleep(2500);
}
std::cout << GetCPULoad(lastIdle, lastKernel, lastUser) << "\n";
CloseHandle(hProcess);
}
} while (Process32Next(hProcessSnap, &pe));
CloseHandle(hProcessSnap);
return (TRUE);
}
I know that using sleep() here isnt a good idea,but i havent think up anything better for now.
Pls help me with some code examples,if you can.
Also i want to know am i right that:
CPU% for process= (1- (IdleSystemTimeDelta/TotalProcessTimeDelta))*100%
This is how i get_CPU in percent.
I use hash_map in order to have PID-time connection static to update it.
First time usage get_cpu_usage(int pid) returns zero every time,but with each next usage it will be more and more accurate(I use it with 0.5 sec period).
static int get_processor_number()
{
SYSTEM_INFO info;
GetSystemInfo(&info);
return (int)info.dwNumberOfProcessors;
}
static __int64 file_time_2_utc(const FILETIME* ftime)
{
LARGE_INTEGER li;
li.LowPart = ftime->dwLowDateTime;
li.HighPart = ftime->dwHighDateTime;
return li.QuadPart;
}
static int get_cpu_usage(int pid)
{
static int processor_count_ = -1;
static std::unordered_map<int, __int64> last_time_;
static std::unordered_map<int, __int64> last_system_time_;
FILETIME now;
FILETIME creation_time;
FILETIME exit_time;
FILETIME kernel_time;
FILETIME user_time;
__int64 system_time;
__int64 time;
__int64 system_time_delta;
__int64 time_delta;
int cpu = -1;
if (processor_count_ == -1)
{
processor_count_ = get_processor_number();
}
GetSystemTimeAsFileTime(&now);
HANDLE hProcess = OpenProcess(PROCESS_ALL_ACCESS, false, pid);
if (!GetProcessTimes(hProcess, &creation_time, &exit_time, &kernel_time, &user_time))
{
std::cout << "Unable to getProcessTime\n";
return -1;
}
system_time = (file_time_2_utc(&kernel_time) + file_time_2_utc(&user_time)) / processor_count_;
time = file_time_2_utc(&now);
if ((last_system_time_[pid] == 0) || (last_time_[pid] == 0))
{
last_system_time_[pid] = system_time;
last_time_[pid] = time;
return 0;
}
system_time_delta = system_time - last_system_time_[pid];
time_delta = time - last_time_[pid];
if (time_delta == 0)
{
std::cout << "timedelta=0";
return -1;
}
cpu = int((system_time_delta * 100 + time_delta / 2) / time_delta);
last_system_time_[pid] = system_time;
last_time_[pid] = time;
return cpu;
}
I use ftd3xx.dll to communicate with the device
The data read part and the data write part are divided into threads and used.
#include <thread>
#include <queue>
#include <array>
#include <windows.h>
using namespace std;
bool dataRead = false;
CRITICAL_SECTION sec;
queue< vector<unsigned short>> BufferQueue;
unsigned WINAPI Write(void* arg) {
int Width = 1000;
vector<unsigned short> data;
data.reserve(Width);
while (Opened)
{
while (dataRead)
{
if (BufferQueue.size() > 0) {
EnterCriticalSection(&sec);
data = BufferQueue.front();
BufferQueue.pop();
LeaveCriticalSection(&sec);
}
else
{
this_thread::sleep_for(2ms);
continue;
}
//wrtie something
}
if (!dataRead)
break;
}
_endthreadex(0);
return 0;
}
unsigned WINAPI Read(void* arg) {
int Width = 1000;
vector<unsigned short> data(Width);
BYTE* acReadBuf = new BYTE[Width];
ULONG ulBytesRead = 0;
int idx = 0;
Sleep(100);
while (dataRead)
{
ftStatus = FT_ReadPipe(ftHandle, CstReadPipeNo, acReadBuf, Width, &ulBytesRead, NULL);
if (FT_SUCCESS(ftStatus))
{
idx = 0;
for (int i = 0; i < Width; i++) {
data[i] = ((unsigned short)((unsigned short)acReadBuf[idx] | ((unsigned short)acReadBuf[idx + 1] << 8)));
idx += 2;
}
EnterCriticalSection(&sec);
if (BufferQueue.size() > 10000) {
queue< vector<unsigned short>> empty;
swap(BufferQueue, empty);
}
BufferQueue.push(data);
LeaveCriticalSection(&sec);
}
else
{
}
}
_endthreadex(0);
return 0;
}
void main() {
//start
InitializeCriticalSection(&sec);
dataRead = true;
HANDLE r_hThread = NULL;
unsigned r_threadID;
r_hThread = (HANDLE)_beginthreadex(NULL, 0, Read, NULL, 0, &r_threadID);
HANDLE w_hThread = NULL;
unsigned w_threadID;
w_hThread = (HANDLE)_beginthreadex(NULL, 0, Write, NULL, 0, &w_threadID);
//....///
//stop
dataRead = false;;
WaitForSingleObject(r_hThread, INFINITE);
WaitForSingleObject(w_hThread, INFINITE);
DeleteCriticalSection(&sec);
}
I want to queue the array directly, but first I am using it as a vector.
Importantly, data loss occurs when other programs are run or even calculators are run.
The same is true even if the device gives the data late or fast.
I would be grateful if someone could help me.
I am trying to publish some random things over shared memory; and for some weird reason, the reader doesn't pick up what the sender has written
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <unistd.h>
#include <sys/types.h>
#include <cstdio>
class SHM {
volatile char* _ptr;
public:
SHM() {
const auto handle = shm_open("myTest", O_RDWR|O_CREAT, 0666);
const auto size = 4 * 1024 * 1024;
if (-1 == ftruncate(handle, size)) {
throw;
}
_ptr = (volatile char*)mmap(0,size , PROT_READ | PROT_WRITE, MAP_SHARED, handle, 0);
if(_ptr == MAP_FAILED){
throw;
}
int rc = fchmod(handle, 0666);
if (rc == -1) {
throw;
}
}
bool read(uint64_t& magic, uint64_t& time) {
const uint64_t newVal = *(uint64_t*)_ptr;
if (newVal != magic) {
magic = newVal;
printf("value changed!!!\n");
time = *(uint64_t*)(_ptr + sizeof(magic));
return true;
}
//printf("old value: %lu\n", newVal);
return false;
}
void publish(const uint64_t time) {
__sync_fetch_and_add((uint64_t*)_ptr, time);
__sync_synchronize();
*(uint64_t*)(_ptr + sizeof(uint64_t)) = time;
}
};
Here is the sender:
#include <ctime>
#include <unistd.h>
#include <cstdlib>
#include <cstdint>
#include "shm.h"
int main() {
SHM shm;
timespec t;
for (auto i = 0; i < 10000; i++) {
if (0 == clock_gettime(CLOCK_REALTIME, &t)) {
const uint64_t v = t.tv_sec * 1000 * 1000 * 1000 + t.tv_nsec;
shm.publish(v);
printf("published %lu\n", v);
usleep(100);
}
}
}
Here is the reader:
#include <iostream>
#include "shm.h"
int main() {
SHM shm;
uint64_t magic = 0;
uint64_t t = 0;
while (true) {
if (shm.read(magic, t)) {
printf("%lu, %lu\n", magic, t);
}
}
}
If I restart the reader, the reader is indeed able to read the last value that the sender has written.
However, if I start the reader first, and then the sender, all the values the sender writes aren't picked up by the reader.
To make this even weirder, if I uncomment the printf statement in SHM::read(), then the reader is able to pick up sometimes.
Any idea?
GCC version:
g++ (GCC) 7.2.1 20170829 (Red Hat 7.2.1-1)
I spotted a couple of issues, however, I am unsure if they would fix your problem.
name for shm_open should start with / for portable use.
In read and publish the casts must not discard volatile. E.g.: const uint64_t newVal = *(uint64_t volatile*)_ptr;. Even better, drop volatile and use std::atomic.
Although there are different processes involved, this is still the case of same objects being accessed by more than one thread of execution and at least one of these threads modifies the shared objects.
I made the above changes. Using std::atomic fixed it:
class SHM {
void* _ptr;
public:
SHM() {
const auto handle = shm_open("/myTest", O_RDWR|O_CREAT, 0666);
const auto size = 4 * 1024 * 1024;
if (-1 == ftruncate(handle, size))
throw;
_ptr = mmap(0,size , PROT_READ | PROT_WRITE, MAP_SHARED, handle, 0);
if(_ptr == MAP_FAILED)
throw;
}
bool read(uint64_t& magic, uint64_t& time) {
auto p = static_cast<std::atomic<uint64_t>*>(_ptr);
const uint64_t newVal = p[0];
if (newVal != magic) {
magic = newVal;
printf("value changed!!!\n");
time = p[1];
return true;
}
return false;
}
void publish(const uint64_t time) {
auto p = static_cast<std::atomic<uint64_t>*>(_ptr);
p[0] += time;
p[1] = time;
}
};
void sender() {
SHM shm;
timespec t;
for (auto i = 0; i < 10000; i++) {
if (0 == clock_gettime(CLOCK_REALTIME, &t)) {
const uint64_t v = t.tv_sec * 1000 * 1000 * 1000 + t.tv_nsec;
shm.publish(v);
printf("published %lu\n", v);
usleep(100);
}
}
}
void reader() {
SHM shm;
uint64_t magic = 0;
uint64_t t = 0;
while (true) {
if (shm.read(magic, t)) {
printf("%lu, %lu\n", magic, t);
}
}
}
int main(int ac, char**) {
if(ac > 1)
reader();
else
sender();
}
With std::atomic you can have more control. E.g.:
struct Data {
std::atomic<uint64_t> time;
std::atomic<uint64_t> generation;
};
// ...
bool read(uint64_t& generation, uint64_t& time) {
auto data = static_cast<Data*>(_ptr);
auto new_generation = data->generation.load(std::memory_order_acquire); // 1. Syncronizes with (2).
if(generation == new_generation)
return false;
generation = new_generation;
time = data->time.load(std::memory_order_relaxed);
printf("value changed!!!\n");
return true;
}
void publish(const uint64_t time) {
auto data = static_cast<Data*>(_ptr);
data->time.store(time, std::memory_order_relaxed);
data->generation.fetch_add(time, std::memory_order_release); // 2. (1) Synchronises with this store.
}