Multithread false sharing in window - c++

Hi I am having trouble with expected false sharing not occurring from my test code.
I am trying to create a process unique thread manager which manages multiple threads homogeneously.
The unique thread manager class is NOT a thread pool, it operates by assigning task functions to designated thread, and able to get return value of the task function, which is not just pushing tasks to the queue without consideration. Also, the thread manager does not care the size (computation amount) of task.
The thread manager will be used by a thread (main thread) for handling computation parts and it will be used quite frequently. The reason for this is, my process will be having game loop design pattern and I want to make the game loop over 120 FPS, which means 1 game loop must be done in less than 8.3 millisecond. A thread (main thread) might this task assignment for a number of times within 1 game loop, so reducing/eliminating context switching cost was my primary concern. My conclusion was having the thread manager's threads spinlock.
In short, the game loop will be iterating following two steps for a number of times.
Main loop assigns tasks to the thread manager.
Wait for results of tasks by the thread manager.
Below is my test code.
ThreadManager.h
namespace YSLibrary
{
class CThreadManager final
{
private:
static long long s_llLock;
static unsigned long long s_ullThreadCount;
static void** s_ppThreads;
static unsigned long* s_pThreadIDs;
static long long* s_pThreadQuits;
static long long* s_pTaskLocks;
static unsigned long long (**s_ppTasks)();
static unsigned long long* s_pTaskResults;
CThreadManager(){}
~CThreadManager(){}
__forceinline static void Lock()
{
while (true)
{
if (InterlockedCompareExchange64(&s_llLock, 1LL, 0LL) == 0LL)
{
return;
}
Sleep(0UL);
}
}
__forceinline static void Unlock()
{
InterlockedExchange64(&s_llLock, 0LL);
}
static unsigned long __stdcall Thread(void* const _pParameter)
{
const unsigned long long ullThreadIndex = reinterpret_cast<const unsigned long long>(_pParameter);
while (true)
{
if (InterlockedCompareExchange64(&s_pThreadQuits[ullThreadIndex], 0LL, 1LL) == 1LL)
{
return 1UL;
}
if (InterlockedCompareExchange64(&s_pTaskLocks[ullThreadIndex], 1LL, 0LL) == 0LL)
{
if (s_ppTasks[ullThreadIndex] != nullptr)
{
s_pTaskResults[ullThreadIndex] = s_ppTasks[ullThreadIndex]();
s_ppTasks[ullThreadIndex] = nullptr;
}
InterlockedExchange64(&s_pTaskLocks[ullThreadIndex], 0LL);
}
}
}
public:
enum class EResult : unsigned long long
{
None = 0ULL,
Success = 1ULL,
Fail_ArgumentNull = 2ULL,
Fail_ArgumentInvalid = 3ULL,
Fail_Locked = 4ULL,
Fail_ThreadCountNotZero = 5ULL,
Fail_ThreadCountZero = 6ULL,
Fail_ThreadsNotNull = 7ULL,
Fail_ThreadsNull = 8ULL,
Fail_ThreadIDsNotNull = 9ULL,
Fail_ThreadIDsNull = 10ULL,
Fail_ThreadQuitsNotNull = 11ULL,
Fail_ThreadQuitsNull = 12ULL,
Fail_TaskLocksNotNull = 13ULL,
Fail_TaskLocksNull = 14ULL,
Fail_TasksNotNull = 15ULL,
Fail_TasksNull = 16ULL,
Fail_TaskResultsNotNull = 17ULL,
Fail_TaskResultsNull = 18ULL,
Fail_CreateThread = 19ULL
};
__forceinline static EResult Initialize(const unsigned long long _ullThreadCount)
{
if (_ullThreadCount == 0ULL)
{
return EResult::Fail_ArgumentNull;
}
Lock();
if (s_ullThreadCount != 0ULL)
{
Unlock();
return EResult::Fail_ThreadCountNotZero;
}
if (s_ppThreads != nullptr)
{
Unlock();
return EResult::Fail_ThreadsNotNull;
}
if (s_pThreadIDs != nullptr)
{
Unlock();
return EResult::Fail_ThreadIDsNotNull;
}
if (s_pThreadQuits != nullptr)
{
Unlock();
return EResult::Fail_ThreadQuitsNotNull;
}
if (s_pTaskLocks != nullptr)
{
Unlock();
return EResult::Fail_TaskLocksNotNull;
}
if (s_ppTasks != nullptr)
{
Unlock();
return EResult::Fail_TasksNotNull;
}
if (s_pTaskResults != nullptr)
{
Unlock();
return EResult::Fail_TaskResultsNotNull;
}
s_ullThreadCount = _ullThreadCount;
s_ppThreads = new void*[s_ullThreadCount]{};
s_pThreadIDs = new unsigned long[s_ullThreadCount]{};
s_pThreadQuits = new long long[s_ullThreadCount]{};
s_pTaskLocks = new long long[s_ullThreadCount]{};
s_ppTasks = new (unsigned long long (*[s_ullThreadCount])()){};
s_pTaskResults = new unsigned long long[s_ullThreadCount]{};
for (unsigned long long i = 0ULL; i < s_ullThreadCount; ++i)
{
s_ppThreads[i] = CreateThread(nullptr, 0ULL, &Thread, reinterpret_cast<void*>(i), 0UL, &s_pThreadIDs[i]);
if (s_ppThreads[i] == nullptr)
{
// Rollback
for (unsigned long long j = 0ULL; j < i; ++j)
{
InterlockedExchange64(&s_pThreadQuits[i], 1LL);
}
unsigned long ulExitCode = 0UL;
for (unsigned long long j = 0ULL; j < i; ++j)
{
while (true)
{
GetExitCodeThread(s_ppThreads[j], &ulExitCode);
if (ulExitCode != static_cast<unsigned long>(STILL_ACTIVE))
{
CloseHandle(s_ppThreads[j]);
s_ppThreads[j] = nullptr;
break;
}
Sleep(0UL);
}
}
delete[] s_pTaskResults;
s_pTaskResults = nullptr;
delete[] s_ppTasks;
s_ppTasks = nullptr;
delete[] s_pTaskLocks;
s_pTaskLocks = nullptr;
delete[] s_pThreadQuits;
s_pThreadQuits = nullptr;
delete[] s_pThreadIDs;
s_pThreadIDs = nullptr;
delete[] s_ppThreads;
s_ppThreads = nullptr;
s_ullThreadCount = 0ULL;
Unlock();
return EResult::Fail_CreateThread;
}
}
Unlock();
return EResult::Success;
}
__forceinline static EResult Terminate()
{
Lock();
if (s_ullThreadCount == 0ULL)
{
Unlock();
return EResult::Fail_ThreadCountZero;
}
if (s_ppThreads == nullptr)
{
Unlock();
return EResult::Fail_ThreadsNull;
}
if (s_pThreadIDs == nullptr)
{
Unlock();
return EResult::Fail_ThreadIDsNull;
}
if (s_pThreadQuits == nullptr)
{
Unlock();
return EResult::Fail_ThreadQuitsNull;
}
if (s_pTaskLocks == nullptr)
{
Unlock();
return EResult::Fail_TaskLocksNull;
}
if (s_ppTasks == nullptr)
{
Unlock();
return EResult::Fail_TasksNull;
}
if (s_pTaskResults == nullptr)
{
Unlock();
return EResult::Fail_TaskResultsNull;
}
for (unsigned long long i = 0ULL; i < s_ullThreadCount; ++i)
{
InterlockedExchange64(&s_pThreadQuits[i], 1LL);
}
unsigned long ulExitCode = 0UL;
for (unsigned long long i = 0ULL; i < s_ullThreadCount; ++i)
{
while (true)
{
GetExitCodeThread(s_ppThreads[i], &ulExitCode);
if (ulExitCode != static_cast<unsigned long>(STILL_ACTIVE))
{
CloseHandle(s_ppThreads[i]);
s_ppThreads[i] = nullptr;
break;
}
Sleep(0UL);
}
}
delete[] s_pTaskResults;
s_pTaskResults = nullptr;
delete[] s_ppTasks;
s_ppTasks = nullptr;
delete[] s_pTaskLocks;
s_pTaskLocks = nullptr;
delete[] s_pThreadQuits;
s_pThreadQuits = nullptr;
delete[] s_pThreadIDs;
s_pThreadIDs = nullptr;
delete[] s_ppThreads;
s_ppThreads = nullptr;
s_ullThreadCount = 0ULL;
Unlock();
return EResult::Success;
}
__forceinline static EResult Execute(const unsigned long long _ullThreadIndex, unsigned long long (*_pFunction)())
{
if (_pFunction == nullptr)
{
return EResult::Fail_ArgumentNull;
}
Lock();
if (s_ullThreadCount == 0ULL)
{
Unlock();
return EResult::Fail_ThreadCountZero;
}
if (s_ppThreads == nullptr)
{
Unlock();
return EResult::Fail_ThreadsNull;
}
if (s_pThreadIDs == nullptr)
{
Unlock();
return EResult::Fail_ThreadIDsNull;
}
if (s_pThreadQuits == nullptr)
{
Unlock();
return EResult::Fail_ThreadQuitsNull;
}
if (s_pTaskLocks == nullptr)
{
Unlock();
return EResult::Fail_TaskLocksNull;
}
if (s_ppTasks == nullptr)
{
Unlock();
return EResult::Fail_TasksNull;
}
if (s_pTaskResults == nullptr)
{
Unlock();
return EResult::Fail_TaskResultsNull;
}
if (_ullThreadIndex >= s_ullThreadCount)
{
Unlock();
return EResult::Fail_ArgumentInvalid;
}
while (true)
{
if (InterlockedCompareExchange64(&s_pTaskLocks[_ullThreadIndex], 1LL, 0LL) == 0LL)
{
s_ppTasks[_ullThreadIndex] = _pFunction;
InterlockedExchange64(&s_pTaskLocks[_ullThreadIndex], 0LL);
Unlock();
return EResult::Success;
}
Sleep(0UL);
}
}
__forceinline static EResult WaitForResult(const unsigned long long _ullThreadIndex, unsigned long long* const _pFunctionResult)
{
if (_pFunctionResult == nullptr)
{
return EResult::Fail_ArgumentNull;
}
Lock();
if (s_ullThreadCount == 0ULL)
{
Unlock();
return EResult::Fail_ThreadCountZero;
}
if (s_ppThreads == nullptr)
{
Unlock();
return EResult::Fail_ThreadsNull;
}
if (s_pThreadIDs == nullptr)
{
Unlock();
return EResult::Fail_ThreadIDsNull;
}
if (s_pThreadQuits == nullptr)
{
Unlock();
return EResult::Fail_ThreadQuitsNull;
}
if (s_pTaskLocks == nullptr)
{
Unlock();
return EResult::Fail_TaskLocksNull;
}
if (s_ppTasks == nullptr)
{
Unlock();
return EResult::Fail_TasksNull;
}
if (s_pTaskResults == nullptr)
{
Unlock();
return EResult::Fail_TaskResultsNull;
}
if (_ullThreadIndex >= s_ullThreadCount)
{
Unlock();
return EResult::Fail_ArgumentInvalid;
}
while (true)
{
if (InterlockedCompareExchange64(&s_pTaskLocks[_ullThreadIndex], 1LL, 0LL) == 0LL)
{
if (s_ppTasks[_ullThreadIndex] == nullptr)
{
(*_pFunctionResult) = s_pTaskResults[_ullThreadIndex];
InterlockedExchange64(&s_pTaskLocks[_ullThreadIndex], 0LL);
Unlock();
return EResult::Success;
}
InterlockedExchange64(&s_pTaskLocks[_ullThreadIndex], 0LL);
}
Sleep(0UL);
}
}
};
}
main.cpp
#include <iostream>
#include <Windows.h>
#include "ThreadManager.h"
long long YSLibrary::CThreadManager::s_llLock = 0LL;
unsigned long long YSLibrary::CThreadManager::s_ullThreadCount = 0ULL;
void** YSLibrary::CThreadManager::s_ppThreads = nullptr;
unsigned long* YSLibrary::CThreadManager::s_pThreadIDs = nullptr;
long long* YSLibrary::CThreadManager::s_pThreadQuits = nullptr;
long long* YSLibrary::CThreadManager::s_pTaskLocks = nullptr;
unsigned long long (**YSLibrary::CThreadManager::s_ppTasks)() = nullptr;
unsigned long long* YSLibrary::CThreadManager::s_pTaskResults = nullptr;
unsigned long long g_pResults[10]{};
struct SData
{
unsigned long long ullData[8];
};
SData g_stData{};
SData g_stData0{};
SData g_stData1{};
SData g_stData2{};
SData g_stData3{};
SData g_stData4{};
SData g_stData5{};
SData g_stData6{};
unsigned long long Function()
{
for (unsigned long long i = 0ULL; i < 70000000ULL; ++i)
{
g_stData.ullData[0] = static_cast<unsigned long long>(rand());
}
return 1ULL;
}
unsigned long long Function0()
{
for (unsigned long long i = 0ULL; i < 10000000ULL; ++i)
{
g_stData0.ullData[0] = static_cast<unsigned long long>(rand());
}
return 1ULL;
}
unsigned long long Function1()
{
for (unsigned long long i = 0ULL; i < 10000000ULL; ++i)
{
g_stData1.ullData[0] = static_cast<unsigned long long>(rand());
}
return 1ULL;
}
unsigned long long Function2()
{
for (unsigned long long i = 0ULL; i < 10000000ULL; ++i)
{
g_stData2.ullData[0] = static_cast<unsigned long long>(rand());
}
return 1ULL;
}
unsigned long long Function3()
{
for (unsigned long long i = 0ULL; i < 10000000ULL; ++i)
{
g_stData3.ullData[0] = static_cast<unsigned long long>(rand());
}
return 1ULL;
}
unsigned long long Function4()
{
for (unsigned long long i = 0ULL; i < 10000000ULL; ++i)
{
g_stData4.ullData[0] = static_cast<unsigned long long>(rand());
}
return 1ULL;
}
unsigned long long Function5()
{
for (unsigned long long i = 0ULL; i < 10000000ULL; ++i)
{
g_stData5.ullData[0] = static_cast<unsigned long long>(rand());
}
return 1ULL;
}
unsigned long long Function6()
{
for (unsigned long long i = 0ULL; i < 10000000ULL; ++i)
{
g_stData6.ullData[0] = static_cast<unsigned long long>(rand());
}
return 1ULL;
}
int main()
{
unsigned long long ullStartTick = 0ULL;
unsigned long long ullEndTick = 0ULL;
srand((unsigned int)time(nullptr));
ullStartTick = GetTickCount64();
Function();
ullEndTick = GetTickCount64();
std::wcout << L"[Main]" << std::endl;
std::wcout << ullEndTick - ullStartTick << std::endl;
YSLibrary::CThreadManager::EResult eResult = YSLibrary::CThreadManager::EResult::None;
eResult = YSLibrary::CThreadManager::Initialize(7ULL);
ullStartTick = GetTickCount64();
eResult = YSLibrary::CThreadManager::Execute(0ULL, &Function0);
eResult = YSLibrary::CThreadManager::Execute(1ULL, &Function1);
eResult = YSLibrary::CThreadManager::Execute(2ULL, &Function2);
eResult = YSLibrary::CThreadManager::Execute(3ULL, &Function3);
eResult = YSLibrary::CThreadManager::Execute(4ULL, &Function4);
eResult = YSLibrary::CThreadManager::Execute(5ULL, &Function5);
eResult = YSLibrary::CThreadManager::Execute(6ULL, &Function6);
eResult = YSLibrary::CThreadManager::WaitForResult(0ULL, &g_pResults[0]);
eResult = YSLibrary::CThreadManager::WaitForResult(1ULL, &g_pResults[1]);
eResult = YSLibrary::CThreadManager::WaitForResult(2ULL, &g_pResults[2]);
eResult = YSLibrary::CThreadManager::WaitForResult(3ULL, &g_pResults[3]);
eResult = YSLibrary::CThreadManager::WaitForResult(4ULL, &g_pResults[4]);
eResult = YSLibrary::CThreadManager::WaitForResult(5ULL, &g_pResults[5]);
eResult = YSLibrary::CThreadManager::WaitForResult(6ULL, &g_pResults[6]);
ullEndTick = GetTickCount64();
std::wcout << L"[Thread Manager]" << std::endl;
std::wcout << ullEndTick - ullStartTick << std::endl;
YSLibrary::CThreadManager::Terminate();
system("pause");
return 0;
}
I am really sorry about Interlocked family of functions, __forceinline, dirty declaration of static variables, etc.
On the other hand, the reason why I used "long long" for lock variable is there was no "bool" type. I'd rather tried "short" but it had no significant difference when I measured time between "short" and "long long". Rather, "short" was slightly slower and I guess the reason is the use of 16 bit registers in 64 bit environment. Also, bool or short type might lead to a problem of memory alignment. So I used "long long" type.
The reason why CThreadManager has private constructor is to explicitly prohibit "new CThreadManager()".
The use of "reinterpret_cast" is minimized. I thought it's cost is compile time, but I saw a question from stackoverflow that it has runtime cost. I'm not sure about it yet. So just use it once when thread function begins.
So far, I have checked false sharing phenomenon by changing
SData::ullData[8] -> SData::ullData1
Also, use of Sleep(0) significantly reduced waste of thread time slice in WaitForResult() and reduction of total execution time within threads.
The result of this code showed
[Main]
1828
[Thread Manager]
344
in my environment.
However, I just realized that there was another place other than SData::ullData where false sharing must occur, which are s_pThreadQuits, s_pTaskLocks, s_ppTasks, s_pTaskResults.
Why false sharing is not occurring with these variables?
[EDIT]
What I mean by "false sharing" is "memory address accessed by different threads but share the same cache-line" are
SData g_stDataN (in each FunctionN())
s_pThreadQuits, s_pTaskLocks, s_pTaskResults, s_ppTasks (in Thread())
I thought of 2. variables will also loaded to cache just like g_stDataN (64 byte in my environment) did. I've set the size of SData to 64 bytes in order to achieve the result of "padding" method to avoid false sharing.
However, as far as s_pThreadQuits are neither sized to 64 bytes nor padded, it should also have false sharing.
Like this image below.
Source of image is from
https://www.codeproject.com/Articles/85356/Avoiding-and-Identifying-False-Sharing-Among-Threa

Related

Shared resource single thread writing, multiple thread reading using interlock

I'm trying to implement single thread writing, multiple thread reading mechanism for shared resource management using interlock in C++, windows environment.
Q1. The result code seems to work as what I intend, but I'd like to ask for your wisdom if I am missing something.
Q2. If there is a real life or good active open source code example I can refer to, it will be really appreciated.
Following are the objectives I've taken into account.
Writing can only be executed by single thread and reading must be blocked when writing in order to avoid "invariant" break.
Reading can be executed by multiple threads.
#include <iostream>
#include <Windows.h>
char g_c = 0i8;
char g_pReadChar[3]{};
void* g_pThreads[4]{};
unsigned long g_pThreadIDs[4]{};
long long g_llLock = 0ULL; // 0 : Not locked / 1 : Locked (Writing) / 2 : Locked (Reading)
long long g_llEntryCount = 0ULL; // Thread entry count
__forceinline void Read()
{
// <- if a thread execution is here (case 0)
InterlockedIncrement64(&g_llEntryCount);
// <- if a thread execution is here (case 1)
for (unsigned long long i = 0ULL; i < 100000ULL; ++i)
{
if (InterlockedCompareExchange64(&g_llLock, 2LL, 0LL) == 1LL)
{
continue;
}
// <- if a thread execution is here (case 2)
// --------------------------------------------------
// Read data
std::cout << g_c;
// --------------------------------------------------
InterlockedExchange64(&g_llLock, 1LL); // Lock is needed in order to block case 0
if (InterlockedDecrement64(&g_llEntryCount) == 0LL)
{
InterlockedExchange64(&g_llLock, 0LL);
}
else
{
InterlockedExchange64(&g_llLock, 2LL);
}
return;
}
InterlockedDecrement64(&g_llEntryCount);
}
__forceinline unsigned long __stdcall ReadLoop(void* _pParam)
{
while (true)
{
Read();
Sleep(1);
}
}
__forceinline void Write(const unsigned long long _ullKey)
{
for (unsigned long long i = 0ULL; i < 100000ULL; ++i)
{
if (InterlockedCompareExchange64(&g_llLock, 1LL, 0LL) != 0LL)
{
continue;
}
// --------------------------------------------------
// Write data
if (_ullKey == 0ULL)
{
g_c = 'A';
}
else if (_ullKey == 1ULL)
{
g_c = 'B';
}
else
{
g_c = 'C';
}
// --------------------------------------------------
InterlockedExchange64(&g_llLock, 0LL);
return;
}
}
__forceinline unsigned long __stdcall WriteLoop(void* _pParam)
{
unsigned long long ullCount = 0ULL;
unsigned long long ullKey = 0ULL;
while (true)
{
if (ullCount > 10000ULL)
{
++ullKey;
if (ullKey >= 3ULL)
{
ullKey = 0ULL;
}
ullCount = 0ULL;
}
Write(ullKey);
++ullCount;
}
}
int main()
{
g_pThreads[0] = CreateThread(nullptr, 0ULL, ReadLoop, nullptr, 0UL, &g_pThreadIDs[0]);
g_pThreads[1] = CreateThread(nullptr, 0ULL, ReadLoop, nullptr, 0UL, &g_pThreadIDs[1]);
g_pThreads[2] = CreateThread(nullptr, 0ULL, ReadLoop, nullptr, 0UL, &g_pThreadIDs[2]);
g_pThreads[3] = CreateThread(nullptr, 0ULL, WriteLoop, nullptr, 0UL, &g_pThreadIDs[3]);
Sleep(100000);
return 0;
}

How can i count percentage CPU for each process by PID

I am trying to code task manager and i stuck with %CPU for each process dy PID.
I wrote something like, that:
static float CalculateCPULoad(unsigned long long idleTicks, unsigned long long totalTicks)
{
static unsigned long long _previousTotalTicks = 0;
static unsigned long long _previousIdleTicks = 0;
unsigned long long totalTicksSinceLastTime = totalTicks - _previousTotalTicks;
unsigned long long idleTicksSinceLastTime = idleTicks - _previousIdleTicks;
float ret = 1.0f - ((totalTicksSinceLastTime > 0) ? ((float)idleTicksSinceLastTime) / totalTicksSinceLastTime : 0);
_previousTotalTicks = totalTicks;
_previousIdleTicks = idleTicks;
return ret;
}
static unsigned long long FileTimeToInt64(const FILETIME& ft) { return (((unsigned long long)(ft.dwHighDateTime)) << 32) | ((unsigned long long)ft.dwLowDateTime); }
And was using it like:
hProcessSnap = CreateToolhelp32Snapshot(TH32CS_SNAPPROCESS, 0);
if (hProcessSnap == INVALID_HANDLE_VALUE)
{
printError("Failed to create Process Snap");
return FALSE;
}
pe.dwSize = sizeof(PROCESSENTRY32);
if (!Process32First(hProcessSnap, &pe))
{
printError("Failed to move along process snap");
CloseHandle(hProcessSnap);
return FALSE;
}
do
{
printf("\n\n=====================================================");
_tprintf(TEXT("\n PROCESS NAME: %s"), pe.szExeFile);
printf("\n-----------------------------------------------------");
dwPriorityClass = 0;
hProcess = OpenProcess(PROCESS_ALL_ACCESS, FALSE, pe.th32ProcessID);
if (hProcess == NULL)
{
printError("Failed to open process");
}
else
{
for (int i = 0; i < 2; i++)
{
GetProcessTimes(hProcess, &exist, &exit, &lastKernel, &lastUser);
GetSystemTimes(&lastIdle, 0, 0);
GetCPULoad(lastIdle, lastKernel, lastUser);
Sleep(2500);
}
std::cout << GetCPULoad(lastIdle, lastKernel, lastUser) << "\n";
CloseHandle(hProcess);
}
} while (Process32Next(hProcessSnap, &pe));
CloseHandle(hProcessSnap);
return (TRUE);
}
I know that using sleep() here isnt a good idea,but i havent think up anything better for now.
Pls help me with some code examples,if you can.
Also i want to know am i right that:
CPU% for process= (1- (IdleSystemTimeDelta/TotalProcessTimeDelta))*100%
This is how i get_CPU in percent.
I use hash_map in order to have PID-time connection static to update it.
First time usage get_cpu_usage(int pid) returns zero every time,but with each next usage it will be more and more accurate(I use it with 0.5 sec period).
static int get_processor_number()
{
SYSTEM_INFO info;
GetSystemInfo(&info);
return (int)info.dwNumberOfProcessors;
}
static __int64 file_time_2_utc(const FILETIME* ftime)
{
LARGE_INTEGER li;
li.LowPart = ftime->dwLowDateTime;
li.HighPart = ftime->dwHighDateTime;
return li.QuadPart;
}
static int get_cpu_usage(int pid)
{
static int processor_count_ = -1;
static std::unordered_map<int, __int64> last_time_;
static std::unordered_map<int, __int64> last_system_time_;
FILETIME now;
FILETIME creation_time;
FILETIME exit_time;
FILETIME kernel_time;
FILETIME user_time;
__int64 system_time;
__int64 time;
__int64 system_time_delta;
__int64 time_delta;
int cpu = -1;
if (processor_count_ == -1)
{
processor_count_ = get_processor_number();
}
GetSystemTimeAsFileTime(&now);
HANDLE hProcess = OpenProcess(PROCESS_ALL_ACCESS, false, pid);
if (!GetProcessTimes(hProcess, &creation_time, &exit_time, &kernel_time, &user_time))
{
std::cout << "Unable to getProcessTime\n";
return -1;
}
system_time = (file_time_2_utc(&kernel_time) + file_time_2_utc(&user_time)) / processor_count_;
time = file_time_2_utc(&now);
if ((last_system_time_[pid] == 0) || (last_time_[pid] == 0))
{
last_system_time_[pid] = system_time;
last_time_[pid] = time;
return 0;
}
system_time_delta = system_time - last_system_time_[pid];
time_delta = time - last_time_[pid];
if (time_delta == 0)
{
std::cout << "timedelta=0";
return -1;
}
cpu = int((system_time_delta * 100 + time_delta / 2) / time_delta);
last_system_time_[pid] = system_time;
last_time_[pid] = time;
return cpu;
}

C++ USB communication delay

I use ftd3xx.dll to communicate with the device
The data read part and the data write part are divided into threads and used.
#include <thread>
#include <queue>
#include <array>
#include <windows.h>
using namespace std;
bool dataRead = false;
CRITICAL_SECTION sec;
queue< vector<unsigned short>> BufferQueue;
unsigned WINAPI Write(void* arg) {
int Width = 1000;
vector<unsigned short> data;
data.reserve(Width);
while (Opened)
{
while (dataRead)
{
if (BufferQueue.size() > 0) {
EnterCriticalSection(&sec);
data = BufferQueue.front();
BufferQueue.pop();
LeaveCriticalSection(&sec);
}
else
{
this_thread::sleep_for(2ms);
continue;
}
//wrtie something
}
if (!dataRead)
break;
}
_endthreadex(0);
return 0;
}
unsigned WINAPI Read(void* arg) {
int Width = 1000;
vector<unsigned short> data(Width);
BYTE* acReadBuf = new BYTE[Width];
ULONG ulBytesRead = 0;
int idx = 0;
Sleep(100);
while (dataRead)
{
ftStatus = FT_ReadPipe(ftHandle, CstReadPipeNo, acReadBuf, Width, &ulBytesRead, NULL);
if (FT_SUCCESS(ftStatus))
{
idx = 0;
for (int i = 0; i < Width; i++) {
data[i] = ((unsigned short)((unsigned short)acReadBuf[idx] | ((unsigned short)acReadBuf[idx + 1] << 8)));
idx += 2;
}
EnterCriticalSection(&sec);
if (BufferQueue.size() > 10000) {
queue< vector<unsigned short>> empty;
swap(BufferQueue, empty);
}
BufferQueue.push(data);
LeaveCriticalSection(&sec);
}
else
{
}
}
_endthreadex(0);
return 0;
}
void main() {
//start
InitializeCriticalSection(&sec);
dataRead = true;
HANDLE r_hThread = NULL;
unsigned r_threadID;
r_hThread = (HANDLE)_beginthreadex(NULL, 0, Read, NULL, 0, &r_threadID);
HANDLE w_hThread = NULL;
unsigned w_threadID;
w_hThread = (HANDLE)_beginthreadex(NULL, 0, Write, NULL, 0, &w_threadID);
//....///
//stop
dataRead = false;;
WaitForSingleObject(r_hThread, INFINITE);
WaitForSingleObject(w_hThread, INFINITE);
DeleteCriticalSection(&sec);
}
I want to queue the array directly, but first I am using it as a vector.
Importantly, data loss occurs when other programs are run or even calculators are run.
The same is true even if the device gives the data late or fast.
I would be grateful if someone could help me.

Invalid output for a custom print function when incorporating newline characters within a string for a kernel project

I'm in the process of working on a kernel program to handle printing capabilities of input for a custom OS. I'm following Poncho's 2nd YouTube Video series found here, I'm currently on Video 4 in the series where he starts to add numerical types as inputs to the renderer's print function. Now, my code isn't exactly like his as I made some modifications.
-Note- This won't compile directly as there is no main function. _start is being called or invoked by a bootloader that isn't shown here, I will however, add it to the bottom of this question.
When I use the class's print function like this within my kernel:
#include "BasicRenderer.h"
extern "C" void _start(Framebuffer* framebuffer, PSF1_FONT** fonts) {
BasicRenderer = renderer(framebuffer, fonts);
renderer.Print("This is some text");
renderer.Print('\n');
renderer.Print(uint64_t(123456789));
renderer.Print('\n');
renderer.Print(int64_t(-123456789));
return;
}
And I run the kernel in emu. I'm getting the following output displayed:
This is some text
123456789
-123456789
The above is correct, however, when I try to incorporate the ability to parse a newline set of characters being either \n or \0 within of a const char* that acts as a string as seen in the following example:
#include "BasicRenderer.h"
extern "C" void _start(Framebuffer* framebuffer, PSF1_FONT** fonts) {
BasicRenderer = renderer(framebuffer, fonts);
renderer.Print("This is some text\n");
renderer.Print(uint64_t(123456789));
renderer.Print('\n');
renderer.Print(int64_t(-123456789));
return;
}
And now the displayed output is:
This is some text
123456789
-123456789
Here, the output in the second line has a space preceding the numerical value to be displayed after the call to Print() that has a \n within its string. I'm not sure what is causing this in my code. Does it have to do with the while condition or how I'm incrementing and indexing into the character string within BasicRenderer::Print(const char* str)? Or is it coming from BasicRender::PutChar(char c)? Or is it within one of the to_string() functions?
Here is the relevant implementation code...
BasicRenderer.cpp
#include "BasicRenderer.h"
void BasicRenderer::Print(const char* str) {
char* chr = (char*)str;
while(*chr != 0) {
if ( (*chr == '\\') && ((*chr+1 == 'n') || (*chr+1 == '0')) ) {
PutChar('\n');
chr++;
chr++;
} else {
PutChar(*chr);
cursor_position_.x += 8;
if (cursor_position_.x + 8 > framebuffer_->Width) {
cursor_position_.x = 0;
cursor_position_.y += 16;
}
chr++;
}
}
}
void BasicRenderer::Print(uint64_t val) {
const char* str = to_string(val);
Print(str);
}
void BasicRenderer::Print(int64_t val) {
const char* str = to_string(val);
Print(str);
}
void BasicRenderer::PutChar(char c) {
if (c == '\n' || c == '\0') {
cursor_position_.x = 0;
cursor_position_.y += 16;
} else {
unsigned int* pixPtr = (unsigned int*)framebuffer_->BaseAddress;
char* fontPtr = (char*)selected_font_->glyphBuffer + (c * selected_font_->psf1_Header->charsize);
for (unsigned long y = cursor_position_.y; y < cursor_position_.y + 16; y++) {
for (unsigned long x = cursor_position_.x; x < cursor_position_.x + 8; x++) {
if ((*fontPtr & (0b10000000 >> (x - cursor_position_.x))) > 0) {
*(unsigned int*)(pixPtr + x + (y * framebuffer_->PixelsPerScanLine)) = font_color_;
}
}
fontPtr++;
}
}
}
cstr.cpp
#include "cstr.h"
const char* to_string(uint64_t value) {
static char output_uint_buffer[128];
uint8_t size = 0;
uint64_t sizeTest = value;
while (sizeTest / 10 > 0) {
sizeTest /= 10;
size++;
}
uint8_t idx = 0;
while (value / 10 > 0) {
uint8_t remainder = value % 10;
value /= 10;
output_uint_buffer[size - idx] = remainder + '0';
idx++;
}
uint8_t remainder = value % 10;
output_uint_buffer[size-idx] = remainder + '0';
output_uint_buffer[size + 1] = 0;
return output_uint_buffer;
}
const char* to_string(int64_t value) {
static char output_int_buffer[128];
uint8_t isNegative = 0;
if (value < 0) {
isNegative = 1;
value *= -1;
output_int_buffer[0] = '-';
}
uint8_t size = 0;
uint64_t sizeTest = value;
while (sizeTest / 10 > 0) {
sizeTest /= 10;
size++;
}
uint8_t idx = 0;
while (value / 10 > 0) {
uint8_t remainder = value % 10;
value /= 10;
output_int_buffer[isNegative + size - idx] = remainder + '0';
idx++;
}
uint8_t remainder = value % 10;
output_int_buffer[isNegative + size - idx] = remainder + '0';
output_int_buffer[isNegative + size + 1] = 0;
return output_int_buffer;
}
And here is the rest of the declarations...
BasicRender.h
#pragma once
#include "cstr.h"
#include "math.h"
#include "framebuffer.h"
#include "SimpleFonts.h"
class BasicRenderer {
public:
BasicRenderer(Framebuffer* framebuffer, PSF1_FONT** fonts) :
framebuffer_{framebuffer},
fonts_{fonts},
cursor_position_({0,0}),
selected_font_{fonts_[0]},
font_color_{0xFFFFFFFF}
{}
void Print(const char* str);
void Print(char c) { PutChar(c); }
void Print(uint64_t val);
void Print(int64_t val);
private:
void PutChar(char c);
Framebuffer* framebuffer_;
Point cursor_position_;
PSF1_FONT** fonts_;
PSF1_FONT* selected_font_;
unsigned int font_color_;
};
cstr.h
#pragma once
#include <stdint.h>
const char* to_string(uint64_t value);
const char* to_string(int64_t value);
math.h
#pragma once
struct Point {
unsigned int x;
unsigned int y;
};
Framebuffer.h
#pragma once
#include <stddef.h>
struct Framebuffer {
void* BaseAddress;
size_t BufferSize;
unsigned int Width;
unsigned int Height;
unsigned int PixelsPerScanLine;
};
SimpleFonts.h
#pragma once
struct PSF1_HEADER {
unsigned char magic[2];
unsigned char mode;
unsigned char charsize;
};
struct PSF1_FONT {
PSF1_HEADER* psf1_Header;
void* glyphBuffer;
};
Here is the bootloader application that invokes the above kernel.
main.c
#include <efi.h>
#include <efilib.h>
#include <elf.h>
#define PSF1_MAGIC0 0x36
#define PSF1_MAGIC1 0x04
typedef unsigned long long size_t;
typedef struct {
unsigned char magic[2];
unsigned char mode;
unsigned char charsize;
} PSF1_HEADER;
typedef struct {
PSF1_HEADER* psf1_Header;
void* glyphBuffer;
} PSF1_FONT;
typedef struct {
void* BaseAddress;
size_t BufferSize;
unsigned int Width;
unsigned int Height;
unsigned int PixelsPerScanLine;
} Framebuffer; Framebuffer framebuffer;
Framebuffer* InitializeGOP() {
EFI_GUID gopGuid = EFI_GRAPHICS_OUTPUT_PROTOCOL_GUID;
EFI_GRAPHICS_OUTPUT_PROTOCOL* gop;
EFI_STATUS status;
status = uefi_call_wrapper(BS->LocateProtocol, 3, &gopGuid, NULL, (void**)&gop);
if (EFI_ERROR(status)) {
Print(L"Unable to locate GOP\n\r");
return NULL;
} else {
Print(L"GOP located\n\r");
}
framebuffer.BaseAddress = (void*)gop->Mode->FrameBufferBase;
framebuffer.BufferSize = gop->Mode->FrameBufferSize;
framebuffer.Width = gop->Mode->Info->HorizontalResolution;
framebuffer.Height = gop->Mode->Info->VerticalResolution;
framebuffer.PixelsPerScanLine = gop->Mode->Info->PixelsPerScanLine;
return &framebuffer;
}
EFI_FILE* LoadFile(EFI_FILE* Directory, CHAR16* Path, EFI_HANDLE ImageHandle, EFI_SYSTEM_TABLE* SystemTable) {
EFI_FILE* LoadedFile;
EFI_LOADED_IMAGE_PROTOCOL* LoadedImage;
SystemTable->BootServices->HandleProtocol(ImageHandle, &gEfiLoadedImageProtocolGuid, (void**)&LoadedImage);
EFI_SIMPLE_FILE_SYSTEM_PROTOCOL* FileSystem;
SystemTable->BootServices->HandleProtocol(LoadedImage->DeviceHandle, &gEfiSimpleFileSystemProtocolGuid, (void**)&FileSystem);
if (Directory == NULL) {
FileSystem->OpenVolume(FileSystem, &Directory);
}
EFI_STATUS s = Directory->Open(Directory, &LoadedFile, Path, EFI_FILE_MODE_READ, EFI_FILE_READ_ONLY);
if (s != EFI_SUCCESS) {
return NULL;
}
return LoadedFile;
}
PSF1_FONT* LoadPSF1Font(EFI_FILE* Directory, CHAR16* Path, EFI_HANDLE ImageHandle, EFI_SYSTEM_TABLE* SystemTable) {
EFI_FILE* font = LoadFile(Directory, Path, ImageHandle, SystemTable);
if (font == NULL) return NULL;
PSF1_HEADER* fontHeader;
SystemTable->BootServices->AllocatePool(EfiLoaderData, sizeof(PSF1_HEADER), (void**)&fontHeader);
UINTN size = sizeof(PSF1_HEADER);
font->Read(font, &size, fontHeader);
if (fontHeader->magic[0] != PSF1_MAGIC0 || fontHeader->magic[1] != PSF1_MAGIC1) return NULL;
UINTN glyphBufferSize = fontHeader->charsize * 256;
if (fontHeader->mode == 1) { // 512 glyph mode
glyphBufferSize *= 2;
}
void* glyphBuffer;
font->SetPosition(font, sizeof(PSF1_HEADER));
SystemTable->BootServices->AllocatePool(EfiLoaderData, glyphBufferSize, (void**)&glyphBuffer);
font->Read(font, &glyphBufferSize, glyphBuffer);
PSF1_FONT* finishedFont;
SystemTable->BootServices->AllocatePool(EfiLoaderData, sizeof(PSF1_FONT), (void**)&finishedFont);
finishedFont->psf1_Header = fontHeader;
finishedFont->glyphBuffer = glyphBuffer;
return finishedFont;
}
int memcmp(const void* aptr, const void* bptr, size_t n) {
const unsigned char* a = aptr, *b = bptr;
for (size_t i = 0; i < n; i++) {
if (a[i] < b[i]) return -1;
else if(a[i] > b[i]) return 1;
}
return 0;
}
EFI_STATUS efi_main (EFI_HANDLE ImageHandle, EFI_SYSTEM_TABLE *SystemTable) {
InitializeLib(ImageHandle, SystemTable);
Print(L"Hello World!\n\r");
EFI_FILE* Kernel = LoadFile(NULL, L"kernel.elf", ImageHandle, SystemTable);
if ( Kernel == NULL) {
Print(L"Could not load kernel \n\r");
} else {
Print(L"Kernel Loaded Successfully \n\r");
}
Elf64_Ehdr header;
{
UINTN FileInfoSize;
EFI_FILE_INFO* FileInfo;
Kernel->GetInfo(Kernel, &gEfiFileInfoGuid, &FileInfoSize, NULL);
SystemTable->BootServices->AllocatePool(EfiLoaderData, FileInfoSize, (void**)&FileInfo);
Kernel->GetInfo(Kernel, &gEfiFileInfoGuid, &FileInfoSize, (void**)&FileInfo);
UINTN size = sizeof(header);
Kernel->Read(Kernel, &size, &header);
}
if (
memcmp(&header.e_ident[EI_MAG0], ELFMAG, SELFMAG) != 0 ||
header.e_ident[EI_CLASS] != ELFCLASS64 ||
header.e_ident[EI_DATA] != ELFDATA2LSB ||
header.e_type != ET_EXEC ||
header.e_machine != EM_X86_64 ||
header.e_version != EV_CURRENT
) {
Print(L"kernel format is bad\r\n");
} else {
Print(L"kernel header successfully verified\r\n");
}
Elf64_Phdr* phdrs;
{
Kernel->SetPosition(Kernel, header.e_phoff);
UINTN size = header.e_phnum * header.e_phentsize;
SystemTable->BootServices->AllocatePool(EfiLoaderData, size, (void**)&phdrs);
Kernel->Read(Kernel, &size, phdrs);
}
for (
Elf64_Phdr* phdr = phdrs;
(char*)phdr < (char*)phdrs + header.e_phnum * header.e_phentsize;
phdr = (Elf64_Phdr*)((char*)phdr + header.e_phentsize)
) {
switch(phdr->p_type) {
case PT_LOAD: {
int pages = (phdr->p_memsz + 0x1000 - 1) / 0x1000;
Elf64_Addr segment = phdr->p_paddr;
SystemTable->BootServices->AllocatePages(AllocateAddress, EfiLoaderData, pages, &segment);
Kernel->SetPosition(Kernel, phdr->p_offset);
UINTN size = phdr->p_filesz;
Kernel->Read(Kernel, &size, (void*)segment);
break;
}
}
}
Print(L"Kernel Loaded\n\r");
void (*KernelStart)(Framebuffer*, PSF1_FONT**) = ((__attribute__((sysv_abi)) void(*)(Framebuffer*, PSF1_FONT**) ) header.e_entry);
PSF1_FONT* newFont = LoadPSF1Font(NULL, L"zap-light16.psf", ImageHandle, SystemTable);
if (newFont == NULL) {
Print(L"Font is not valid or is not found\n\r");
} else {
Print(L"Font found, char size = %d\n\r", newFont->psf1_Header->charsize);
}
PSF1_FONT* newFontExt = LoadPSF1Font(NULL, L"zap-ext-light16.psf", ImageHandle, SystemTable);
if (newFont == NULL) {
Print(L"Font is not valid or is not found\n\r");
} else {
Print(L"Font found, char size = %d\n\r", newFont->psf1_Header->charsize);
}
PSF1_FONT* fonts[] = {newFont, newFontExt};
Framebuffer* newBuffer = InitializeGOP();
Print(L"Base: 0x%x\n\rSize: 0x%x\n\rWidth: %d\n\rHeight: %d\n\rPixelsPerScanline: %d\n\r",
newBuffer->BaseAddress,
newBuffer->BufferSize,
newBuffer->Width,
newBuffer->Height,
newBuffer->PixelsPerScanLine);
KernelStart(newBuffer, fonts);
return EFI_SUCCESS; // Exit the UEFI application
}
The problem is here:
if ( (*chr == '\\') && ((*chr+1 == 'n') || (*chr+1 == '0')) ) {
PutChar('\n');
chr++;
chr++;
}
...
You should not be parsing out \n since this will be present in the string as a linefeed character. What you want instead is:
if (*chr == '\n') {
PutChar('\n');
chr++;
}
...

Fail to Read Through Shared Memory

I am trying to publish some random things over shared memory; and for some weird reason, the reader doesn't pick up what the sender has written
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <unistd.h>
#include <sys/types.h>
#include <cstdio>
class SHM {
volatile char* _ptr;
public:
SHM() {
const auto handle = shm_open("myTest", O_RDWR|O_CREAT, 0666);
const auto size = 4 * 1024 * 1024;
if (-1 == ftruncate(handle, size)) {
throw;
}
_ptr = (volatile char*)mmap(0,size , PROT_READ | PROT_WRITE, MAP_SHARED, handle, 0);
if(_ptr == MAP_FAILED){
throw;
}
int rc = fchmod(handle, 0666);
if (rc == -1) {
throw;
}
}
bool read(uint64_t& magic, uint64_t& time) {
const uint64_t newVal = *(uint64_t*)_ptr;
if (newVal != magic) {
magic = newVal;
printf("value changed!!!\n");
time = *(uint64_t*)(_ptr + sizeof(magic));
return true;
}
//printf("old value: %lu\n", newVal);
return false;
}
void publish(const uint64_t time) {
__sync_fetch_and_add((uint64_t*)_ptr, time);
__sync_synchronize();
*(uint64_t*)(_ptr + sizeof(uint64_t)) = time;
}
};
Here is the sender:
#include <ctime>
#include <unistd.h>
#include <cstdlib>
#include <cstdint>
#include "shm.h"
int main() {
SHM shm;
timespec t;
for (auto i = 0; i < 10000; i++) {
if (0 == clock_gettime(CLOCK_REALTIME, &t)) {
const uint64_t v = t.tv_sec * 1000 * 1000 * 1000 + t.tv_nsec;
shm.publish(v);
printf("published %lu\n", v);
usleep(100);
}
}
}
Here is the reader:
#include <iostream>
#include "shm.h"
int main() {
SHM shm;
uint64_t magic = 0;
uint64_t t = 0;
while (true) {
if (shm.read(magic, t)) {
printf("%lu, %lu\n", magic, t);
}
}
}
If I restart the reader, the reader is indeed able to read the last value that the sender has written.
However, if I start the reader first, and then the sender, all the values the sender writes aren't picked up by the reader.
To make this even weirder, if I uncomment the printf statement in SHM::read(), then the reader is able to pick up sometimes.
Any idea?
GCC version:
g++ (GCC) 7.2.1 20170829 (Red Hat 7.2.1-1)
I spotted a couple of issues, however, I am unsure if they would fix your problem.
name for shm_open should start with / for portable use.
In read and publish the casts must not discard volatile. E.g.: const uint64_t newVal = *(uint64_t volatile*)_ptr;. Even better, drop volatile and use std::atomic.
Although there are different processes involved, this is still the case of same objects being accessed by more than one thread of execution and at least one of these threads modifies the shared objects.
I made the above changes. Using std::atomic fixed it:
class SHM {
void* _ptr;
public:
SHM() {
const auto handle = shm_open("/myTest", O_RDWR|O_CREAT, 0666);
const auto size = 4 * 1024 * 1024;
if (-1 == ftruncate(handle, size))
throw;
_ptr = mmap(0,size , PROT_READ | PROT_WRITE, MAP_SHARED, handle, 0);
if(_ptr == MAP_FAILED)
throw;
}
bool read(uint64_t& magic, uint64_t& time) {
auto p = static_cast<std::atomic<uint64_t>*>(_ptr);
const uint64_t newVal = p[0];
if (newVal != magic) {
magic = newVal;
printf("value changed!!!\n");
time = p[1];
return true;
}
return false;
}
void publish(const uint64_t time) {
auto p = static_cast<std::atomic<uint64_t>*>(_ptr);
p[0] += time;
p[1] = time;
}
};
void sender() {
SHM shm;
timespec t;
for (auto i = 0; i < 10000; i++) {
if (0 == clock_gettime(CLOCK_REALTIME, &t)) {
const uint64_t v = t.tv_sec * 1000 * 1000 * 1000 + t.tv_nsec;
shm.publish(v);
printf("published %lu\n", v);
usleep(100);
}
}
}
void reader() {
SHM shm;
uint64_t magic = 0;
uint64_t t = 0;
while (true) {
if (shm.read(magic, t)) {
printf("%lu, %lu\n", magic, t);
}
}
}
int main(int ac, char**) {
if(ac > 1)
reader();
else
sender();
}
With std::atomic you can have more control. E.g.:
struct Data {
std::atomic<uint64_t> time;
std::atomic<uint64_t> generation;
};
// ...
bool read(uint64_t& generation, uint64_t& time) {
auto data = static_cast<Data*>(_ptr);
auto new_generation = data->generation.load(std::memory_order_acquire); // 1. Syncronizes with (2).
if(generation == new_generation)
return false;
generation = new_generation;
time = data->time.load(std::memory_order_relaxed);
printf("value changed!!!\n");
return true;
}
void publish(const uint64_t time) {
auto data = static_cast<Data*>(_ptr);
data->time.store(time, std::memory_order_relaxed);
data->generation.fetch_add(time, std::memory_order_release); // 2. (1) Synchronises with this store.
}