Sleep on Windows 8.1 x64 always lasts 1 more milliseconds than needed. For instance Sleep(1) lasts approximately 2 milliseconds, Sleep(2) - 3 etc. timeBeginPeriod is set to 1. On Windows 7 works fine as expected (without excess millisecond). Is this behaviour is normal / possible to fix?
#include <Windows.h>
#include <stdio.h>
#pragma comment(lib, "winmm.lib")
LARGE_INTEGER Frequency;
long long int GetCurrent()
{
LARGE_INTEGER counter;
QueryPerformanceCounter(&counter);
return (1000000 * counter.QuadPart / Frequency.QuadPart);
}
int CALLBACK WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, LPSTR lpCmdLine, int nCmdShow)
{
timeBeginPeriod(1);
QueryPerformanceFrequency(&Frequency);
const unsigned int count = 1000;
long long int buffer[count];
long long int lastTime = GetCurrent(), currentTime;
for (unsigned int i = 0; i < count; i++)
{
currentTime = GetCurrent();
buffer[i] = currentTime - lastTime;
lastTime = currentTime;
Sleep(1);
}
timeEndPeriod(1);
FILE *file = fopen("log.txt", "w");
for (unsigned int i = 0; i < count; i++)
fprintf(file, "%ld\n", buffer[i]);
fclose(file);
return EXIT_SUCCESS;
}
NtDelayExecution workaround thanks to Mehrdad.
static NTSTATUS (__stdcall *NtDelayExecution)(BOOL Alertable, PLARGE_INTEGER DelayInterval) = (NTSTATUS (__stdcall*)(BOOL, PLARGE_INTEGER)) GetProcAddress(GetModuleHandle(L"ntdll.dll"), "NtDelayExecution");
LARGE_INTEGER delay;
unsigned int milliseconds = 1;
delay.QuadPart = (milliseconds > 1) ? -10000LL * (milliseconds - 1) : -1LL;
NtDelayExecution(false, &delay);
Related
I am trying to calculate CPU usage per core using C++ and WinAPI, but I am not able to do that properly. Thus, I have managed to calculate it by following:
#include <stdio.h>
#include <stdlib.h>
#include <tchar.h>
#include <windows.h>
#define SystemProcessorPerformanceInformation 0x8
#define SystemBasicInformation 0x0
int _tmain(int argc, _TCHAR* argv[])
{
typedef struct _SYSTEM_PROCESSOR_PERFORMANCE_INFORMATION
{
LARGE_INTEGER IdleTime;
LARGE_INTEGER KernelTime;
LARGE_INTEGER UserTime;
LARGE_INTEGER Reserved1[2];
ULONG Reserved2;
} SYSTEM_PROCESSOR_PERFORMANCE_INFORMATION;
typedef struct _SYSTEM_BASIC_INFORMATION {
ULONG Reserved;
ULONG TimerResolution;
ULONG PageSize;
ULONG NumberOfPhysicalPages;
ULONG LowestPhysicalPageNumber;
ULONG HighestPhysicalPageNumber;
ULONG AllocationGranularity;
ULONG_PTR MinimumUserModeAddress;
ULONG_PTR MaximumUserModeAddress;
KAFFINITY ActiveProcessorsAffinityMask;
CCHAR NumberOfProcessors;
} SYSTEM_BASIC_INFORMATION, *PSYSTEM_BASIC_INFORMATION;
// SYSTEM_INFO sysinf;
if (argc<2)
{
printf("Please specify waiting time in seconds\n");
return -1;
}
int nWaitSec = _wtoi(argv[1]);
if (nWaitSec <= 0)
{
printf("Waiting interval in seconds should be positive integer\n");
return -1;
}
typedef DWORD(WINAPI * PNTQUERYSYSYTEMINFORMATION)(DWORD info_class, void *out, DWORD size, DWORD *out_size);
PNTQUERYSYSYTEMINFORMATION pNtQuerySystemInformation = NULL;
pNtQuerySystemInformation = (PNTQUERYSYSYTEMINFORMATION)GetProcAddress(GetModuleHandle(L"NTDLL.DLL"), "NtQuerySystemInformation");
SYSTEM_BASIC_INFORMATION sbi;
SYSTEM_PROCESSOR_PERFORMANCE_INFORMATION * spi;
DWORD returnlength;
DWORD status = pNtQuerySystemInformation(SystemBasicInformation, &sbi,
sizeof(SYSTEM_BASIC_INFORMATION), &returnlength);
spi = new SYSTEM_PROCESSOR_PERFORMANCE_INFORMATION[sbi.NumberOfProcessors];
memset(spi, 0, sizeof(SYSTEM_PROCESSOR_PERFORMANCE_INFORMATION)*sbi.NumberOfProcessors);
status = pNtQuerySystemInformation(SystemProcessorPerformanceInformation, spi,
(sizeof(SYSTEM_PROCESSOR_PERFORMANCE_INFORMATION)*sbi.NumberOfProcessors), &returnlength);
int numberOfCores = returnlength / sizeof(SYSTEM_PROCESSOR_PERFORMANCE_INFORMATION);
printf("Number of cores: %d\n", numberOfCores);
static ULARGE_INTEGER ul_sys_idle_old[32];
static ULARGE_INTEGER ul_sys_kernel_old[32];
static ULARGE_INTEGER ul_sys_user_old[32];
float usage = 0;
float usageAccum = 0;
printf("\n\nWait for %d seconds\n", nWaitSec);
Sleep(nWaitSec*1000);
status = pNtQuerySystemInformation(SystemProcessorPerformanceInformation, spi,
(sizeof(SYSTEM_PROCESSOR_PERFORMANCE_INFORMATION)*numberOfCores), &returnlength);
usageAccum = 0;
for (int ii = 0; ii<numberOfCores; ii++)
{
ULARGE_INTEGER ul_sys_idle;
ULARGE_INTEGER ul_sys_kernel;
ULARGE_INTEGER ul_sys_user;
ul_sys_idle.QuadPart = spi[ii].IdleTime.QuadPart;
ul_sys_kernel.QuadPart = spi[ii].KernelTime.QuadPart;
ul_sys_user.QuadPart = spi[ii].UserTime.QuadPart;
ULONGLONG kernelTime = (ul_sys_kernel.QuadPart - ul_sys_kernel_old[ii].QuadPart);
ULONGLONG usertime = (ul_sys_user.QuadPart - ul_sys_user_old[ii].QuadPart);
ULONGLONG idletime = (ul_sys_idle.QuadPart - ul_sys_idle_old[ii].QuadPart);
ULONGLONG proctime = kernelTime + usertime - idletime;
ULONGLONG totaltime = kernelTime + usertime;
usage = (float)(proctime * 100) / totaltime;
usageAccum += usage;
printf("Core : %u: Usage : %f%%\n", ii + 1, usage);
}
usageAccum /= numberOfCores;
printf("----------------\nAverage for the last %d seconds: %f", nWaitSec, usageAccum);
delete[] spi;
return 0;
}
Despite that fact, it seems to be calculated in a wrong way because its output data almost does not change. How could I find CPU usage per core appropriately?
These three arrays are not updated in real time in your source code.So their value is always 0 in the loop.
static ULARGE_INTEGER ul_sys_idle_old[32];
static ULARGE_INTEGER ul_sys_kernel_old[32];
static ULARGE_INTEGER ul_sys_user_old[32];
What you get is the CPU usage time that has been running for a long time, and the base is very large, resulting in basically unchanged.
You can refer to Correct way to get Windows CPU utilization for multiprocessor and update your array to get the correct result.
BOOL QueryPerformanceCounter(
__out LARGE_INTEGER *lpPerformanceCount
);
LARGE_IN
TEGER startTimer()
{
LARGE_INTEGER start;
DWORD_PTR oldmask = SetThreadAffinityMask(GetCurrentThread(), 0);
QueryPerformanceCounter(&start);
SetThreadAffinityMask(GetCurrentThread(), oldmask);
return
start;
}
LARGE_INTEGER endTimer()
{
LARGE_INTE
GER stop;
DWORD_PTR oldmask = SetThreadAffinityMask(GetCurrentThread(), 0);
QueryPerformanceCounter(&stop);
SetThreadAffinityMask(GetCurrentThread(), oldmask);
return
stop;
}
I'm using those functions but I'm not sure in what type it returns values.
endTimer - startTimer = ? How to convert this result to get seconds ?
You need to get frequency and divide your 2 counters difference by it.
LARGE_INTEGER fr,t1,t2;
QueryPerformanceCounter(&t1);
// some lengthy code ...
QueryPerformanceCounter(&t2);
QueryPerformanceFrequency(&fr);
double diff_sec = (t2.QuadPart-t1.QuadPart)/(double)fr.QuadPart;
This is a followup to this question:
I guess I don't understand the Interlocked Acquire / Release APIs. I put together the small program below. As I understand it, g_val_1, g_val_2 and g_val_3 should always be updated in the same order and should end up as all the same value. But they do not (for more than one thread).
What am I missing? Thanks.
#include "windows.h"
#include "stdio.h"
#define _THREADS_ 100
#define _TICKS_ 1000
int volatile g_threads = 0;
DWORD volatile g_val_1 = 0;
DWORD volatile g_val_2 = 0;
DWORD volatile g_val_3 = 0;
BOOL g_running = TRUE;
DWORD TestThread(PVOID ignore)
{
while (g_running)
{
InterlockedIncrementAcquire(&g_val_1);
g_val_2++;
InterlockedIncrementRelease(&g_val_3);
}
InterlockedDecrement(&g_threads);
return(0);
}
int __cdecl main(int argc, char* argv[])
{
int th, duration;
int success;
DWORD ticks;
duration = _TICKS_;
g_threads = _THREADS_;
printf("Max=%d Threads=%d Entries=%d\n", duration, g_threads);
printf("Creating Threads\n");
th = g_threads;
while (th-- > 0)
{
CreateThread(NULL,
0,
TestThread,
NULL,
NORMAL_PRIORITY_CLASS,
NULL);
}
printf("Starting Threads\n");
ticks = GetTickCount();
while ((GetTickCount() - ticks) < duration);
g_running = FALSE;
while (g_threads > 0);
ticks = GetTickCount() - ticks;
success = ((g_val_1 == g_val_2) && (g_val_1 == g_val_2));
printf("Duration=%d g_val_1=%d g_val_2=%d g_val_3=%d OK=%d\n", ticks, g_val_1, g_val_2, g_val_3, success);
}
I've tried this code in C++ on Win7x64 platform with MSVC++, and I got CPU frequency about 2900000 ticks per second.
When I run this program, my stopwatch returns about 10,000,000 tick, which means it take about 4 seconds to process my program, but my program results are ready for me in 1 second (or less) O_o.
Could you please tell me what is wrong in my code?
#include <iostream>
#include "header.h"
#include <fstream>
#include <string>
#include <sstream>
#include <strsafe.h>
#include <direct.h>
#include <string.h>
using namespace std;
#define CV_TO_NANO 1000000000
#define CV_TO_MICRO 1000000
#define CV_TO_MILLI 1000
unsigned __int64 inline GetRDTSC()
{
__asm
{
; Flush the pipeline
XOR eax, eax
CPUID
; Get RDTSC counter in edx:eax
RDTSC
}
}
unsigned __int64 RunTest(TCHAR *AppName, TCHAR *CmdLine);
void main()
{
unsigned __int64 start = 0;
unsigned __int64 stop = 0;
unsigned __int64 freq = 0;
float rps;
ofstream dataFile;
// get processor freq
QueryPerformanceFrequency((LARGE_INTEGER *)&freq);
cout <<"freq (count per second): "<< freq << endl;
// round per second
rps = 1.0/(freq);
cout <<"rps (1/rps): "<< rps << endl;
dataFile.open ("d:/dataC.txt",ios::out );
for(int i = 0;i<200;i++)
{
SetProcessAffinityMask(GetCurrentProcess(),0x0001);
SetThreadAffinityMask(GetCurrentThread(),0x0001);
cout << RunTest(L"D:\\Child\\Child.exe", NULL);
}
getchar();
return;
}
unsigned __int64 RunTest(TCHAR *AppName, TCHAR *CmdLine)
{
unsigned __int64 start = 0;
unsigned __int64 stop = 0;
PROCESS_INFORMATION processInformation;
STARTUPINFO startupInfo;
memset(&processInformation, 0, sizeof(processInformation));
memset(&startupInfo, 0, sizeof(startupInfo));
startupInfo.cb = sizeof(startupInfo);
BOOL result;
start = GetRDTSC();
result = ::CreateProcess(AppName, CmdLine, NULL, NULL, FALSE, REALTIME_PRIORITY_CLASS, NULL, NULL, &startupInfo, &processInformation);
stop = GetRDTSC();
getchar();
if (result == 0)
{
wprintf(L"ERROR: CreateProcess failed!");
}
else
{
WaitForSingleObject( processInformation.hProcess, 0 );
CloseHandle( processInformation.hProcess );
CloseHandle( processInformation.hThread );
}
return stop - start;
}
I think you have a misconception here that QueryPerformanceFrequency is telling you something about the speed of your processor - it isn't. QueryPerformanceFrequency retrieves the frequency of the high-resolution performance counter, which is not guaranteed to have any predictable relationship to your CPU clock speed. This value needs to be used in conjunction with QueryPerformanceCounter in order to get quality timing values, not with assembly that directly queries the RDTSC.
Here is an example of how to use the high-frequency timer to time a block of code:
#include <Windows.h>
#include <iostream>
using namespace std;
int main()
{
LARGE_INTEGER li = {};
__int64 freq, start, stop;
QueryPerformanceFrequency(&li);
freq = li.QuadPart;
cout << "Counter Frequency: " << freq << "\n";
QueryPerformanceCounter(&li);
start = li.QuadPart;
for( int i = 0; i < 1000000; ++i )
{
int n = i * rand();
}
QueryPerformanceCounter(&li);
stop = li.QuadPart;
double elapsed_seconds = static_cast<double>(stop-start) / static_cast<double>(freq);
cout << "Elapsed Time: " << elapsed_seconds << " seconds\n";
}
I have the following code running on qnx momemntics.
#define BILLION 1000000000L;
struct timespec start_time;
struct timespec stop_time;
void start MyTestFunc() {
//Initialize the Test Start time
clock_gettime(CLOCK_REALTIME,&start_time)
// ... additonal code.
cout << "The exectuion time of func "<< calculateExecutionTime();
}
double calculateExecutionTime ()
{
clock_gettime(CLOCK_REALTIME,&stop_time);
double dSeconds = (stop_time.tv_sec - start_time.tv_sec);
double dNanoSeconds = (double)( stop_time.tv_nsec - start_time.tv_nsec ) / BILLION;
return dSeconds + dNanoSeconds;
}
Now i want to port above code to windows. can any one provide sample code.
Thanks!
You can implement a clock_gettime() replacement for windows as follows:
LARGE_INTEGER
getFILETIMEoffset()
{
SYSTEMTIME s;
FILETIME f;
LARGE_INTEGER t;
s.wYear = 1970;
s.wMonth = 1;
s.wDay = 1;
s.wHour = 0;
s.wMinute = 0;
s.wSecond = 0;
s.wMilliseconds = 0;
SystemTimeToFileTime(&s, &f);
t.QuadPart = f.dwHighDateTime;
t.QuadPart <<= 32;
t.QuadPart |= f.dwLowDateTime;
return (t);
}
int
clock_gettime(int X, struct timeval *tv)
{
LARGE_INTEGER t;
FILETIME f;
double microseconds;
static LARGE_INTEGER offset;
static double frequencyToMicroseconds;
static int initialized = 0;
static BOOL usePerformanceCounter = 0;
if (!initialized) {
LARGE_INTEGER performanceFrequency;
initialized = 1;
usePerformanceCounter = QueryPerformanceFrequency(&performanceFrequency);
if (usePerformanceCounter) {
QueryPerformanceCounter(&offset);
frequencyToMicroseconds = (double)performanceFrequency.QuadPart / 1000000.;
} else {
offset = getFILETIMEoffset();
frequencyToMicroseconds = 10.;
}
}
if (usePerformanceCounter) QueryPerformanceCounter(&t);
else {
GetSystemTimeAsFileTime(&f);
t.QuadPart = f.dwHighDateTime;
t.QuadPart <<= 32;
t.QuadPart |= f.dwLowDateTime;
}
t.QuadPart -= offset.QuadPart;
microseconds = (double)t.QuadPart / frequencyToMicroseconds;
t.QuadPart = microseconds;
tv->tv_sec = t.QuadPart / 1000000;
tv->tv_usec = t.QuadPart % 1000000;
return (0);
}
Avoiding PerformanceCounter mess, simple code:
struct timespec { long tv_sec; long tv_nsec; }; //header part
int clock_gettime(int, struct timespec *spec) //C-file part
{ __int64 wintime; GetSystemTimeAsFileTime((FILETIME*)&wintime);
wintime -=116444736000000000i64; //1jan1601 to 1jan1970
spec->tv_sec =wintime / 10000000i64; //seconds
spec->tv_nsec =wintime % 10000000i64 *100; //nano-seconds
return 0;
}
...is fast, reliable and correct porting solution with impressive 100ns precision (1ms/10000).
And QPC-based solution which precision will be possibly (on some hw) even better is:
struct timespec { long tv_sec; long tv_nsec; }; //header part
#define exp7 10000000i64 //1E+7 //C-file part
#define exp9 1000000000i64 //1E+9
#define w2ux 116444736000000000i64 //1.jan1601 to 1.jan1970
void unix_time(struct timespec *spec)
{ __int64 wintime; GetSystemTimeAsFileTime((FILETIME*)&wintime);
wintime -=w2ux; spec->tv_sec =wintime / exp7;
spec->tv_nsec =wintime % exp7 *100;
}
int clock_gettime(int, timespec *spec)
{ static struct timespec startspec; static double ticks2nano;
static __int64 startticks, tps =0; __int64 tmp, curticks;
QueryPerformanceFrequency((LARGE_INTEGER*)&tmp); //some strange system can
if (tps !=tmp) { tps =tmp; //init ~~ONCE //possibly change freq ?
QueryPerformanceCounter((LARGE_INTEGER*)&startticks);
unix_time(&startspec); ticks2nano =(double)exp9 / tps; }
QueryPerformanceCounter((LARGE_INTEGER*)&curticks); curticks -=startticks;
spec->tv_sec =startspec.tv_sec + (curticks / tps);
spec->tv_nsec =startspec.tv_nsec + (double)(curticks % tps) * ticks2nano;
if (!(spec->tv_nsec < exp9)) { spec->tv_sec++; spec->tv_nsec -=exp9; }
return 0;
}
My improved version of clock_gettime() using QueryPerformanceCounter().
#define BILLION (1E9)
static BOOL g_first_time = 1;
static LARGE_INTEGER g_counts_per_sec;
int clock_gettime(int dummy, struct timespec *ct)
{
LARGE_INTEGER count;
if (g_first_time)
{
g_first_time = 0;
if (0 == QueryPerformanceFrequency(&g_counts_per_sec))
{
g_counts_per_sec.QuadPart = 0;
}
}
if ((NULL == ct) || (g_counts_per_sec.QuadPart <= 0) ||
(0 == QueryPerformanceCounter(&count)))
{
return -1;
}
ct->tv_sec = count.QuadPart / g_counts_per_sec.QuadPart;
ct->tv_nsec = ((count.QuadPart % g_counts_per_sec.QuadPart) * BILLION) / g_counts_per_sec.QuadPart;
return 0;
}
I think my version is an improvement over the currently accepted answer using QueryPerformanceCounter(), because -
More robust - checks return values of functions, also value returned in pass-by-reference variable.
More robust - checks validity of input parameter.
More streamlined - Uses as few as necessary number of variables (3 vs 7).
More streamlined - Avoids the code-path involving GetSystemTimeAsFileTime() since QueryPerformanceFrequency() and QueryPerformanceCounter() are guaranteed to work on systems that run Windows XP or later.
A full-featured and fully-tested implementation of clock_gettime() has been in mingw-w64 for many years now. You'll have to use a toolchain with mingw64/msys2 to use this, with header #include <time.h> (on windows). If you're writing a codebase that's portable between linux and windows, and you can't find clock_gettime() in <time.h> for your linux builds 3, I'd suggest you try #include <pthread_time.h>, compiling with -pthread, or linking with -lrt.
See also question 60020968 for Windows builds; and 33846055, 538609 for your Linux builds.
I needed monotonic and realtime.
For monotonic, I just take the perf counter since a wall clock baseline is meaningless.
#define MS_PER_SEC 1000ULL // MS = milliseconds
#define US_PER_MS 1000ULL // US = microseconds
#define HNS_PER_US 10ULL // HNS = hundred-nanoseconds (e.g., 1 hns = 100 ns)
#define NS_PER_US 1000ULL
#define HNS_PER_SEC (MS_PER_SEC * US_PER_MS * HNS_PER_US)
#define NS_PER_HNS (100ULL) // NS = nanoseconds
#define NS_PER_SEC (MS_PER_SEC * US_PER_MS * NS_PER_US)
int clock_gettime_monotonic(struct timespec *tv)
{
static LARGE_INTEGER ticksPerSec;
LARGE_INTEGER ticks;
if (!ticksPerSec.QuadPart) {
QueryPerformanceFrequency(&ticksPerSec);
if (!ticksPerSec.QuadPart) {
errno = ENOTSUP;
return -1;
}
}
QueryPerformanceCounter(&ticks);
tv->tv_sec = (long)(ticks.QuadPart / ticksPerSec.QuadPart);
tv->tv_nsec = (long)(((ticks.QuadPart % ticksPerSec.QuadPart) * NS_PER_SEC) / ticksPerSec.QuadPart);
return 0;
}
and wall clock, based on GMT unlike the tempting and similar _ftime() function.
int clock_gettime_realtime(struct timespec *tv)
{
FILETIME ft;
ULARGE_INTEGER hnsTime;
GetSystemTimePreciseAsFileTime(&ft);
hnsTime.LowPart = ft.dwLowDateTime;
hnsTime.HighPart = ft.dwHighDateTime;
// To get POSIX Epoch as baseline, subtract the number of hns intervals from Jan 1, 1601 to Jan 1, 1970.
hnsTime.QuadPart -= (11644473600ULL * HNS_PER_SEC);
// modulus by hns intervals per second first, then convert to ns, as not to lose resolution
tv->tv_nsec = (long) ((hnsTime.QuadPart % HNS_PER_SEC) * NS_PER_HNS);
tv->tv_sec = (long) (hnsTime.QuadPart / HNS_PER_SEC);
return 0;
}
And then the POSIX compatible function... see POSIX header for typedef and macros.
int clock_gettime(clockid_t type, struct timespec *tp)
{
if (type == CLOCK_MONOTONIC)
{
return clock_gettime_monotonic(tp);
}
else if (type == CLOCK_REALTIME)
{
return clock_gettime_realtime(tp);
}
errno = ENOTSUP;
return -1;
}
You can use timespec_get to implement simple clock_gettime.
(timespec_get function is available since C11)
int clock_gettime(int, struct timespec *tv)
{
return timespec_get(tv, TIME_UTC);
}
... but result timespec has about 10 milisec resolution on my windows7 64bit machine. :(
Here is my version of clock_gettime.
int clock_gettime(int, struct timespec *tv)
{
static int initialized = 0;
static LARGE_INTEGER freq, startCount;
static struct timespec tv_start;
LARGE_INTEGER curCount;
time_t sec_part;
long nsec_part;
if (!initialized) {
QueryPerformanceFrequency(&freq);
QueryPerformanceCounter(&startCount);
timespec_get(&tv_start, TIME_UTC);
initialized = 1;
}
QueryPerformanceCounter(&curCount);
curCount.QuadPart -= startCount.QuadPart;
sec_part = curCount.QuadPart / freq.QuadPart;
nsec_part = (long)((curCount.QuadPart - (sec_part * freq.QuadPart))
* 1000000000UL / freq.QuadPart);
tv->tv_sec = tv_start.tv_sec + sec_part;
tv->tv_nsec = tv_start.tv_nsec + nsec_part;
if(tv->tv_nsec >= 1000000000UL) {
tv->tv_sec += 1;
tv->tv_nsec -= 1000000000UL;
}
return 0;
}