The page size is 4KB in linux, but I access a new page, the latency is about twice large than inside a page. How can I reduce this latency?
Here is my test code. I use clock_gettime to measure the time cost in the main function.
#define MEM_SIZE 4096 * 12
long long GetRT() {
struct timespec tp;
clock_gettime(CLOCK_REALTIME, &tp);
return (long long) tp.tv_sec * 1000000000 + tp.tv_nsec;
}
void *InitSharedMemory() {
int fd = shm_open("/test-steve", O_CREAT | O_RDWR, ACCESSPERMS);
if (fd < 0) {
perror("shm_open");
}
if (ftruncate(fd, MEM_SIZE) < 0) {
perror("ftruncate");
}
void *mmap_ptr = mmap(NULL, MEM_SIZE, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_POPULATE, fd, 0);
if (mmap_ptr == (caddr_t)-1) {
perror("mmap");
}
return mmap_ptr;
}
int main()
{
constexpr auto STEP = 512;
char arr[STEP];
for (int i = 0; i < STEP; ++i) {
arr[i] = i;
}
int ret = 0;
void *buffer = InitSharedMemory();
for (int i = 0; i < 40; ++i) {
if ((i % 0x8) == 0) {
// *(char *) buffer = 'a';
}
auto t1 = GetRT();
*(char *) buffer = 'a';
auto t2 = GetRT();
ret += t2 - t1;
printf("cost: %lld ns\n", t2 - t1);
buffer = (char *) buffer + STEP;
}
return ret;
}
The result is some like follows, is the extra latency TLB missing? how to avoid it?
cost: 272 ns
cost: 73 ns
cost: 76 ns
cost: 74 ns
cost: 74 ns
cost: 75 ns
cost: 73 ns
cost: 76 ns
cost: 281 ns
cost: 74 ns
cost: 73 ns
cost: 76 ns
cost: 76 ns
cost: 74 ns
cost: 73 ns
cost: 76 ns
cost: 334 ns
cost: 76 ns
cost: 76 ns
cost: 76 ns
cost: 76 ns
cost: 73 ns
cost: 76 ns
cost: 76 ns
cost: 267 ns
cost: 74 ns
cost: 74 ns
cost: 75 ns
cost: 73 ns
cost: 76 ns
cost: 73 ns
cost: 76 ns
cost: 260 ns
I am afraid that you cannot avoid the cost of the first hit to a page (as illustrated by your example).
May be this cost could be amortised when using huge-pages (2MB) with MAP_HUGETLB in the flags of mmap().
Depending on the applicative context, touching the first byte (or any) of each page once for all, right after the allocation, will force the commit (may be useful on numa hardware).
This will have a large initial cost but could lead to more predictable timings for the subsequent accesses.
Related
I write grid-stride loop to have High Performance Calculations, where large N, for example long long N 1<<36, or even more. From total grid I need only some indexes, which have to satisfy the define condition.
__global__ void Indexes(int *array, int N) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
while( index<N)
{
if (condition)
{....//do something to save index in array}
index += blockDim.x * gridDim.x;
}
}
Of course, it is possible use the Thrust, which allows to have both host and device arrays. But in this case obviously the calculation will be extremely ineffective, because need firstly to create a lot of non-needed elements, then to delete these.
What is the most effective way to save the indexes directly in array in device to pass in CPU?
If your output is relatively dense (i.e. a lot of indices and relatively few zeros), then the stream compaction approach suggested in comments is a good solution. There are a lot of ready-to-go stream compaction implementations which you can probably adapt to your purposes.
If your output is sparse, so you need to save relatively few indices for a lot of inputs, then stream compaction isn't such a great solution because it will waste a lot of GPU memory. In that case (and you can roughly estimate an upper bound of the number of output indices) something like this:
template <typename T>
struct Array
{
T* p;
int Nmax;
int* next;
Array() = default;
__host__ __device__
Array(T* _p, int _Nmax, int* _next) : p(_p), Nmax(_Nmax), next(_next) {};
__device__
int append(T& val)
{
int pos = atomicAdd(next, 1);
if (pos > Nmax) {
atomicExch(next, Nmax);
return -1;
} else {
p[pos] = val;
return pos;
}
};
};
is probably more appropriate. Here, the idea is to use an atomically incremented position in the output array to keep track of where a thread should store its index. The code will signal if you fill the index array, and there will be information from which you can work out a restart strategy to stop the current kernel and then start from the last known index which you were able to store.
A complete example:
$ cat append.cu
#include <iostream>
#include <thrust/device_ptr.h>
#include <thrust/device_vector.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/copy.h>
namespace AppendArray
{
template <typename T>
struct Array
{
T* p;
int Nmax;
int* next;
Array() = default;
__host__ __device__
Array(T* _p, int _Nmax, int* _next) : p(_p), Nmax(_Nmax), next(_next) {};
__device__
int append(T& val)
{
int pos = atomicAdd(next, 1);
if (pos > Nmax) {
atomicExch(next, Nmax);
return -1;
} else {
p[pos] = val;
return pos;
}
};
};
}
__global__
void kernelfind(int* input, int N, AppendArray::Array<int> indices)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
for(; idx < N; idx += gridDim.x*blockDim.x) {
if (input[idx] % 10000 == 0) {
if (indices.append(idx) < 0) return;
}
}
}
int main()
{
const int Ninputs = 1 << 20;
thrust::device_vector<int> inputs(Ninputs);
thrust::counting_iterator<int> vals(1);
thrust::copy(vals, vals + Ninputs, inputs.begin());
int* d_input = thrust::raw_pointer_cast(inputs.data());
int Nindices = Ninputs >> 12;
thrust::device_vector<int> indices(Nindices);
int* d_indices = thrust::raw_pointer_cast(indices.data());
int* pos; cudaMallocManaged(&pos, sizeof(int)); *pos = 0;
AppendArray::Array<int> index(d_indices, Nindices-1, pos);
int gridsize, blocksize;
cudaOccupancyMaxPotentialBlockSize(&gridsize, &blocksize, kernelfind, 0, 0);
kernelfind<<<gridsize, blocksize>>>(d_input, Ninputs, index);
cudaDeviceSynchronize();
for(int i = 0; i < *pos; ++i) {
int idx = indices[i];
std::cout << i << " " << idx << " " << inputs[idx] << std::endl;
}
return 0;
}
$ nvcc -std=c++11 -arch=sm_52 -o append append.cu
$ ./append
0 9999 10000
1 19999 20000
2 29999 30000
3 39999 40000
4 49999 50000
5 69999 70000
6 79999 80000
7 59999 60000
8 89999 90000
9 109999 110000
10 99999 100000
11 119999 120000
12 139999 140000
13 129999 130000
14 149999 150000
15 159999 160000
16 169999 170000
17 189999 190000
18 179999 180000
19 199999 200000
20 209999 210000
21 219999 220000
22 239999 240000
23 249999 250000
24 229999 230000
25 279999 280000
26 269999 270000
27 259999 260000
28 319999 320000
29 329999 330000
30 289999 290000
31 299999 300000
32 339999 340000
33 349999 350000
34 309999 310000
35 359999 360000
36 379999 380000
37 399999 400000
38 409999 410000
39 369999 370000
40 429999 430000
41 419999 420000
42 389999 390000
43 439999 440000
44 459999 460000
45 489999 490000
46 479999 480000
47 449999 450000
48 509999 510000
49 539999 540000
50 469999 470000
51 499999 500000
52 569999 570000
53 549999 550000
54 519999 520000
55 589999 590000
56 529999 530000
57 559999 560000
58 619999 620000
59 579999 580000
60 629999 630000
61 669999 670000
62 599999 600000
63 609999 610000
64 699999 700000
65 639999 640000
66 649999 650000
67 719999 720000
68 659999 660000
69 679999 680000
70 749999 750000
71 709999 710000
72 689999 690000
73 729999 730000
74 779999 780000
75 799999 800000
76 809999 810000
77 739999 740000
78 849999 850000
79 759999 760000
80 829999 830000
81 789999 790000
82 769999 770000
83 859999 860000
84 889999 890000
85 879999 880000
86 819999 820000
87 929999 930000
88 869999 870000
89 839999 840000
90 909999 910000
91 939999 940000
92 969999 970000
93 899999 900000
94 979999 980000
95 959999 960000
96 949999 950000
97 1019999 1020000
98 1009999 1010000
99 989999 990000
100 1029999 1030000
101 919999 920000
102 1039999 1040000
103 999999 1000000
I have a simple benchmark that demonstrates performance of busywait threads. It runs in two modes: first one simply gets two timepoints sequentially, second one iterates through vector and measures duration of an iteration.
I see that two sequential calls of clock::now() takes about 50 nanoseconds on the average and one average iteration through vector takes about 100 nanoseconds. But sometimes these operations are executed with a huge delay: about 50 microseconds in the first case and 10 milliseconds (!) in the second case.
Test runs on single isolated core so context switches do not occur. I also call mlockall in beginning of the program so I assume that page faults do not affect the performance.
Following additional optimizations were also applied:
kernel boot parameters: intel_idle.max_cstate=0 idle=halt
irqaffinity=0,14 isolcpus=4-13,16-27 pti=off spectre_v2=off audit=0
selinux=0 nmi_watchdog=0 nosoftlockup=0 rcu_nocb_poll rcu_nocbs=19-20
nohz_full=19-20;
rcu[^c] kernel threads moved to a housekeeping CPU core 0;
network card RxTx queues moved to a housekeeping CPU core 0;
writeback kernel workqueue moved to a housekeeping CPU core 0;
transparent_hugepage disabled;
Intel CPU HyperThreading disabled;
swap file/partition is not used.
Environment:
System details:
Default Archlinux kernel:
5.1.9-arch1-1-ARCH #1 SMP PREEMPT Tue Jun 11 16:18:09 UTC 2019 x86_64 GNU/Linux
that has following PREEMPT and HZ settings:
CONFIG_HZ_300=y
CONFIG_HZ=300
CONFIG_PREEMPT=y
Hardware details:
RAM: 256GB
CPU(s): 28
On-line CPU(s) list: 0-27
Thread(s) per core: 1
Core(s) per socket: 14
Socket(s): 2
NUMA node(s): 2
Vendor ID: GenuineIntel
CPU family: 6
Model: 79
Model name: Intel(R) Xeon(R) CPU E5-2690 v4 # 2.60GHz
Stepping: 1
CPU MHz: 3200.011
CPU max MHz: 3500.0000
CPU min MHz: 1200.0000
BogoMIPS: 5202.68
Virtualization: VT-x
L1d cache: 32K
L1i cache: 32K
L2 cache: 256K
L3 cache: 35840K
NUMA node0 CPU(s): 0-13
NUMA node1 CPU(s): 14-27
Example code:
struct TData
{
std::vector<char> Data;
TData() = default;
TData(size_t aSize)
{
for (size_t i = 0; i < aSize; ++i)
{
Data.push_back(i);
}
}
};
using TBuffer = std::vector<TData>;
TData DoMemoryOperation(bool aPerform, const TBuffer& aBuffer, size_t& outBufferIndex)
{
if (!aPerform)
{
return TData {};
}
const TData& result = aBuffer[outBufferIndex];
if (++outBufferIndex == aBuffer.size())
{
outBufferIndex = 0;
}
return result;
}
void WarmUp(size_t aCyclesCount, bool aPerform, const TBuffer& aBuffer)
{
size_t bufferIndex = 0;
for (size_t i = 0; i < aCyclesCount; ++i)
{
auto data = DoMemoryOperation(aPerform, aBuffer, bufferIndex);
}
}
void TestCycle(size_t aCyclesCount, bool aPerform, const TBuffer& aBuffer, Measurings& outStatistics)
{
size_t bufferIndex = 0;
for (size_t i = 0; i < aCyclesCount; ++i)
{
auto t1 = std::chrono::steady_clock::now();
{
auto data = DoMemoryOperation(aPerform, aBuffer, bufferIndex);
}
auto t2 = std::chrono::steady_clock::now();
auto diff = std::chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1).count();
outStatistics.AddMeasuring(diff, t2);
}
}
int Run(int aCpu, size_t aDataSize, size_t aBufferSize, size_t aCyclesCount, bool aAllocate, bool aPerform)
{
if (mlockall(MCL_CURRENT | MCL_FUTURE))
{
throw std::runtime_error("mlockall failed");
}
std::cout << "Test parameters"
<< ":\ndata size=" << aDataSize
<< ",\nnumber of elements=" << aBufferSize
<< ",\nbuffer size=" << aBufferSize * aDataSize
<< ",\nnumber of cycles=" << aCyclesCount
<< ",\nallocate=" << aAllocate
<< ",\nperform=" << aPerform
<< ",\nthread ";
SetCpuAffinity(aCpu);
TBuffer buffer;
if (aPerform)
{
buffer.resize(aBufferSize);
std::fill(buffer.begin(), buffer.end(), TData { aDataSize });
}
WaitForKey();
std::cout << "Running..."<< std::endl;
WarmUp(aBufferSize * 2, aPerform, buffer);
Measurings statistics;
TestCycle(aCyclesCount, aPerform, buffer, statistics);
statistics.Print(aCyclesCount);
WaitForKey();
if (munlockall())
{
throw std::runtime_error("munlockall failed");
}
return 0;
}
And following results are received:
First:
StandaloneTests --run_test=MemoryAccessDelay --cpu=19 --data-size=280 --size=67108864 --count=1000000000 --allocate=1 --perform=0
Test parameters:
data size=280,
number of elements=67108864,
buffer size=18790481920,
number of cycles=1000000000,
allocate=1,
perform=0,
thread 14056 on cpu 19
Statistics: min: 16: max: 18985: avg: 18
0 - 10 : 0 (0 %): -
10 - 100 : 999993494 (99 %): min: 40: max: 117130: avg: 40
100 - 1000 : 946 (0 %): min: 380: max: 506236837: avg: 43056598
1000 - 10000 : 5549 (0 %): min: 56876: max: 70001739: avg: 7341862
10000 - 18985 : 11 (0 %): min: 1973150818: max: 14060001546: avg: 3644216650
Second:
StandaloneTests --run_test=MemoryAccessDelay --cpu=19 --data-size=280 --size=67108864 --count=1000000000 --allocate=1 --perform=1
Test parameters:
data size=280,
number of elements=67108864,
buffer size=18790481920,
number of cycles=1000000000,
allocate=1,
perform=1,
thread 3264 on cpu 19
Statistics: min: 36: max: 4967479: avg: 48
0 - 10 : 0 (0 %): -
10 - 100 : 964323921 (96 %): min: 60: max: 4968567: avg: 74
100 - 1000 : 35661548 (3 %): min: 122: max: 4972632: avg: 2023
1000 - 10000 : 14320 (0 %): min: 1721: max: 33335158: avg: 5039338
10000 - 100000 : 130 (0 %): min: 10010533: max: 1793333832: avg: 541179510
100000 - 1000000 : 0 (0 %): -
1000000 - 4967479 : 81 (0 %): min: 508197829: max: 2456672083: avg: 878824867
Any ideas what is the reason of such huge delays and how it may be investigated?
In:
TData DoMemoryOperation(bool aPerform, const TBuffer& aBuffer, size_t& outBufferIndex);
It returns a std::vector<char> by value. That involves a memory allocation and data copying. The memory allocations can do a syscall (brk or mmap) and memory mappings related syscalls are notorious for being slow.
When timings include syscalls one cannot expect low variance.
You may like to run your application with /usr/bin/time --verbose <app> or perf -ddd <app> to see the number of page faults and context switches.
I have the following code:
#pragma pack(4)
struct RECORD_HEADER {
uint64_t msgType;
uint64_t rdtsc;
};
struct BODY {
char content[488];
};
#pragma pack()
class SerializedRDTSC {
public:
typedef unsigned long long timeunit_t;
static timeunit_t start(void) {
unsigned cycles_high, cycles_low;
__asm__ __volatile__ ( "CPUID\n\t"
"RDTSC\n\t"
"mov %%edx, %0\n\t"
"mov %%eax, %1\n\t": "=r" (cycles_high), "=r" (cycles_low)::
"%rax", "%rbx", "%rcx", "%rdx");
return ( (unsigned long long)cycles_low)|( ((unsigned long long)cycles_high)<<32 );
}
static timeunit_t end(void) {
unsigned cycles_high, cycles_low;
__asm__ __volatile__( "RDTSCP\n\t"
"mov %%edx, %0\n\t"
"mov %%eax, %1\n\t"
"CPUID\n\t": "=r" (cycles_high), "=r" (cycles_low):: "%rax",
"%rbx", "%rcx", "%rdx");
return ( (unsigned long long)cycles_low)|( ((unsigned long long)cycles_high)<<32 );
}
};
char* createSHM() noexcept {
const auto sharedMemHandle = shm_open("testing", O_RDWR | O_CREAT, 0666);
if (-1 == sharedMemHandle) {
std::cout << "failed to open named shared memory: " << std::endl;
return nullptr;
}
constexpr int32_t size = (1 << 26);
ftruncate(sharedMemHandle, size);
char* ptr = (char*) mmap(nullptr, size, PROT_READ | PROT_WRITE,
MAP_SHARED, sharedMemHandle, 0);
if (MAP_FAILED == ptr) {
std::cout << errno << std::endl;
return nullptr;
}
const auto rc = fchmod(sharedMemHandle, 0666);
if (rc == -1) {
fprintf(stderr,
"Can't change permissions to 0666 on shared mem segment: %m\n");
fflush(stderr);
}
return ptr;
}
int main() {
BODY update;
srand(time(nullptr));
char* ptr = createSHM();
constexpr uint64_t n = 700;
constexpr uint64_t n2 = 10;
uint64_t m_data[n * n2];
memset(m_data, 0, sizeof(m_data));
uint64_t r = 0;
for (uint64_t i = 0; i < n; i++) {
for (uint64_t k = 0; k < n2; k++) {
// populate the header
const auto msgType = rand();
const auto rdtsc = rand();
// populate the struct randomly
uint32_t* tmp = reinterpret_cast<uint32_t*>(&update);
for (uint32_t j = 0; j < sizeof(BODY) / sizeof(uint32_t); j++) {
const uint32_t v = rand() % 32767;
tmp[j] = v;
}
// write the struct
const auto s = SerializedRDTSC::start();
memcpy(ptr, (char*)&msgType, sizeof(uint64_t));
ptr+= sizeof(uint64_t);
memcpy(ptr, (char*)&rdtsc, sizeof(uint64_t));
ptr+= sizeof(uint64_t);
memcpy(ptr, &update, sizeof(BODY));
ptr+= sizeof(BODY);
const auto e = SerializedRDTSC::end();
m_data[r++] = e - s;
}
usleep(249998);
}
for (uint32_t i = 0; i < r; i++) {
std::cout << i << "," << m_data[i] << std::endl;
}
}
And for some reason, there are periodic latency spike according to the output:
0 9408
1 210
2 162
3 176
4 172
5 164
6 172
7 8338
8 174
9 186
10 612
11 380
12 380
13 374
14 358
15 13610
16 190
17 186
18 164
19 168
20 246
21 196
22 170
23 5066
24 176
25 176
26 168
27 174
28 166
29 440
30 232
31 214
32 5128
33 180
34 178
35 172
36 174
37 184
38 170
39 162
40 5964
41 182
42 174
43 164
44 180
45 180
46 162
47 172
I already isolated the core and double-checked with htop to make sure no other processes were using the core.
My machine has an i7 CPU (nothing fancy).
And then I tried with an Xeon CPU. The pattern is about the same -- every 7-11 write, there was a spike.
With i7 CPU, I compiled with GCC 7.2 with c++17 and ran it on CentOS 7.3.
With Xeon CPU, I compiled with GCC 4.6 with c++0x and ran it on CentOS 6.5.
My questions are:
1. Why there were periodic latency spikes? (I checked with strace. And I don't see weird system call involved)
2. Any suggestion on how to investigate/understand the spike? More for my learning.
Thanks in advance!
P.S. Yes, some people object to use rdtsc to measure latency because temperature affects TSC. Tho, I don't see any better option as I don't have PTP, and clock_gettime() sometimes will have latency spikes too. If you have any suggestion, it is more than welcome :)
A memory page is 4K bytes. Every time you start writing on a new page, that page needs mapped into the process address space. Since the data you're writing every loop is 8 + 8 + 488 = 504 bytes, you'll get a spike every 8 or 9 time thru the loop.
Since the CPU can speculatively prefetch data from memory, the page fault for the 2nd page (which should occur on the 8th loop) occurs one loop earlier than expected, when the hardware prefetcher tries to access the page.
I have a char buffer[1024] and a struct
typedef struct {
int record;
int key;
} leaf_entry;
What I am trying to implement is the buffer array acting like a node of a tree that holds many entries of leaf_entry.
If I want to iterate down the buffer to compare an entry in the buffer to another entry
for (i = 0; i < max_ents * entry_size; i += entry_size)
{
leaf_entry curr;
memcpy (curr, buffer[i], entry_size)
if (curr == entry_to_compare)
etc...
}
Is this correct? Is there an easier/more efficient way to accomplish this?
You should be able to do this without copying given that your struct is POD.
I believe something like this should work where offset is the correct byte offset into the char buffer:
leaf_entry & x = static_cast<leaf_entry &>(buffer + offset);
I would consider using the approach that standard sorting implementations use - namely, use a user-defined compare function that receives 2 pointers. You then use these pointers to locate the element and from there, access and compare the members you're interested in. This also avoids the unnecessary memory copying.
Consider the following func used to compare ints:
int compareIntAsc(const void *int1, const void *int2)
{
int *num1 = (int*)int1;
int *num2 = (int*)int2;
return *num1 - *num2;
}
and now consider one which will compare the count member of some structs
int compareNodeCountAsc(const void *node1, const void *node1)
{
return (pHuffmanNode)(node1))->charCount - ((pHuffmanNode)(node2))->charCount;
}
If you pass the address of two elements in your array to such a function, you can compare either the record or the key.
e.g compare element 0 to every other element
code
typedef struct {
int record;
int key;
} leaf_entry, *pLeafEntry;
int compLeafKeyAsc(void *leaf1, void *leaf2)
{
leaf_entry *p1, *p2;
p1 = (leaf_entry *)leaf1;
p2 = (leaf_entry *)leaf2;
return p1->key - p2->key;
}
void printLeaf(pLeafEntry leaf)
{
printf("----- leaf -------\n");
printf("record: %d\n", leaf->record);
printf("key: %d\n", leaf->key);
}
void demo()
{
const int nElems = 16;
leaf_entry leafArray[nElems];
pLeafEntry firstElement;
int i;
for (i=0;i<nElems;i++)
{
leafArray[i].record = (rand()%51) + 100; // record is [100..150]
leafArray[i].key = (rand()%128); // key is [0..127]
printLeaf(&leafArray[i]);
}
//e.g compare element 0 to every other element
firstElement = &leafArray[0];
for (i=1; i<nElems; i++)
{
printf("%d", firstElement->key );
int result = compLeafKeyAsc(firstElement, &leafArray[i]);
if (result < 0)
printf(" is less than ");
else if (result > 0)
printf(" is greater than ");
else
printf(" is equal to ");
printf("%d\n", leafArray[i].key);
}
}
output
----- leaf -------
record: 141
key: 35
----- leaf -------
record: 110
key: 4
----- leaf -------
record: 144
key: 108
----- leaf -------
record: 103
key: 46
----- leaf -------
record: 134
key: 16
----- leaf -------
record: 144
key: 113
----- leaf -------
record: 125
key: 59
----- leaf -------
record: 116
key: 107
----- leaf -------
record: 137
key: 38
----- leaf -------
record: 133
key: 60
----- leaf -------
record: 106
key: 12
----- leaf -------
record: 126
key: 25
----- leaf -------
record: 137
key: 94
----- leaf -------
record: 130
key: 28
----- leaf -------
record: 132
key: 55
----- leaf -------
record: 141
key: 94
35 is greater than 4
35 is less than 108
35 is less than 46
35 is greater than 16
35 is less than 113
35 is less than 59
35 is less than 107
35 is less than 38
35 is less than 60
35 is greater than 12
35 is greater than 25
35 is less than 94
35 is greater than 28
35 is less than 55
35 is less than 94
I am attempting to use clock_gettime() to monitor elapsed time. However it returns bad results.
I tested it with the following:
#include <time.h>
#include <iostream>
#include <math.h>
using namespace std;
int main()
{
// Time vars for calculation.
int ns;
// Initial struct.
timespec tt;
// Get starting time.
clock_gettime(CLOCK_MONOTONIC,&tt);
int ns_start = tt.tv_nsec;
int s_start = tt.tv_sec;
// Base for second wrap around.
int ns_base = 1000e6 - ns_start;
while(true)
{
cin.ignore();
// Get time.
clock_gettime(CLOCK_MONOTONIC,&tt);
// Implement/calculate wrap around.
if(tt.tv_nsec >= ns_start) ns = tt.tv_nsec - ns_start;
else ns = tt.tv_nsec + ns_base;
// Display result.
cout << "Time Passed:\ts: " << tt.tv_sec-s_start << " ms: " << round(ns/1e6) << endl;
}
return 0;
}
When I hold any key for a while I get a similar result:
Time Passed: s: 1 ms: 833
Time Passed: s: 2 ms: 308
Time Passed: s: 2 ms: 354
Time Passed: s: 2 ms: 415
Time Passed: s: 2 ms: 459
Time Passed: s: 2 ms: 511
Time Passed: s: 2 ms: 566
Time Passed: s: 2 ms: 613
Time Passed: s: 2 ms: 661
Time Passed: s: 2 ms: 712
Time Passed: s: 2 ms: 762
Time Passed: s: 2 ms: 813
Time Passed: s: 2 ms: 861
Time Passed: s: 2 ms: 920 // crap starts here
Time Passed: s: 3 ms: 970
Time Passed: s: 3 ms: 20
Time Passed: s: 3 ms: 69
Time Passed: s: 3 ms: 124
Time Passed: s: 3 ms: 171
Time Passed: s: 3 ms: 226
Time Passed: s: 3 ms: 272
Time Passed: s: 3 ms: 329
Time Passed: s: 3 ms: 372
Time Passed: s: 3 ms: 429
Time Passed: s: 3 ms: 474
Time Passed: s: 3 ms: 528
Time Passed: s: 3 ms: 576
Time Passed: s: 3 ms: 632
Time Passed: s: 3 ms: 679
Time Passed: s: 3 ms: 736
Time Passed: s: 3 ms: 782
Time Passed: s: 3 ms: 835
Time Passed: s: 3 ms: 880
Time Passed: s: 4 ms: 939
Time Passed: s: 4 ms: 982
Time Passed: s: 4 ms: 38
Time Passed: s: 4 ms: 84
Time Passed: s: 4 ms: 143
Time Passed: s: 4 ms: 188
Time Passed: s: 4 ms: 244
Time Passed: s: 4 ms: 291
Time Passed: s: 4 ms: 348
Time Passed: s: 4 ms: 391
Time Passed: s: 4 ms: 448
Time Passed: s: 4 ms: 493
Time Passed: s: 4 ms: 549
Time Passed: s: 4 ms: 594
Time Passed: s: 4 ms: 650
Time Passed: s: 4 ms: 696
Time Passed: s: 6 ms: 259
Time Passed: s: 7 ms: 989
It should be obvious by looking at the numbers that results are messed up at the point of comment.
Anyone has any ideas as to why this is and how to fix it?
Imagine the timer starts at 1.999 seconds. At 2.001 seconds, your code would say that 1 second and 2 ms have elapsed, when really it should be zero seconds and 2 ms. This is because you're subtracting the starting second from the current second, even if the nanosecond part hasn't passed its starting value.
You had the right idea with the nanosecond wraparound. Let's extend that to keep the seconds from getting ahead of the correct value. Here's one way to do it:
#include <time.h>
#include <iostream>
#include <math.h>
using namespace std;
int main()
{
// Time vars for calculation.
int ns;
int s;
// Initial struct.
timespec tt;
// Get starting time.
clock_gettime(CLOCK_MONOTONIC,&tt);
int ns_start = tt.tv_nsec;
int s_start = tt.tv_sec;
// Base for second wrap around.
int ns_base = 1000e6 - ns_start;
while(true)
{
cin.ignore();
// Get time.
clock_gettime(CLOCK_MONOTONIC,&tt);
// Implement/calculate wrap around.
if(tt.tv_nsec >= ns_start)
{
ns = tt.tv_nsec - ns_start;
s = tt.tv_sec - s_start;
}
else
{
ns = tt.tv_nsec + ns_base;
s = tt.tv_sec - s_start - 1;
}
// Display result.
cout << "Time Passed:\ts: " << s << " ms: " << round(ns/1e6) << endl;
}
return 0;
}