Get average of time spent using std::chrono - c++

I have a function running more than a million times. I want to print out the duration of how long the function takes to run by printing the sum of durations of 10,000 calls to the function.
At the start of each function I have something like this:
int counter = 0;
auto duration_total = 0; //not sure about the type
std::chrono::high_resolution_clock::time_point t1, t2, duration;
t1 = std::chrono::high_resolution_clock::now();
Function f(){
counter++;
}
t2 = std::chrono::high_resolution_clock::now();
duration= std::chrono::duration_cast<std::chrono::nanoseconds>( t2 - t1 ).count();
duration_total += duration;
if(counter %10000 == 0){
long int average_duration = duration_total/10000;
duration_total = 0;
cout << average_duration << "\n";
}
I can't find a way to add durations and then get their average.

If you look at std::chrono::duration<Rep,Period>::count, you can see that you can use
int duration = std::chrono::duration_cast<std::chrono::nanoseconds>( t2 - t1 ).count();
(or something else, e.g., unsigned long), as the return value is
The number of ticks for this duration.
in full:
#include <iostream>
#include <chrono>
int main()
{
int counter = 0;
auto duration_total = 0; //not sure about the type
std::chrono::high_resolution_clock::time_point t1, t2;
t1 = std::chrono::high_resolution_clock::now();
t2 = std::chrono::high_resolution_clock::now();
int duration = std::chrono::duration_cast<std::chrono::nanoseconds>( t2 - t1 ).count();
duration_total += duration;
if(counter %10000 == 0){
long int average_duration = duration_total/10000;
duration_total = 0;
std::cout << average_duration << "\n";
}
}
See it in Coliru.

You create a clock when you start and one when you stop.
When subtracting one clock from another, you get a duration. Divide the duration with the number of iterations.
Example:
#include <chrono>
#include <functional>
#include <iostream>
template<typename T>
auto timeit(size_t iterations, std::function<void()> func_to_test) {
auto start = std::chrono::high_resolution_clock::now();
for(size_t i = 0; i < iterations; ++i)
func_to_test();
auto end = std::chrono::high_resolution_clock::now();
return std::chrono::duration_cast<T>(end - start) / iterations;
}
int main() {
auto dur =
timeit<std::chrono::microseconds>(10000, [] { system("echo Hello World"); });
std::cout << dur.count() << " ยตs\n";
}
If you need to sum up individual runs, keep a duration variable that you add to. I'm reusing the same timeit function, but you can remove the iteration stuff in it if you only want to run it once.
int main() {
std::chrono::microseconds tot{0};
size_t iterations = 0;
for(size_t i = 0; i < 10; ++i) {
// sum up the total time spent
tot += timeit<decltype(tot)>(1, [] { system("echo Hello World"); });
++iterations;
}
// divide with the number of iterations
std::cout << (tot / iterations).count() << " ยตs\n";
}

First, the type here is int:
auto duration_total = 0;
You should do something similar to it:
auto t1 = std::chrono::steady_clock::now();
//do some work
auto t2 = std::chrono::steady_clock::now();
double duration_in_seconds = std::chrono::duration_cast<std::chrono::duration<double>>(t2 - t1).count();
Note that I'm casting the duration to double. Then you can use the duration value more freely.
If you prefer nanoseconds:
double duration_in_nanoseconds = std::chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1).count();

Related

Thread not improving the code performance

I am trying to convert a basic long loop into thread to improve the loop performance.
Here is the threaded version:
#include <iostream>
#include <thread>
#include <chrono>
using namespace std;
using namespace std::chrono;
void funcSum(long long int start, long long int end, long long int *sum)
{
for(auto i = start; i <= end; ++i)
{
*sum += i;
}
}
int main()
{
long long int start = 10, end = 1900000000;
long long int sum = 0;
auto startTime = high_resolution_clock::now();
thread t1(funcSum, start, end / 2, &sum);
thread t2(funcSum, end / 2 + 1 , end, &sum);
t1.join();
t2.join();
auto stopTime = high_resolution_clock::now();
auto duration = duration_cast<seconds>(stopTime - startTime);
cout << "Sum: " << sum << endl;
cout << duration.count() << " Seconds";
return 0;
}
And here is the normal code (Without threads):
#include <iostream>
#include <thread>
#include <chrono>
using namespace std;
using namespace std::chrono;
void funcSum(long long int start, long long int end, long long int *sum)
{
for(auto i = start; i <= end; ++i)
{
*sum += i;
}
}
int main()
{
long long int start = 10, end = 1900000000;
long long int sum = 0;
auto startTime = high_resolution_clock::now();
funcSum(start, end, &sum);
auto stopTime = high_resolution_clock::now();
auto duration = duration_cast<seconds>(stopTime - startTime);
cout << "Sum: " << sum << endl;
cout << duration.count() << " Seconds";
return 0;
}
Sum: 1805000000949999955
5 Seconds
Process finished with exit code 0
In both the cases, time spent is 5 seconds.
Why the first threaded version does not improve the performance? How do I decrease the time using threads for this sum of range?
Fixed version of threaded code:
// Compute the sum of start ... end
class Summer {
public:
long long int start;
long long int end;
long long int sum = 0;
Summer(long long int aStart, long long int aEnd)
: start(aStart),
end(aEnd)
{
}
void funcSum()
{
sum = 0;
for (auto i = start; i <= end; ++i)
{
sum += i;
}
}
};
class SummerFunctor {
Summer& mSummer;
public:
SummerFunctor(Summer& aSummer)
: mSummer(aSummer)
{
}
void operator()()
{
mSummer.funcSum();
}
};
// Version with n thread objects reports
// 1 threads, sum = 1805000000949999955, 1587 ms
// 2 threads, sum = 1805000000949999955, 2547 ms
// 4 threads, sum = 1805000000949999955, 1251 ms
// 6 threads, sum = 1805000000949999955, 916 ms
int main()
{
long long int start = 10, end = 1900000000;
long long int sum = 0;
auto startTime = high_resolution_clock::now();
const size_t threadCount = 6;
if (threadCount < 2) {
funcSum(start, end, &sum);
} else {
Summer* summers[threadCount];
std::thread* threads[threadCount];
// Start threads
auto val = start;
auto partitionSize = (end-start) / threadCount;
for (size_t i = 0; i < threadCount; ++i) {
auto partitionEnd = std::min(start + partitionSize, end);
summers[i] = new Summer(start, partitionEnd);
start = partitionEnd + 1;
SummerFunctor functor (*summers[i]);
threads[i] = new std::thread(functor);
}
// Join threads
for (size_t i = 0; i < threadCount; ++i) {
threads[i]->join();
sum += summers[i]->sum;
delete threads[i];
delete summers[i];
}
}
auto stopTime = high_resolution_clock::now();
auto duration = duration_cast<milliseconds>(stopTime - startTime);
cout << threadCount << " threads, sum = " << sum << ", " << duration.count() << " ms" << std::endl;
return 0;
}
I had to wrap the Summer object with a functor because std::thread insists on making a copy of a functor handed to it, that we can't access later. The execution gets better when more threads are used (running times see comments). Possible reasons for this:
The CPU has to synchronize access to the memory pages even though the threads use separate variables here because the variables likely lie in the same page
If there is only one thread running on a CPU, that thread may run at higher CPU frequency, but several threads may run only at normal CPU frequency
CPU cores often share arithmetic units
Without threads, the compiler can make optimizations that are not possible with threads. In theory, the compiler could unroll the loop and directly print the result.

std::vector.push_back() C++

I want to initialize a vector with alphabetical letters using push_back function. Is it the right way to do it?
vector<char> v;
char letter = 'A';
for(int i=0; i<26; i++)
{
v.push_back(letter+i);
}
It works. I am just wondering whether I should use a type cast letter to an int before adding i to it?
Or is there a more efficient way?
Note that your code relies on a character encoding scheme that encodes the letters contiguously, like e.g. ASCII.
If that assumption holds, you could create the vector using the correct size initially, and use std::iota to initialize all elements:
std::vector<char> v(26); // Create a vector of 26 (default-initialized) elements
std::iota(begin(v), end(v), 'A'); // Assign a letter to each element in the vector
If you want your code to be portable to systems where letters aren't contiguously encoded (like a system which uses EBCDIC) then you're better of to create a string using the letters explicitly:
std::string alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; // Thanks Nathan Oliver :)
And if you have a string with all the letters, then perhaps you won't need the vector even.
Looks pretty good!
I guess maybe std::array() would be an option too, as compared to std::vector() for similar tasks:
#include <iostream>
#include <array>
#include <vector>
#include <chrono>
void function1() {
std::vector<char> alphabets;
for (unsigned int index = 0; index < 26; ++index) {
alphabets.push_back(index + 'A');
// std::cout << alphabets[index] << "\t";
}
// std::cout << "\n\n";
}
void function2() {
std::vector<char> alphabets;
for (unsigned int index = 0; index < 26; ++index) {
alphabets.emplace_back(index + 'A');
// std::cout << alphabets[index] << "\t";
}
// std::cout << "\n\n";
}
void function3() {
std::array<char, 26> alphabets;
for (unsigned int index = 0; index < 26; ++index) {
alphabets[index] = index + 'A';
// std::cout << alphabets[index] << "\t";
}
// std::cout << "\n\n";
}
int main() {
const auto t1 = std::chrono::high_resolution_clock::now();
for (std::size_t i = 0; i < 1000000; ++i) {
function1();
}
const auto t2 = std::chrono::high_resolution_clock::now();
const auto duration = std::chrono::duration_cast<std::chrono::microseconds>( t2 - t1 ).count();
std::cout << duration <<
" is the rough runtime of std::vector function with push_back\t๐Ÿ’™๐Ÿ’™๐Ÿ’™\t๐Ÿ˜ณ\n\n";
const auto t3 = std::chrono::high_resolution_clock::now();
for (std::size_t i = 0; i < 1000000; ++i) {
function2();
}
const auto t4 = std::chrono::high_resolution_clock::now();
const auto duration2 = std::chrono::duration_cast<std::chrono::microseconds>( t4 - t3 ).count();
std::cout << duration2 <<
" is the rough runtime of std::vector function with emplace_back\t๐Ÿ’™๐Ÿ’™๐Ÿ’™\t๐Ÿ˜ณ\n\n";
const auto t5 = std::chrono::high_resolution_clock::now();
for (std::size_t i = 0; i < 1000000; ++i) {
function3();
}
const auto t6 = std::chrono::high_resolution_clock::now();
const auto duration3 = std::chrono::duration_cast<std::chrono::microseconds>( t6 - t5 ).count();
std::cout << duration3 << " is the rough runtime of std::array function\t๐Ÿ’™๐Ÿ’™๐Ÿ’™\t๐Ÿ˜ณ\n\n";
return 0;
};

why in reverse sorting is faster, than re-sorting

I wrote such a test -
std::vector<int> test_vector;
for (int i = 0; i < 100000000; ++i) {
test_vector.push_back(i);
}
QElapsedTimer timer;
timer.start();
std::sort(test_vector.begin(),test_vector.end(), [](int a, int b) { return a < b; });
qDebug() << "The slow operation took" << timer.elapsed() << "milliseconds";
qDebug() << "The slow operation took" << timer.nsecsElapsed() << "nanoseconds";
here i start re-sorting and the result
The slow operation took 4091 milliseconds
The slow operation took 4091842000 nanoseconds
but when I changed
std::sort(test_vector.begin(),test_vector.end(), [](int a, int b) { return a > b; });
result
The slow operation took 2867 milliseconds
The slow operation took 2867591800 nanoseconds
i tested on Qt_5_12_3_MinGW_64_bit-Release , and can't understand why in reverse sorting is faster, than re-sorting?
Resolved!
I tested the same example on Qt_5_12_3_MSVC2017_64bit and the issue is resolved, the problem was in MinGW_64
However, I still have a question, why if I sort the vector into a feed all the elements of 10,
#include <chrono>
#include<iostream>
#include <vector>
#include <algorithm>
int main() {
std::vector<int> test_vector;
for (int i = 0; i < 100000000; ++i) {
test_vector.push_back(10);
}
auto begin = chrono::high_resolution_clock::now();
std::sort(test_vector.begin(), test_vector.end(), [](int a, int b) { return a < b; });
auto end = std::chrono::high_resolution_clock::now();
auto dur = end - begin;
auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(dur).count();
std::cout << ms << endl;
return 0;
}
result 167 milliseconds,
and re-sorting 2553 milliseconds
for (int i = 0; i < 100000000; ++i) {
test_vector.push_back(i);
}

Why is my function that returns by value slower than function that using pass_by_reference?

I understand that c++ core guidelines specify that std::vector should be returned by value (in order for RVO/NRVO/move semantics to take place) as opposed to a pass by reference operation. When I tested this however with the below benchmark code it appears that the pass_by_reference function is much faster than the function that returns by value. Why is my PassByReference Multiply function so much faster than my RVOMulitply function?
I am using clang 5.0.2.
My compile line is clang++ -std=c++17 RVO_PassByReference.cpp -o RVO_PassByReference -O3 -march=native
#include <array>
#include <vector>
#include <chrono>
#include <iostream>
using namespace std;
using namespace std::chrono;
vector<double> RVOMultiply(const vector<double>& v1, const vector<double>& v2)
{
std::vector<double> ResultVector;
ResultVector.reserve(v1.size());
for (size_t i {0}; i < v1.size(); ++i)
{
ResultVector.emplace_back(v1[i] * v2[i]);
}
return ResultVector;
}
void PassByReferenceMultiply(const vector<double>& v1, const vector<double>& v2, vector<double>& Result)
{
for (size_t i {0}; i < Result.size(); ++i)
{
Result[i] = v1[i] * v2[i];
}
}
int main ()
{
vector<double> ReferenceVector(10000);
vector<double> Operand1Vector(10000);
vector<double> Operand2Vector(10000);
for (size_t i {0}; i < Operand1Vector.size(); ++i)
{
Operand1Vector[i] = i;
Operand2Vector[i] = i+1;
}
high_resolution_clock::time_point t1 = high_resolution_clock::now();
high_resolution_clock::time_point t2 = high_resolution_clock::now();
auto duration1 = duration_cast<nanoseconds>(t2 - t1).count();
auto duration2 = duration_cast<nanoseconds>(t2 - t1).count();
for (double z {0}; z < 100000; ++z)
{
t1 = high_resolution_clock::now();
vector<double> RVOVector = RVOMultiply(Operand1Vector, Operand2Vector);
t2 = high_resolution_clock::now();
if (z != 99999)
vector<double>().swap(RVOVector);
duration1 += duration_cast<nanoseconds>(t2 - t1).count();
t1 = high_resolution_clock::now();
PassByReferenceMultiply(Operand1Vector, Operand2Vector, ReferenceVector);
t2 = high_resolution_clock::now();
duration2 += duration_cast<nanoseconds>(t2 - t1).count();
}
duration1 /= 100000;
duration2 /= 100000;
cout << "RVOVector Duration Average was: " << duration1 << endl;
cout << "ReferenceVector push_back Duration Average was: " << duration2 << endl;
}
My output on my system is
RVOVector Duration Average was: 11901
ReferenceVector push_back Duration Average was: 3634

std::chrono - fixed time step loop

I'm trying to make fixed time step loop with using < chrono >.
This is my code:
#include <iostream>
#include <chrono>
int main()
{
std::chrono::steady_clock::time_point start;
const double timePerFrame = 1.0 / 60.0;
double accumulator = 0.0;
int i = 0;
while(true)
{
start = std::chrono::steady_clock::now();
while(accumulator >= timePerFrame)
{
accumulator -= timePerFrame;
std::cout << ++i << std::endl;
//update();
}
accumulator += std::chrono::duration_cast<std::chrono::duration<double>>(std::chrono::steady_clock::now() - start).count();
//render();
}
return 0;
}
Value of variable "i" is printed less then 60 times a second. The same situation takes place when I'm trying to change "timePerFrame" to "1.0". What is wrong with it?
#include <iostream>
#include <chrono>
#include <thread>
int main()
{
using namespace std::chrono;
using Framerate = duration<steady_clock::rep, std::ratio<1, 60>>;
auto next = steady_clock::now() + Framerate{1};
int i = 0;
while(true)
{
std::cout << ++i << std::endl;
//update();
std::this_thread::sleep_until(next);
next += Framerate{1};
//render();
}
return 0;
}
Here's the same thing with a busy loop:
int main()
{
using namespace std::chrono;
using Framerate = duration<steady_clock::rep, std::ratio<1, 60>>;
auto next = steady_clock::now() + Framerate{1};
int i = 0;
while(true)
{
std::cout << ++i << std::endl;
//update();
while (steady_clock::now() < next)
;
next += Framerate{1};
//render();
}
return 0;
}