Why calling via weak_ptr is so slow? - c++

I have read the question What's the performance penalty of weak_ptr? but my own tests show different results.
I'm making delegates with smart pointers. The simple code below shows reproduces the performance issues with weak_ptr. Can anybody tell me why?
#include <chrono>
#include <functional>
#include <iostream>
#include <memory>
#include <stdint.h>
#include <string>
#include <utility>
struct Foo
{
Foo() : counter(0) { incrStep = 1;}
void bar()
{
counter += incrStep;
}
virtual ~Foo()
{
std::cout << "End " << counter << std::endl;
}
private:
uint64_t counter;
uint64_t incrStep;
};
void pf(const std::string &md, const std::function<void()> &g)
{
const auto st = std::chrono::high_resolution_clock::now();
g();
const auto ft = std::chrono::high_resolution_clock::now();
const auto del = std::chrono::duration_cast<std::chrono::milliseconds>(ft - st);
std::cout << md << " \t: \t" << del.count() << std::endl;
}
And the test:
int main(int , char** )
{
volatile size_t l = 1000000000ULL;
size_t maxCounter = l;
auto a = std::make_shared<Foo>();
std::weak_ptr<Foo> wp = a;
pf("call via raw ptr ", [=](){
for (size_t i = 0; i < maxCounter; ++i)
{
auto p = a.get();
if (p)
{
p->bar();
}
}
});
pf("call via shared_ptr ", [=](){
for (size_t i = 0; i < maxCounter; ++i)
{
if (a)
{
a->bar();
}
}
});
pf("call via weak_ptr ", [=](){
std::shared_ptr<Foo> p;
for (size_t i = 0; i < maxCounter; ++i)
{
p = wp.lock();
if (p)
{
p->bar();
}
}
});
pf("call via shared_ptr copy", [=](){
volatile std::shared_ptr<Foo> p1 = a;
std::shared_ptr<Foo> p;
for (size_t i = 0; i < maxCounter; ++i)
{
p = const_cast<std::shared_ptr<Foo>& >(p1);
if (p)
{
p->bar();
}
}
});
pf("call via mem_fn ", [=](){
auto fff = std::mem_fn(&Foo::bar);
for (size_t i = 0; i < maxCounter; ++i)
{
fff(a.get());
}
});
return 0;
}
Results:
$ ./test
call via raw ptr : 369
call via shared_ptr : 302
call via weak_ptr : 22663
call via shared_ptr copy : 2171
call via mem_fn : 2124
End 5000000000
As you can see, weak_ptr is 10 times slower than shared_ptr with copying and std::mem_fn and 60 times slower than using raw ptr or shared_ptr.get()

In trying to reproduce your test I realised that the optimizer might be eliminating more than it should. What I did was to utilize random numbers to defeat over-optimization and these results seem realistic with std::weak_ptr being nearly three times slower than the std::shared_ptr or its raw pointer.
I calculate a checksum in each test to ensure they are all doing the same work:
#include <chrono>
#include <memory>
#include <random>
#include <vector>
#include <iomanip>
#include <iostream>
#define OUT(m) do{std::cout << m << '\n';}while(0)
class Timer
{
using clock = std::chrono::steady_clock;
using microseconds = std::chrono::microseconds;
clock::time_point tsb;
clock::time_point tse;
public:
void start() { tsb = clock::now(); }
void stop() { tse = clock::now(); }
void clear() { tsb = tse; }
friend std::ostream& operator<<(std::ostream& o, const Timer& timer)
{
return o << timer.secs();
}
// return time difference in seconds
double secs() const
{
if(tse <= tsb)
return 0.0;
auto d = std::chrono::duration_cast<microseconds>(tse - tsb);
return double(d.count()) / 1000000.0;
}
};
constexpr auto N = 100000000U;
int main()
{
std::mt19937 rnd{std::random_device{}()};
std::uniform_int_distribution<int> pick{0, 100};
std::vector<int> random_ints;
for(auto i = 0U; i < 1024; ++i)
random_ints.push_back(pick(rnd));
std::shared_ptr<int> sptr = std::make_shared<int>(std::rand() % 100);
int* rptr = sptr.get();
std::weak_ptr<int> wptr = sptr;
Timer timer;
unsigned sum = 0;
sum = 0;
timer.start();
for(auto i = 0U; i < N; ++i)
{
sum += random_ints[i % random_ints.size()] * *sptr;
}
timer.stop();
OUT("sptr: " << sum << " " << timer);
sum = 0;
timer.start();
for(auto i = 0U; i < N; ++i)
{
sum += random_ints[i % random_ints.size()] * *rptr;
}
timer.stop();
OUT("rptr: " << sum << " " << timer);
sum = 0;
timer.start();
for(auto i = 0U; i < N; ++i)
{
sum += random_ints[i % random_ints.size()] * *wptr.lock();
}
timer.stop();
OUT("wptr: " << sum << " " << timer);
}
Compiler flags:
g++ -std=c++14 -O3 -g0 -D NDEBUG -o bin/timecpp src/timecpp.cpp
Example Output:
sptr: 1367265700 1.26869 // shared pointer
rptr: 1367265700 1.26435 // raw pointer
wptr: 1367265700 2.99008 // weak pointer

Related

Gtest Text Fixture tests return almost nothing

I have two classes MultiContainert and MultiContainerTest with Fixture:
My main problem is that some of my tests work well and return results, but after the "Remove" test, the terminal does not stop and display no message and or results, so I presume it blocks. My question is that I do not see where the problems would come from? I have no message to help me or guide me.
#include <boost/multi_index_container.hpp>
#include <boost/multi_index/sequenced_index.hpp>
#include <boost/multi_index/hashed_index.hpp>
#include <boost/multi_index/key.hpp>
#include <iostream>
#include <mutex>
#include <memory>
template <typename Tkey, typename Tval>
class MultiContainer
{
private:
using value_type = std::pair<Tkey, Tval>;
unsigned int capacity;
boost::multi_index_container<
value_type,
boost::multi_index::indexed_by<
boost::multi_index::sequenced<>,
boost::multi_index::hashed_unique<boost::multi_index::key<&value_type::first>>>>
container;
std::mutex mtx;
static std::unique_ptr<MultiContainer> instance;
MultiContainer(unsigned int icapacity) : capacity(icapacity) {}
public:
static MultiContainer &getInstance(unsigned int capacity)
{
if (!instance)
instance.reset(new MultiContainer(capacity));
return *instance;
}
int size()
{
std::lock_guard<std::mutex> lock(mtx);
return container.size();
}
bool isCapacityReached()
{
std::lock_guard<std::mutex> lock(mtx);
return container.size() >= capacity;
}
void removeLRU()
{
std::lock_guard<std::mutex> lock(mtx);
if (!container.empty())
container.erase(container.template get<0>().begin());
}
/*
void removeMRU()
{
std::lock_guardstd::mutex lock(mtx);
if (!container.empty())
container.erase(container.template get<0>().end() - 1);
}
*/
bool empty()
{
std::lock_guard<std::mutex> lock(mtx);
return container.empty();
}
void clear()
{
std::lock_guard<std::mutex> lock(mtx);
container.clear();
}
bool contains(const Tkey &key)
{
std::lock_guard<std::mutex> lock(mtx);
const auto &lookup = container.template get<1>();
return lookup.find(key) != lookup.end();
}
void remove(const Tkey &key)
{
std::lock_guard<std::mutex> lock(mtx);
auto &lookup = container.template get<1>();
lookup.erase(key);
}
void put(const Tkey &key, const Tval &val)
{
if (isCapacityReached())
{
std::lock_guard<std::mutex> lock(mtx);
removeLRU();
}
auto &lookup = container.template get<1>();
auto it = lookup.find(key);
if (it != lookup.end())
{
container.relocate(container.template get<0>().begin(), container.template project<0>(it));
lookup.modify(it, [&](value_type &x)
{ x.second = val; });
}
else
{
std::lock_guard<std::mutex> lock(mtx);
it = lookup.emplace(value_type(key, val)).first;
}
}
Tval get(const Tkey &key, const Tval &default_val = Tval())
{
std::lock_guard<std::mutex> lock(mtx);
const auto &lookup = container.template get<1>();
const auto it = lookup.find(key);
if (it == lookup.end())
{
return default_val;
}
container.relocate(container.template get<0>().begin(), container.template project<0>(it));
return it->second;
}
};
template <typename Tkey, typename Tval>
std::unique_ptr<MultiContainer<Tkey, Tval>> MultiContainer<Tkey, Tval>::instance = nullptr;
int main() {
MultiContainer<std::string, std::string> &container = MultiContainer<std::string, std::string>::getInstance(3);
container.put("key1", "value1");
container.put("key2", "value2");
container.put("key3", "value3");
std::cout << "Size : " << container.size() << std::endl;
std::cout << "Capacity reached : " << container.isCapacityReached() << std::endl;
std::cout << "Contains key1 : " << container.contains("key1") << std::endl;
std::cout << "Value for key1 : " << container.get("key1") << std::endl;
container.remove("key1");
std::cout << "Size : " << container.size() << std::endl;
std::cout << "Contains key1 : " << container.contains("key1") << std::endl;
container.clear();
std::cout << "Size : " << container.size() << std::endl;
std::cout << "Empty : " << container.empty() << std::endl;
return 0;
}
Classe MultiContainerTest:
#include <../googletest/include/gtest/gtest.h>
#include "multiContainer.hpp"
#include <chrono>
#include <thread>
class MultiContainerTest : public ::testing::Test
{
protected:
MultiContainer<int, std::string> &container = MultiContainer<int, std::string>::getInstance(3);
MultiContainer<int, std::string> &container2 = MultiContainer<int, std::string>::getInstance(5);
unsigned int cache_size = 1000;
MultiContainer<int, int> &cache = MultiContainer<int, int>::getInstance(cache_size);
virtual void TearDown()
{
cache.clear();
}
~MultiContainerTest() { } // remove exception specification
};
TEST_F(MultiContainerTest, CreateInstance)
{
EXPECT_EQ(container.size(), 0);
EXPECT_FALSE(container.isCapacityReached());
}
TEST_F(MultiContainerTest, Size)
{
container.put(1, "hello");
container.put(2, "world");
EXPECT_EQ(container.size(), 2);
}
TEST_F(MultiContainerTest, CapacityReached)
{
container.put(1, "hello");
container.put(2, "world");
container.put(3, "!");
EXPECT_TRUE(container.isCapacityReached());
}
TEST_F(MultiContainerTest, RemoveLRU)
{
container.put(1, "hello");
container.put(2, "world");
container.put(3, "!");
container.removeLRU();
EXPECT_FALSE(container.contains(1));
EXPECT_EQ(container.size(), 2);
}
TEST_F(MultiContainerTest, Empty)
{
EXPECT_TRUE(container.empty());
container.put(1, "hello");
EXPECT_FALSE(container.empty());
}
TEST_F(MultiContainerTest, Clear)
{
container.put(1, "hello");
container.put(2, "world");
container.clear();
EXPECT_TRUE(container.empty());
}
TEST_F(MultiContainerTest, Contains)
{
container.put(1, "hello");
container.put(2, "world");
EXPECT_TRUE(container.contains(1));
EXPECT_FALSE(container.contains(3));
}
TEST_F(MultiContainerTest, Remove)
{
container.put(1, "hello");
container.put(2, "world");
container.remove(1);
EXPECT_FALSE(container.contains(1));
EXPECT_EQ(container.size(), 1);
}
TEST_F(MultiContainerTest, CheckSingleton)
{
EXPECT_EQ(&container, &container2);
EXPECT_EQ(container.size(), container2.size());
container.put(1, "hello");
EXPECT_EQ(container2.get(1), "hello");
}
// Hit rate: the number of times an item is found in the cache divided by the total number of lookups
TEST_F(MultiContainerTest, HitRateTest)
{
int total_lookups = 1000000;
int hits = 0;
for (int i = 0; i < total_lookups; i++)
{
int key = rand() % cache_size;
int val = rand();
cache.put(key, val);
if (cache.contains(key))
{
hits++;
}
}
double hit_rate = (double)hits / total_lookups;
EXPECT_GT(hit_rate, 0.8);
}
TEST_F(MultiContainerTest, ConcurrencyTest)
{
std::vector<std::thread> threads;
for (int i = 0; i < 10; i++)
{
threads.push_back(std::thread([&]()
{
for (int j = 0; j < 100; j++)
{
int key = rand() % cache_size;
int val = rand();
cache.put(key, val);
} }));
}
for (auto &t : threads)
{
t.join();
}
// check that all elements are in the cache
for (int i = 0; i < cache_size; i++)
{
EXPECT_TRUE(cache.contains(i));
}
}
TEST_F(MultiContainerTest, TimeComplexityTest)
{
int num_elements = 100000;
auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < num_elements; i++)
{
cache.put(i, i);
}
auto end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
// check that the time taken to insert the elements is below a certain threshold
EXPECT_LT(duration.count(), 2000000);
}
TEST_F(MultiContainerTest, TimeComplexityGet)
{
int num_elements = 100000;
for (int i = 0; i < num_elements; i++)
{
cache.put(i, i);
}
auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < num_elements; i++)
{
cache.get(i);
}
auto end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
// check that the time taken to get the elements is below a certain threshold
EXPECT_LT(duration.count(), 2000000);
}
TEST_F(MultiContainerTest, TimeComplexityRemove)
{
int num_elements = 100000;
for (int i = 0; i < num_elements; i++)
{
cache.put(i, i);
}
auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < num_elements; i++)
{
cache.remove(i);
}
auto end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
// check that the time taken to remove the elements is below a certain threshold
EXPECT_LT(duration.count(), 2000000);
}
TEST_F(MultiContainerTest, TimeComplexityRemoveLRU)
{
int num_elements = 100000;
for (int i = 0; i < num_elements; i++)
{
cache.put(i, i);
}
auto start = std::chrono::high_resolution_clock::now();
for (int i = 0; i < num_elements; i++)
{
cache.removeLRU();
}
auto end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
// check that the time taken to remove the LRU elements is below a certain threshold
EXPECT_LT(duration.count(), 2000000);
}
int main(int argc, char* argv[]) {
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}

Why does my boost::circular_buffer crash when accessed using an iterator?

In the following program, when I access elements from the front of the circular buffer using *(begin_iterator + index) I get a crash. But if I access the same elements using buffer[index], the crash is eliminated. (See the two commented lines below). Why does this happen?
#include <boost/circular_buffer.hpp>
#include <thread>
auto buffer = boost::circular_buffer<int>(5000);
void f1()
{
const auto N = 500;
while (true) {
// Make sure the buffer is never empty
if (buffer.size() < N+1) {
continue;
}
auto front = buffer.begin();
volatile auto j = int{};
for (auto i = 0; i < N; ++i) {
// j = *(front + i); // un-commenting this causes a crash
// j = buffer[i]; // un-commenting this doesn't
}
buffer.erase_begin(N);
}
}
void f2()
{
while (true) {
// Make sure the buffer is at most half-full.
if (buffer.size() > buffer.capacity() / 2) {
continue;
}
static auto k = 0;
buffer.push_back(k++);
}
}
int main()
{
auto t1 = std::thread{f1};
auto t2 = std::thread{f2};
t1.join();
}
You're experiencing undefined behavior because you're reading and modifying the same object unsynchronized from multiple threads.
Coding it one way or another might eliminate the crash, but the program is still wrong.
If we add a mutex, then there's no crash anymore:
#include <boost/circular_buffer.hpp>
#include <thread>
#include <mutex>
boost::circular_buffer<int> buffer(5000);
std::mutex mtx;
void f1()
{
const auto N = 500;
while (true) {
std::lock_guard<std::mutex> lock(mtx);
// Make sure the buffer is never empty
if (buffer.size() < N + 1) {
continue;
}
auto front = buffer.begin();
volatile auto j = int{};
for (auto i = 0; i < N; ++i) {
j = *(front + i); // is OK now
// j = buffer[i];
}
buffer.erase_begin(N);
}
}
void f2()
{
while (true) {
std::lock_guard<std::mutex> lock(mtx);
// Make sure the buffer is at most half-full.
if (buffer.size() > buffer.capacity() / 2) {
continue;
}
static auto k = 0;
buffer.push_back(k++);
}
}
int main()
{
auto t1 = std::thread{ f1 };
auto t2 = std::thread{ f2 };
t1.join();
}

How do I create a proper Memory Poolfor a Multithreaded vector (LFSV)?

Below is an old exercise for a class that is no longer being taught at my university (Parallel Processing). The goal is to create and use a Memory Bank to speed up the Lock-Free Sorted Vector implementation. I implemented the Memory Bank myself and the goal is to set aside enough memory to use so I do not have to use new or delete in the LFSV. I believe I need a Get() function that returns the address of the memory (not sure how keep track of the unused memory) and Store should free the memory (somehow mark it as unused).
Inside LFSV (which worked perfectly fine before my intervention), the exercise explains that I should replace the new and delete with new replacement and Store(memory we want freed). How do I go about creating the Get(if this is incorrect) or the Store function to perform like a proper memory bank? I will also take any reference or memory bank examples online that you may know of because I am having trouble finding good resources related to memory banks and multithreading.
There are no errors in this program, but it returns as a "FAIL" since I did not properly manage the memory bank.
#include <algorithm>//copy, random_shuffle
#include <ctime> //std::time (NULL) to seed srand
#include <iostream> // std::cout
#include <atomic> // std::atomic
#include <thread> // std::thread
#include <vector> // std::vector
#include <mutex> // std::mutex
#include <deque> // std::deque
class MemoryBank
{
std::deque< std::vector<int>* > slots;
public:
MemoryBank() : slots(10000)
{
for (int i = 0; i<10000; ++i)
{
slots[i] = reinterpret_cast<std::vector<int>*>(new char[sizeof(std::vector<int>)]);
}
}
~MemoryBank()
{
for (unsigned int i = 0; i < slots.size(); ++i)
{
delete slots[i];
}
slots.clear();
}
void * Get()
{
return &slots;
}
void Store(std::vector<int *> freeMemory)
{
return;
}
};
class LFSV {
std::atomic< std::vector<int>* > pdata;
std::mutex wr_mutex;
MemoryBank mb;
public:
LFSV() : mb(), pdata( new (mb.Get()) std::vector<int> ) {}
~LFSV()
{
mb.~MemoryBank();
}
void Insert( int const & v ) {
std::vector<int> *pdata_new = nullptr, *pdata_old;
int attempt = 0;
do {
++attempt;
delete pdata_new;
pdata_old = pdata;
pdata_new = new (mb.Get())std::vector<int>( *pdata_old );
std::vector<int>::iterator b = pdata_new->begin();
std::vector<int>::iterator e = pdata_new->end();
if ( b==e || v>=pdata_new->back() ) { pdata_new->push_back( v ); } //first in empty or last element
else {
for ( ; b!=e; ++b ) {
if ( *b >= v ) {
pdata_new->insert( b, v );
break;
}
}
}
// std::lock_guard< std::mutex > write_lock( wr_mutex );
// std::cout << "insert " << v << "(attempt " << attempt << ")" << std::endl;
} while ( !(this->pdata).compare_exchange_weak( pdata_old, pdata_new ));
// LEAKing pdata_old since "delete pdata_old;" will cause errors
// std::lock_guard< std::mutex > write_lock( wr_mutex );
// std::vector<int> * pdata_current = pdata;
// std::vector<int>::iterator b = pdata_current->begin();
// std::vector<int>::iterator e = pdata_current->end();
// for ( ; b!=e; ++b ) {
// std::cout << *b << ' ';
// }
// std::cout << "Size " << pdata_current->size() << " after inserting " << v << std::endl;
}
int const& operator[] ( int pos ) const {
return (*pdata)[ pos ];
}
};
LFSV lfsv;
void insert_range( int b, int e ) {
int * range = new int [e-b];
for ( int i=b; i<e; ++i ) {
range[i-b] = i;
}
std::srand( static_cast<unsigned int>(std::time (NULL)) );
std::random_shuffle( range, range+e-b );
for ( int i=0; i<e-b; ++i ) {
lfsv.Insert( range[i] );
}
delete [] range;
}
int reader( int pos, int how_many_times ) {
int j = 0;
for ( int i=1; i<how_many_times; ++i ) {
j = lfsv[pos];
}
return j;
}
std::atomic<bool> doread( true );
void read_position_0() {
int c = 0;
while ( doread.load() ) {
std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
if ( lfsv[0] != -1 ) {
std::cout << "not -1 on iteration " << c << "\n"; // see main - all element are non-negative, so index 0 should always be -1
}
++c;
}
}
void test( int num_threads, int num_per_thread )
{
std::vector<std::thread> threads;
lfsv.Insert( -1 );
std::thread reader = std::thread( read_position_0 );
for (int i=0; i<num_threads; ++i) {
threads.push_back( std::thread( insert_range, i*num_per_thread, (i+1)*num_per_thread ) );
}
for (auto& th : threads) th.join();
doread.store( false );
reader.join();
for (int i=0; i<num_threads*num_per_thread; ++i) {
// std::cout << lfsv[i] << ' ';
if ( lfsv[i] != i-1 ) {
std::cout << "Error\n";
return;
}
}
std::cout << "All good\n";
}
void test0() { test( 1, 100 ); }
void test1() { test( 2, 100 ); }
void test2() { test( 8, 100 ); }
void test3() { test( 100, 100 ); }
void (*pTests[])() = {
test0,test1,test2,test3//,test4,test5,test6,test7
};
#include <cstdio> /* sscanf */
int main( int argc, char ** argv ) {
if (argc==2) { //use test[ argv[1] ]
int test = 0;
std::sscanf(argv[1],"%i",&test);
try {
pTests[test]();
} catch( const char* msg) {
std::cerr << msg << std::endl;
}
return 0;
}
}
reinterpret_cast is really a "I know what I'm doing, trust me" cast. The compiler will - if possible - believe you.
However, in this case it's entirely wrong. new char[] does not return a vector<int>*.

C++ lambda, undefined behavior

can someone tell me, why the following program fails?
#include <iostream>
#include <functional>
#include <vector>
#include <type_traits>
using namespace std;
template <bool is_enabled = true>
class Aggregator {
public:
Aggregator(std::function<char*()> data) : data(data) { }
void fun() {
for (uint32_t i = 0; i < 500; i++)
std::cout << *((int *)data() + i) << std::endl;
funInternal();
}
private:
template<bool enabled = is_enabled>
void funInternal(typename std::enable_if<enabled>::type* = 0) {
std::cout << "Feature is enabled!" << std::endl;
}
template<bool enabled = is_enabled>
void funInternal(typename std::enable_if<!enabled>::type* = 0) {
std::cout << "Feature is disabled!" << std::endl;
}
std::function<char*()> data;
};
template <typename T>
class Multivalue : public Aggregator<true> {
public:
Multivalue() : Aggregator<true>([&]() { return (char *) v.data(); }) { }
void add(T a) { v.push_back(a); }
private:
std::vector<T> v;
};
std::vector<Multivalue<int>> m;
int main() {
for (uint32_t i = 0; i < 100; i++)
m.push_back(Multivalue<int>());
for (uint32_t i = 0; i < 100; i++)
for (uint32_t j = 0; j < 1000; j++)
m[i].add(j);
for (uint32_t i = 0; i < 100; i++)
m[i].fun();
return 0;
}
I know that the code is not really beautiful, but I want to understand the problem there. Somehow, the lambda expression seems to make trouble.
Thanks,
Moo

Odd performance issue with nested for loops

Below is the full source code you can just copy paste into Visual Studio for easy repro.
#include <Windows.h>
#include <algorithm>
#include <vector>
#include <iostream>
#include <sstream>
LARGE_INTEGER gFreq;
struct CProfileData;
// Yes, we map the pointer itself not the string, for performance reasons
std::vector<CProfileData*> gProfileData;
// simulate a draw buffer access to avoid CBlock::Draw being optimized away
float gDrawBuffer = 0;
struct CTimer
{
CTimer()
{
Reset();
}
size_t GetElapsedMicro()
{
LARGE_INTEGER now;
::QueryPerformanceCounter(&now);
return (1000000 * (now.QuadPart - m_timer.QuadPart)) / gFreq.QuadPart;
}
inline void Reset()
{
::QueryPerformanceCounter(&m_timer);
}
LARGE_INTEGER m_timer;
};
struct CProfileData
{
CProfileData() : m_hitCount(0), m_totalTime(0), m_minTime(-1),
m_maxTime(0), m_name(NULL)
{
gProfileData.push_back(this);
}
size_t m_totalTime;
size_t m_minTime;
size_t m_maxTime;
size_t m_hitCount;
const char * m_name;
};
class CSimpleProfiler
{
public:
CSimpleProfiler(const char * aLocationName, CProfileData * aData)
: m_location(aLocationName), m_data(aData)
{
::QueryPerformanceCounter(&m_clock);
}
~CSimpleProfiler()
{
CProfileData & data = *m_data;
data.m_name = m_location;
++data.m_hitCount;
LARGE_INTEGER now;
::QueryPerformanceCounter(&now);
size_t elapsed = (1000000 * (now.QuadPart - m_clock.QuadPart)) / gFreq.QuadPart;
data.m_totalTime += elapsed;
elapsed < data.m_minTime ? data.m_minTime = elapsed : true;
elapsed > data.m_maxTime ? data.m_maxTime = elapsed : true;
}
static void PrintAll()
{
std::stringstream str;
str.width(20);
str << "Location";
str.width(15);
str << "Total time";
str.width(15);
str << "Average time";
str.width(15);
str << "Hit count";
str.width(15);
str << "Min";
str.width(15);
str << "Max" << std::endl;
::OutputDebugStringA(str.str().c_str());
for (auto i = gProfileData.begin(); i != gProfileData.end(); ++i)
{
CProfileData & data = **i;
std::stringstream str;
str.width(20);
str << data.m_name;
str.width(15);
str << data.m_totalTime;
str.width(15);
str << data.m_totalTime / (float)data.m_hitCount;
str.width(15);
str << data.m_hitCount;
str.width(15);
str << data.m_minTime;
str.width(15);
str << data.m_maxTime << std::endl;
::OutputDebugStringA(str.str().c_str());
}
}
static void Clear()
{
for (auto i = gProfileData.begin(); i != gProfileData.end(); ++i)
{
(*i)->m_totalTime = 0;
(*i)->m_minTime = 0;
(*i)->m_maxTime = 0;
(*i)->m_hitCount = 0;
}
}
private:
LARGE_INTEGER m_clock;
const char * m_location;
CProfileData * m_data;
};
#define PROFILING_ENABLED
#ifdef PROFILING_ENABLED
#define SIMPLE_PROFILE \
static CProfileData pdata ## __LINE__; \
CSimpleProfiler p ## __LINE__(__FUNCTION__, & pdata ## __LINE__)
#define SIMPLE_PROFILE_WITH_NAME(Name) \
static CProfileData pdata ## __LINE__; \
CSimpleProfiler p ## __LINE__(Name, & pdata ## __LINE__)
#else
#define SIMPLE_PROFILE __noop
#define SIMPLE_PROFILE_WITH_NAME(Name) __noop
#endif
void InvalidateL1Cache()
{
const int size = 256 * 1024;
static char *c = (char *)malloc(size);
for (int i = 0; i < 0x0fff; i++)
for (int j = 0; j < size; j++)
c[j] = i*j;
}
int _tmain(int argc, _TCHAR* argv[])
{
::QueryPerformanceFrequency(&gFreq);
LARGE_INTEGER pc;
::QueryPerformanceCounter(&pc);
struct CBlock
{
float x;
float y;
void Draw(float aBlend)
{
for (size_t i = 0; i < 100; ++i )
gDrawBuffer += aBlend;
}
};
typedef std::vector<std::vector<CBlock>> Layer;
typedef std::vector<Layer> Layers;
Layers mBlocks;
// populate with dummy data;
mBlocks.push_back(Layer());
Layer & layer = mBlocks.back();
layer.resize(109);
srand(0); // for reprodicibility (determinism)
for (auto i = layer.begin(); i != layer.end(); ++i)
{
i->resize(25 + rand() % 10 - 5);
}
// end populating dummy data
while (1)
{
CSimpleProfiler::Clear();
float aBlend = 1.f / (rand() % 100);
{
for (auto i = mBlocks.begin(); i != mBlocks.end(); ++i)
{
for (auto j = i->begin(); j != i->end(); ++j)
{
CTimer t;
{
SIMPLE_PROFILE_WITH_NAME("Main_Draw_3");
for (auto blockIt = j->begin(); blockIt != j->end();)
{
CBlock * b = nullptr;
{
b = &*blockIt;
}
{
b->Draw(aBlend);
}
{
++blockIt;
}
}
}
if (t.GetElapsedMicro() > 1000)
{
::OutputDebugStringA("SLOWDOWN!\n");
CSimpleProfiler::PrintAll();
}
}
}
}
}
return 0;
}
I get the following profiling from time to time, expressed in microseconds:
SLOWDOWN!
Location Total time Average time Hit count Min Max
Main_Draw_3 2047 36.5536 56 0 1040
This spikes from time to time. Normally, it takes 100 microseconds for Main_Draw_3 block to finish, but it spikes to 1000 (the Max column) from time to time. What causes this?
I'm aware cache misses could play a role, but is it really that in this case?... What is happening here and how can I mitigate this?
More info:
compiler VS 2013, compiled with Maximize Speed (/O2)
I think there might be two issues:
Are you compiling with optimizations on? What are the flags?
Maybe you could increase the sample size (by doing for instance ten (or hundred, or thousand etc) runs of this code in one profiling run). The reason is that if the sample size is small, the standard deviation is very high