Why does std::atomic_flag cause a deadlock here? - c++

Why does this ping-pong example with C++20 std::atomic_flag often result in a deadlock? I am using GCC 11.1 to compile.
#include <atomic>
#include <iostream>
#include <thread>
constexpr auto count_limit = 10'000;
auto atomic_flag = std::atomic_flag{};
auto counter = std::atomic<int>{};
void ping() {
while (counter < count_limit) {
atomic_flag.wait(true);
++counter;
atomic_flag.test_and_set();
atomic_flag.notify_one();
}
}
void pong() {
while (counter < count_limit) {
atomic_flag.wait(false);
atomic_flag.clear();
atomic_flag.notify_one();
}
}
int main() {
atomic_flag.test_and_set();
{
auto const t1 = std::jthread{ping};
auto const t2 = std::jthread{pong};
}
std::cout << "Finished\n";
}
Update: The "deadlock" does not occur on the Linux machine on godbolt.org: https://godbolt.org/z/zPb8d1bca. It does not happen on my own Linux machine either. It does happen on my Windows machine, so this might be a GCC bug that is specific to Windows.

Related

Concurrent program compiled with clang runs fine, but hangs with gcc

I wrote a class to share a limited number of resources (for instance network interfaces) between a larger number of threads. The resources are pooled and, if not in use, they are borrowed out to the requesting thread, which otherwise waits on a condition_variable.
Nothing really exotic: apart for the fancy scoped_lock which requires c++17, it should be good old c++11.
Both gcc10.2 and clang11 compile the test main fine, but while the latter produces an executable which does pretty much what expected, the former hangs without consuming CPU (deadlock?).
With the help of https://godbolt.org/ I tried older versions of gcc and also icc (passing options -O3 -std=c++17 -pthread), all reproducing the bad result, while even there clang confirms the proper behavior.
I wonder if I made a mistake or if the code triggers some compiler misbehavior and in case how to work around that.
#include <iostream>
#include <vector>
#include <stdexcept>
#include <mutex>
#include <condition_variable>
template <typename T>
class Pool {
///////////////////////////
class Borrowed {
friend class Pool<T>;
Pool<T>& pool;
const size_t id;
T * val;
public:
Borrowed(Pool & p, size_t i, T& v): pool(p), id(i), val(&v) {}
~Borrowed() { release(); }
T& get() const {
if (!val) throw std::runtime_error("Borrowed::get() this resource was collected back by the pool");
return *val;
}
void release() { pool.collect(*this); }
};
///////////////////////////
struct Resource {
T val;
bool available = true;
Resource(T v): val(std::move(v)) {}
};
///////////////////////////
std::vector<Resource> vres;
size_t hint = 0;
std::condition_variable cv;
std::mutex mtx;
size_t available_cnt;
public:
Pool(std::initializer_list<T> l): available_cnt(l.size()) {
vres.reserve(l.size());
for (T t: l) {
vres.emplace_back(std::move(t));
}
std::cout << "Pool has size " << vres.size() << std::endl;
}
~Pool() {
for ( auto & res: vres ) {
if ( ! res.available ) {
std::cerr << "WARNING Pool::~Pool resources are still in use\n";
}
}
}
Borrowed borrow() {
std::unique_lock<std::mutex> lk(mtx);
cv.wait(lk, [&](){return available_cnt > 0;});
if ( vres[hint].available ) {
// quick path, if hint points to an available resource
std::cout << "hint good" << std::endl;
vres[hint].available = false;
--available_cnt;
Borrowed b(*this, hint, vres[hint].val);
if ( hint + 1 < vres.size() ) ++hint;
return b; // <--- gcc seems to hang here
} else {
// full scan to find the available resource
std::cout << "hint bad" << std::endl;
for ( hint = 0; hint < vres.size(); ++hint ) {
if ( vres[hint].available ) {
vres[hint].available = false;
--available_cnt;
return Borrowed(*this, hint, vres[hint].val);
}
}
}
throw std::runtime_error("Pool::borrow() no resource is available - internal logic error");
}
void collect(Borrowed & b) {
if ( &(b.pool) != this )
throw std::runtime_error("Pool::collect() trying to collect resource owned by another pool!");
if ( b.val ) {
b.val = nullptr;
{
std::scoped_lock<std::mutex> lk(mtx);
hint = b.id;
vres[hint].available = true;
++available_cnt;
}
cv.notify_one();
}
}
};
///////////////////////////////////////////////////////////////////
#include <thread>
#include <chrono>
int main() {
Pool<std::string> pool{"hello","world"};
std::vector<std::thread> vt;
for (int i = 10; i > 0; --i) {
vt.emplace_back( [&pool, i]()
{
auto res = pool.borrow();
std::this_thread::sleep_for(std::chrono::milliseconds(i*300));
std::cout << res.get() << std::endl;
}
);
}
for (auto & t: vt) t.join();
return 0;
}
You're running into undefined behavior since you effectively relock an already acquired lock. With MSVC I obtained a helpful callstack to distinguish this. Here is a working fixed example (I suppose, works now for me, see the changes within the borrow() method, might be further re-designed since locking inside a destructor might be questioned):
#include <iostream>
#include <vector>
#include <stdexcept>
#include <mutex>
#include <condition_variable>
template <typename T>
class Pool {
///////////////////////////
class Borrowed {
friend class Pool<T>;
Pool<T>& pool;
const size_t id;
T * val;
public:
Borrowed(Pool & p, size_t i, T& v) : pool(p), id(i), val(&v) {}
~Borrowed() { release(); }
T& get() const {
if (!val) throw std::runtime_error("Borrowed::get() this resource was collected back by the pool");
return *val;
}
void release() { pool.collect(*this); }
};
///////////////////////////
struct Resource {
T val;
bool available = true;
Resource(T v) : val(std::move(v)) {}
};
///////////////////////////
std::vector<Resource> vres;
size_t hint = 0;
std::condition_variable cv;
std::mutex mtx;
size_t available_cnt;
public:
Pool(std::initializer_list<T> l) : available_cnt(l.size()) {
vres.reserve(l.size());
for (T t : l) {
vres.emplace_back(std::move(t));
}
std::cout << "Pool has size " << vres.size() << std::endl;
}
~Pool() {
for (auto & res : vres) {
if (!res.available) {
std::cerr << "WARNING Pool::~Pool resources are still in use\n";
}
}
}
Borrowed borrow() {
std::unique_lock<std::mutex> lk(mtx);
while (available_cnt == 0) cv.wait(lk);
if (vres[hint].available) {
// quick path, if hint points to an available resource
std::cout << "hint good" << std::endl;
vres[hint].available = false;
--available_cnt;
Borrowed b(*this, hint, vres[hint].val);
if (hint + 1 < vres.size()) ++hint;
lk.unlock();
return b; // <--- gcc seems to hang here
}
else {
// full scan to find the available resource
std::cout << "hint bad" << std::endl;
for (hint = 0; hint < vres.size(); ++hint) {
if (vres[hint].available) {
vres[hint].available = false;
--available_cnt;
lk.unlock();
return Borrowed(*this, hint, vres[hint].val);
}
}
}
throw std::runtime_error("Pool::borrow() no resource is available - internal logic error");
}
void collect(Borrowed & b) {
if (&(b.pool) != this)
throw std::runtime_error("Pool::collect() trying to collect resource owned by another pool!");
if (b.val) {
b.val = nullptr;
{
std::scoped_lock<std::mutex> lk(mtx);
hint = b.id;
vres[hint].available = true;
++available_cnt;
cv.notify_one();
}
}
}
};
///////////////////////////////////////////////////////////////////
#include <thread>
#include <chrono>
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
int main()
{
try
{
Pool<std::string> pool{ "hello","world" };
std::vector<std::thread> vt;
for (int i = 10; i > 0; --i) {
vt.emplace_back([&pool, i]()
{
auto res = pool.borrow();
std::this_thread::sleep_for(std::chrono::milliseconds(i * 300));
std::cout << res.get() << std::endl;
}
);
}
for (auto & t : vt) t.join();
return 0;
}
catch(const std::exception& e)
{
std::cout << "exception occurred: " << e.what();
}
return 0;
}
Locking destructor coupled with missed NRVO caused the issue (credits to Secundi for pointing this out in the comments).
If the compiler skips NRVO, the few lines below if will call the destructor of b. The destructor tries to acquire the mutex before this gets released by the unique_lock, resulting in a deadlock.
Borrowed b(*this, hint, vres[hint].val);
if ( hint + 1 < vres.size() ) ++hint;
return b; // <--- gcc seems to hang here
It is of crucial importance here to avoid destroying b. In fact, even if manually releasing the unique_lock before returning will avoid the deadlock, the destructor of b will mark the pooled resource as available, while this is just being borrowed out, making the code wrong.
A possible fix consists in replacing the lines above with:
const auto tmp = hint;
if ( hint + 1 < vres.size() ) ++hint;
return Borrowed(*this, tmp, vres[tmp].val);
Another possibility (which does not exclude the former) is to delete the (evil) copy ctor of Borrowed and only provide a move ctor:
Borrowed(const Borrowed &) = delete;
Borrowed(Borrowed && b): pool(b.pool), id(b.id), val(b.val) { b.val = nullptr; }

Strange atomic_flag example for c++20 at cppreference.com

cppreference.com std::atomic_flag listed two examples of spinlock in prior c++20 and c++20. The last modified date is 21 July 2020, at 12:58.
Prior c++20:
#include <thread>
#include <vector>
#include <iostream>
#include <atomic>
std::atomic_flag lock = ATOMIC_FLAG_INIT;
void f(int n)
{
for (int cnt = 0; cnt < 100; ++cnt) {
while (lock.test_and_set(std::memory_order_acquire)) // acquire lock
; // spin
std::cout << "Output from thread " << n << '\n';
lock.clear(std::memory_order_release); // release lock
}
}
int main()
{
std::vector<std::thread> v;
for (int n = 0; n < 10; ++n) {
v.emplace_back(f, n);
}
for (auto& t : v) {
t.join();
}
}
c++ 20:
#include <thread>
#include <vector>
#include <iostream>
#include <atomic>
std::atomic_flag lock = ATOMIC_FLAG_INIT;
void f(int n)
{
for (int cnt = 0; cnt < 100; ++cnt) {
while (lock.test_and_set(std::memory_order_acquire)) // acquire lock
while (lock.test(std::memory_order_relaxed)) // test lock
; // spin
std::cout << "Output from thread " << n << '\n';
lock.clear(std::memory_order_release); // release lock
}
}
int main()
{
std::vector<std::thread> v;
for (int n = 0; n < 10; ++n) {
v.emplace_back(f, n);
}
for (auto& t : v) {
t.join();
}
}
Two things that bother me in the c++20 example are:
(1) ATOMIC_FLAG_INIT is deprecated in c++20 and the default constructor should store value to false for us.
(2) The "optimization" by introducing while (lock.test(std::memory_order_relaxed)) after the flag has been set to true does not make sense to me. Shouldn't while (lock.test(std::memory_order_relaxed)) always immediately return in that case? Why is it an optimization to the prior c++20 example then?
Edit:
c++20 has introduced test() for atomic which simply checks if the flag is true by doing an atomic load. It is placed at the inner loop when test_and_set() has failed so that the computer first spins inside the test() while loop before going back to test_and_set() the second time.
See the comment:
that edit came from stackoverflow.com/q/62318642/2945027 – Cubbi

How to wait until all threads from the pool ends their work?

I am trying to implement simple thread pool using boost library.
Here is code:
//boost::asio::io_service ioService;
//boost::thread_group pool;
//boost::asio::io_service::work* worker;
ThreadPool::ThreadPool(int poolSize /*= boost::thread::hardware_concurrency()*/)
{
if (poolSize >= 1 && poolSize <= boost::thread::hardware_concurrency())
threadAmount = poolSize;
else
threadAmount = 1;
worker = NULL;
}
ThreadPool::~ThreadPool()
{
if (worker != NULL && !ioService.stopped())
{
_shutdown();
delete worker;
worker = NULL;
}
}
void ThreadPool::start()
{
if (worker != NULL)
{
return;
}
worker = new boost::asio::io_service::work(ioService);
for (int i = 0; i < threadAmount; ++i)
{
pool.create_thread(boost::bind(&boost::asio::io_service::run, &ioService));
}
}
template<class F, class...Args>
void ThreadPool::execute(F f, Args&&... args)
{
ioService.post(boost::bind(f, std::forward<Args>(args)...));
}
void ThreadPool::shutdown()
{
pool.interrupt_all();
_shutdown();
}
void ThreadPool::join_all()
{
// wait for all threads before continue
// in other words - barier for all threads when they finished all jobs
// and to be able re-use them in futur.
}
void ThreadPool::_shutdown()
{
ioService.reset();
ioService.stop();
}
In my program i assign to thread pool some tasks that needs to be done, and going further with main thread. At some point i need to wait for all threads to finished all tasks before i could proceed calculations. Is there any way to do this ?
Thanks a lot.
As others have pointed out, the main culprit is the work instance.
I'd much simplify the interface (there's really no reason to split shutdown into shutdown, _shutdown, join_all and some random logic in the destructor as well. That just makes it hard to know what responsibility is where.
The interface should be a Pit Of Success - easy to use right, hard to use wrong.
At the same time it makes it much easier to implement it correctly.
Here's a first stab:
Live On Coliru
#include <boost/asio.hpp>
#include <boost/thread.hpp>
namespace ba = boost::asio;
struct ThreadPool {
ThreadPool(unsigned poolSize = boost::thread::hardware_concurrency());
~ThreadPool();
void start();
template <typename F, typename... Args>
void execute(F f, Args&&... args) {
ioService.post(std::bind(f, std::forward<Args>(args)...));
}
private:
unsigned threadAmount;
ba::io_service ioService;
boost::thread_group pool;
std::unique_ptr<ba::io_service::work> work;
void shutdown();
};
ThreadPool::ThreadPool(
unsigned poolSize /*= boost::thread::hardware_concurrency()*/) {
threadAmount = std::max(1u, poolSize);
threadAmount = std::min(boost::thread::hardware_concurrency(), poolSize);
}
ThreadPool::~ThreadPool() {
shutdown();
}
void ThreadPool::start() {
if (!work) {
work = std::make_unique<ba::io_service::work>(ioService);
for (unsigned i = 0; i < threadAmount; ++i) {
pool.create_thread(
boost::bind(&ba::io_service::run, &ioService));
}
}
}
void ThreadPool::shutdown() {
work.reset();
pool.interrupt_all();
ioService.stop();
pool.join_all();
ioService.reset();
}
#include <iostream>
using namespace std::chrono_literals;
int main() {
auto now = std::chrono::high_resolution_clock::now;
auto s = now();
{
ThreadPool p(10);
p.start();
p.execute([] { std::this_thread::sleep_for(1s); });
p.execute([] { std::this_thread::sleep_for(600ms); });
p.execute([] { std::this_thread::sleep_for(400ms); });
p.execute([] { std::this_thread::sleep_for(200ms); });
p.execute([] { std::this_thread::sleep_for(10ms); });
}
std::cout << "Total elapsed: " << (now() - s) / 1.0s << "s\n";
}
Which on most multi-core systems will print something like on mine:
Total elapsed: 1.00064s
It looks like you had an error in calculating threadAmount where you'd take 1 if poolSize was more than hardware_concurrency.
To be honest, why have the bind in the implementation? It really doesn't add a lot, you can leave it up to the caller, and they can choose whether they use bind, and if so, whether it's boost::bind, std::bind or some other way of composing calleables:
template <typename F>
void execute(F f) { ioService.post(f); }
You're missing exception handling around io_service::run calls (see Should the exception thrown by boost::asio::io_service::run() be caught?).
If you're using recent boost version, you can use the newer io_context and thread_pool interfaces, greatly simplifying things:
Live On Coliru
#include <boost/asio.hpp>
struct ThreadPool {
ThreadPool(unsigned poolSize)
: pool(std::clamp(poolSize, 1u, std::thread::hardware_concurrency()))
{ }
template <typename F>
void execute(F f) { post(pool, f); }
private:
boost::asio::thread_pool pool;
};
This still has 99% of the functionality¹, but in 10 LoC.
In fact, the class has become a trivial wrapper, so we could just write:
Live On Coliru
#include <boost/asio.hpp>
#include <iostream>
using namespace std::chrono_literals;
using C = std::chrono::high_resolution_clock;
static void sleep_for(C::duration d) { std::this_thread::sleep_for(d); }
int main() {
auto s = C::now();
{
boost::asio::thread_pool pool;
post(pool, [] { sleep_for(1s); });
post(pool, [] { sleep_for(600ms); });
// still can bind if you want
post(pool, std::bind(sleep_for, 400ms));
post(pool, std::bind(sleep_for, 200ms));
post(pool, std::bind(sleep_for, 10ms));
//pool.join(); // implicit in destructor
}
std::cout << "Total elapsed: " << (C::now() - s) / 1.0s << "s\n";
}
Main difference is the default pool size: it is 2*hardware concurrency (but also calculated more safely, because not all platforms have a reliable hardware_concurrency() - it could be zero, e.g.).
¹ It doesn't currently exercise interruptions points

Why cannot my c++ thread pool accelerate my program?

I tried to implement a c++ thread pool according to some notes made by others, the code is like this:
#include <vector>
#include <queue>
#include <functional>
#include <future>
#include <atomic>
#include <condition_variable>
#include <thread>
#include <mutex>
#include <memory>
#include <glog/logging.h>
#include <iostream>
#include <chrono>
using std::cout;
using std::endl;
class ThreadPool {
public:
ThreadPool(const ThreadPool&) = delete;
ThreadPool(ThreadPool&&) = delete;
ThreadPool& operator=(const ThreadPool&) = delete;
ThreadPool& operator=(ThreadPool&&) = delete;
ThreadPool(uint32_t capacity=std::thread::hardware_concurrency(),
uint32_t n_threads=std::thread::hardware_concurrency()
): capacity(capacity), n_threads(n_threads) {
init(capacity, n_threads);
}
~ThreadPool() noexcept {
shutdown();
}
void init(uint32_t capacity, uint32_t n_threads) {
CHECK_GT(capacity, 0) << "task queue capacity should be greater than 0";
CHECK_GT(n_threads, 0) << "thread pool capacity should be greater than 0";
for (int i{0}; i < n_threads; ++i) {
pool.emplace_back(std::thread([this] {
std::function<void(void)> task;
while (!this->stop) {
{
std::unique_lock<std::mutex> lock(this->q_mutex);
task_q_empty.wait(lock, [&] {return this->stop | !task_q.empty();});
if (this->stop) break;
task = this->task_q.front();
this->task_q.pop();
task_q_full.notify_one();
}
// auto id = std::this_thread::get_id();
// std::cout << "thread id is: " << id << std::endl;
task();
}
}));
}
}
void shutdown() {
stop = true;
task_q_empty.notify_all();
task_q_full.notify_all();
for (auto& thread : pool) {
if (thread.joinable()) {
thread.join();
}
}
}
template<typename F, typename...Args>
auto submit(F&& f, Args&&... args) -> std::future<decltype(f(args...))> {
using res_type = decltype(f(args...));
std::function<res_type(void)> func = std::bind(std::forward<F>(f), std::forward<Args>(args)...);
auto task_ptr = std::make_shared<std::packaged_task<res_type()>>(func);
{
std::unique_lock<std::mutex> lock(q_mutex);
task_q_full.wait(lock, [&] {return this->stop | task_q.size() <= capacity;});
CHECK (this->stop == false) << "should not add task to stopped queue\n";
task_q.emplace([task_ptr]{(*task_ptr)();});
}
task_q_empty.notify_one();
return task_ptr->get_future();
}
private:
std::vector<std::thread> pool;
std::queue<std::function<void(void)>> task_q;
std::condition_variable task_q_full;
std::condition_variable task_q_empty;
std::atomic<bool> stop{false};
std::mutex q_mutex;
uint32_t capacity;
uint32_t n_threads;
};
int add(int a, int b) {return a + b;}
int main() {
auto t1 = std::chrono::steady_clock::now();
int n_threads = 1;
ThreadPool tp;
tp.init(n_threads, 1024);
std::vector<std::future<int>> res;
for (int i{0}; i < 1000000; ++i) {
res.push_back(tp.submit(add, i, i+1));
}
auto t2 = std::chrono::steady_clock::now();
for (auto &el : res) {
el.get();
// cout << el.get() << endl;
}
tp.shutdown();
cout << "processing: "
<< std::chrono::duration<double, std::milli>(t2 - t1).count()
<< endl;
return 0;
}
The problem is that, when I set n_threads=1, the program takes the same length of time as I set n_threads=4. Since my gpu has 72 kernels (from the htop command), I believe the 4 thread would be faster than the 1 thread settings. What is the problem with this implementation of the thread pool please?
I found few issues:
1) Use ORing instead of the bitwise operation in the both conditional-variable waits:
Replace this - `task_q_empty.wait(lock, [&] {return this->stop | !task_q.empty();});`
By - `task_q_empty.wait(lock, [&] {return this->stop || !task_q.empty();});`
2) Use notify_all() in place of notify_one() in init() and submit().
3) Two condition_variables is unnecessary here, use only task_q_empty.
4) Your use case is not ideal. Switching of the threads may outweigh adding of two integers, it may appear more the threads longer the execution time. Test in optimized mode. Try scenario like this to simulate longer process:
int add(int a, int b) { this_thread::sleep_for(chrono::milliseconds(200)); return a + b; }

Increasing allocation performance for strings

I ported a Java GC test program to C++ (see the code below) as well as Python. The Java and Python performance is much greater than C++ and I was thinking this was due to all the calls to new that have to be done to create the strings each time. I've tried using Boost's fast_pool_allocator but that actually worsened performance from 700ms to 1200ms. Am I using the allocator wrong, or is there something else I should be doing?
EDIT: Compiled with g++ -O3 -march=native --std=c++11 garbage.cpp -lboost_system. g++ is version 4.8.1
One iteration takes in Python is about 300ms and with Java about 50ms. std::allocator gives about 700ms and boost::fast_pool_allocator gives about 1200ms.
#include <string>
#include <vector>
#include <chrono>
#include <list>
#include <iostream>
#include <boost/pool/pool_alloc.hpp>
#include <memory>
//#include <gc/gc_allocator.h>
using namespace std;
#include <sstream>
typedef boost::fast_pool_allocator<char> c_allocator;
//typedef std::allocator<char> c_allocator;
typedef basic_string<char, char_traits<char>, c_allocator> pool_string;
namespace patch {
template <typename T> pool_string to_string(const T& in) {
std::basic_stringstream<char, char_traits<char>, c_allocator> stm;
stm << in;
return stm.str();
}
}
#include "mytime.hpp"
class Garbage {
public:
vector<pool_string> outer;
vector<pool_string> old;
const int nThreads = 1;
//static auto time = chrono::high_resolution_clock();
void go() {
// outer.resize(1000000);
//old.reserve(1000000);
auto tt = mytime::msecs();
for (int i = 0; i < 10; ++i) {
if (i % 100 == 0) {
cout << "DOING AN OLD" << endl;
doOld();
tt = mytime::msecs();
}
for (int j = 0; j < 1000000/nThreads; ++j)
outer.push_back(patch::to_string(j));
outer.clear();
auto t = mytime::msecs();
cout << (t - tt) << endl;
tt = t;
}
}
void doOld() {
old.clear();
for (int i = 0; i < 1000000/nThreads; ++i)
old.push_back(patch::to_string(i));
}
};
int main() {
Garbage().go();
}
The problem is you're using a new string stream each time to convert an integer.
Fix it:
namespace patch {
template <typename T> pool_string to_string(const T& in) {
return boost::lexical_cast<pool_string>(in);
}
}
Now the timings are:
DOING AN OLD
0.175462
0.0670085
0.0669926
0.0687969
0.0692518
0.0669318
0.0669196
0.0669187
0.0668962
0.0669185
real 0m0.801s
user 0m0.784s
sys 0m0.016s
See it Live On Coliru
Full code for reference:
#include <boost/pool/pool_alloc.hpp>
#include <chrono>
#include <iostream>
#include <list>
#include <memory>
#include <sstream>
#include <string>
#include <vector>
#include <boost/lexical_cast.hpp>
//#include <gc/gc_allocator.h>
using string = std::string;
namespace patch {
template <typename T> string to_string(const T& in) {
return boost::lexical_cast<string>(in);
}
}
class Timer
{
typedef std::chrono::high_resolution_clock clock;
clock::time_point _start;
public:
Timer() { reset(); }
void reset() { _start = now(); }
double elapsed()
{
using namespace std::chrono;
auto e = now() - _start;
return duration_cast<nanoseconds>(e).count()*1.0e-9;
}
clock::time_point now()
{
return clock::now();
}
};
class Garbage {
public:
std::vector<string> outer;
std::vector<string> old;
const int nThreads = 1;
void go() {
outer.resize(1000000);
//old.reserve(1000000);
Timer timer;
for (int i = 0; i < 10; ++i) {
if (i % 100 == 0) {
std::cout << "DOING AN OLD" << std::endl;
doOld();
}
for (int j = 0; j < 1000000/nThreads; ++j)
outer.push_back(patch::to_string(j));
outer.clear();
std::cout << timer.elapsed() << std::endl;
timer.reset();
}
}
void doOld() {
old.clear();
for (int i = 0; i < 1000000/nThreads; ++i)
old.push_back(patch::to_string(i));
}
};
int main() {
Garbage().go();
}
Since I don't use boost on my machine, I simplified the code to use standard C++11 to_string (thus accidentally "fixing" the problem sehe found), and got this:
#include <string>
#include <vector>
#include <chrono>
#include <list>
#include <iostream>
#include <memory>
//#include <gc/gc_allocator.h>
#include <sstream>
using namespace std;
class Timer
{
typedef std::chrono::high_resolution_clock clock;
clock::time_point _start;
public:
Timer() { reset(); }
void reset() { _start = now(); }
double elapsed()
{
using namespace std::chrono;
auto e = now() - _start;
return duration_cast<nanoseconds>(e).count()*1.0e-9;
}
clock::time_point now()
{
return clock::now();
}
};
class Garbage {
public:
vector<string> outer;
vector<string> old;
const int nThreads = 1;
Timer timer;
void go() {
// outer.resize(1000000);
//old.reserve(1000000);
for (int i = 0; i < 10; ++i) {
if (i % 100 == 0) {
cout << "DOING AN OLD" << endl;
doOld();
}
for (int j = 0; j < 1000000/nThreads; ++j)
outer.push_back(to_string(j));
outer.clear();
cout << timer.elapsed() << endl;
timer.reset();
}
}
void doOld() {
old.clear();
for (int i = 0; i < 1000000/nThreads; ++i)
old.push_back(to_string(i));
}
};
int main() {
Garbage().go();
}
Compiling with:
$ g++ -O3 -std=c++11 gc.cpp
$ ./a.out
DOING AN OLD
0.414637
0.189082
0.189143
0.186336
0.184449
0.18504
0.186302
0.186055
0.183123
0.186835
clang 3.5 build with source from Friday 18th of April 2014 gives similar results with the same compiler options.
My processor is a AMD Phenom(tm) II X4 965, running at 3.6GHz (if I remember right).