I ported a Java GC test program to C++ (see the code below) as well as Python. The Java and Python performance is much greater than C++ and I was thinking this was due to all the calls to new that have to be done to create the strings each time. I've tried using Boost's fast_pool_allocator but that actually worsened performance from 700ms to 1200ms. Am I using the allocator wrong, or is there something else I should be doing?
EDIT: Compiled with g++ -O3 -march=native --std=c++11 garbage.cpp -lboost_system. g++ is version 4.8.1
One iteration takes in Python is about 300ms and with Java about 50ms. std::allocator gives about 700ms and boost::fast_pool_allocator gives about 1200ms.
#include <string>
#include <vector>
#include <chrono>
#include <list>
#include <iostream>
#include <boost/pool/pool_alloc.hpp>
#include <memory>
//#include <gc/gc_allocator.h>
using namespace std;
#include <sstream>
typedef boost::fast_pool_allocator<char> c_allocator;
//typedef std::allocator<char> c_allocator;
typedef basic_string<char, char_traits<char>, c_allocator> pool_string;
namespace patch {
template <typename T> pool_string to_string(const T& in) {
std::basic_stringstream<char, char_traits<char>, c_allocator> stm;
stm << in;
return stm.str();
}
}
#include "mytime.hpp"
class Garbage {
public:
vector<pool_string> outer;
vector<pool_string> old;
const int nThreads = 1;
//static auto time = chrono::high_resolution_clock();
void go() {
// outer.resize(1000000);
//old.reserve(1000000);
auto tt = mytime::msecs();
for (int i = 0; i < 10; ++i) {
if (i % 100 == 0) {
cout << "DOING AN OLD" << endl;
doOld();
tt = mytime::msecs();
}
for (int j = 0; j < 1000000/nThreads; ++j)
outer.push_back(patch::to_string(j));
outer.clear();
auto t = mytime::msecs();
cout << (t - tt) << endl;
tt = t;
}
}
void doOld() {
old.clear();
for (int i = 0; i < 1000000/nThreads; ++i)
old.push_back(patch::to_string(i));
}
};
int main() {
Garbage().go();
}
The problem is you're using a new string stream each time to convert an integer.
Fix it:
namespace patch {
template <typename T> pool_string to_string(const T& in) {
return boost::lexical_cast<pool_string>(in);
}
}
Now the timings are:
DOING AN OLD
0.175462
0.0670085
0.0669926
0.0687969
0.0692518
0.0669318
0.0669196
0.0669187
0.0668962
0.0669185
real 0m0.801s
user 0m0.784s
sys 0m0.016s
See it Live On Coliru
Full code for reference:
#include <boost/pool/pool_alloc.hpp>
#include <chrono>
#include <iostream>
#include <list>
#include <memory>
#include <sstream>
#include <string>
#include <vector>
#include <boost/lexical_cast.hpp>
//#include <gc/gc_allocator.h>
using string = std::string;
namespace patch {
template <typename T> string to_string(const T& in) {
return boost::lexical_cast<string>(in);
}
}
class Timer
{
typedef std::chrono::high_resolution_clock clock;
clock::time_point _start;
public:
Timer() { reset(); }
void reset() { _start = now(); }
double elapsed()
{
using namespace std::chrono;
auto e = now() - _start;
return duration_cast<nanoseconds>(e).count()*1.0e-9;
}
clock::time_point now()
{
return clock::now();
}
};
class Garbage {
public:
std::vector<string> outer;
std::vector<string> old;
const int nThreads = 1;
void go() {
outer.resize(1000000);
//old.reserve(1000000);
Timer timer;
for (int i = 0; i < 10; ++i) {
if (i % 100 == 0) {
std::cout << "DOING AN OLD" << std::endl;
doOld();
}
for (int j = 0; j < 1000000/nThreads; ++j)
outer.push_back(patch::to_string(j));
outer.clear();
std::cout << timer.elapsed() << std::endl;
timer.reset();
}
}
void doOld() {
old.clear();
for (int i = 0; i < 1000000/nThreads; ++i)
old.push_back(patch::to_string(i));
}
};
int main() {
Garbage().go();
}
Since I don't use boost on my machine, I simplified the code to use standard C++11 to_string (thus accidentally "fixing" the problem sehe found), and got this:
#include <string>
#include <vector>
#include <chrono>
#include <list>
#include <iostream>
#include <memory>
//#include <gc/gc_allocator.h>
#include <sstream>
using namespace std;
class Timer
{
typedef std::chrono::high_resolution_clock clock;
clock::time_point _start;
public:
Timer() { reset(); }
void reset() { _start = now(); }
double elapsed()
{
using namespace std::chrono;
auto e = now() - _start;
return duration_cast<nanoseconds>(e).count()*1.0e-9;
}
clock::time_point now()
{
return clock::now();
}
};
class Garbage {
public:
vector<string> outer;
vector<string> old;
const int nThreads = 1;
Timer timer;
void go() {
// outer.resize(1000000);
//old.reserve(1000000);
for (int i = 0; i < 10; ++i) {
if (i % 100 == 0) {
cout << "DOING AN OLD" << endl;
doOld();
}
for (int j = 0; j < 1000000/nThreads; ++j)
outer.push_back(to_string(j));
outer.clear();
cout << timer.elapsed() << endl;
timer.reset();
}
}
void doOld() {
old.clear();
for (int i = 0; i < 1000000/nThreads; ++i)
old.push_back(to_string(i));
}
};
int main() {
Garbage().go();
}
Compiling with:
$ g++ -O3 -std=c++11 gc.cpp
$ ./a.out
DOING AN OLD
0.414637
0.189082
0.189143
0.186336
0.184449
0.18504
0.186302
0.186055
0.183123
0.186835
clang 3.5 build with source from Friday 18th of April 2014 gives similar results with the same compiler options.
My processor is a AMD Phenom(tm) II X4 965, running at 3.6GHz (if I remember right).
Related
I create test code where I am computing in parallel one complex matrix.
I am computing on CPU.
I observed that it takes around 3 seconds to finish all the blocks.
Can someone explain why it takes so long time ?
Code
Utils.hpp
#pragma once
#include <chrono>
#include <armadillo>
namespace utils
{
class watch : std::chrono::steady_clock {
time_point start_ = now();
public: auto elapsed_sec() const {return std::chrono::duration<double>(now() - start_).count();}
};
void op_herk(arma::cx_mat && A, arma::cx_mat & C)
{
using blas_int = int;
using T = double;
const char uplo = 'U';
const char trans_A = 'N';
const auto n = blas_int(C.n_cols);
const auto k = blas_int(A.n_cols);
const T local_alpha = T(1);
const T local_beta = T(0);
const blas_int lda = n;
arma::blas::herk<T>( &uplo, &trans_A, &n, &k, &local_alpha, A.mem, &lda, &local_beta, C.memptr(), &n);
arma::herk_helper::inplace_conj_copy_upper_tri_to_lower_tri(C);
}
}
ThreadPoll
#pragma once
#include <boost/thread.hpp>
#include <boost/asio.hpp>
#include <boost/asio/thread_pool.hpp>
class ThreadPool {
public:
explicit ThreadPool(size_t size = boost::thread::hardware_concurrency()) : threadPool(size)
{ }
template<typename F>
void addTask(F &&f)
{
boost::asio::post(threadPool, std::forward<F>(f));
}
void wait()
{
threadPool.wait();
}
~ThreadPool()
{
threadPool.join();
}
private:
boost::asio::thread_pool threadPool;
};
main.cpp
#include <armadillo>
#include "Utils.h"
#include "ThreadPool.h"
int main() {
ThreadPool threadPool;
arma::cx_mat test (256, 30000 , arma::fill::randu);
arma::vec averageTime(30, arma::fill::zeros);
std::vector<arma::cx_mat > results(30);
for(auto &it : results)
it.set_size(256, 256);
{
for(int i = 0; i < 30; ++i)
{
threadPool.addTask([i = i, &results, &averageTime, test = test.submat(arma::span::all, arma::span(0, 20000)), _ = utils::watch() ]() {
utils::op_herk(test, results[i]);
arma::vec r = arma::sort(arma::eig_sym(results[i]), "descent");
std::cout << _.elapsed_sec() << '\n';
averageTime[i] = _.elapsed_sec();
});
}
threadPool.wait();
std::cout << "average " << arma::sum(averageTime)/averageTime.size() <<std::endl;
}
return 0;
}
Parameters :
gcc 9.4
computer : Intel 6 Cores , 12 threads;
armadillo 10.7.3
openblas 0.3.17
CMAKE parameters : set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -msse2 -O3 -mtune=native -flto")
My results :
1.16084
1.16434
1.16571
1.16601
1.17055
1.17118
1.17382
1.17511
1.1767
1.17981
1.18254
1.18537
2.40071
2.40225
2.4025
2.40511
2.40545
2.40565
2.40583
2.40941
2.40972
2.40974
2.41172
2.41291
3.23446
3.23592
3.23734
3.23972
3.24305
3.24484
3.24728
average 2.14871
Is it possible that "Main end" could get displayed before all result.get(); are returned back in below code snippet (Under any scenario)?
OR "Main end" will always be the last one to appear?
#include <iostream>
#include <vector>
#include <future>
#include <chrono>
using namespace std::chrono;
std::vector<std::future<int>> doParallelProcessing()
{
std::vector<std::future<int>> v;
for (int i = 0; i < 10; i++)
{
auto ret = std::async(std::launch::async, [&]() {
std::this_thread::sleep_for(seconds(i + 5));
return 5;
});
v.push_back(std::move(ret));
}
return v;
}
int main() {
std::vector<std::future<int>> results;
results = doParallelProcessing();
for (std::future<int>& result : results)
{
result.get();
}
std::cout << "Main end\n";
return 0;
}
I'm doing a benchmark on boost::interprocess:vector and std::vector, since I'm gonna use shared memory in my program but I'm concerned with any potential performance issues.
My benchmark is simply random accessing a vector, and it turned out that std::vector is almost 2x faster than boost::interprocess::vector.
Note: in the benchmark I only have a single process, and I don't do any synchronization manually.
I don't know where is the bottleneck....I have three guess:
shared memory make it slower
boost vector has a slower implementation
boost shared memory as well as its containers have some overhead somehow, i.e., if I use mmap and do things in a plain way, it will be better
And what further experiment should I do to figure out this? Or tune something to make it faster? Any idea?
Here is the benchmark code:
for boost::interprocess::vector
#include <boost/interprocess/allocators/allocator.hpp>
#include <boost/interprocess/containers/vector.hpp>
#include <boost/interprocess/managed_shared_memory.hpp>
#include <chrono>
#include <cstdlib>
#include <iostream>
#include <random>
#include <string>
#include <vector>
using namespace boost::interprocess;
typedef allocator<double, managed_shared_memory::segment_manager> ShmemAllocator;
typedef vector<double, ShmemAllocator> MyVector;
const int total_size = 2000 * 2000;
const int mem_size = 2000 * 2000 * 8 * 2;
const int query_size = 100000;
int main(int argc, char *argv[]) {
std::uniform_real_distribution<double> unif(0, 10000);
std::default_random_engine re;
std::vector<double> data;
data.reserve(total_size);
for (int i = 0; i < total_size; ++i) {
data.push_back(unif(re));
}
std::vector<int> query;
query.reserve(query_size);
for (int i = 0; i < query_size; ++i) {
query.push_back(rand() % total_size);
}
struct shm_remove {
shm_remove() { shared_memory_object::remove("MySharedMemory"); }
~shm_remove() { shared_memory_object::remove("MySharedMemory"); }
} remover;
managed_shared_memory segment(create_only, "MySharedMemory", mem_size);
const ShmemAllocator alloc_inst(segment.get_segment_manager());
MyVector *myvector = segment.construct<MyVector>("MyVector")(alloc_inst);
myvector->reserve(total_size);
for (auto d : data) myvector->push_back(d);
auto t1 = std::chrono::high_resolution_clock::now();
for (auto q : query) {
double res = (*myvector)[q];
}
auto t2 = std::chrono::high_resolution_clock::now();
std::cout << std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count() << std::endl;
return 0;
}
for std::vector
#include <boost/interprocess/allocators/allocator.hpp>
#include <boost/interprocess/containers/vector.hpp>
#include <boost/interprocess/managed_shared_memory.hpp>
#include <chrono>
#include <cstdlib> //std::system
#include <iostream>
#include <random>
#include <string>
#include <vector>
const int total_size = 2000 * 2000;
const int mem_size = 2000 * 2000 * 8 * 8;
const int query_size = 100000;
int main(int argc, char *argv[]) {
std::uniform_real_distribution<double> unif(0, 10000);
std::default_random_engine re;
std::vector<double> data;
data.reserve(total_size);
for (int i = 0; i < total_size; ++i) {
data.push_back(unif(re));
}
std::vector<int> query;
query.reserve(query_size);
for (int i = 0; i < query_size; ++i) {
query.push_back(rand() % total_size);
}
std::vector<double> myvector;
myvector.reserve(total_size);
for (auto d : data) myvector.push_back(d);
auto t1 = std::chrono::high_resolution_clock::now();
for (auto q : query) {
double res = myvector[q];
}
auto t2 = std::chrono::high_resolution_clock::now();
std::cout << std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count() << std::endl;
return 0;
}
In using packaged_task, I collected all the futures in a vector. After that, I push back the future values with get(). However, I got the wrong answer. Can anyone help? Thank you very much.
#define BOOST_THREAD_PROVIDES_FUTURE
#include <boost/thread/future.hpp>
#include <vector>
#include <iostream>
using namespace std;
vector<int> subFun(int n) {
vector<int> a{ 2 * n, 3 * n };
return a;
}
int main() {
vector<boost::future<vector<int>>> g;
vector<vector<int>> x(10, vector<int>(2));
int i;
for (i = 0; i < 10; i++) {
boost::packaged_task<vector<int>> task{ boost::bind(&subFun, i) };
g.push_back(task.get_future());
boost::thread t{ std::move(task) };
}
for (auto& m : g) {
x.push_back(m.get());
}
cout << x[3][0] << endl;//should be 6, now is 0
return 0;
}
The realest issue is that you push_back into x, but you already had it initialized here:
vector<vector<int>> x(10, vector<int>(2));
So, you just add 10 more elements, instead of putting the result at indices 0..9. I'd suggest not pre-filling, like #patrick's answer, or instead filling the designated slot:
#define BOOST_THREAD_PROVIDES_FUTURE
#include <boost/thread/future.hpp>
#include <vector>
#include <iostream>
using namespace std;
void subFun(int n, vector<int>& into) {
into = { 2 * n, 3 * n };
}
int main() {
vector<boost::future<void>> futures;
vector<vector<int>> x(10, vector<int>(2));
for (size_t i = 0; i < x.size(); i++) {
boost::packaged_task<void> task{ boost::bind(&subFun, i, std::ref(x[i])) };
futures.push_back(task.get_future());
boost::thread(std::move(task)).detach();
}
for (auto& f : futures)
f.wait();
cout << x[3][0] << endl;
}
Of course you can be more complex:
#define BOOST_THREAD_PROVIDES_FUTURE
#include <boost/thread/future.hpp>
#include <vector>
#include <iostream>
struct TaskResult {
int index;
std::vector<int> data;
};
TaskResult subFun(int n) {
return { n, { 2 * n, 3 * n } };
}
int main() {
std::vector<boost::future<TaskResult>> futures;
std::vector<std::vector<int>> x(10, std::vector<int>(2));
for (size_t i = 0; i < x.size(); i++) {
boost::packaged_task<TaskResult> task{ boost::bind(&subFun, i) };
futures.push_back(task.get_future());
boost::thread(std::move(task)).detach();
}
for (auto& f : futures) {
auto r = f.get();
x[r.index] = r.data;
}
std::cout << x[3][0] << std::endl;
}
After much tinkering, I found this program works without abort traps (which I'm surprised you weren't getting):
#include <future>
#include <thread>
#include <functional>
#include <vector>
#include <iostream>
std::vector<int> subFun(int n) {
std::vector<int> a { 2 * n, 3 * n };
return a;
}
int main() {
std::vector<std::future<std::vector<int>>> g;
std::vector<std::vector<int>> x;
int i;
for (i = 0; i < 10; i++) {
std::packaged_task<std::vector<int>(int)> task{ subFun };
g.push_back(task.get_future());
std::thread { std::move(task), i }.detach();
}
for (auto& m : g) {
m.wait();
x.push_back(m.get());
}
std::cout << x[3][0] << std::endl; // is now 6
return 0;
}
Convert to boost as necessary. This answer was extremely helpful in finding a couple of key issues.
This question already has answers here:
Measuring execution time of a function in C++
(14 answers)
Closed 6 years ago.
I want to know the time of the quickSort function in my code, I used clock() and it get 0 .. and I also tried to use chrono and it still get me 0 .
I also made my array is large .
I don't know if my code is wrong or not !
this is my code
#include <iostream>
#include <chrono>
#include <ctime>
#include <ratio>
using namespace std;
using namespace chrono;
void quick_sort(int *arr,int left,int right){
int i=left,j=right;
int pivot=arr[(left+right)/2];
while(i<=j){
while(arr[i]<pivot)
i++;
while(arr[j]>pivot)
j--;
if(i<=j){
swap(arr[i],arr[j]);
i++;
j--;
}
}
if(left<j)
quick_sort(arr,left,j);
if(right>i)
quick_sort(arr,i,right);
}
int main()
{
int arr[30]={4,2,5,3,8,9,7,10,54,23,65,78,10,44,56,91,75,79,42,81,10,57,23,651,78,100,47,50,71,715};
high_resolution_clock::time_point t1 = high_resolution_clock::now();
quick_sort(arr,0,29);
high_resolution_clock::time_point t2 = high_resolution_clock::now();
duration<double>time_span = duration_cast<duration<double> >(t2-t1);
cout<<"it takes "<<time_span.count()<<" seconds"<<endl;
return 0;
}
Simple implementation:
#include <iostream>
#include <iomanip>
#include <string>
// benchmak
#include <limits>
#include <random>
#include <chrono>
#include <algorithm>
#include <functional>
class Clock
{
std::chrono::time_point<std::chrono::steady_clock> _start;
public:
static inline std::chrono::time_point<std::chrono::steady_clock> now() { return std::chrono::steady_clock::now(); }
Clock() : _start(now())
{
}
template<class DurationUnit>
std::size_t end()
{
return std::chrono::duration_cast<DurationUnit>(now() - _start).count();
}
};
Usage:
int main()
{
{
Clock clock;
business();
const double unit_time = clock.end<std::chrono::nanoseconds>();
std::cout << std::setw(40) << "business(): " << std::setprecision(3) << unit_time << " ns\n";
}
}