Why my program runs faster on 1 thread than on 8. C++

Why my program runs faster on 1 thread than on 8. C++ - c++

Please look at this code:
#include <iostream>
#include <thread>
#include <numeric>
#include <algorithm>
#include <vector>
#include <chrono>
template<typename Iterator, typename T>
struct accumulate_block
{
void operator()(Iterator begin, Iterator end, T& result)
{
result = std::accumulate(begin, end, result);
}
};
template<typename Iterator, typename T>
int accumulate_all(Iterator begin, Iterator end, T& init)
{
auto numOfThreads = std::thread::hardware_concurrency();
std::vector<std::thread> threads(numOfThreads);
auto step = std::distance(begin, end) / numOfThreads;
std::vector<int> results(numOfThreads,0);
for(int i=0; i<numOfThreads-1; ++i)
{
auto block_end = begin;
std::advance(block_end, step);
threads[i] = std::thread(accumulate_block<Iterator, T>(), begin, block_end, std::ref(results[i]));
begin = block_end;
}
threads[numOfThreads-1] = std::thread(accumulate_block<Iterator, T>(), begin, end, std::ref(results[numOfThreads-1]));
for_each(threads.begin(), threads.end(), std::mem_fn(&std::thread::join));
return accumulate(results.begin(), results.end(), 0);
}
int main()
{
int x=0;
std::vector<int> V(20000000,1);
auto t1 = std::chrono::high_resolution_clock::now();
//std::accumulate(std::begin(V), std::end(V), x); singe threaded option
std::cout<<accumulate_all(std::begin(V), std::end(V), x);
auto t2 = std::chrono::high_resolution_clock::now();
std::cout << "process took: "
<< std::chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1).count()
<< " nanoseconds\n";
return 0;
}
When I run on concurrent version (basically on 8 threads because my std::thread::hardware_concurrency(); returns 8)
the output is: process took: 8895404 nanoseconds.
But the single threaded options output is: process took: 124 nanoseconds
Can anyone explain this strange behavior??

The compiler removes the call to std::accumulate because it does not have side effects and the result is not used.
Fix:
auto sum = std::accumulate(std::begin(V), std::end(V), x); // singe threaded option
// At the very end.
std::cout << sum << '\n';

Related

thread doesnot update referenced variable

#include<iostream>
#include<vector>
#include<cstdlib>
#include<thread>
#include<array>
#include<iterator>
#include<algorithm>
#include<functional>
#include<numeric>//accumulate
int sum_of_digits(int num, int sum = 0) {
//calculate sum of digits recursively till the sum is single digit
if (num == 0) {
if (sum / 10 == 0)
return sum;
return sum_of_digits(sum);
}
return sum_of_digits(num / 10, sum + num % 10);
}
template<typename T, typename Iterator>
int sum_of_digits(Iterator begin, Iterator end) {
//not for array temp workout
T copy(std::distance(begin,end));
std::copy(begin, end, copy.begin());
std::for_each(copy.begin(), copy.end(), [](int& i) {i = sum_of_digits(i); });
return sum_of_digits(std::accumulate(begin, end, 0));
}
template<typename T, typename Iterator>
void sum_of_digits(Iterator begin, Iterator end, int& sum) {
sum = sum_of_digits<T, Iterator>(begin, end);
}
template<typename T>
int series_computation(const T& container) {
return sum_of_digits<T, typename T::const_iterator>(container.begin(), container.end());
}
#define MIN_THREAD 4
#define DEBUGG
template<typename T>
int parallel_computation(const T& container, const int min_el) {
if (container.size() < min_el) {
#ifdef DEBUGG
std::cout << "no multithreading" << std::endl;
#endif
return series_computation<T>(container);
}
const unsigned int total_el = container.size();
const unsigned int assumed_thread = total_el / min_el;
const unsigned hardwarethread = std::thread::hardware_concurrency();
const unsigned int thread_count = std::min<unsigned int>(hardwarethread == 0 ? MIN_THREAD : hardwarethread, assumed_thread) - 1;//one thread is main thread
const unsigned int el_per_thread = total_el / thread_count;
#ifdef DEBUGG
std::cout << "thread count: " << thread_count << " element per thread: " << el_per_thread << std::endl;
#endif
std::vector<std::thread> threads;
threads.reserve(thread_count);
using result_type = std::vector<int>;
result_type results(thread_count);
results.reserve(thread_count);
auto it_start = container.begin();
for (int i = 0; i < thread_count; i++) {
auto it_end = it_start;
std::advance(it_end, el_per_thread);
threads.push_back(std::thread([&]() {sum_of_digits<T, typename T::const_iterator>(it_start, it_end, std::ref(results[i])); }));
//threads.push_back( std::thread{ sum_of_digits<T,typename T::const_iterator>, it_start, it_end, std::ref(results[i]) });
it_start = it_end;
std::cout << "iterator " << i << std::endl;
}
results[thread_count - 1] = sum_of_digits<T, typename T::const_iterator>(it_start, container.end());
std::for_each(threads.begin(), threads.end(), std::mem_fn(&std::thread::join));
return series_computation<T>(results);
}
#define SIZE 1000
int main() {
std::vector<int> array(SIZE);
for (auto& curr : array) {
curr = std::rand();
}
for (auto& curr : array) {
//std::cout <<curr<< std::endl;
}
int series_val = series_computation<std::vector<int>>(array);
std::cout << "series val: " << series_val << std::endl;
int parallel_val = parallel_computation<std::vector<int>>(array, 25);
std::cout << "parallel val: " << parallel_val << std::endl;
return 0;
}
I am trying to calculate the sum of digits(recursive) of a randomly generated vector using std::thread but in the results vector only the result of the last element(i.e main thread) is stored and the child thread doesnot update the referenced results[i].
What is causing this behaviour?
Also, inside the for loop of parallel_combination, this code works
threads.push_back(std::thread([&]() {sum_of_digits<T, typename T::const_iterator>(it_start, it_end, std::ref(results[i])); }));
but this doesnot and the error is: Error: '<function-style-cast>': cannot convert from 'initializer list' to 'std::thread'.
What's wrong with the below one?
threads.push_back( std::thread{ sum_of_digits<T,typename T::const_iterator>, it_start, it_end, std::ref(results[i]) });

Your code has a data race here:
threads.push_back(std::thread([&]() {sum_of_digits<T, typename T::const_iterator>(it_start, it_end, std::ref(results[i])); }));
it_start = it_end;
The lambda uses it_start and then without any synchronization you immediately modify it in the main thread. Capture it_start and it_end by-copy. Also std::ref is pointless there. You are not passing a reference to the std::thread constructor, but just to a normal direct function call. Also, as I mentioned in the comments, template arguments in a function call can be deduced:
threads.push_back(std::thread([it_start,it_end,&results](){
sum_of_digits(it_start, it_end, results[i]); }));
it_start = it_end;
threads.push_back( std::thread{ sum_of_digits<T,typename T::const_iterator>, it_start, it_end, std::ref(results[i]) });
fails, because there are two function templates of which sum_of_digits<T,typename T::const_iterator> could be a specialization. It is impossible to pass overload sets to functions and since there is no way to deduce which one you mean, it will fail.
There are some smaller issues in the code, e.g. pointless reserve after resizing/construction of vectors or
std::for_each(threads.begin(), threads.end(), std::mem_fn(&std::thread::join));
which is not guaranteed to work, because taking the address of a member function of a standard library function has unspecified behavior. Instead just use a loop:
for(auto& t : threads) {
t.join();
}
and possibly some others.

thread racing when working with vector of data in the thread function

Being an early stage c++/thread coder I am having some hard time with thread racing in one of my test functions and would truly appreciate some feedback.
My parent() function takes in as input a rather large vector of images (cv::Mat from openCV) and the task is to compute an operator on each one separately (e.g. dilation). I wrote a loop that creates threads using a worker() function and passes on each thread a subset of my input vector.
The result from each thread is to be stored on that input subset vector. My problem is that I cannot retrieve it back from within the parent().
As an alternative I passed the entire vector to worker() with start and end indices for each thread but then I run into some serious thread racing issues consuming more time than the serial approach.
Please see my code below.
std::vector<cv::Mat> worker(std::vector<cv::Mat>& ctn);
std::vector<cv::Mat> worker(std::vector<cv::Mat>& ctn) {
int erosion_type = cv::MORPH_RECT;
int erosion_size = 5;
cv::Mat element = cv::getStructuringElement( erosion_type,
cv::Size( 2*erosion_size + 1, 2*erosion_size+1 ),
cv::Point( erosion_size, erosion_size ) );
this_mutex.lock();
for(uint it=0; it<ctn.size(); ++it) {
cv::erode(ctn[it], ctn[it], element);
}
this_mutex.unlock();
return ctn;
}
void parent(std::vector<cv::Mat>& imageSet) {
auto start = std::chrono::steady_clock::now();
const auto processor_count = std::thread::hardware_concurrency();
std::vector<std::thread> threads;
const int grainsize = imageSet.size() / processor_count;
uint work_iter = 0;
std::vector<cv::Mat> target; // holds the output vector
// create the threads
for(uint it=0; it<processor_count-1; ++it) {
std::vector<cv::Mat> subvec(imageSet.begin() + work_iter, imageSet.begin() + work_iter + grainsize);
threads.emplace_back([&,it]() {
std::vector<cv::Mat> tmp = worker(subvec);
target.insert(target.end(), tmp.begin(), tmp.end());
});
work_iter += grainsize;
}
// create the last thread for the remainder of the vector elements
std::vector<cv::Mat> subvec(imageSet.begin() + work_iter, imageSet.end());
int it = processor_count-1;
threads.emplace_back([&,it]() {
std::vector<cv::Mat> tmp = worker(subvec);
target.insert(target.end(), tmp.begin(), tmp.end());
});
// join the threads
for(int i=0; i<threads.size(); ++i) {
threads[i].join();
}
auto end = std::chrono::steady_clock::now();
std::chrono::duration<double> elapsed_seconds = end-start;
std::cout << "elapsed time: " << elapsed_seconds.count() << "s\n";
// try to reconstruct the output
imageSet.clear();
for(int i=0; i<target.size(); ++i) {
imageSet.push_back(target[i]);
}
}
In this code the statement target.insert(target.end(), tmp.begin(), tmp.end()) is meant to concatenate the target[ ] vector with the result of each thread but it does not execute in time thus I get an empty target[] at the end.
Any ideas how to get target[] to collect all tmp[]s?

Where you thinking something like this?
This processes them all individually, but you can chunk it up however you want and return a vector from the lamda if you want.
Note: This is in C++11, since that is what you tagged. If you have access to 17, this becomes a whole lot simpler.
#include <vector>
#include <algorithm>
#include <numeric>
#include <future>
#include <iostream>
int main()
{
std::vector<int> input{0,1,2,3,4,5,6,7,8,9,10};
for(const auto& item : input)
{
std::cout << item << " ";
}
std::cout << std::endl;
std::vector<std::future<int>> threads{};
for(const auto& item : input)
{
threads.push_back(std::async(std::launch::async, [&item]{
return item * 100;
}));
}
std::vector<int> output{};
for(auto& thread : threads)
{
output.push_back(thread.get());
}
for(const auto& item : output)
{
std::cout << item << " ";
}
return 0;
}

One result (res) for each thread.
#include <iostream>
#include <thread>
#include <vector>
#include <algorithm>
#include <cassert>
void threadFunction (std::vector<int> &speeds, int start, int end, std::vector<int>& res);
int main()
{
std::vector<int> images (100000);
auto processor_count = std::thread::hardware_concurrency();
auto step = images.size() / processor_count;
auto startFrom = 0;
// one result vector (res) for each thread (t).
std::vector<std::thread>t;
std::vector<std::vector<int>>res (processor_count);
// Start the threads
for (auto i = 0; i < processor_count; ++i)
{
auto th = std::thread(threadFunction, std::ref(images), startFrom, startFrom+step, std::ref(res[i]));
t.push_back(std::move(th));
startFrom += step;
}
// Join
std::for_each(begin(t), end(t), [](std::thread &t)
{
assert(t.joinable());
t.join();
});
// Results here. Each thread puts the results in res[i];
return 0;
}
void threadFunction (std::vector<int> &images, int start, int end, std::vector<int>& res)
{
for (int i = start; i <= end; ++i)
res.push_back(images[i]);
}

Thread unable to join for_each parallel c++

I wrote a sample code to run parallel instances of for_each
I am unable to join the threads, in the below code. I am little early to concurrent programming so im not sure if i have done everything right.
template <typename Iterator, typename F>
class for_each_block
{
public :
void operator()(Iterator start, Iterator end, F f) {
cout << this_thread::get_id << endl;
this_thread::sleep_for(chrono::seconds(5));
for_each(start, end, [&](auto& x) { f(x); });
}
};
typedef unsigned const long int ucli;
template <typename Iterator, typename F>
void for_each_par(Iterator first, Iterator last, F f)
{
ucli size = distance(first, last);
if (!size)
return;
ucli min_per_thread = 4;
ucli max_threads = (size + min_per_thread - 1) / min_per_thread;
ucli hardware_threads = thread::hardware_concurrency();
ucli no_of_threads = min(max_threads, hardware_threads != 0 ? hardware_threads : 4);
ucli block_size = size / no_of_threads;
vector<thread> vf(no_of_threads);
Iterator block_start = first;
for (int i = 0; i < (no_of_threads - 1); i++)
{
Iterator end = first;
advance(end, block_size);
vf.push_back(std::move(thread(for_each_block<Iterator, F>(),first,end,f)));
first = end;
}
vf.push_back(std::move(thread(for_each_block<Iterator, F>(), first, last, f)));
cout << endl;
cout << vf.size() << endl;
for(auto& x: vf)
{
if (x.joinable())
x.join();
else
cout << "threads not joinable " << endl;
}
this_thread::sleep_for(chrono::seconds(100));
}
int main()
{
vector<int> v1 = { 1,8,12,5,4,9,20,30,40,50,10,21,34,33 };
for_each_par(v1.begin(), v1.end(), print_type<int>);
return 0;
}
In the above code i am getting threads not joinable. I have also tried with async futures still i get the same. Am i missing something here?
Any help is greatly appreciated ,
Thank you in advance ..

vector<thread> vf(no_of_threads);
This creates a vector with no_of_threads default-initialized threads. Since they're default initialized, none of them will be joinable. You probably meant to do:
vector<thread> vf;
vf.reserve(no_of_threads);
P.S.: std::move on a temporary is redundant :); consider changing this:
vf.push_back(std::move(thread(for_each_block<Iterator, F>(), first, last, f)));
to this:
vf.emplace_back(for_each_block<Iterator, F>(), first, last, f);

This may or may not be interesting. I had a go at refactoring the code to use what I think is a more idiomatic approach. I'm not saying that your approach is wrong, but since you're learning thread management I thought you may be interested in what else is possible.
Feel free to flame/question as appropriate. Comments inline:
#include <vector>
#include <chrono>
#include <thread>
#include <mutex>
#include <iomanip>
#include <future>
using namespace std;
//
// provide a means of serialising writing to a stream.
//
struct locker
{
locker() : _lock(mutex()) {}
static std::mutex& mutex() { static std::mutex m; return m; }
std::unique_lock<std::mutex> _lock;
};
std::ostream& operator<<(std::ostream& os, const locker& l) {
return os;
}
//
// fill in the missing work function
//
template<class T>
void print_type(const T& t) {
std::cout << locker() << hex << std::this_thread::get_id() << " : " << dec << t << std::endl;
}
// put this in your personable library.
// the standards committee really should have given us ranges by now...
template<class I1, class I2>
struct range_impl
{
range_impl(I1 i1, I2 i2) : _begin(i1), _end(i2) {};
auto begin() const { return _begin; }
auto end() const { return _end; }
I1 _begin;
I2 _end;
};
// distinct types because sometimes dissimilar iterators are comparable
template<class I1, class I2>
auto range(I1 i1, I2 i2) {
return range_impl<I1, I2>(i1, i2);
}
//
// lets make a helper function so we can auto-deduce template args
//
template<class Iterator, typename F>
auto make_for_each_block(Iterator start, Iterator end, F&& f)
{
// a lambda gives all the advantages of a function object with none
// of the boilerplate.
return [start, end, f = std::move(f)] {
cout << locker() << this_thread::get_id() << endl;
this_thread::sleep_for(chrono::seconds(1));
// let's keep loops simple. for_each is a bit old-skool.
for (auto& x : range(start, end)) {
f(x);
}
};
}
template <typename Iterator, typename F>
void for_each_par(Iterator first, Iterator last, F f)
{
if(auto size = distance(first, last))
{
std::size_t min_per_thread = 4;
std::size_t max_threads = (size + min_per_thread - 1) / min_per_thread;
std::size_t hardware_threads = thread::hardware_concurrency();
auto no_of_threads = min(max_threads, hardware_threads != 0 ? hardware_threads : 4);
auto block_size = size / no_of_threads;
// futures give us two benefits:
// 1. they automatically transmit exceptions
// 2. no need for if(joinable) join. get is sufficient
//
vector<future<void>> vf;
vf.reserve(no_of_threads - 1);
for (auto count = no_of_threads ; --count ; )
{
//
// I was thinking of refactoring this into std::generate_n but actually
// it was less readable.
//
auto end = std::next(first, block_size);
vf.push_back(async(launch::async, make_for_each_block(first, end, f)));
first = end;
}
cout << locker() << endl << "threads: " << vf.size() << " (+ main thread)" << endl;
//
// why spawn a thread for the remaining block? we may as well use this thread
//
/* auto partial_sum = */ make_for_each_block(first, last, f)();
// join the threads
// note that if the blocks returned a partial aggregate, we could combine them
// here by using the values in the futures.
for (auto& f : vf) f.get();
}
}
int main()
{
vector<int> v1 = { 1,8,12,5,4,9,20,30,40,50,10,21,34,33 };
for_each_par(v1.begin(), v1.end(), print_type<int>);
return 0;
}
sample output:
0x700000081000
0x700000104000
threads: 3 (+ main thread)
0x700000187000
0x100086000
0x700000081000 : 1
0x700000104000 : 5
0x700000187000 : 20
0x100086000 : 50
0x700000081000 : 8
0x700000104000 : 4
0x700000187000 : 30
0x100086000 : 10
0x700000081000 : 12
0x700000104000 : 9
0x700000187000 : 40
0x100086000 : 21
0x100086000 : 34
0x100086000 : 33
Program ended with exit code: 0
please explain std::move here: [start, end, f = std::move(f)] {...};
This is a welcome language feature that was made available in c++14. f = std::move(f) inside the capture block is equivalent to: decltype(f) new_f = std::move(f) except that the new variable is called f and not new_f. It allows us to std::move objects into lambdas rather than copy them.
For most function objects it won't matter - but some can large and this gives the compiler the opportunity to use a move rather than a copy if available.

Why would a parallel version of accumulate be so much slower?

Inspired by Antony Williams' "C++ Concurrency in Action" I took a closer look at his parallel version of std::accumulate. I copied its code from the book and added some output for debugging purposes and this is what I ended up with:
#include <algorithm>
#include <future>
#include <iostream>
#include <thread>
template <typename Iterator, typename T>
struct accumulate_block
{
T operator()(Iterator first, Iterator last)
{
return std::accumulate(first, last, T());
}
};
template <typename Iterator, typename T>
T parallel_accumulate(Iterator first, Iterator last, T init)
{
const unsigned long length = std::distance(first, last);
if (!length) return init;
const unsigned long min_per_thread = 25;
const unsigned long max_threads = (length) / min_per_thread;
const unsigned long hardware_conc = std::thread::hardware_concurrency();
const unsigned long num_threads = std::min(hardware_conc != 0 ? hardware_conc : 2, max_threads);
const unsigned long block_size = length / num_threads;
std::vector<std::future<T>> futures(num_threads - 1);
std::vector<std::thread> threads(num_threads - 1);
Iterator block_start = first;
for (unsigned long i = 0; i < (num_threads - 1); ++i)
{
Iterator block_end = block_start;
std::advance(block_end, block_size);
std::packaged_task<T(Iterator, Iterator)> task{accumulate_block<Iterator, T>()};
futures[i] = task.get_future();
threads[i] = std::thread(std::move(task), block_start, block_end);
block_start = block_end;
}
T last_result = accumulate_block<Iterator, T>()(block_start, last);
for (auto& t : threads) t.join();
T result = init;
for (unsigned long i = 0; i < (num_threads - 1); ++i) {
result += futures[i].get();
}
result += last_result;
return result;
}
template <typename TimeT = std::chrono::microseconds>
struct measure
{
template <typename F, typename... Args>
static typename TimeT::rep execution(F func, Args&&... args)
{
using namespace std::chrono;
auto start = system_clock::now();
func(std::forward<Args>(args)...);
auto duration = duration_cast<TimeT>(system_clock::now() - start);
return duration.count();
}
};
template <typename T>
T parallel(const std::vector<T>& v)
{
return parallel_accumulate(v.begin(), v.end(), 0);
}
template <typename T>
T stdaccumulate(const std::vector<T>& v)
{
return std::accumulate(v.begin(), v.end(), 0);
}
int main()
{
constexpr unsigned int COUNT = 200000000;
std::vector<int> v(COUNT);
// optional randomising vector contents - std::accumulate also gives 0us
// but custom parallel accumulate gives longer times with randomised input
std::mt19937 mersenne_engine;
std::uniform_int_distribution<int> dist(1, 100);
auto gen = std::bind(dist, mersenne_engine);
std::generate(v.begin(), v.end(), gen);
std::fill(v.begin(), v.end(), 1);
auto v2 = v; // copy to work on the same data
std::cout << "starting ... " << '\n';
std::cout << "std::accumulate : \t" << measure<>::execution(stdaccumulate<int>, v) << "us" << '\n';
std::cout << "parallel: \t" << measure<>::execution(parallel<int>, v2) << "us" << '\n';
}
What is most interesting here is that almost always I will get 0 length time from std::accumulate.
Exemplar output:
starting ...
std::accumulate : 0us
parallel:
inside1 54us
inside2 81830us
inside3 89082us
89770us
What is the problem here?
http://cpp.sh/6jbt

As is the usual case with micro-benchmarking, you need to make sure that your code is actually doing something. You're doing an accumulate, but you're not actually storing the result anywhere or doing anything with it. So do you really need to have done any of the work anyway? The compiler just snipped out all that logic in the normal case. That's why you get 0.
Just change your code to actually ensure that work needs to be done. For example:
int s, s2;
std::cout << "starting ... " << '\n';
std::cout << "std::accumulate : \t"
<< measure<>::execution([&]{s = std::accumulate(v.begin(), v.end(), 0);})
<< "us\n";
std::cout << "parallel: \t"
<< measure<>::execution([&]{s2 = parallel_accumulate(v2.begin(), v2.end(), 0);})
<< "us\n";
std::cout << s << ',' << s2 << std::endl;

What is the optimal way to concatenate two vectors whilst transforming elements of one vector?

Suppose I have
std::vector<T1> vec1 {/* filled with T1's */};
std::vector<T2> vec2 {/* filled with T2's */};
and some function T1 f(T2) which could of course be a lambda. What is the optimal way to concatenate vec1 and vec2 whilst applying f to each T2 in vec2?
The apparently obvious solution is std::transform, i.e.
vec1.reserve(vec1.size() + vec2.size());
std::transform(vec2.begin(), vec2.end(), std::back_inserter(vec1), f);
but I say this is not optimal as std::back_inserter must make an unnecessary capacity check on each inserted element. What would be optimal is something like
vec1.insert(vec1.end(), vec2.begin(), vec2.end(), f);
which could get away with a single capacity check. Sadly this is not valid C++. Essentially this is the same reason why std::vector::insert is optimal for vector concatenation, see this question and the comments in this question for further discussion on this point.
So:
Is std::transform the optimal method using the STL?
If so, can we do better?
Is there a good reason why the insert function described above was left out of the STL?
UPDATE
I've had a go at verifying if the multiple capacity checks do have any noticeable cost. To do this I basically just pass the id function (f(x) = x) to the std::transform and push_back methods discussed in the answers. The full code is:
#include <iostream>
#include <vector>
#include <iterator>
#include <algorithm>
#include <cstdint>
#include <chrono>
#include <numeric>
#include <random>
using std::size_t;
std::vector<int> generate_random_ints(size_t n)
{
std::default_random_engine generator;
auto seed1 = std::chrono::system_clock::now().time_since_epoch().count();
generator.seed((unsigned) seed1);
std::uniform_int_distribution<int> uniform {};
std::vector<int> v(n);
std::generate_n(v.begin(), n, [&] () { return uniform(generator); });
return v;
}
template <typename D=std::chrono::nanoseconds, typename F>
D benchmark(F f, unsigned num_tests)
{
D total {0};
for (unsigned i = 0; i < num_tests; ++i) {
auto start = std::chrono::system_clock::now();
f();
auto end = std::chrono::system_clock::now();
total += std::chrono::duration_cast<D>(end - start);
}
return D {total / num_tests};
}
template <typename T>
void std_insert(std::vector<T> vec1, const std::vector<T> &vec2)
{
vec1.insert(vec1.end(), vec2.begin(), vec2.end());
}
template <typename T1, typename T2, typename UnaryOperation>
void push_back_concat(std::vector<T1> vec1, const std::vector<T2> &vec2, UnaryOperation op)
{
vec1.reserve(vec1.size() + vec2.size());
for (const auto& x : vec2) {
vec1.push_back(op(x));
}
}
template <typename T1, typename T2, typename UnaryOperation>
void transform_concat(std::vector<T1> vec1, const std::vector<T2> &vec2, UnaryOperation op)
{
vec1.reserve(vec1.size() + vec2.size());
std::transform(vec2.begin(), vec2.end(), std::back_inserter(vec1), op);
}
int main(int argc, char **argv)
{
unsigned num_tests {1000};
size_t vec1_size {10000000};
size_t vec2_size {10000000};
auto vec1 = generate_random_ints(vec1_size);
auto vec2 = generate_random_ints(vec1_size);
auto f_std_insert = [&vec1, &vec2] () {
std_insert(vec1, vec2);
};
auto f_push_back_id = [&vec1, &vec2] () {
push_back_concat(vec1, vec2, [] (int i) { return i; });
};
auto f_transform_id = [&vec1, &vec2] () {
transform_concat(vec1, vec2, [] (int i) { return i; });
};
auto std_insert_time = benchmark<std::chrono::milliseconds>(f_std_insert, num_tests).count();
auto push_back_id_time = benchmark<std::chrono::milliseconds>(f_push_back_id, num_tests).count();
auto transform_id_time = benchmark<std::chrono::milliseconds>(f_transform_id, num_tests).count();
std::cout << "std_insert: " << std_insert_time << "ms" << std::endl;
std::cout << "push_back_id: " << push_back_id_time << "ms" << std::endl;
std::cout << "transform_id: " << transform_id_time << "ms" << std::endl;
return 0;
}
Compiled with:
g++ vector_insert_demo.cpp -std=c++11 -O3 -o vector_insert_demo
Output:
std_insert: 44ms
push_back_id: 61ms
transform_id: 61ms
The compiler will have inlined the lambda, so that cost can be safely be discounted. Unless anyone else has a viable explanation for these results (or is willing to check the assembly), I think it's reasonable to conclude there is a noticeable cost of the multiple capacity checks.

UPDATE: The performance difference is due to the reserve() calls, which, in libstdc++ at least, make the capacity be exactly what you request instead of using the exponential growth factor.
I did some timing tests, with interesting results. Using std::vector::insert along with boost::transform_iterator was the fastest way I found by a large margin:
Version 1:
void
appendTransformed1(
std::vector<int> &vec1,
const std::vector<float> &vec2
)
{
auto v2begin = boost::make_transform_iterator(vec2.begin(),f);
auto v2end = boost::make_transform_iterator(vec2.end(),f);
vec1.insert(vec1.end(),v2begin,v2end);
}
Version 2:
void
appendTransformed2(
std::vector<int> &vec1,
const std::vector<float> &vec2
)
{
vec1.reserve(vec1.size()+vec2.size());
for (auto x : vec2) {
vec1.push_back(f(x));
}
}
Version 3:
void
appendTransformed3(
std::vector<int> &vec1,
const std::vector<float> &vec2
)
{
vec1.reserve(vec1.size()+vec2.size());
std::transform(vec2.begin(),vec2.end(),std::inserter(vec1,vec1.end()),f);
}
Timing:
Version 1: 0.59s
Version 2: 8.22s
Version 3: 8.42s
main.cpp:
#include <algorithm>
#include <cassert>
#include <chrono>
#include <iterator>
#include <iostream>
#include <random>
#include <vector>
#include "appendtransformed.hpp"
using std::cerr;
template <typename Engine>
static std::vector<int> randomInts(Engine &engine,size_t n)
{
auto distribution = std::uniform_int_distribution<int>(0,999);
auto generator = [&]{return distribution(engine);};
auto vec = std::vector<int>();
std::generate_n(std::inserter(vec,vec.end()),n,generator);
return vec;
}
template <typename Engine>
static std::vector<float> randomFloats(Engine &engine,size_t n)
{
auto distribution = std::uniform_real_distribution<float>(0,1000);
auto generator = [&]{return distribution(engine);};
auto vec = std::vector<float>();
std::generate_n(std::inserter(vec,vec.end()),n,generator);
return vec;
}
static auto
appendTransformedFunction(int version) ->
void(*)(std::vector<int>&,const std::vector<float> &)
{
switch (version) {
case 1: return appendTransformed1;
case 2: return appendTransformed2;
case 3: return appendTransformed3;
default:
cerr << "Unknown version: " << version << "\n";
exit(EXIT_FAILURE);
}
return 0;
}
int main(int argc,char **argv)
{
if (argc!=2) {
cerr << "Usage: appendtest (1|2|3)\n";
exit(EXIT_FAILURE);
}
auto version = atoi(argv[1]);
auto engine = std::default_random_engine();
auto vec1_size = 1000000u;
auto vec2_size = 1000000u;
auto count = 100;
auto vec1 = randomInts(engine,vec1_size);
auto vec2 = randomFloats(engine,vec2_size);
namespace chrono = std::chrono;
using chrono::system_clock;
auto appendTransformed = appendTransformedFunction(version);
auto start_time = system_clock::now();
for (auto i=0; i!=count; ++i) {
appendTransformed(vec1,vec2);
}
auto end_time = system_clock::now();
assert(vec1.size() == vec1_size+count*vec2_size);
auto sum = std::accumulate(vec1.begin(),vec1.end(),0u);
auto elapsed_seconds = chrono::duration<float>(end_time-start_time).count();
cerr << "Using version " << version << ":\n";
cerr << " sum=" << sum << "\n";
cerr << " elapsed: " << elapsed_seconds << "s\n";
}
Compiler: g++ 4.9.1
Options: -std=c++11 -O2

Is std::transform the optimal method using the STL?
I can't say that. If you reserve space, the difference should be ephemeral because the check might be optimized out by either the compiler or the CPU. The only way to find out is to measure your real code.
If you don't have a particular need, you should go for std::transform.
If so, can we do better?
What you want to have:
Reduce length checks
Take advantage of move semantics when push'n_back
You might also want to create a binary function, if needed.
template <typename InputIt, typename OutputIt, typename UnaryCallable>
void move_append(InputIt first, InputIt last, OutputIt firstOut, OutputIt lastOut, UnaryCallable fn)
{
if (std::distance(first, last) < std::distance(firstOut, lastOut)
return;
while (first != last && firstOut != lastOut) {
*firstOut++ = std::move( fn(*first++) );
}
}
a call could be:
std::vector<T1> vec1 {/* filled with T1's */};
std::vector<T2> vec2 {/* filled with T2's */};
// ...
vec1.resize( vec1.size() + vec2.size() );
move_append( vec1.begin(), vec1.end(), vec2.begin(), vec2.end(), f );
I'm not sure you can do this with plain algorithms because back_inserter would call Container::push_back which will check in any case for reallocation. Also, the element won't be able to benefit from move semantics.
Note: the safety check depends on your usage, based on how you pass the elements to append. Also it should return a bool.
Some measurements here. I can't explain that big discrepancy.

I do not get the same results as #VaughnCato - although I do a slightly different test of std::string to int. According to my tests the push_back and std::transform methods are equally good, while the boost::transform method is slightly worse. Here is my full code:
UPDATE
I included another test case that instead of using reserve and back_inserter, just uses resize. This is essentially the same method as in #black's answer, and also the method suggested by #ChrisDrew in the question comments. I also performed the test 'both ways' that is std::string -> int, and int -> std::string.
#include <iostream>
#include <vector>
#include <iterator>
#include <algorithm>
#include <cstdint>
#include <chrono>
#include <numeric>
#include <random>
#include <boost/iterator/transform_iterator.hpp>
using std::size_t;
std::vector<int> generate_random_ints(size_t n)
{
std::default_random_engine generator;
auto seed1 = std::chrono::system_clock::now().time_since_epoch().count();
generator.seed((unsigned) seed1);
std::uniform_int_distribution<int> uniform {};
std::vector<int> v(n);
std::generate_n(v.begin(), n, [&] () { return uniform(generator); });
return v;
}
std::vector<std::string> generate_random_strings(size_t n)
{
std::default_random_engine generator;
auto seed1 = std::chrono::system_clock::now().time_since_epoch().count();
generator.seed((unsigned) seed1);
std::uniform_int_distribution<int> uniform {};
std::vector<std::string> v(n);
std::generate_n(v.begin(), n, [&] () { return std::to_string(uniform(generator)); });
return v;
}
template <typename D=std::chrono::nanoseconds, typename F>
D benchmark(F f, unsigned num_tests)
{
D total {0};
for (unsigned i = 0; i < num_tests; ++i) {
auto start = std::chrono::system_clock::now();
f();
auto end = std::chrono::system_clock::now();
total += std::chrono::duration_cast<D>(end - start);
}
return D {total / num_tests};
}
template <typename T1, typename T2, typename UnaryOperation>
void push_back_concat(std::vector<T1> vec1, const std::vector<T2> &vec2, UnaryOperation op)
{
vec1.reserve(vec1.size() + vec2.size());
for (const auto& x : vec2) {
vec1.push_back(op(x));
}
}
template <typename T1, typename T2, typename UnaryOperation>
void transform_concat_reserve(std::vector<T1> vec1, const std::vector<T2> &vec2, UnaryOperation op)
{
vec1.reserve(vec1.size() + vec2.size());
std::transform(vec2.begin(), vec2.end(), std::back_inserter(vec1), op);
}
template <typename T1, typename T2, typename UnaryOperation>
void transform_concat_resize(std::vector<T1> vec1, const std::vector<T2> &vec2, UnaryOperation op)
{
auto vec1_size = vec1.size();
vec1.resize(vec1.size() + vec2.size());
std::transform(vec2.begin(), vec2.end(), vec1.begin() + vec1_size, op);
}
template <typename T1, typename T2, typename UnaryOperation>
void boost_transform_concat(std::vector<T1> vec1, const std::vector<T2> &vec2, UnaryOperation op)
{
auto v2_begin = boost::make_transform_iterator(vec2.begin(), op);
auto v2_end = boost::make_transform_iterator(vec2.end(), op);
vec1.insert(vec1.end(), v2_begin, v2_end);
}
int main(int argc, char **argv)
{
unsigned num_tests {1000};
size_t vec1_size {1000000};
size_t vec2_size {1000000};
// Switch the variable names to inverse test
auto vec1 = generate_random_ints(vec1_size);
auto vec2 = generate_random_strings(vec2_size);
auto op = [] (const std::string& str) { return std::stoi(str); };
//auto op = [] (int i) { return std::to_string(i); };
auto f_push_back_concat = [&vec1, &vec2, &op] () {
push_back_concat(vec1, vec2, op);
};
auto f_transform_concat_reserve = [&vec1, &vec2, &op] () {
transform_concat_reserve(vec1, vec2, op);
};
auto f_transform_concat_resize = [&vec1, &vec2, &op] () {
transform_concat_resize(vec1, vec2, op);
};
auto f_boost_transform_concat = [&vec1, &vec2, &op] () {
boost_transform_concat(vec1, vec2, op);
};
auto push_back_concat_time = benchmark<std::chrono::milliseconds>(f_push_back_concat, num_tests).count();
auto transform_concat_reserve_time = benchmark<std::chrono::milliseconds>(f_transform_concat_reserve, num_tests).count();
auto transform_concat_resize_time = benchmark<std::chrono::milliseconds>(f_transform_concat_resize, num_tests).count();
auto boost_transform_concat_time = benchmark<std::chrono::milliseconds>(f_boost_transform_concat, num_tests).count();
std::cout << "push_back: " << push_back_concat_time << "ms" << std::endl;
std::cout << "transform_reserve: " << transform_concat_reserve_time << "ms" << std::endl;
std::cout << "transform_resize: " << transform_concat_resize_time << "ms" << std::endl;
std::cout << "boost_transform: " << boost_transform_concat_time << "ms" << std::endl;
return 0;
}
Compiled using:
g++ vector_concat.cpp -std=c++11 -O3 -o vector_concat_test
The results (mean user-times) are :
| Method | std::string -> int (ms) | int -> std::string (ms) |
|:------------------------:|:-----------------------:|:-----------------------:|
| push_back | 68 | 206 |
| std::transform (reserve) | 68 | 202 |
| std::transform (resize) | 67 | 218 |
| boost::transform | 70 | 238 |
PROVISIONAL CONCLUSION
The std::transform method using resize is likely optimal (using STL) for trivial to default-construct types.
The std::transform method using reserve and back_inserter is most likely the best we can do otherwise.

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

Why my program runs faster on 1 thread than on 8. C++ - c++

The compiler removes the call to std::accumulate because it does not have side effects and the result is not used. Fix: auto sum = std::accumulate(std::begin(V), std::end(V), x); // singe threaded option // At the very end. std::cout << sum << '\n';

Related

thread doesnot update referenced variable

thread racing when working with vector of data in the thread function

Thread unable to join for_each parallel c++

Why would a parallel version of accumulate be so much slower?

What is the optimal way to concatenate two vectors whilst transforming elements of one vector?

Categories

Resources