I want to use std::for_each to iterate over vector indexes in range [a, b) in parallel, calculate the value of the Weierstrass function and write it to the std::vector:
std::vector<std::array<float, 2>> values(1000);
auto range = /** equivalent of Pyhthon range(0, values.size()) **/;
std::for_each(std::execution::par, range.begin(), range.end(), [&](auto &&i) {
values[i][0] = static_cast<float>(i) / resolution;
values[i][1] = weierstrass(a, b, static_cast<float>(i) / resolution);
});
// a, b, and resolution are some constants defined before
// weierstrass() is the Weierstrass function
I have found some solutions in the internet, but all of them requires to include some third-party libraries or create my own range class. Is there any standard solution for this?
You can use std::views::iota(), its use is similar (but a bit different) to Python's range(). With help of std::ranges::for_each(). Both are available in C++20.
Try it online!
#include <algorithm>
#include <ranges>
#include <iostream>
int main() {
std::ranges::for_each(std::views::iota(1, 10), [](int i) {
std::cout << i << ' ';
});
}
Output:
1 2 3 4 5 6 7 8 9
As noted by #Afshin, in code mentioned above std::ranges::for_each() doesn't support std::execution::par for multi-threaded execution.
To overcome this issue you may use iota with regular std::for_each() as following:
Try it online!
#include <algorithm>
#include <ranges>
#include <iostream>
#include <execution>
int main() {
auto range = std::views::iota(1, 10);
std::for_each(std::execution::par, range.begin(), range.end(),
[](int i) {
std::cout << i << ' ';
});
}
Output:
1 2 3 4 5 6 7 8 9
I decided to implement Range class plus iterator from scratch, according to how it works in Python's range().
Similar to Python you can use it three ways: Range(stop), Range(start, stop), Range(start, stop, step). All three support any negative value.
To test correctness of implementation I filled two unordered sets, one containing all generated values, another containing all used thread ids (to show that it actually used multi-core CPU execution).
Although I marked my iterator as random access type, still it is missing some methods like -= or -- operators, these extra methods are for further improvements. But for usage of std::for_each() it has enough methods.
If I made some mistakes of implementation please add comments to my answer with explanation.
Try it online!
#include <limits>
#include <execution>
#include <algorithm>
#include <iostream>
#include <iterator>
#include <thread>
#include <unordered_set>
#include <string>
#include <sstream>
#include <mutex>
class Range {
public:
Range(ptrdiff_t start_stop, ptrdiff_t stop =
std::numeric_limits<ptrdiff_t>::max(), ptrdiff_t step = 1)
: step_(step) {
if (stop == std::numeric_limits<ptrdiff_t>::max()) {
start_ = 0;
stop_ = start_stop;
} else {
start_ = start_stop;
stop_ = stop;
}
if (step_ >= 0)
stop_ = std::max(start_, stop_);
else
stop_ = std::min(start_, stop_);
if (step_ >= 0)
stop_ = start_ + (stop_ - start_ + step_ - 1) / step_ * step_;
else
stop_ = start_ - (start_ - stop_ + step_ - 1) / (-step_) * (-step_);
}
class RangeIter {
public:
using iterator_category = std::random_access_iterator_tag;
using value_type = ptrdiff_t;
using difference_type = ptrdiff_t;
using pointer = ptrdiff_t const *;
using reference = ptrdiff_t const &;
RangeIter() {}
RangeIter(ptrdiff_t start, ptrdiff_t stop, ptrdiff_t step)
: cur_(start), stop_(stop), step_(step) {}
RangeIter & operator += (ptrdiff_t steps) {
cur_ += step_ * steps;
if (step_ >= 0)
cur_ = std::min(cur_, stop_);
else
cur_ = std::max(cur_, stop_);
return *this;
}
RangeIter operator + (ptrdiff_t steps) const {
auto it = *this;
it += steps;
return it;
}
ptrdiff_t operator [] (ptrdiff_t steps) const {
auto it = *this;
it += steps;
return *it;
}
ptrdiff_t operator - (RangeIter const & other) const {
return (cur_ - other.cur_) / step_;
}
RangeIter & operator ++ () {
*this += 1;
return *this;
}
ptrdiff_t const & operator * () const {
return cur_;
}
bool operator == (RangeIter const & other) const {
return cur_ == other.cur_;
}
bool operator != (RangeIter const & other) const {
return !(*this == other);
}
ptrdiff_t cur_ = 0, stop_ = 0, step_ = 0;
};
auto begin() const { return RangeIter(start_, stop_, step_); }
auto end() const { return RangeIter(stop_, stop_, step_); }
private:
ptrdiff_t start_ = 0, stop_ = 0, step_ = 0;
};
int main() {
ptrdiff_t start = 1, stop = 1000000, step = 2;
std::mutex mutex;
std::unordered_set<std::string> threads;
std::unordered_set<ptrdiff_t> values;
auto range = Range(start, stop, step);
std::for_each(std::execution::par, range.begin(), range.end(),
[&](int i) {
std::unique_lock<std::mutex> lock(mutex);
std::ostringstream ss;
ss << std::this_thread::get_id();
threads.insert(ss.str());
values.insert(i);
});
std::cout << "Threads:" << std::endl;
for (auto const & s: threads)
std::cout << s << std::endl;
{
bool correct = true;
size_t cnt = 0;
for (ptrdiff_t i = start; i < stop; i += step) {
++cnt;
if (!values.count(i)) {
correct = false;
std::cout << "No value: " << i << std::endl;
break;
}
}
if (values.size() != cnt)
std::cout << "Expected amount of values: " << cnt
<< ", actual " << values.size() << std::endl;
std::cout << "Correct values: " << std::boolalpha
<< (correct && (values.size() == cnt)) << std::endl;
}
}
Output:
Threads:
1628
9628
5408
2136
2168
8636
2880
6492
1100
Correct values: true
If the problem is in creating range similar to python's range() you can look through https://en.cppreference.com/w/cpp/iterator/iterator and use it's example:
#include <iostream>
#include <algorithm>
template<long FROM, long TO>
class Range {
public:
// member typedefs provided through inheriting from std::iterator
class iterator: public std::iterator<
std::input_iterator_tag, // iterator_category
long, // value_type
long, // difference_type
const long*, // pointer
long // reference
>{
long num = FROM;
public:
explicit iterator(long _num = 0) : num(_num) {}
iterator& operator++() {num = TO >= FROM ? num + 1: num - 1; return *this;}
iterator operator++(int) {iterator retval = *this; ++(*this); return retval;}
bool operator==(iterator other) const {return num == other.num;}
bool operator!=(iterator other) const {return !(*this == other);}
reference operator*() const {return num;}
};
iterator begin() {return iterator(FROM);}
iterator end() {return iterator(TO >= FROM? TO+1 : TO-1);}
};
int main() {
// std::find requires an input iterator
auto range = Range<15, 25>();
auto itr = std::find(range.begin(), range.end(), 18);
std::cout << *itr << '\n'; // 18
// Range::iterator also satisfies range-based for requirements
for(long l : Range<3, 5>()) {
std::cout << l << ' '; // 3 4 5
}
std::cout << '\n';
}
Just as an alternative, you could make each work package carry the necessary information by adding the index you need.
Example:
std::vector<std::pair<size_t, std::array<float, 2>>> values(1000);
for(size_t i = 0; i < values.size(); ++i) values[i].first = i;
std::for_each(std::execution::par, values.begin(), values.end(),
[resolution](auto& p) {
p.second[0] = static_cast<float>(p.first) / resolution;
p.second[1] = weierstrass(a, b, static_cast<float>(p.first) / resolution);
});
Not using indexing on values inside the threaded part like above may prevent false sharing and improve performance. You could also make each work package aligned to prevent false sharing to see if that has an effect on performance.
#include <new>
struct alignas(std::hardware_destructive_interference_size) workpackage {
size_t index;
std::array<float, 2> arr;
};
std::vector<workpackage> values(1000);
for(size_t i = 0; i < values.size(); ++i) values[i].index = i;
std::for_each(std::execution::par, values.begin(), values.end(),
[resolution](auto& wp) {
wp.arr[0] = static_cast<float>(wp.index) / resolution;
wp.arr[1] = weierstrass(a, b, static_cast<float>(wp.index) / resolution);
});
You can write your code in another way and drop any need for range at all like this:
std::vector<std::array<float, 2>> values(1000);
std::for_each(std::execution::par, values.begin(), values.end(), [&](std::array<float, 2>& val) {
auto i = std::distance(&values[0], &val);
val[0] = static_cast<float>(i) / resolution;
val[1] = weierstrass(a, b, static_cast<float>(i) / resolution);
});
I should say that this code is valid if and only if you are using std::for_each, because it is stated that:
Unlike the rest of the parallel algorithms, std::for_each is not allowed to make copies of the elements in the sequence even if they are trivially copyable.
Related
I need to update a 100M-element array and would like to do it in parallel. std::for_each(std::execution::par, ...) seems great for this, except that the update needs to access elements of other arrays depending on the index that I am updating. A minimal serial working example of the kind of thing I'm trying to parallelize might look like this:
for (size_t i = 0; i < 100'000'000; i++)
d[i] = combine(d[i], s[2*i], s[2*i+1]);
I could of course manually spawn threads, but that is a lot more code than std::for_each, so it would be great to find an elegant way to do this with the standard library. So far I have found some not very elegant ways of using for_each, for instance:
Compute the index by using pointer arithmetic on the address of the array element.
Implement my own bogus iterator in the spirit of boost's counting_range.
Is there a better way to do this?
std::ranges should be able to help if you have access to c++20, you can iterate over the indexes rather than your data:
#include <ranges>
#include <vector>
#include <algorithm>
#include <iostream>
int main() {
std::vector<int> d(100);
std::ranges::iota_view indexes((size_t)0, d.size());
std::for_each(std::execution::par, indexes.begin(), indexes.end(), [&d](size_t i)
{
std::cout << i << "," << d[i] << "\n";
});
return 0;
}
You should be able to iterate over the indexes rather than the items. I think C++20 std::ranges gives you an easy way to do this, or you can use one of the Boost range methods. I'm not sure why you would consider rolling your own in the spirit of Boost counting_range when you could just, well, use Boost :-)
Having said that, I've actually opted for that roll-your-own approach, simply to make the code self-contained with neither C++20 nor Boost: feel free to replace paxrange with one of the other methods depending on your needs:
#include <iostream>
#include <algorithm>
// Seriously, just use Boost :-)
class paxrange {
public:
class iterator {
friend class paxrange;
public:
long int operator *() const { return value; }
const iterator &operator ++() { ++value; return *this; }
iterator operator ++(int) { iterator copy(*this); ++value; return copy; }
bool operator ==(const iterator &other) const { return value == other.value; }
bool operator !=(const iterator &other) const { return value != other.value; }
protected:
iterator(long int start) : value (start) { }
private:
unsigned long value;
};
iterator begin() const { return beginVal; }
iterator end() const { return endVal; }
paxrange(long int begin, long int end) : beginVal(begin), endVal(end) {}
private:
iterator beginVal;
iterator endVal;
};
int main() {
// Create a source and destination collection.
std::vector<int> s;
s.push_back(42); s.push_back(77); s.push_back(144);
s.push_back(12); s.push_back(6);
std::vector<int> d(5);
// Shows how to use indexes with multiple collections sharing index.
auto process = [s, &d](const int idx) { d[idx] = s[idx] + idx; };
paxrange x(0, d.size());
std::for_each(x.begin(), x.end(), process); // add parallelism later.
// Debug output.
for (const auto &item: s) std::cout << "< " << item << '\n';
std::cout << "=====\n";
for (const auto &item: d) std::cout << "> " << item << '\n';
}
The "meat" of the solution is the three lines in the middle of main(), where you set up a function for call-backs, one that takes the index rather than the item itself.
Inside that function, you use that index plus as many collections as needed, to set up the destination collection, very similar to what you desire.
In my case, I simply wanted the output vector to be the input vector but with the index added to each element, as per the output:
< 42
< 77
< 144
< 12
< 6
=====
> 42
> 78
> 146
> 15
> 10
There is a simple header-only library in Github which might help you.
Your minimal example can be parallelized like this. However, presumably due to cache cooling, the runtime will not scale down linearly with the number of cores.
#include "Lazy.h"
double combine(double a, double b, double c)
{
if (b > 0.5 && c < 0.4)
return a + std::exp(b * c + 1);
else if (b*c < 0.2)
return a * 0.8 + (1-c) * (1-b);
else
return std::exp(1.0 / a) + b + c;
}
// Generate index split for parallel tasks
auto getIndexPairs(std::size_t N, std::size_t numSplits)
{
std::vector<std::pair<std::size_t, std::size_t>> vecPairs(numSplits);
double dFrom = 0, dTo = 0;
for (auto i = 0; i < numSplits; ++i) {
dFrom = dTo;
dTo += N / double(numSplits);
vecPairs[i] = {std::size_t(dFrom), std::min(std::size_t(dTo), N)};
}
vecPairs[numSplits-1].second = N;
return vecPairs;
}
int main(int argc, char** argv) {
const std::size_t N = 100000000;
const std::size_t C = std::thread::hardware_concurrency(); // Number of parallel finder threads
std::vector<double> d(N);
std::vector<double> s(2*N);
// Fill d and s with some values
for (std::size_t i = 0; i < N; ++i) {
s[i] = double(i) / N;
s[i + N] = double(i + N) / N;
d[i] = N - i;
}
// Run combine(...) in parallel in C threads
Lazy::runForAll(getIndexPairs(N, C), [&](auto pr) {
for (int i=pr.first; i<pr.second; ++i)
d[i] = combine(d[i], s[2*i], s[2*i+1]);
return nullptr; // Dummy return value
});
}
#Alan Birtles answer does not work with a parallel execution policy, as it errors out to "static_assert failed: 'Parallel algorithms require forward iterators or stronger.'".
A potential alternative is to make an iterator vector, but it won't be as space-efficient.
std::vector<std::size_t> indexes(d.size());
std::iota(indexes.begin(), indexes.end(), 0);
std::for_each(std::execution::par, indexes.begin(), indexes.end(), [&](size_t i) {
std::cout << i << ',' << d[i] << '\n';
}
Coming from a Python world, I find the function std::iota very limited. Why is the interface restricted to not take any UnaryFunction ?
For instance I can convert
>>> x = range(0, 10)
into
std::vector<int> x(10);
std::iota(std::begin(x), std::end(x), 0);
But how would one do:
>>> x = range(0,20,2)
or even
>>> x = range(10,0,-1)
I know this is trivial to write one such function or use Boost, but I figured that C++ committee must have picked this design with care. So clearly I am missing something from C++11.
how about std::generate?
int n = -2;
std::generate(x.begin(), x.end(), [&n]{ return n+=2; });
int n = 10;
std::generate(x.begin(), x.end(), [&n]{ return n--;});
But how would one do:
x = range(0,20,2)
Alternatively to std::generate() (see other answer), you can provide your own unary function to std::iota(), it just have to be called operator++():
#include <iostream>
#include <functional>
#include <numeric>
#include <vector>
template<class T>
struct IotaWrapper
{
typedef T type;
typedef std::function<type(const type&)> IncrFunction;
type value;
IncrFunction incrFunction;
IotaWrapper() = delete;
IotaWrapper(const type& n, const IncrFunction& incrFunction) : value(n), incrFunction(incrFunction) {};
operator type() { return value; }
IotaWrapper& operator++() { value = incrFunction(value); return *this; }
};
int main()
{
IotaWrapper<int> n(0, [](const int& n){ return n+2; });
std::vector<int> v(10);
std::iota(v.begin(), v.end(), n);
for (auto i : v)
std::cout << i << ' ';
std::cout << std::endl;
}
Output: 0 2 4 6 8 10 12 14 16 18
Demo
Here is an idea of how one could implement Range():
struct Range
{
template<class Value, class Incr>
std::vector<Value> operator()(const Value& first, const Value& last, const Incr& increment)
{
IotaWrapper<Value> iota(first, [=](const int& n){ return n+increment; });
std::vector<Value> result((last - first) / increment);
std::iota(result.begin(), result.end(), iota);
return result;
}
};
Demo
With C++20 ranges, you can write it like this:
static auto stepped_iota(int start, int step) {
return std::ranges::views::iota(0) |
std::ranges::views::transform([=](int x) { return x * step + start; });
}
void f() {
for (int x : stepped_iota(0, 2)) { ... }
}
https://godbolt.org/z/3G49rs
Or, if you want the range to be finite:
static auto stepped_iota(int start, int end, int step) {
return std::ranges::views::iota(0, (end - start + step - 1) / step) |
std::ranges::views::transform([=](int x) { return x * step + start; });
}
What is the most efficient and standard (C++11/14) way to find the max/min item of vector of vectors?
std::vector<std::vector<double>> some_values{{5,0,8},{3,1,9}};
the wanted max element is 9
the wanted min element is 0
Here's a multi-threaded solution that returns an iterator (or throws) to the maximum for general type T (assuming operator< is defined for T). Note the most important optimisation is to perform the inner max operations on the 'columns' to exploit C++'s column-major ordering.
#include <vector>
#include <algorithm>
template <typename T>
typename std::vector<T>::const_iterator max_element(const std::vector<std::vector<T>>& values)
{
if (values.empty()) throw std::runtime_error {"values cannot be empty"};
std::vector<std::pair<typename std::vector<T>::const_iterator, bool>> maxes(values.size());
threaded_transform(values.cbegin(), values.cend(), maxes.begin(),
[] (const auto& v) {
return std::make_pair(std::max_element(v.cbegin(), v.cend()), v.empty());
});
auto it = std::remove_if(maxes.begin(), maxes.end(), [] (auto p) { return p.second; });
if (it == maxes.begin()) throw std::runtime_error {"values cannot be empty"};
return std::max_element(maxes.begin(), it,
[] (auto lhs, auto rhs) {
return *lhs.first < *rhs.first;
})->first;
}
threaded_transform is not part of the standard library (yet), but here's an implementation you could use.
#include <vector>
#include <thread>
#include <algorithm>
#include <cstddef>
template <typename InputIterator, typename OutputIterator, typename UnaryOperation>
OutputIterator threaded_transform(InputIterator first, InputIterator last, OutputIterator result, UnaryOperation op, unsigned num_threads)
{
std::size_t num_values_per_threads = std::distance(first, last) / num_threads;
std::vector<std::thread> threads;
threads.reserve(num_threads);
for (int i = 1; i <= num_threads; ++i) {
if (i == num_threads) {
threads.push_back(std::thread(std::transform<InputIterator,
OutputIterator, UnaryOperation>,
first, last, result, op));
} else {
threads.push_back(std::thread(std::transform<InputIterator,
OutputIterator, UnaryOperation>,
first, first + num_values_per_threads,
result, op));
}
first += num_values_per_threads;
result += num_values_per_threads;
}
for (auto& thread : threads) thread.join();
return result;
}
template <typename InputIterator, typename OutputIterator, typename UnaryOperation>
OutputIterator threaded_transform(InputIterator first, InputIterator last, OutputIterator result, UnaryOperation op)
{
return threaded_transform<InputIterator, OutputIterator, UnaryOperation>(first, last, result, op, std::thread::hardware_concurrency());
}
If you used a boost::multi_array<double, 2> instead of a std::vector<std::vector<double>> it would be as simple as:
auto minmax = std::minmax_element(values.data(), values.data() + values.num_elements());
Live demo.
The plain for loop way:
T max_e = std::numeric_limits<T>::min();
for(const auto& v: vv) {
for(const auto& e: v) {
max_e = std::max(max_e, e);
}
}
You must at least look at every element, so, as Anony-mouse mentioned, complexity will be at least O(n^2).
#include <vector>
#include <limits>
#include <algorithm>
int main() {
std::vector<std::vector<double>> some_values;
double max = std::numeric_limits<double>::lowest();
for (const auto& v : some_values)
{
double current_max = *std::max_element(v.cbegin(), v.cend());
max = max < current_max ? current_max : max; // max = std::max(current_max, max);
}
}
You can do it pretty easily with Eric Niebler's range-v3 library (which obviously isn't standard yet, but hopefully will be in the not-too-distant future):
vector<vector<double>> some_values{{5,0,8},{3,1,9}};
auto joined = some_values | ranges::view::join;
auto p = std::minmax_element(joined.begin(), joined.end());
p.first is an iterator to the min element; p.second to the max.
(range-v3 does have an implementation of minmax_element, but unfortunately, it requires a ForwardRange and view::join only gives me an InputRange, so I can't use it.)
Any efficient way to calculate the maximum element in a 2-D array(or vector in your case) involves a complexity of O(n^2) irrespective of what you do, as the calculation involves a comparison between n*n elements.Best way in terms of ease of use is to use std::max_element on the vector of vectors.I will not delve into details.Here is the reference.
If you create a custom iterator to iterate over all double of your vector of vector, a simple std::minmax_element do the job
iterator is something like:
class MyIterator : public std::iterator<std::random_access_iterator_tag, double>
{
public:
MyIterator() : container(nullptr), i(0), j(0) {}
MyIterator(const std::vector<std::vector<double>>& container,
std::size_t i,
std::size_t j) : container(&container), i(i), j(j)
{
// Skip empty container
if (i < container.size() && container[i].empty())
{
j = 0;
++(*this);
}
}
MyIterator(const MyIterator& rhs) = default;
MyIterator& operator = (const MyIterator& rhs) = default;
MyIterator& operator ++() {
if (++j >= (*container)[i].size()) {
do {++i;} while (i < (*container).size() && (*container)[i].empty());
j = 0;
}
return *this;
}
MyIterator operator ++(int) { auto it = *this; ++(*this); return it; }
MyIterator& operator --() {
if (j-- == 0) {
do { --i; } while (i != 0 && (*container)[i].empty());
j = (*container)[i].size();
}
return *this;
}
MyIterator operator --(int) { auto it = *this; --(*this); return it; }
double operator *() const { return (*container)[i][j]; }
bool operator == (const MyIterator& rhs) const {
return container == rhs.container && i == rhs.i && j == rhs.j;
}
bool operator != (const MyIterator& rhs) const { return !(*this == rhs); }
private:
const std::vector<std::vector<double>>* container;
std::size_t i;
std::size_t j;
};
And usage may be
// Helper functions for begin/end
MyIterator MyIteratorBegin(const std::vector<std::vector<double>>& container)
{
return MyIterator(container, 0, 0);
}
MyIterator MyIteratorEnd(const std::vector<std::vector<double>>& container)
{
return MyIterator(container, container.size(), 0);
}
int main() {
std::vector<std::vector<double>> values = {{5,0,8}, {}, {3,1,9}};
auto b = MyIteratorBegin(values);
auto e = MyIteratorEnd(values);
auto p = std::minmax_element(b, e);
if (p.first != e) {
std::cout << "min is " << *p.first << " and max is " << *p.second << std::endl;
}
}
Live example
Using the accumulate function you could write:
#include <iostream>
#include <numeric>
#include <vector>
int main()
{
std::vector<std::vector<double>> m{ {5, 0, 8}, {3, 1, 9} };
double x = std::accumulate(m.begin(), m.end(), m[0][0],
[](double max, const std::vector<double> &v)
{
return std::max(max,
*std::max_element(v.begin(),
v.end()));
});
std::cout << x << '\n';
return 0;
}
but I'd prefer the good, old for-loop.
The example can be extended to find both the min and max values:
std::accumulate(m.begin(), m.end(),
std::make_pair(m[0][0], m[0][0]),
[](std::pair<double, double> minmax, const std::vector<double> &v)
{
auto tmp(std::minmax_element(v.begin(), v.end()));
return std::make_pair(
std::min(minmax.first, *tmp.first),
std::max(minmax.second, *tmp.second));
});
(in real code you have to handle the empty-vector case)
Unfortunately a vector of vector isn't stored contiguously in memory, so you haven't a single block containing all the values (this is one of the reasons why a vector of vector isn't a good model for a matrix).
You can take advantage of a vector of vector if it contains a lot of elements.
Since each sub-vector is autonomous, you could use std::async to fill asynchronously a vector of futures containing the max value of each sub-vector.
The simplest method would be to first have a function to determine the max/min elements of one vector, say a function called:
double getMaxInVector(const vector<double>& someVec){}
Passing by reference (for reading purposes only) in this case will be a lot more time and space efficient (you don't want your function copying an entire vector). Thus in your function to determine max/min element of a vector of vectors, you would have a nested loop, such as:
for(size_t x= 0; x < some_values.size(); x++){
for(size_t y = 0; y < x.size(); y++){
// y represents the vectors inside the vector of course
// current max/min = getMax(y)
// update max/min after inner loop finishes and x increments
// by comparing it with previous max/min
The problem with the above solution is its inefficiency. From my knowledge, this algorithm will generally run on O(n^2log(n)) efficiency, which is quite unimpressive. But of course, it is still a solution. Although there might be standard algorithms that can find the max/min of a vector for you, it's always more accomplishing to write your own, and using the given will usually do nothing in terms of improving efficiency because the algorithm will generally be the same (for small functions that determine max/min). In fact, theoretically, standard functions would run marginally slower since those functions are templates which have to determine the type it is dealing with at run-time.
Lets say we have a vector named some_values, as shown below
7 4 2 0
4 8 10 8
3 6 7 6
3 9 19* 14
define a one-dimensional vector as shown below
vector<int> oneDimVector;
for(int i = 0; i < 4; i++){
for(int j = 0; j < 4; j++){
oneDimVector.push_back(some_values[i][j]);
}
}
Then find out a maximum/minimum element in that one-dimensional vector as shown below
vector<int>::iterator maxElement = max_element(oneDimVector.begin(),oneDimVector.end());
vector<int>::iterator minElement = min_element(oneDimVector.begin(),oneDimVector.end());
Now you get the max/min elements as below
cout << "Max element is " << *maxElement << endl;
cout << "Min element is " << *minElement << endl;
vector<vector<int>> vv = { vector<int>{10,12,43,58}, vector<int>{10,14,23,18}, vector<int>{28,47,12,90} };
vector<vector<int>> vv1 = { vector<int>{22,24,43,58}, vector<int>{56,17,23,18}, vector<int>{11,12,12,90} };
int matrix1_elem_sum=0;
int matrix2_elem_sum = 0;
for (size_t i = 0; i < vv.size(); i++)
{
matrix1_elem_sum += std::accumulate(vv[i].begin(), vv[i].end(), 0);
matrix2_elem_sum += std::accumulate(vv1[i].begin(), vv1[i].end(), 0);
}
cout << matrix1_elem_sum <<endl;
cout << matrix2_elem_sum << endl;
int summ = matrix1_elem_sum + matrix2_elem_sum;
cout << summ << endl;
or optimazed variant:
vector<vector<int>> vv = { vector<int>{10,12,43,58}, vector<int>{10,14,23,18}, vector<int>{28,47,12,90} };
vector<vector<int>> vv1 = { vector<int>{22,24,43,58}, vector<int>{56,17,23,18}, vector<int>{11,12,12,90} };
int summ=0;
int matrix2_elem_sum = 0;
for (size_t i = 0; i < vv.size(); i++)
{
summ += std::accumulate(vv[i].begin(), vv[i].end(), 0)+ std::accumulate(vv1[i].begin(), vv1[i].end(), 0);
}
cout << summ << endl;
}
I have two lists of pointers to a data structure X, the algorithm is very simple:
It loops over the first list A and try to find the the first matching element in list B. The requirement is to have at least 50k elements in each list:
#include <iostream>
#include <memory>
#include <chrono>
#include <vector>
#include <algorithm>
#include <string>
struct X {
std::string field_1;
std::string field_2;
std::string field_3;
std::string field_4;
X(std::string f1, std::string f2, std::string f3, std::string f4)
: field_1(f1)
, field_2(f2)
, field_3(f3)
, field_4(f4)
{};
bool equal(const std::shared_ptr<X>& x) {
return (x->field_1 == field_1) &&
(x->field_2 == field_2) &&
(x->field_3 == field_3) &&
(x->field_4 == field_4);
};
X *match = nullptr;
};
typedef std::shared_ptr<X> X_ptr;
class Timer
{
public:
Timer(std::string name) : beg_(clock_::now()), name_(name) {}
~Timer() {
std::cout << "Elapsed(" << name_ << "): " << elapsed() << std::endl;
}
void reset() { beg_ = clock_::now(); }
double elapsed() const {
return std::chrono::duration_cast<second_>
(clock_::now() - beg_).count();
}
private:
typedef std::chrono::high_resolution_clock clock_;
typedef std::chrono::duration<double, std::ratio<1> > second_;
std::chrono::time_point<clock_> beg_;
std::string name_;
};
std::string random_string(size_t length)
{
auto randchar = []() -> char
{
const char charset[] =
"0123456789"
"ABCDEFGHIJKLMNOPQRSTUVWXYZ";
const size_t max_index = (sizeof(charset) - 1);
return charset[rand() % max_index];
};
std::string str(length, 0);
std::generate_n(str.begin(), length, randchar);
return str;
}
int main()
{
Timer t("main");
std::vector <X_ptr> list_A;
std::vector <X_ptr> list_B;
const int MAX_ELEM = 50000;
list_A.reserve(MAX_ELEM);
list_B.reserve(MAX_ELEM);
{
Timer t("insert");
for (int i = 0; i < MAX_ELEM; i++) {
list_A.push_back(X_ptr(new X{ random_string(2), random_string(2), random_string(2), random_string(2) }));
list_B.push_back(X_ptr(new X{ random_string(2), random_string(2), random_string(2), random_string(2) }));
}
}
{
Timer t("match");
std::for_each(list_A.begin(), list_A.end(), [list_B](X_ptr& a) {
auto found_b = std::find_if(list_B.begin(), list_B.end(), [a](const X_ptr& b) {
return a->equal(b);
});
if (found_b != list_B.end()) {
a->match = found_b->get();
std::cout << "match OK \n";
}
});
}
}
on my machine the program is running extremly slow:
Elapsed(insert): 0.05566
Elapsed(match): 98.3739
Elapsed(main): 98.452
Would appreciate it if you can think of any other way to optimize it to run faster.
You are using vectors so each lookup into list_B takes O(n), where n is the number of elements in B. This means the total algorithm is O(m*n), if m is the number of elements in list_A. Thus if m and n a similar in size, you have a O(n^2) algorithm. That is too slow for any large n. To fix this, convert list_B into a unordered_map, (you can do this as part of this algorithm as the conversion is O(n)) where an element in the map's key is an element from list B and the value anything, say 0. You can then perform lookups into the map in O(1) time using find() on the map. Thus your algorithm becomes O(n), way better that O(n^2).
For example
std::unordered_map< X_ptr, int > value_map;
Time r t("match");
std::for_each(list_B.begin(), list_B.end(), [&](X_ptr& b) {
value_map[b] = 0;
});
std::for_each(list_A.begin(), list_A.end(), [value_map](X_ptr& a) {
auto found_b = value_map.find( a );
if ( found_b != value_map.end() )
{
a->match = found_b->first.get();
std::cout << "match OK \n";
}
});
}
Your Version:
Elapsed(insert): 0.0758608
Elapsed(match): 182.899
Elapsed(main): 182.991
New Version:
Elapsed(insert): 0.0719907
Elapsed(match): 0.0388562
Elapsed(main): 0.130884
You may use something like the following:
std::sort(list_B.begin(), list_B.end(), deref_less<X>);
{
Timer t("match");
for (const auto& a : list_A) {
auto it = std::lower_bound(list_B.begin(), list_B.end(), a, deref_less<X>);
if (it != list_B.end() && **it == *a) {
a->match = it->get();
std::cout << "match OK \n";
}
}
}
Live example.
I've written an indirect radix sort algorithm in C++ (by indirect, I mean it returns the indices of the items):
#include <algorithm>
#include <iterator>
#include <vector>
template<class It1, class It2>
void radix_ipass(
It1 begin, It1 const end,
It2 const a, size_t const i,
std::vector<std::vector<size_t> > &buckets)
{
size_t ncleared = 0;
for (It1 j = begin; j != end; ++j)
{
size_t const k = a[*j][i];
while (k >= ncleared && ncleared < buckets.size())
{ buckets[ncleared++].clear(); }
if (k >= buckets.size())
{
buckets.resize(k + 1);
ncleared = buckets.size();
}
buckets[k].push_back(size_t());
using std::swap; swap(buckets[k].back(), *j);
}
for (std::vector<std::vector<size_t> >::iterator
j = buckets.begin(); j != buckets.begin() + ncleared; j->clear(), ++j)
{
begin = std::swap_ranges(j->begin(), j->end(), begin);
}
}
template<class It, class It2>
void radix_isort(It const begin, It const end, It2 const items)
{
for (ptrdiff_t i = 0; i != end - begin; ++i) { items[i] = i; }
size_t smax = 0;
for (It i = begin; i != end; ++i)
{
size_t const n = i->size();
smax = n > smax ? n : smax;
}
std::vector<std::vector<size_t> > buckets;
for (size_t i = 0; i != smax; ++i)
{
radix_ipass(
items, items + (end - begin),
begin, smax - i - 1, buckets);
}
}
It seems to perform around 40% faster than std::sort when I test it with the following code (3920 ms compared to 6530 ms):
#include <functional>
template<class Key>
struct key_comp : public Key
{
explicit key_comp(Key const &key = Key()) : Key(key) { }
template<class T>
bool operator()(T const &a, T const &b) const
{ return this->Key::operator()(a) < this->Key::operator()(b); }
};
template<class Key>
key_comp<Key> make_key_comp(Key const &key) { return key_comp<Key>(key); }
template<class T1, class T2>
struct add : public std::binary_function<T1, T2, T1>
{ T1 operator()(T1 a, T2 const &b) const { return a += b; } };
template<class F>
struct deref : public F
{
deref(F const &f) : F(f) { }
typename std::iterator_traits<
typename F::result_type
>::value_type const
&operator()(typename F::argument_type const &a) const
{ return *this->F::operator()(a); }
};
template<class T> deref<T> make_deref(T const &t) { return deref<T>(t); }
size_t xorshf96(void) // random number generator
{
static size_t x = 123456789, y = 362436069, z = 521288629;
x ^= x << 16;
x ^= x >> 5;
x ^= x << 1;
size_t t = x;
x = y;
y = z;
z = t ^ x ^ y;
return z;
}
#include <stdio.h>
#include <time.h>
#include <array>
int main(void)
{
typedef std::vector<std::array<size_t, 3> > Items;
Items items(1 << 24);
std::vector<size_t> ranks(items.size() * 2);
for (size_t i = 0; i != items.size(); i++)
{
ranks[i] = i;
for (size_t j = 0; j != items[i].size(); j++)
{ items[i][j] = xorshf96() & 0xFFF; }
}
clock_t const start = clock();
if (1) { radix_isort(items.begin(), items.end(), ranks.begin()); }
else // STL sorting
{
std::sort(
ranks.begin(),
ranks.begin() + items.size(),
make_key_comp(make_deref(std::bind1st(
add<Items::const_iterator, ptrdiff_t>(),
items.begin()))));
}
printf("%u ms\n",
(unsigned)((clock() - start) * 1000 / CLOCKS_PER_SEC),
std::min(ranks.begin(), ranks.end()));
return 0;
}
Hmm, I guess that's the best I can do, I thought.
But after lots of banging my head against the wall, I realized that prefetching in the beginning of radix_ipass can help cut down the result to 1440 ms (!):
#include <xmmintrin.h>
...
for (It1 j = begin; j != end; ++j)
{
#if defined(_MM_TRANSPOSE4_PS) // should be defined if xmmintrin.h is included
enum { N = 8 };
if (end - j > N)
{ _mm_prefetch((char const *)(&a[j[N]][i]), _MM_HINT_T0); }
#endif
...
}
Clearly, the bottleneck is the memory bandwidth---the access pattern is unpredictable.
So now my question is: what else can I do to make it even faster on similar amounts of data?
Or is there not much room left for improvement?
(I'm hoping to avoid compromising the readability of the code if possible, so if the readability is harmed, the improvement should be significant.)
Using a more compact data structure that combines ranks and values can boost the performance of std::sort by a factor 2-3. Essentially, the sort now runs on a vector<pair<Value,Rank>>. The Value data type, std::array<integer_type, 3> has been replaced for this by a more compact pair<uint32_t, uint8_t> data structure. Only half a byte of it is unused, and the < comparison can by done in two steps, first using a presumably efficient comparison of uint32_ts (it's not clear if the loop used by std::array<..>::operator< can be optimized to a similarly fast code, but the replacement of std::array<integer_type,3> by this data structure yielded another performance boost).
Still, it doesn't get as efficient as the radix sort. (Maybe you could tweak a custom QuickSort with prefetches?)
Besides that additional sorting method, I've replaced the xorshf96 by a mt19937, because I know how to provide a seed for the latter ;)
The seed and the number of values can be changed via two command-line arguments: first the seed, then the count.
Compiled with g++ 4.9.0 20131022, using -std=c++11 -march=native -O3, for a 64-bit linux
Sample runs; important note running on a Core2Duo processor U9400 (old & slow!)
item count: 16000000
using std::sort
duration: 12260 ms
result sorted: true
seed: 5648
item count: 16000000
using std::sort
duration: 12230 ms
result sorted: true
seed: 5648
item count: 16000000
using std::sort
duration: 12230 ms
result sorted: true
seed: 5648
item count: 16000000
using std::sort with a packed data structure
duration: 4290 ms
result sorted: true
seed: 5648
item count: 16000000
using std::sort with a packed data structure
duration: 4270 ms
result sorted: true
seed: 5648
item count: 16000000
using std::sort with a packed data structure
duration: 4280 ms
result sorted: true
item count: 16000000
using radix sort
duration: 3790 ms
result sorted: true
seed: 5648
item count: 16000000
using radix sort
duration: 3820 ms
result sorted: true
seed: 5648
item count: 16000000
using radix sort
duration: 3780 ms
result sorted: true
New or changed code:
template<class It>
struct fun_obj
{
It beg;
bool operator()(ptrdiff_t lhs, ptrdiff_t rhs)
{
return beg[lhs] < beg[rhs];
}
};
template<class It>
fun_obj<It> make_fun_obj(It beg)
{
return fun_obj<It>{beg};
}
struct uint32p8_t
{
uint32_t m32;
uint8_t m8;
uint32p8_t(std::array<uint16_t, 3> const& a)
: m32( a[0]<<(32-3*4) | a[1]<<(32-2*3*4) | (a[2]&0xF00)>>8)
, m8( a[2]&0xFF )
{
}
operator std::array<size_t, 3>() const
{
return {{m32&0xFFF00000 >> (32-3*4), m32&0x000FFF0 >> (32-2*3*4),
(m32&0xF)<<8 | m8}};
}
friend bool operator<(uint32p8_t const& lhs, uint32p8_t const& rhs)
{
if(lhs.m32 < rhs.m32) return true;
if(lhs.m32 > rhs.m32) return false;
return lhs.m8 < rhs.m8;
}
};
#include <stdio.h>
#include <time.h>
#include <array>
#include <iostream>
#include <iomanip>
#include <utility>
#include <algorithm>
#include <cstdlib>
#include <iomanip>
#include <random>
int main(int argc, char* argv[])
{
std::cout.sync_with_stdio(false);
constexpr auto items_count_default = 2<<22;
constexpr auto seed_default = 42;
uint32_t const seed = argc > 1 ? std::atoll(argv[1]) : seed_default;
std::cout << "seed: " << seed << "\n";
size_t const items_count = argc > 2 ? std::atoll(argv[2])
: items_count_default;
std::cout << "item count: " << items_count << "\n";
using Items_array_value_t =
#ifdef RADIX_SORT
size_t
#elif defined(STDSORT)
uint16_t
#elif defined(STDSORT_PACKED)
uint16_t
#endif
;
typedef std::vector<std::array<Items_array_value_t, 3> > Items;
Items items(items_count);
auto const ranks_count =
#ifdef RADIX_SORT
items.size() * 2
#elif defined(STDSORT)
items.size()
#elif defined(STDSORT_PACKED)
items.size()
#endif
;
//auto prng = xorshf96;
std::mt19937 gen(seed);
std::uniform_int_distribution<> dist;
auto prng = [&dist, &gen]{return dist(gen);};
std::vector<size_t> ranks(ranks_count);
for (size_t i = 0; i != items.size(); i++)
{
ranks[i] = i;
for (size_t j = 0; j != items[i].size(); j++)
{ items[i][j] = prng() & 0xFFF; }
}
std::cout << "using ";
clock_t const start = clock();
#ifdef RADIX_SORT
std::cout << "radix sort\n";
radix_isort(items.begin(), items.end(), ranks.begin());
#elif defined(STDSORT)
std::cout << "std::sort\n";
std::sort(ranks.begin(), ranks.begin() + items.size(),
make_fun_obj(items.cbegin())
//make_key_comp(make_deref(std::bind1st(
// add<Items::const_iterator, ptrdiff_t>(),
// items.begin())))
);
#elif defined(STDSORT_PACKED)
std::cout << "std::sort with a packed data structure\n";
using Items_ranks = std::vector< std::pair<uint32p8_t,
decltype(ranks)::value_type> >;
Items_ranks items_ranks;
size_t i = 0;
for(auto iI = items.cbegin(); iI != items.cend(); ++iI, ++i)
{
items_ranks.emplace_back(*iI, i);
}
std::sort(begin(items_ranks), end(items_ranks),
[](Items_ranks::value_type const& lhs,
Items_ranks::value_type const& rhs)
{ return lhs.first < rhs.first; }
);
std::transform(items_ranks.cbegin(), items_ranks.cend(), begin(ranks),
[](Items_ranks::value_type const& e) { return e.second; }
);
#endif
auto const duration = (clock() - start) / (CLOCKS_PER_SEC / 1000);
bool const sorted = std::is_sorted(ranks.begin(), ranks.begin() + items.size(),
make_fun_obj(items.cbegin()));
std::cout << "duration: " << duration << " ms\n"
<< "result sorted: " << std::boolalpha << sorted << "\n";
return 0;
}
Full code:
#include <algorithm>
#include <iterator>
#include <vector>
#include <cstddef>
using std::size_t;
using std::ptrdiff_t;
#include <xmmintrin.h>
template<class It1, class It2>
void radix_ipass(
It1 begin, It1 const end,
It2 const a, size_t const i,
std::vector<std::vector<size_t> > &buckets)
{
size_t ncleared = 0;
for (It1 j = begin; j != end; ++j)
{
#if defined(_MM_TRANSPOSE4_PS)
constexpr auto N = 8;
if(end - j > N)
{ _mm_prefetch((char const *)(&a[j[N]][i]), _MM_HINT_T0); }
#else
#error SS intrinsic not found
#endif
size_t const k = a[*j][i];
while (k >= ncleared && ncleared < buckets.size())
{ buckets[ncleared++].clear(); }
if (k >= buckets.size())
{
buckets.resize(k + 1);
ncleared = buckets.size();
}
buckets[k].push_back(size_t());
using std::swap; swap(buckets[k].back(), *j);
}
for (std::vector<std::vector<size_t> >::iterator
j = buckets.begin(); j != buckets.begin() + ncleared; j->clear(), ++j)
{
begin = std::swap_ranges(j->begin(), j->end(), begin);
}
}
template<class It, class It2>
void radix_isort(It const begin, It const end, It2 const items)
{
for (ptrdiff_t i = 0; i != end - begin; ++i) { items[i] = i; }
size_t smax = 0;
for (It i = begin; i != end; ++i)
{
size_t const n = i->size();
smax = n > smax ? n : smax;
}
std::vector<std::vector<size_t> > buckets;
for (size_t i = 0; i != smax; ++i)
{
radix_ipass(
items, items + (end - begin),
begin, smax - i - 1, buckets);
}
}
#include <functional>
template<class Key>
struct key_comp : public Key
{
explicit key_comp(Key const &key = Key()) : Key(key) { }
template<class T>
bool operator()(T const &a, T const &b) const
{ return this->Key::operator()(a) < this->Key::operator()(b); }
};
template<class Key>
key_comp<Key> make_key_comp(Key const &key) { return key_comp<Key>(key); }
template<class T1, class T2>
struct add : public std::binary_function<T1, T2, T1>
{ T1 operator()(T1 a, T2 const &b) const { return a += b; } };
template<class F>
struct deref : public F
{
deref(F const &f) : F(f) { }
typename std::iterator_traits<
typename F::result_type
>::value_type const
&operator()(typename F::argument_type const &a) const
{ return *this->F::operator()(a); }
};
template<class T> deref<T> make_deref(T const &t) { return deref<T>(t); }
size_t xorshf96(void) // random number generator
{
static size_t x = 123456789, y = 362436069, z = 521288629;
x ^= x << 16;
x ^= x >> 5;
x ^= x << 1;
size_t t = x;
x = y;
y = z;
z = t ^ x ^ y;
return z;
}
template<class It>
struct fun_obj
{
It beg;
bool operator()(ptrdiff_t lhs, ptrdiff_t rhs)
{
return beg[lhs] < beg[rhs];
}
};
template<class It>
fun_obj<It> make_fun_obj(It beg)
{
return fun_obj<It>{beg};
}
struct uint32p8_t
{
uint32_t m32;
uint8_t m8;
uint32p8_t(std::array<uint16_t, 3> const& a)
: m32( a[0]<<(32-3*4) | a[1]<<(32-2*3*4) | (a[2]&0xF00)>>8)
, m8( a[2]&0xFF )
{
}
operator std::array<size_t, 3>() const
{
return {{m32&0xFFF00000 >> (32-3*4), m32&0x000FFF0 >> (32-2*3*4),
(m32&0xF)<<8 | m8}};
}
friend bool operator<(uint32p8_t const& lhs, uint32p8_t const& rhs)
{
if(lhs.m32 < rhs.m32) return true;
if(lhs.m32 > rhs.m32) return false;
return lhs.m8 < rhs.m8;
}
};
#include <stdio.h>
#include <time.h>
#include <array>
#include <iostream>
#include <iomanip>
#include <utility>
#include <algorithm>
#include <cstdlib>
#include <iomanip>
#include <random>
int main(int argc, char* argv[])
{
std::cout.sync_with_stdio(false);
constexpr auto items_count_default = 2<<22;
constexpr auto seed_default = 42;
uint32_t const seed = argc > 1 ? std::atoll(argv[1]) : seed_default;
std::cout << "seed: " << seed << "\n";
size_t const items_count = argc > 2 ? std::atoll(argv[2]) : items_count_default;
std::cout << "item count: " << items_count << "\n";
using Items_array_value_t =
#ifdef RADIX_SORT
size_t
#elif defined(STDSORT)
uint16_t
#elif defined(STDSORT_PACKED)
uint16_t
#endif
;
typedef std::vector<std::array<Items_array_value_t, 3> > Items;
Items items(items_count);
auto const ranks_count =
#ifdef RADIX_SORT
items.size() * 2
#elif defined(STDSORT)
items.size()
#elif defined(STDSORT_PACKED)
items.size()
#endif
;
//auto prng = xorshf96;
std::mt19937 gen(seed);
std::uniform_int_distribution<> dist;
auto prng = [&dist, &gen]{return dist(gen);};
std::vector<size_t> ranks(ranks_count);
for (size_t i = 0; i != items.size(); i++)
{
ranks[i] = i;
for (size_t j = 0; j != items[i].size(); j++)
{ items[i][j] = prng() & 0xFFF; }
}
std::cout << "using ";
clock_t const start = clock();
#ifdef RADIX_SORT
std::cout << "radix sort\n";
radix_isort(items.begin(), items.end(), ranks.begin());
#elif defined(STDSORT)
std::cout << "std::sort\n";
std::sort(ranks.begin(), ranks.begin() + items.size(),
make_fun_obj(items.cbegin())
//make_key_comp(make_deref(std::bind1st(
// add<Items::const_iterator, ptrdiff_t>(),
// items.begin())))
);
#elif defined(STDSORT_PACKED)
std::cout << "std::sort with a packed data structure\n";
using Items_ranks = std::vector< std::pair<uint32p8_t,
decltype(ranks)::value_type> >;
Items_ranks items_ranks;
size_t i = 0;
for(auto iI = items.cbegin(); iI != items.cend(); ++iI, ++i)
{
items_ranks.emplace_back(*iI, i);
}
std::sort(begin(items_ranks), end(items_ranks),
[](Items_ranks::value_type const& lhs,
Items_ranks::value_type const& rhs)
{ return lhs.first < rhs.first; }
);
std::transform(items_ranks.cbegin(), items_ranks.cend(), begin(ranks),
[](Items_ranks::value_type const& e) { return e.second; }
);
#endif
auto const duration = (clock() - start) / (CLOCKS_PER_SEC / 1000);
bool const sorted = std::is_sorted(ranks.begin(), ranks.begin() + items.size(),
make_fun_obj(items.cbegin()));
std::cout << "duration: " << duration << " ms\n"
<< "result sorted: " << std::boolalpha << sorted << "\n";
return 0;
}