Parallel memcpy in cpp - c++

I am trying to copy a matrix in parallel. Below is the code that I am working with. Currently, it works as expected with char, but it seg faults when I use shorts. I assume that the bug is in copying outside of the memory outside of the vector. I have tried to debug my assumption without success.
CMakeLists.txt
cmake_minimum_required(VERSION 3.0)
project(memcpy CXX)
find_package (Threads)
add_executable(memcpy main.cpp)
set_property(TARGET memcpy PROPERTY CXX_STANDARD 17)
target_link_libraries (memcpy ${CMAKE_THREAD_LIBS_INIT})
main.cpp
#include <cassert>
#include <condition_variable>
#include <cstring>
#include <iostream>
#include <mutex>
#include <string>
#include <thread>
#include <vector>
class Barrier {
public:
explicit Barrier(std::size_t const count) : m_threshold(count), m_remaining(count), m_generation(0) {}
void wait() {
auto local = std::unique_lock<std::mutex>{m_mutex};
auto current_generation = m_generation;
m_remaining--;
if (!m_remaining) {
m_generation++;
m_remaining = m_threshold;
m_condition.notify_all();
} else {
m_condition.wait(local, [this, current_generation] { return current_generation != m_generation; });
}
}
private:
std::mutex m_mutex;
std::condition_variable m_condition;
std::size_t m_threshold;
std::size_t m_remaining;
std::size_t m_generation;
};
template <typename T>
class Matrix {
using reference = typename std::vector<T>::reference;
using const_reference = typename std::vector<T>::const_reference;
public:
Matrix(std::size_t rows, std::size_t cols) : m_rows(rows), m_cols(cols), m_data(m_cols * m_rows) {}
Matrix(std::size_t rows, std::size_t cols, T const& default_val) : m_rows(rows), m_cols(cols), m_data(m_cols * m_rows, default_val) {}
constexpr std::size_t get_columns() const { return m_cols; }
constexpr std::size_t get_rows() const { return m_rows; }
constexpr std::size_t get_element_count() const {
assert(m_cols * m_rows == m_data.size());
return m_cols * m_rows;
}
T* data() { return m_data.data(); }
T const* data() const { return m_data.data(); }
reference operator()(std::size_t const column_x, std::size_t const row_y) {
assert(0 <= column_x);
assert(column_x < get_columns());
assert(0 <= row_y);
assert(row_y < get_rows());
return m_data[row_y * m_cols + column_x];
}
const_reference operator()(std::size_t const column_x, std::size_t const row_y) const {
assert(0 <= column_x);
assert(column_x < get_columns());
assert(0 <= row_y);
assert(row_y < get_rows());
return m_data[row_y * m_cols + column_x];
}
private:
std::size_t const m_rows;
std::size_t const m_cols;
std::vector<T> m_data;
};
static_assert(false, "FIX ME");
using T = char;
// using T = short;
// using T = int;
// using T = double;
void run(std::size_t const my_rank, std::size_t const num_threads, Barrier& barrier, Matrix<T> const& from_data, Matrix<T>& to_data) {
auto n = from_data.get_element_count();
std::string str;
if (my_rank == 0) {
std::cerr << "bytes to copy: " << (n * sizeof(T)) << '\n';
}
// initialization
std::size_t segment_size = n / num_threads;
std::size_t start = (my_rank * segment_size) * sizeof(T);
std::size_t end = ((my_rank + 1) * segment_size) * sizeof(T);
std::size_t distance = end - start;
str += " my_rank: " + std::to_string(my_rank);
str += " segment_size: " + std::to_string(segment_size);
str += " start: " + std::to_string(start);
str += " end: " + std::to_string(end);
str += " distance: " + std::to_string(distance);
str += " rank: " + std::to_string(my_rank);
str += " start: " + std::to_string(start);
str += " end: " + std::to_string(end);
str += " distance: " + std::to_string(distance);
str += " e: " + std::to_string(start + distance);
str += "\n";
std::cerr << str;
barrier.wait();
std::memcpy(to_data.data() + start, from_data.data() + start, distance);
barrier.wait();
if (my_rank == 0)
for (auto y = 0; y < from_data.get_rows(); y++) {
for (auto x = 0; x < from_data.get_columns(); x++) {
if (to_data(x, y) != from_data(x, y)) {
std::cerr << "x: " << x << '\t' << "y: " << y << "\t\t";
std::cerr << "to: " << to_data(x, y) << '\t' << "from: " << from_data(x, y) << '\n';
}
}
}
barrier.wait();
}
int main() {
auto const num_threads = 1;
// auto const num_threads = 4;
// auto const width = 64;
// auto const height = 64;
auto const width = 97;
auto const height = 101;
auto from_data = Matrix<T>(width, height, 70);
auto to_data = Matrix<T>(width, height, 84);
std::vector<std::thread> threads;
auto barrier = Barrier{num_threads};
for (auto i = 0; i < num_threads; i++) {
threads.emplace_back(run, i, num_threads, std::ref(barrier), std::ref(from_data), std::ref(to_data));
}
for (auto& thread : threads) {
thread.join();
}
}

std::memcpy(to_data.data() + start, from_data.data() + start, distance)
std::vector<T>::data() returns a T* so if you add an integral value foo to it, you effectively add foo * sizeof T bytes ... but you allready multiplied with sizeof(T) earlier when calculating start and end. Also, std::memcpy() won't work for Ts that are not PODs.
Better use std::copy().

Related

Move semantics for a custom vector and allocators

I'm implementing a basic std::vector by using an allocator, following PPP by Stroustrup, and in particular I'm sure that all the functions like resize, push_back, emplace_back, and so on are okay.
What is really puzzling me a bit is the copy and move semantics: it works now, but I find it a bit too hard-coded and, more importantly, it's leaking 1 byte in the move assignment, as can be seen from this
==7853== LEAK SUMMARY:
==7853== definitely lost: 1 bytes in 1 blocks
==7853== indirectly lost: 0 bytes in 0 blocks
==7853== possibly lost: 0 bytes in 0 blocks
In the following the whole code. I'd like to see especially how the move assignment should be implemented, because I believe that a problem is the fact that I have to move an allocator, and I'm totally new to this topic.
#include <iostream>
#include <memory>
#include <utility>
#include <initializer_list>
#include <string>
template <typename T, typename Allocator = std::allocator<T>>
class Vector
{
private:
Allocator allocator; //used to handle memory for the elements
T *elem;
std::size_t _size{};
std::size_t _capacity{};
void reserve(const std::size_t n)
{
if (_capacity < n) //if the capacity is smaller than what I want to allocate
{
T *tmp{std::allocator_traits<Allocator>::allocate(allocator, n)};
for (std::size_t i = 0; i < _size; ++i)
{
// allocator.construct(tmp + i, std::move(elem[i])); //tmp[i] = std::move(elem[i]);
std::allocator_traits<Allocator>::construct(allocator, tmp + i, elem[i]);
}
for (std::size_t i = 0; i < _size; ++i)
{
std::allocator_traits<Allocator>::destroy(allocator, elem + i);
}
std::allocator_traits<Allocator>::deallocate(allocator, elem, _capacity);
elem = tmp; //move the pointer
_capacity = n; //size increased!
}
}
void check_and_increase_capacity()
{
if (_capacity == 0)
{
reserve(8);
}
else if (_size == _capacity)
{
reserve(2 * _size);
}
}
template <typename O>
void _push_back(O &&x)
{
check_and_increase_capacity();
std::allocator_traits<Allocator>::construct(allocator, elem + _size, std::forward<O>(x));
++_size;
}
public:
explicit Vector(std::initializer_list<T> list) : elem{std::allocator_traits<Allocator>::allocate(allocator, list.size())}, _size{list.size()}, _capacity{list.size()}
{
std::uninitialized_copy(list.begin(), list.end(), begin());
std::cout << "custom cstr"
<< "\n";
}
~Vector() noexcept
{
for (std::size_t i = 0; i < _size; ++i)
{
std::allocator_traits<Allocator>::destroy(allocator, elem + i); //call the destructor
}
std::allocator_traits<Allocator>::deallocate(allocator, elem, _capacity); //deallocate memory
}
T *begin() { return elem; }
const T *begin() const { return elem; }
T *end() { return elem + _capacity; }
const T *end() const { return elem + _capacity; }
Vector(const Vector &v) : allocator{std::allocator_traits<Allocator>::select_on_container_copy_construction(v.allocator)}, elem{std::allocator_traits<Allocator>::allocate(allocator, v._capacity)}
{
T *tmp{std::allocator_traits<Allocator>::allocate(allocator, _capacity)};
_size = v._size;
_capacity = v._capacity;
for (std::size_t i = 0; i < _size; ++i)
{
std::allocator_traits<Allocator>::construct(allocator, tmp + i, std::move(v[i]));
}
std::uninitialized_copy(v.begin(), v.end(), begin()); //copy the elements
//destroy and deallocate tmp
for (std::size_t i = 0; i < _size; ++i)
{
std::allocator_traits<Allocator>::destroy(allocator, tmp + i);
}
std::allocator_traits<Allocator>::deallocate(allocator, tmp, _capacity);
}
Vector &operator=(const Vector &v)
{
T *tmp{std::allocator_traits<Allocator>::allocate(allocator, v._capacity)};
std::uninitialized_copy(v.begin(), v.end(), begin()); //copy the elements
for (std::size_t i = 0; i < v._size; ++i)
{
std::allocator_traits<Allocator>::destroy(allocator, elem + i);
}
std::allocator_traits<Allocator>::deallocate(allocator, elem, _capacity);
elem = tmp;
_size = v._size;
_capacity = v._capacity;
return *this;
}
Vector(Vector &&v) noexcept : allocator{std::move(v.allocator)}
{
elem = v.elem;
v.elem = nullptr;
_size = v._size;
v._size = 0;
_capacity = v._capacity;
v._capacity = 0;
std::cout << elem << "\n";
}
Vector &operator=(Vector &&v) noexcept
{
std::cout << "move assignment"
<< "\n";
allocator = std::move(v.allocator);
elem = v.elem;
v.elem = nullptr;
_size = v._size;
v._size = 0;
_capacity = v._capacity;
v._capacity = 0;
return *this;
}
void push_back(const T &x)
{
_push_back(x);
}
void push_back(T &&x)
{
_push_back(std::move(x));
}
template <typename... Types>
void emplace_back(Types &&...args)
{
check_and_increase_capacity();
std::allocator_traits<Allocator>::construct(allocator, elem + _size, std::forward<Types>(args)...);
}
T &operator[](const std::size_t i) noexcept { return elem[i]; }
const T &operator[](const std::size_t i) const noexcept { return elem[i]; }
friend std::ostream &operator<<(std::ostream &os, const Vector &v)
{
for (std::size_t i = 0; i < v._size; i++)
{
std::cout << v[i] << "\n";
}
return os;
}
void resize(const std::size_t newsize, T val = T{})
{
reserve(newsize);
for (std::size_t i = _size; i < newsize; ++i)
{
std::allocator_traits<Allocator>::construct(allocator, elem + i, val);
}
//destroy all the new extra elements
for (std::size_t i = newsize; i < _size; ++i)
{
std::allocator_traits<Allocator>::destroy(allocator, elem + i); //just destroy them, don't call release the memory!
}
_size = newsize;
}
};
struct Foo
{
std::string _s;
Foo()
{
std::cout << "foo cstr"
<< "\n";
};
explicit Foo(const std::string &s) : _s{s} {}
~Foo() = default;
};
int main()
{
Vector<int> v{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
std::cout << v << "\n";
v.push_back(11);
std::cout << "After push_back \n"
<< v << "\n";
Vector<Foo> w{{}, {}};
w.emplace_back();
v.resize(6);
std::cout << "After resize \n"
<< v << "\n";
// Copy/Move semantics tests
Vector<int> v1{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
Vector<int> v2{v1};
std::cout << "After copy cstr \n"
<< v2 << "\n";
v2.push_back(20);
std::cout << "after push_back \n"
<< v2 << "\n";
Vector<int> v3{};
v3 = v1;
std::cout << v3 << "and v1: \n"
<< v1 << "\n";
Vector<int> v4{std::move(v1)};
std::cout << v4 << "and v1: \n"
<< v1 << "\n";
Vector<int> v5{};
v5 = std::move(v4);
std::cout << v5 << "and v4: \n"
<< v4 << "\n";
return 0;
}

Array template class in C++

I can not get the idea of how to create Array template class properly in C++.
The problem is solely out of learning purposes.
Let me provide the code first.
Array.h :
//Developed by Trofimov Yaroslav on 30.03.2018
#ifndef _ARRAY_H_TROFIMOV_
#define _ARRAY_H_TROFIMOV_
#include <string>
template<const size_t n, typename T>
class Array {
static unsigned __freeId, __quantity;
unsigned _id;
T** _array;
const size_t _n;
public:
typedef const bool (* const BooleanResultDelegate)(const T&);
class ArrayError {
const std::string _reason;
const size_t _index;
const size_t _maxIndex;
public:
ArrayError(const size_t index, const size_t maxIndex,const std::string& reason = "")
: _index(index), _maxIndex(maxIndex), _reason(reason) {}
std::string explanation(void) {
std::string res += "Index: " + std::to_string(_index) + "\n";
res += "Max index: " + std::to_string(_maxIndex) + "\n";
res += "Reason: " + _reason + "\n";
return res;
}
};
explicit Array<n, T>(T* arrayFiller = 0)
: _n(n), _array(new T*[n]), _id(++__freeId) {
if(arrayFiller != 0) {
for(size_t i(0); i < length(); ++i) {
_array[i] = new T(*arrayFiller);
}
} else {
for(size_t i(0); i < length(); ++i) {
_array[i] = arrayFiller;
}
}
reportIfDebug<n, T>(*this, "created");
++__quantity;
}
explicit Array<n, T>(const T& arrayFiller)
: _n(n), _array(new T*[n]), _id(++__freeId) {
for(size_t i(0); i < length(); ++i) {
_array[i] = new T(arrayFiller);
}
reportIfDebug<n, T>(*this, "created");
++__quantity;
}
Array<n, T>(const Array<n, T>& that)
: _n(n), _array(new T[n]), _id(++__freeId) {
for(size_t i(0); i < length(); ++i) {
(*this)[i] = new T[that[i]];
}
reportIfDebug<n, T>(*this, "created");
++__quantity;
}
~Array<n, T>(void) {
removeAll();
delete [] _array; _array = 0;
reportIfDebug<n, T>(*this, "deleted", false);
--__quantity;
}
T* operator[](const size_t i) {
if(i > length()) {
throw ArrayError(i, _n, "out of bounds exception");
}
return _array[i];
}
const T* operator[](const size_t i) const {
if(i > length()) {
throw ArrayError(i, _n, "out of bounds exception");
}
return _array[i];
}
const size_t length() const {
return _n;
}
const unsigned getID() const {
return _id;
}
void removeAll(BooleanResultDelegate removeCondition = 0) {
for(size_t i(0); i < length(); ++i) {
if(removeCondition == 0 || removeCondition(*_array[i])) {
delete [] _array[i]; _array[i] = 0;
}
}
}
};
template<const size_t n, typename T>
unsigned Array<n, T>::__freeId(0);
template<const size_t n, typename T>
unsigned Array<n, T>::__quantity(0);
template<const size_t n, typename T>
void reportIfDebug(
const Array<n, T>& instance,
const char* const message,
const bool showContent = true) {
#ifndef NDEBUG
std::cout << "========================================" << std::endl;
std::cout << typeid(instance).name() << ' '
<< message << ' '
<< "id: " << instance.getID() << std::endl;
if(showContent) {
std::cout << instance;
}
std::cout << "========================================" << std::endl;
#endif
}
template<const size_t n, typename T>
std::ostream& operator<<(std::ostream& os, const Array<n, T>& instance) {
for(size_t i(0); i < instance.length(); ++i) {
if(instance[i] == 0) {
os << "[" << i << "]: " << instance[i] << "\n";
} else {
os << "[" << i << "]: " << *instance[i] << "\n";
}
}
return os;
}
#endif
Main.cpp :
//Developed by Trofimov Yaroslav on 30.03.2018
#include <iostream>
#include "Array.h"
int main(void) {
const Array<5, int> a(7);
std::cout << *a[2] << std::endl;
return 0;
}
What is the main problem right now - is that the client of my Array class would have to use [indirection operator *] and [0 value pointer check] to use the objects from array.
I do not want the client to do that. But, if I use reference instead of pointer as a return type of operator[] I will not be able to return types which do not have copy constructor and I will return trash if nothing was put there before.
It seems like in Java and C# the problem is not fixed as well. The user is getting a reference to an object there and definitely should check for null being returned. And [indirection operator *] is called automatically there.

Quick Sort applied to a vector of pointers to objects - infinite loop

I cannot figure out why the algorithm results in an infinite loop. I am trying to sort the vector according to the final price. The pivot always stays the same. Maybe the problem is with the swapping of the objects
Motorbike findPivotPricee(vector<Motorbike*>& available, int left, int right)
{
int center = (left + right) / 2;
if (available[center]->finalPrice() < available[left]->finalPrice())
{
swap(*available[left], *available[center]);
}
if (available[right]->finalPrice()< available[left]->finalPrice())
{
swap(*available[left], *available[right]);
}
if (available[right]->finalPrice() < available[center]->finalPrice())
{
swap(*available[center], *available[right]);
}
Motorbike pivot = *available[center];
swap(*available[center], *available[right - 1]);
return pivot;
}
void quickSortMotorbikeByPrice(vector<Motorbike*>& available, int left, int right)
{
int i = left;
int j = right-1;
Motorbike pivot = findPivotPricee(available, left, right);
cout << pivot.finalPrice() << endl;
while (i < j)
{
while (available[i]->finalPrice() < pivot.finalPrice())
{
i++;
}
while (available[j]->finalPrice() > pivot.finalPrice())
{
j--;
}
if (i <= j)
{
swap(*available[i], *available[j]);
i++;
j--;
}
else {
break;
}
}
swap(*available[i], *available[right - 1]);//restore the pivot
if (left < right) {
if (left<j) quickSortMotorbikeByPrice(available, left, j);
if (right>i) quickSortMotorbikeByPrice(available, i, right);
}
}
void quickSortMotorbikeByPrice(vector<Motorbike*>& available)
{
quickSortMotorbikeByPrice(available, 0, available.size() - 1);
}
Often, algorithms from wikipedia, e.g. the quicksort algorithms are hard to transform into a working implementation because they fail to point out the assumed programming model implied by the given algorithm. For example, if it is a 0 based or a 1 based array and that they assume that the indices used are signed not unsigned integers.
Then, people start trying to use those algorithms, here, for example in C++ and run into all sorts of problems. Quite a waste of time... And from looking at the code given in the question, I assume the author of the question tried to use info from wikipedia...
Since this is obviously homework, impress your teacher and use the code below. Quicksort is tricky to get right and it can take quite a while to find out if you should write lo = left + 1 or lo = left etc.
#include <cstdint>
#include <memory>
#include <vector>
#include <iostream>
#include <cassert>
template <class X>
void vswap(std::vector<X>& v, size_t i1, size_t i2)
{
X temp = v[i1];
v[i1] = v[i2];
v[i2] = temp;
}
template <typename X>
std::ostream& operator<<(std::ostream& stm, const std::vector<X>& v)
{
stm << "[|";
size_t i = 0;
for (auto& x : v)
{
if (0 == i)
stm << x;
else
stm << "; " << x;
i++;
}
stm << "|]";
return stm;
}
template <typename X>
std::ostream& operator<<(std::ostream& stm, const std::vector<X*>& v)
{
stm << "[|";
size_t i = 0;
for (auto& x : v)
{
if (0 == i)
if (nullptr == x) stm << "nullptr"; else stm << *x;
else
if (nullptr == x) stm << "; nullptr"; else stm << "; " << *x;
i++;
}
stm << "|]";
return stm;
}
template <class X, class Predicate>
size_t partition(std::vector<X> & v, Predicate p, size_t left, size_t right)
{
size_t boundary = left;
X x = v[boundary];
for (size_t i = left; i < right; i++)
{
if (p(v[i], x))
{
vswap(v, boundary, i);
boundary++;
}
}
return boundary;
}
template<class X, class Predicate>
void mysort(std::vector<X> & v, Predicate p, size_t left, size_t right)
{
//std::cout << "mysort: " << v << " " << left << " " << right << std::endl;
if ((right - left) > 1)
{
size_t boundary = partition(v, p, left, right);
//std::cout << "boundary = " << boundary << std::endl;
mysort(v, p, left, boundary);
mysort(v, p, boundary == left ? boundary + 1 : boundary, right);
}
}
class Motorbike
{
size_t m_id;
int32_t m_finalPrice;
public:
Motorbike()
: m_id(0)
, m_finalPrice(0)
{}
Motorbike(size_t id, int32_t finalPrice)
: m_id(id)
, m_finalPrice(finalPrice)
{}
void Id(size_t id)
{
m_id = id;
}
size_t Id() const
{
return m_id;
}
void Price(int32_t price)
{
m_finalPrice = price;
}
int32_t Price() const
{
return m_finalPrice;
}
};
std::ostream& operator<< (std::ostream& stm, const Motorbike& bike)
{
stm << "(" << bike.Id() << ", " << bike.Price() << ")";
return stm;
}
std::vector<Motorbike> randomBikes(size_t count, int32_t lowPrice = 100, int32_t highPrice = 1000)
{
std::vector<Motorbike> result;
result.resize(count);
for (size_t i = 0; i < count; i++)
{
result[i].Id(i);
result[i].Price(lowPrice + rand() * (highPrice - lowPrice) / RAND_MAX);
}
return result;
}
std::vector<Motorbike*> bikePointers(std::vector<Motorbike> & bikes)
{
std::vector<Motorbike*> result;
result.resize(bikes.size());
for (size_t i = 0; i < bikes.size(); i++)
{
result[i] = &bikes[i];
}
return result;
}
int main()
{
//_CrtSetDbgFlag(_CRTDBG_CHECK_ALWAYS_DF);
//_CrtDumpMemoryLeaks();
//{
//{
// std::vector<int32_t> data = { 3, 5, 1, 4, 2, 0 };
// std::cout << "original: " << data << std::endl;
// mysort(data, [](int32_t a, int32_t b) -> bool {return a < b;}, 0, data.size());
// std::cout << "sorted? " << data << std::endl;
//}
//std::cout << "--------------------------------------------------------" << std::endl;
//{
// std::vector<int32_t> data = { 3, 6, 1, 4, 2, 0, 5 };
// std::cout << "original: " << data << std::endl;
// mysort(data, [](int32_t a, int32_t b) -> bool {return a < b;}, 0, data.size());
// std::cout << "sorted? " << data << std::endl;
//}
for(size_t run = 0; run < 10; run++)
{
auto bikes = randomBikes(5+run%2);
auto bikes_p = bikePointers(bikes);
std::cout << "original: " << bikes_p << std::endl;
mysort(bikes_p, [](const Motorbike* m1, const Motorbike* m2)-> bool { return m1->Price() < m2->Price(); }, 0, bikes_p.size());
std::cout << "sorted? " << bikes_p << std::endl;
std::cout << "--------------------------------------------------------" << std::endl;
}
//}
//_CrtDumpMemoryLeaks();
return 0;
}

Different return and coordinate types in nanoflann radius search

I'm trying to use nanoflann in a project and am looking at the vector-of-vector and radius search examples.
I can't find a way to perform a radius search with a different data type than the coordinate type. For example, my coordinates are vectors of uint8_t; I am trying to input a radius of type uint32_t with little success.
I see in the source that the metric_L2 struct (which I am using for distance) uses the L2_Adaptor with two template parameters. L2_Adaptor itself takes three parameters, with the third defaulted to the first, which seems to be the problem if I am understanding the code correctly. However, trying to force use of the third always results in 0 matches in the radius search.
Is there a way to do this?
Edit: In the same code below, everything works. However, if I change the search_radius (and ret_matches) to uint32_t, the radiusSearch method doesn't work.
#include <iostream>
#include <Eigen/Dense>
#include <nanoflann.hpp>
typedef Eigen::Matrix<uint8_t, Eigen::Dynamic, 1> coord_t;
using namespace nanoflann;
struct Point
{
coord_t address;
Point() {}
Point(uint8_t coordinates) : address(coord_t::Random(coordinates)) {}
};
struct Container
{
std::vector<Point> points;
Container(uint8_t coordinates, uint32_t l)
: points(l)
{
for(auto& each_location: points)
{
each_location = Point(coordinates);
}
}
};
struct ContainerAdaptor
{
typedef ContainerAdaptor self_t;
typedef nanoflann::metric_L2::traits<uint8_t, self_t>::distance_t metric_t;
typedef KDTreeSingleIndexAdaptor<metric_t, self_t, -1, size_t> index_t;
index_t *index;
const Container &container;
ContainerAdaptor(const int dimensions, const Container &container, const int leaf_max_size = 10)
: container(container)
{
assert(container.points.size() != 0 && container.points[0].address.rows() != 0);
const size_t dims = container.points[0].address.rows();
index = new index_t(dims, *this, nanoflann::KDTreeSingleIndexAdaptorParams(leaf_max_size));
index->buildIndex();
}
~ContainerAdaptor()
{
delete index;
}
inline void query(const uint8_t *query_point, const size_t num_closest, size_t *out_indices, uint32_t *out_distances_sq, const int ignoreThis = 10) const
{
nanoflann::KNNResultSet<uint32_t, size_t, size_t> resultSet(num_closest);
resultSet.init(out_indices, out_distances_sq);
index->findNeighbors(resultSet, query_point, nanoflann::SearchParams());
}
const self_t& derived() const
{
return *this;
}
self_t& derived()
{
return *this;
}
inline size_t kdtree_get_point_count() const
{
return container.points.size();
}
inline size_t kdtree_distance(const uint8_t *p1, const size_t idx_p2, size_t size) const
{
size_t s = 0;
for (size_t i = 0; i < size; i++)
{
const uint8_t d = p1[i] - container.points[idx_p2].address[i];
s += d * d;
}
return s;
}
inline coord_t::Scalar kdtree_get_pt(const size_t idx, int dim) const
{
return container.points[idx].address[dim];
}
template <class BBOX>
bool kdtree_get_bbox(BBOX & bb) const
{
for(size_t i = 0; i < bb.size(); i++)
{
bb[i].low = 0;
bb[i].high = UINT8_MAX;
}
return true;
}
};
void container_demo(const size_t points, const size_t coordinates)
{
Container s(coordinates, points);
coord_t query_pt(coord_t::Random(coordinates));
typedef ContainerAdaptor my_kd_tree_t;
my_kd_tree_t mat_index(coordinates, s, 25);
mat_index.index->buildIndex();
const uint8_t search_radius = static_cast<uint8_t>(100);
std::vector<std::pair<size_t, uint8_t>> ret_matches;
nanoflann::SearchParams params;
const size_t nMatches = mat_index.index->radiusSearch(query_pt.data(), search_radius, ret_matches, params);
for (size_t i = 0; i < nMatches; i++)
{
std::cout << "idx[" << i << "]=" << +ret_matches[i].first << " dist[" << i << "]=" << +ret_matches[i].second << std::endl;
}
std::cout << std::endl;
std::cout << "radiusSearch(): radius=" << +search_radius << " -> " << +nMatches << " matches" << std::endl;
}
int main()
{
container_demo(1e6, 32);
return 0;
}
More info: so it seems that the distance type, which the third parameter of the L2_Adaptor, must be a signed type. Changing the metric_t typedef to the following solves the problem if search_radius and ret_matches are also changed to int64_t.
typedef L2_Adaptor<uint8_t, self_t, int64_t> metric_t;

How to optimize an indirect radix sort? (a.k.a. how to optimize unpredictable memory access patterns)

I've written an indirect radix sort algorithm in C++ (by indirect, I mean it returns the indices of the items):
#include <algorithm>
#include <iterator>
#include <vector>
template<class It1, class It2>
void radix_ipass(
It1 begin, It1 const end,
It2 const a, size_t const i,
std::vector<std::vector<size_t> > &buckets)
{
size_t ncleared = 0;
for (It1 j = begin; j != end; ++j)
{
size_t const k = a[*j][i];
while (k >= ncleared && ncleared < buckets.size())
{ buckets[ncleared++].clear(); }
if (k >= buckets.size())
{
buckets.resize(k + 1);
ncleared = buckets.size();
}
buckets[k].push_back(size_t());
using std::swap; swap(buckets[k].back(), *j);
}
for (std::vector<std::vector<size_t> >::iterator
j = buckets.begin(); j != buckets.begin() + ncleared; j->clear(), ++j)
{
begin = std::swap_ranges(j->begin(), j->end(), begin);
}
}
template<class It, class It2>
void radix_isort(It const begin, It const end, It2 const items)
{
for (ptrdiff_t i = 0; i != end - begin; ++i) { items[i] = i; }
size_t smax = 0;
for (It i = begin; i != end; ++i)
{
size_t const n = i->size();
smax = n > smax ? n : smax;
}
std::vector<std::vector<size_t> > buckets;
for (size_t i = 0; i != smax; ++i)
{
radix_ipass(
items, items + (end - begin),
begin, smax - i - 1, buckets);
}
}
It seems to perform around 40% faster than std::sort when I test it with the following code (3920 ms compared to 6530 ms):
#include <functional>
template<class Key>
struct key_comp : public Key
{
explicit key_comp(Key const &key = Key()) : Key(key) { }
template<class T>
bool operator()(T const &a, T const &b) const
{ return this->Key::operator()(a) < this->Key::operator()(b); }
};
template<class Key>
key_comp<Key> make_key_comp(Key const &key) { return key_comp<Key>(key); }
template<class T1, class T2>
struct add : public std::binary_function<T1, T2, T1>
{ T1 operator()(T1 a, T2 const &b) const { return a += b; } };
template<class F>
struct deref : public F
{
deref(F const &f) : F(f) { }
typename std::iterator_traits<
typename F::result_type
>::value_type const
&operator()(typename F::argument_type const &a) const
{ return *this->F::operator()(a); }
};
template<class T> deref<T> make_deref(T const &t) { return deref<T>(t); }
size_t xorshf96(void) // random number generator
{
static size_t x = 123456789, y = 362436069, z = 521288629;
x ^= x << 16;
x ^= x >> 5;
x ^= x << 1;
size_t t = x;
x = y;
y = z;
z = t ^ x ^ y;
return z;
}
#include <stdio.h>
#include <time.h>
#include <array>
int main(void)
{
typedef std::vector<std::array<size_t, 3> > Items;
Items items(1 << 24);
std::vector<size_t> ranks(items.size() * 2);
for (size_t i = 0; i != items.size(); i++)
{
ranks[i] = i;
for (size_t j = 0; j != items[i].size(); j++)
{ items[i][j] = xorshf96() & 0xFFF; }
}
clock_t const start = clock();
if (1) { radix_isort(items.begin(), items.end(), ranks.begin()); }
else // STL sorting
{
std::sort(
ranks.begin(),
ranks.begin() + items.size(),
make_key_comp(make_deref(std::bind1st(
add<Items::const_iterator, ptrdiff_t>(),
items.begin()))));
}
printf("%u ms\n",
(unsigned)((clock() - start) * 1000 / CLOCKS_PER_SEC),
std::min(ranks.begin(), ranks.end()));
return 0;
}
Hmm, I guess that's the best I can do, I thought.
But after lots of banging my head against the wall, I realized that prefetching in the beginning of radix_ipass can help cut down the result to 1440 ms (!):
#include <xmmintrin.h>
...
for (It1 j = begin; j != end; ++j)
{
#if defined(_MM_TRANSPOSE4_PS) // should be defined if xmmintrin.h is included
enum { N = 8 };
if (end - j > N)
{ _mm_prefetch((char const *)(&a[j[N]][i]), _MM_HINT_T0); }
#endif
...
}
Clearly, the bottleneck is the memory bandwidth---the access pattern is unpredictable.
So now my question is: what else can I do to make it even faster on similar amounts of data?
Or is there not much room left for improvement?
(I'm hoping to avoid compromising the readability of the code if possible, so if the readability is harmed, the improvement should be significant.)
Using a more compact data structure that combines ranks and values can boost the performance of std::sort by a factor 2-3. Essentially, the sort now runs on a vector<pair<Value,Rank>>. The Value data type, std::array<integer_type, 3> has been replaced for this by a more compact pair<uint32_t, uint8_t> data structure. Only half a byte of it is unused, and the < comparison can by done in two steps, first using a presumably efficient comparison of uint32_ts (it's not clear if the loop used by std::array<..>::operator< can be optimized to a similarly fast code, but the replacement of std::array<integer_type,3> by this data structure yielded another performance boost).
Still, it doesn't get as efficient as the radix sort. (Maybe you could tweak a custom QuickSort with prefetches?)
Besides that additional sorting method, I've replaced the xorshf96 by a mt19937, because I know how to provide a seed for the latter ;)
The seed and the number of values can be changed via two command-line arguments: first the seed, then the count.
Compiled with g++ 4.9.0 20131022, using -std=c++11 -march=native -O3, for a 64-bit linux
Sample runs; important note running on a Core2Duo processor U9400 (old & slow!)
item count: 16000000
using std::sort
duration: 12260 ms
result sorted: true
seed: 5648
item count: 16000000
using std::sort
duration: 12230 ms
result sorted: true
seed: 5648
item count: 16000000
using std::sort
duration: 12230 ms
result sorted: true
seed: 5648
item count: 16000000
using std::sort with a packed data structure
duration: 4290 ms
result sorted: true
seed: 5648
item count: 16000000
using std::sort with a packed data structure
duration: 4270 ms
result sorted: true
seed: 5648
item count: 16000000
using std::sort with a packed data structure
duration: 4280 ms
result sorted: true
item count: 16000000
using radix sort
duration: 3790 ms
result sorted: true
seed: 5648
item count: 16000000
using radix sort
duration: 3820 ms
result sorted: true
seed: 5648
item count: 16000000
using radix sort
duration: 3780 ms
result sorted: true
New or changed code:
template<class It>
struct fun_obj
{
It beg;
bool operator()(ptrdiff_t lhs, ptrdiff_t rhs)
{
return beg[lhs] < beg[rhs];
}
};
template<class It>
fun_obj<It> make_fun_obj(It beg)
{
return fun_obj<It>{beg};
}
struct uint32p8_t
{
uint32_t m32;
uint8_t m8;
uint32p8_t(std::array<uint16_t, 3> const& a)
: m32( a[0]<<(32-3*4) | a[1]<<(32-2*3*4) | (a[2]&0xF00)>>8)
, m8( a[2]&0xFF )
{
}
operator std::array<size_t, 3>() const
{
return {{m32&0xFFF00000 >> (32-3*4), m32&0x000FFF0 >> (32-2*3*4),
(m32&0xF)<<8 | m8}};
}
friend bool operator<(uint32p8_t const& lhs, uint32p8_t const& rhs)
{
if(lhs.m32 < rhs.m32) return true;
if(lhs.m32 > rhs.m32) return false;
return lhs.m8 < rhs.m8;
}
};
#include <stdio.h>
#include <time.h>
#include <array>
#include <iostream>
#include <iomanip>
#include <utility>
#include <algorithm>
#include <cstdlib>
#include <iomanip>
#include <random>
int main(int argc, char* argv[])
{
std::cout.sync_with_stdio(false);
constexpr auto items_count_default = 2<<22;
constexpr auto seed_default = 42;
uint32_t const seed = argc > 1 ? std::atoll(argv[1]) : seed_default;
std::cout << "seed: " << seed << "\n";
size_t const items_count = argc > 2 ? std::atoll(argv[2])
: items_count_default;
std::cout << "item count: " << items_count << "\n";
using Items_array_value_t =
#ifdef RADIX_SORT
size_t
#elif defined(STDSORT)
uint16_t
#elif defined(STDSORT_PACKED)
uint16_t
#endif
;
typedef std::vector<std::array<Items_array_value_t, 3> > Items;
Items items(items_count);
auto const ranks_count =
#ifdef RADIX_SORT
items.size() * 2
#elif defined(STDSORT)
items.size()
#elif defined(STDSORT_PACKED)
items.size()
#endif
;
//auto prng = xorshf96;
std::mt19937 gen(seed);
std::uniform_int_distribution<> dist;
auto prng = [&dist, &gen]{return dist(gen);};
std::vector<size_t> ranks(ranks_count);
for (size_t i = 0; i != items.size(); i++)
{
ranks[i] = i;
for (size_t j = 0; j != items[i].size(); j++)
{ items[i][j] = prng() & 0xFFF; }
}
std::cout << "using ";
clock_t const start = clock();
#ifdef RADIX_SORT
std::cout << "radix sort\n";
radix_isort(items.begin(), items.end(), ranks.begin());
#elif defined(STDSORT)
std::cout << "std::sort\n";
std::sort(ranks.begin(), ranks.begin() + items.size(),
make_fun_obj(items.cbegin())
//make_key_comp(make_deref(std::bind1st(
// add<Items::const_iterator, ptrdiff_t>(),
// items.begin())))
);
#elif defined(STDSORT_PACKED)
std::cout << "std::sort with a packed data structure\n";
using Items_ranks = std::vector< std::pair<uint32p8_t,
decltype(ranks)::value_type> >;
Items_ranks items_ranks;
size_t i = 0;
for(auto iI = items.cbegin(); iI != items.cend(); ++iI, ++i)
{
items_ranks.emplace_back(*iI, i);
}
std::sort(begin(items_ranks), end(items_ranks),
[](Items_ranks::value_type const& lhs,
Items_ranks::value_type const& rhs)
{ return lhs.first < rhs.first; }
);
std::transform(items_ranks.cbegin(), items_ranks.cend(), begin(ranks),
[](Items_ranks::value_type const& e) { return e.second; }
);
#endif
auto const duration = (clock() - start) / (CLOCKS_PER_SEC / 1000);
bool const sorted = std::is_sorted(ranks.begin(), ranks.begin() + items.size(),
make_fun_obj(items.cbegin()));
std::cout << "duration: " << duration << " ms\n"
<< "result sorted: " << std::boolalpha << sorted << "\n";
return 0;
}
Full code:
#include <algorithm>
#include <iterator>
#include <vector>
#include <cstddef>
using std::size_t;
using std::ptrdiff_t;
#include <xmmintrin.h>
template<class It1, class It2>
void radix_ipass(
It1 begin, It1 const end,
It2 const a, size_t const i,
std::vector<std::vector<size_t> > &buckets)
{
size_t ncleared = 0;
for (It1 j = begin; j != end; ++j)
{
#if defined(_MM_TRANSPOSE4_PS)
constexpr auto N = 8;
if(end - j > N)
{ _mm_prefetch((char const *)(&a[j[N]][i]), _MM_HINT_T0); }
#else
#error SS intrinsic not found
#endif
size_t const k = a[*j][i];
while (k >= ncleared && ncleared < buckets.size())
{ buckets[ncleared++].clear(); }
if (k >= buckets.size())
{
buckets.resize(k + 1);
ncleared = buckets.size();
}
buckets[k].push_back(size_t());
using std::swap; swap(buckets[k].back(), *j);
}
for (std::vector<std::vector<size_t> >::iterator
j = buckets.begin(); j != buckets.begin() + ncleared; j->clear(), ++j)
{
begin = std::swap_ranges(j->begin(), j->end(), begin);
}
}
template<class It, class It2>
void radix_isort(It const begin, It const end, It2 const items)
{
for (ptrdiff_t i = 0; i != end - begin; ++i) { items[i] = i; }
size_t smax = 0;
for (It i = begin; i != end; ++i)
{
size_t const n = i->size();
smax = n > smax ? n : smax;
}
std::vector<std::vector<size_t> > buckets;
for (size_t i = 0; i != smax; ++i)
{
radix_ipass(
items, items + (end - begin),
begin, smax - i - 1, buckets);
}
}
#include <functional>
template<class Key>
struct key_comp : public Key
{
explicit key_comp(Key const &key = Key()) : Key(key) { }
template<class T>
bool operator()(T const &a, T const &b) const
{ return this->Key::operator()(a) < this->Key::operator()(b); }
};
template<class Key>
key_comp<Key> make_key_comp(Key const &key) { return key_comp<Key>(key); }
template<class T1, class T2>
struct add : public std::binary_function<T1, T2, T1>
{ T1 operator()(T1 a, T2 const &b) const { return a += b; } };
template<class F>
struct deref : public F
{
deref(F const &f) : F(f) { }
typename std::iterator_traits<
typename F::result_type
>::value_type const
&operator()(typename F::argument_type const &a) const
{ return *this->F::operator()(a); }
};
template<class T> deref<T> make_deref(T const &t) { return deref<T>(t); }
size_t xorshf96(void) // random number generator
{
static size_t x = 123456789, y = 362436069, z = 521288629;
x ^= x << 16;
x ^= x >> 5;
x ^= x << 1;
size_t t = x;
x = y;
y = z;
z = t ^ x ^ y;
return z;
}
template<class It>
struct fun_obj
{
It beg;
bool operator()(ptrdiff_t lhs, ptrdiff_t rhs)
{
return beg[lhs] < beg[rhs];
}
};
template<class It>
fun_obj<It> make_fun_obj(It beg)
{
return fun_obj<It>{beg};
}
struct uint32p8_t
{
uint32_t m32;
uint8_t m8;
uint32p8_t(std::array<uint16_t, 3> const& a)
: m32( a[0]<<(32-3*4) | a[1]<<(32-2*3*4) | (a[2]&0xF00)>>8)
, m8( a[2]&0xFF )
{
}
operator std::array<size_t, 3>() const
{
return {{m32&0xFFF00000 >> (32-3*4), m32&0x000FFF0 >> (32-2*3*4),
(m32&0xF)<<8 | m8}};
}
friend bool operator<(uint32p8_t const& lhs, uint32p8_t const& rhs)
{
if(lhs.m32 < rhs.m32) return true;
if(lhs.m32 > rhs.m32) return false;
return lhs.m8 < rhs.m8;
}
};
#include <stdio.h>
#include <time.h>
#include <array>
#include <iostream>
#include <iomanip>
#include <utility>
#include <algorithm>
#include <cstdlib>
#include <iomanip>
#include <random>
int main(int argc, char* argv[])
{
std::cout.sync_with_stdio(false);
constexpr auto items_count_default = 2<<22;
constexpr auto seed_default = 42;
uint32_t const seed = argc > 1 ? std::atoll(argv[1]) : seed_default;
std::cout << "seed: " << seed << "\n";
size_t const items_count = argc > 2 ? std::atoll(argv[2]) : items_count_default;
std::cout << "item count: " << items_count << "\n";
using Items_array_value_t =
#ifdef RADIX_SORT
size_t
#elif defined(STDSORT)
uint16_t
#elif defined(STDSORT_PACKED)
uint16_t
#endif
;
typedef std::vector<std::array<Items_array_value_t, 3> > Items;
Items items(items_count);
auto const ranks_count =
#ifdef RADIX_SORT
items.size() * 2
#elif defined(STDSORT)
items.size()
#elif defined(STDSORT_PACKED)
items.size()
#endif
;
//auto prng = xorshf96;
std::mt19937 gen(seed);
std::uniform_int_distribution<> dist;
auto prng = [&dist, &gen]{return dist(gen);};
std::vector<size_t> ranks(ranks_count);
for (size_t i = 0; i != items.size(); i++)
{
ranks[i] = i;
for (size_t j = 0; j != items[i].size(); j++)
{ items[i][j] = prng() & 0xFFF; }
}
std::cout << "using ";
clock_t const start = clock();
#ifdef RADIX_SORT
std::cout << "radix sort\n";
radix_isort(items.begin(), items.end(), ranks.begin());
#elif defined(STDSORT)
std::cout << "std::sort\n";
std::sort(ranks.begin(), ranks.begin() + items.size(),
make_fun_obj(items.cbegin())
//make_key_comp(make_deref(std::bind1st(
// add<Items::const_iterator, ptrdiff_t>(),
// items.begin())))
);
#elif defined(STDSORT_PACKED)
std::cout << "std::sort with a packed data structure\n";
using Items_ranks = std::vector< std::pair<uint32p8_t,
decltype(ranks)::value_type> >;
Items_ranks items_ranks;
size_t i = 0;
for(auto iI = items.cbegin(); iI != items.cend(); ++iI, ++i)
{
items_ranks.emplace_back(*iI, i);
}
std::sort(begin(items_ranks), end(items_ranks),
[](Items_ranks::value_type const& lhs,
Items_ranks::value_type const& rhs)
{ return lhs.first < rhs.first; }
);
std::transform(items_ranks.cbegin(), items_ranks.cend(), begin(ranks),
[](Items_ranks::value_type const& e) { return e.second; }
);
#endif
auto const duration = (clock() - start) / (CLOCKS_PER_SEC / 1000);
bool const sorted = std::is_sorted(ranks.begin(), ranks.begin() + items.size(),
make_fun_obj(items.cbegin()));
std::cout << "duration: " << duration << " ms\n"
<< "result sorted: " << std::boolalpha << sorted << "\n";
return 0;
}