Why is std::vector<bool> faster? - c++

As I was implementing the Sieve of Eratosthenes I ran into an issue with std::vector<bool> : there is no access to the raw data.
So I decided to use a custom minimalistic implementation where I would have access to the data pointer.
#ifndef LIB_BITS_T_H
#define LIB_BITS_T_H
#include <algorithm>
template <typename B>
class bits_t{
public:
typedef B block_t;
static const size_t block_size = sizeof(block_t) * 8;
block_t* data;
size_t size;
size_t blocks;
class bit_ref{
public:
block_t* const block;
const block_t mask;
bit_ref(block_t& block, const block_t mask) noexcept : block(&block), mask(mask){}
inline void operator=(bool v) const noexcept{
if(v) *block |= mask;
else *block &= ~mask;
}
inline operator bool() const noexcept{
return (bool)(*block & mask);
}
};
bits_t() noexcept : data(nullptr){}
void resize(const size_t n, const bool v) noexcept{
block_t fill = v ? ~block_t(0) : block_t(0);
size = n;
blocks = (n + block_size - 1) / block_size;
data = new block_t[blocks];
std::fill(data, data + blocks, fill);
}
inline block_t& block_at_index(const size_t i) const noexcept{
return data[i / block_size];
}
inline size_t index_in_block(const size_t i) const noexcept{
return i % block_size;
}
inline bit_ref operator[](const size_t i) noexcept{
return bit_ref(block_at_index(i), block_t(1) << index_in_block(i));
}
~bits_t(){
delete[] data;
}
};
#endif // LIB_BITS_T_H
The code is nearly the same than the one in /usr/include/c++/4.7/bits/stl_bvector.h but is slower.
I tried an optimization,
#ifndef LIB_BITS_T_H
#define LIB_BITS_T_H
#include <algorithm>
template <typename B>
class bits_t{
const B mask[64] = {
0b0000000000000000000000000000000000000000000000000000000000000001,
0b0000000000000000000000000000000000000000000000000000000000000010,
0b0000000000000000000000000000000000000000000000000000000000000100,
0b0000000000000000000000000000000000000000000000000000000000001000,
0b0000000000000000000000000000000000000000000000000000000000010000,
0b0000000000000000000000000000000000000000000000000000000000100000,
0b0000000000000000000000000000000000000000000000000000000001000000,
0b0000000000000000000000000000000000000000000000000000000010000000,
0b0000000000000000000000000000000000000000000000000000000100000000,
0b0000000000000000000000000000000000000000000000000000001000000000,
0b0000000000000000000000000000000000000000000000000000010000000000,
0b0000000000000000000000000000000000000000000000000000100000000000,
0b0000000000000000000000000000000000000000000000000001000000000000,
0b0000000000000000000000000000000000000000000000000010000000000000,
0b0000000000000000000000000000000000000000000000000100000000000000,
0b0000000000000000000000000000000000000000000000001000000000000000,
0b0000000000000000000000000000000000000000000000010000000000000000,
0b0000000000000000000000000000000000000000000000100000000000000000,
0b0000000000000000000000000000000000000000000001000000000000000000,
0b0000000000000000000000000000000000000000000010000000000000000000,
0b0000000000000000000000000000000000000000000100000000000000000000,
0b0000000000000000000000000000000000000000001000000000000000000000,
0b0000000000000000000000000000000000000000010000000000000000000000,
0b0000000000000000000000000000000000000000100000000000000000000000,
0b0000000000000000000000000000000000000001000000000000000000000000,
0b0000000000000000000000000000000000000010000000000000000000000000,
0b0000000000000000000000000000000000000100000000000000000000000000,
0b0000000000000000000000000000000000001000000000000000000000000000,
0b0000000000000000000000000000000000010000000000000000000000000000,
0b0000000000000000000000000000000000100000000000000000000000000000,
0b0000000000000000000000000000000001000000000000000000000000000000,
0b0000000000000000000000000000000010000000000000000000000000000000,
0b0000000000000000000000000000000100000000000000000000000000000000,
0b0000000000000000000000000000001000000000000000000000000000000000,
0b0000000000000000000000000000010000000000000000000000000000000000,
0b0000000000000000000000000000100000000000000000000000000000000000,
0b0000000000000000000000000001000000000000000000000000000000000000,
0b0000000000000000000000000010000000000000000000000000000000000000,
0b0000000000000000000000000100000000000000000000000000000000000000,
0b0000000000000000000000001000000000000000000000000000000000000000,
0b0000000000000000000000010000000000000000000000000000000000000000,
0b0000000000000000000000100000000000000000000000000000000000000000,
0b0000000000000000000001000000000000000000000000000000000000000000,
0b0000000000000000000010000000000000000000000000000000000000000000,
0b0000000000000000000100000000000000000000000000000000000000000000,
0b0000000000000000001000000000000000000000000000000000000000000000,
0b0000000000000000010000000000000000000000000000000000000000000000,
0b0000000000000000100000000000000000000000000000000000000000000000,
0b0000000000000001000000000000000000000000000000000000000000000000,
0b0000000000000010000000000000000000000000000000000000000000000000,
0b0000000000000100000000000000000000000000000000000000000000000000,
0b0000000000001000000000000000000000000000000000000000000000000000,
0b0000000000010000000000000000000000000000000000000000000000000000,
0b0000000000100000000000000000000000000000000000000000000000000000,
0b0000000001000000000000000000000000000000000000000000000000000000,
0b0000000010000000000000000000000000000000000000000000000000000000,
0b0000000100000000000000000000000000000000000000000000000000000000,
0b0000001000000000000000000000000000000000000000000000000000000000,
0b0000010000000000000000000000000000000000000000000000000000000000,
0b0000100000000000000000000000000000000000000000000000000000000000,
0b0001000000000000000000000000000000000000000000000000000000000000,
0b0010000000000000000000000000000000000000000000000000000000000000,
0b0100000000000000000000000000000000000000000000000000000000000000,
0b1000000000000000000000000000000000000000000000000000000000000000
};
public:
typedef B block_t;
static const size_t block_size = sizeof(block_t) * 8;
block_t* data;
size_t size;
size_t blocks;
class bit_ref{
public:
block_t* const block;
const block_t mask;
bit_ref(block_t& block, const block_t mask) noexcept : block(&block), mask(mask){}
inline void operator=(bool v) const noexcept{
if(v) *block |= mask;
else *block &= ~mask;
}
inline operator bool() const noexcept{
return (bool)(*block & mask);
}
};
bits_t() noexcept : data(nullptr){}
void resize(const size_t n, const bool v) noexcept{
block_t fill = v ? ~block_t(0) : block_t(0);
size = n;
blocks = (n + block_size - 1) / block_size;
data = new block_t[blocks];
std::fill(data, data + blocks, fill);
}
inline block_t& block_at_index(const size_t i) const noexcept{
return data[i / block_size];
}
inline size_t index_in_block(const size_t i) const noexcept{
return i % block_size;
}
inline bit_ref operator[](const size_t i) noexcept{
return bit_ref(block_at_index(i), mask[index_in_block(i)]);
}
~bits_t(){
delete[] data;
}
};
#endif // LIB_BITS_T_H
(Compiling with g++4.7 -O3)
Eratosthenes sieve algorithm (33.333.333 bits)
std::vector<bool> 19.1s
bits_t<size_t> 19.9s
bits_t<size_t> (with lookup table) 19.7s
ctor + resize(33.333.333 bits) + dtor
std::vector<bool> 120ms
bits_t<size_t> 150ms
QUESTION : Where does the slowdown come from?

Outside of all the problems as pointed out by some other users, your resize is allocating more memory each time the current block limit is reached to add ONE block. The std::vector will double the size of the buffer (so if you already had 16 blocks, now you have 32 blocks). In other words, they will do less new than you.
This being said, you do not do the necessary delete & copy and that could have a "positive" impact in your version... ("positive" impact speed wise, it is not positive that you do not delete the old data, nor copy it in your new buffer.)
Also, the std::vector will properly enlarge the buffer and thus copy data that is likely already in your CPU cache. With your version, that cache is lost since you just ignore the old buffer on each resize().
Also when a class handles a memory buffer it is customary to implement the copy and assignment operators, for some reasons... and you could look into using a shared_ptr<>() too. The delete is then hidden and the class is a template so it is very fast (it does not add any code that you would not already have in your own version.)
=== Update
There is one other thing. You're operator [] implementation:
inline bit_ref operator[](const size_t i) noexcept{
return bit_ref(block_at_index(i), mask[index_in_block(i)]);
}
(side note: the inline is not required since the fact that you write the code within the class already means you okayed the inline capability already.)
You only offer a non-const version which "is slow" because it creates a sub-class. You should try implementing a const version that returns bool and see whether that accounts for the ~3% difference you see.
bool operator[](const size_t i) const noexcept
{
return (block_at_index(i) & mask[index_in_block(i)]) != 0;
}
Also, using a mask[] array can also slow down things. (1LL << (index & 0x3F)) should be faster (2 CPU instructions with 0 memory access).

Apparently, the wrapping of i % block_size in a function was the culprit
inline size_t index_in_block ( const size_t i ) const noexcept {
return i % block_size;
}
inline bit_ref operator[] ( const size_t i ) noexcept {
return bit_ref( block_at_index( i ), block_t( 1 ) << index_in_block( i ) );
}
so replacing the above code with
inline bit_ref operator[] ( const size_t i ) noexcept {
return bit_ref( block_at_index( i ), block_t( 1 ) << ( i % block_size ) );
}
solves the issue. However, I still don't know why it is. My best guess is that I didn't get the signature of index_in_block right and that the optimizer is thus not able to inline this function in a similar way to the manual inlining way.
Here is the new code.
#ifndef LIB_BITS_2_T_H
#define LIB_BITS_2_T_H
#include <algorithm>
template <typename B>
class bits_2_t {
public:
typedef B block_t;
static const int block_size = sizeof( block_t ) * __CHAR_BIT__;
private:
block_t* _data;
size_t _size;
size_t _blocks;
public:
class bit_ref {
public:
block_t* const block;
const block_t mask;
bit_ref ( block_t& block, const block_t mask) noexcept
: block( &block ), mask( mask ) {}
inline bool operator= ( const bool v ) const noexcept {
if ( v ) *block |= mask;
else *block &= ~mask;
return v;
}
inline operator bool() const noexcept {
return (bool)( *block & mask );
}
};
bits_2_t () noexcept : _data( nullptr ), _size( 0 ), _blocks( 0 ) {}
bits_2_t ( const size_t n ) noexcept : _data( nullptr ), _size( n ) {
_blocks = number_of_blocks_needed( n );
_data = new block_t[_blocks];
const block_t fill( 0 );
std::fill( _data, _data + _blocks, fill );
}
bits_2_t ( const size_t n, const bool v ) noexcept : _data( nullptr ), _size( n ) {
_blocks = number_of_blocks_needed( n );
_data = new block_t[_blocks];
const block_t fill = v ? ~block_t( 0 ) : block_t( 0 );
std::fill( _data, _data + _blocks, fill );
}
void resize ( const size_t n ) noexcept {
resize( n, false );
}
void resize ( const size_t n, const bool v ) noexcept {
const size_t tmpblocks = number_of_blocks_needed( n );
const size_t copysize = std::min( _blocks, tmpblocks );
block_t* tmpdata = new block_t[tmpblocks];
std::copy( _data, _data + copysize, tmpdata );
const block_t fill = v ? ~block_t( 0 ) : block_t( 0 );
std::fill( tmpdata + copysize, tmpdata + tmpblocks, fill );
delete[] _data;
_data = tmpdata;
_blocks = tmpblocks;
_size = n;
}
inline size_t number_of_blocks_needed ( const size_t n ) const noexcept {
return ( n + block_size - 1 ) / block_size;
}
inline block_t& block_at_index ( const size_t i ) const noexcept {
return _data[i / block_size];
}
inline bit_ref operator[] ( const size_t i ) noexcept {
return bit_ref( block_at_index( i ), block_t( 1 ) << ( i % block_size ) );
}
inline bool operator[] ( const size_t i ) const noexcept {
return (bool)( block_at_index( i ) & ( block_t( 1 ) << ( i % block_size ) ) );
}
inline block_t* data () {
return _data;
}
inline const block_t* data () const {
return _data;
}
inline size_t size () const {
return _size;
}
void clear () noexcept {
delete[] _data;
_size = 0;
_blocks = 0;
_data = nullptr;
}
~bits_2_t () {
clear();
}
};
#endif // LIB_BITS_2_T_H
Here are the results for this new code on my amd64 machine for primes up to 1.000.000.000 (best of 3 runs, real time).
Sieve of Eratosthenes with 1 memory unit per number ( not skipping multiples of 2 ).
bits_t<uint8_t>
real 0m23.614s user 0m23.493s sys 0m0.092s
bits_t<uint16_t>
real 0m24.399s user 0m24.294s sys 0m0.084s
bits_t<uint32_t>
real 0m23.501s user 0m23.372s sys 0m0.108s <-- best
bits_t<uint64_t>
real 0m24.393s user 0m24.304s sys 0m0.068s
std::vector<bool>
real 0m24.362s user 0m24.276s sys 0m0.056s
std::vector<uint8_t>
real 0m38.303s user 0m37.570s sys 0m0.683s
Here is the code of the sieve (where (...) should be replaced by the bit array of your choice).
#include <iostream>
typedef (...) array_t;
int main ( int argc, char const *argv[] ) {
if ( argc != 2 ) {
std::cout << "#0 missing" << std::endl;
return 1;
}
const size_t count = std::stoull( argv[1] );
array_t prime( count, true );
prime[0] = prime[1] = false;
for ( size_t k = 2 ; k * k < count ; ++k ) {
if ( prime[k] ) {
for ( size_t i = k * k ; i < count ; i += k ) {
prime[i] = false;
}
}
}
return 0;
}

Related

How to define a RandomAccessIterator over a pointer to a vector of chars?

I am implementing a kind of dataframe and I want to define a RandomAccessIterator over it, in order to execute the different std algorithms, such as the sorting one. The dataframe of the example contains two column "a" and "b":
a; b;
20; 21;
20; 19;
10; 11;
40; 41;
10; 11;
After sorting with a trivial selection sort this is the result:
a; b;
10; 11;
10; 11;
20; 19;
20; 21;
40; 41;
The problem that I am facing is that the std::sort does not work properly. And I don't know weather the implementation of the iterator is sound or not.
This is the code.
File: dataframe.hpp
#pragma once
#include <iostream>
#include <charconv>
#include <vector>
#include <memory>
#include <cstring>
#include <numeric>
#include "iterator.hpp"
namespace df
{
class Record;
class Column;
class Dataframe;
namespace types
{
enum class Base : char
{
CHAR = 'A',
UNSIGNED = 'U',
// Other types..
};
class Dtype
{
public:
Dtype(types::Base base, std::size_t size) : m_base_dtype{base}, m_size{size} {}
[[nodiscard]] auto name() const
{
return std::string{static_cast<char>(m_base_dtype)} + std::to_string(m_size);
}
[[nodiscard]] auto base() const { return m_base_dtype; }
[[nodiscard]] auto size() const { return m_size; }
[[nodiscard]] auto is_primitive() const
{
switch (base())
{
case types::Base::CHAR:
return size() == 1;
case types::Base::UNSIGNED:
return size() == 1 or size() == 2 or size() == 4 or size() == 8;
}
return false;
}
private:
types::Base m_base_dtype;
std::size_t m_size;
};
[[nodiscard]] static auto CHAR(const std::size_t size) { return Dtype(types::Base::CHAR, size); }
[[nodiscard]] static auto UNSIGNED(const std::size_t size) { return Dtype(types::Base::UNSIGNED, size); }
}
class Column
{
public:
Column(std::vector<char> &raw, const types::Dtype dtype) : m_raw{std::move(raw)}, m_dtype{dtype} {}
Column &operator=(Column &&c) = default; // Move constructor
[[nodiscard]] const auto &dtype() const { return m_dtype; }
[[nodiscard]] auto &raw() { return m_raw; }
[[nodiscard]] const auto &raw() const { return m_raw; }
[[nodiscard]] auto *data() { return m_raw.data(); }
[[nodiscard]] const auto *data() const { return m_raw.data(); }
private:
std::vector<char> m_raw;
types::Dtype m_dtype;
};
class Dataframe
{
public:
Dataframe(std::vector<char> &raw, std::vector<std::string> names, std::vector<types::Dtype> dtypes)
{
m_raw = std::move(raw);
m_column_dtypes = dtypes;
m_column_names = names;
m_record_size = 0;
for (const auto dt : dtypes)
{
m_column_offsets.emplace_back(m_record_size);
m_record_size += dt.size();
}
m_record_count = m_raw.size() / m_record_size;
}
Dataframe(std::vector<char> &raw, std::vector<types::Dtype> dtypes) : Dataframe(raw, {}, dtypes) {}
Dataframe &operator=(Dataframe &&c) = default; // Move constructor
[[nodiscard]] auto &raw() { return m_raw; }
[[nodiscard]] const auto &raw() const { return m_raw; }
[[nodiscard]] auto *data() { return m_raw.data(); }
[[nodiscard]] const auto *data() const { return m_raw.data(); }
// Iterators
[[nodiscard]] df::Iterator begin()
{
return df::Iterator{m_raw.data(), m_record_size};
}
[[nodiscard]] df::Iterator end()
{
return df::Iterator{m_raw.data() + m_raw.size(), m_record_size};
}
[[nodiscard]] auto shape() const { return std::make_pair(m_record_count, m_column_dtypes.size()); }
[[nodiscard]] auto record_count() const { return m_record_count; }
[[nodiscard]] auto record_size() const { return m_record_size; }
[[nodiscard]] const auto &names() const { return m_column_names; }
[[nodiscard]] const auto &dtypes() const { return m_column_dtypes; }
[[nodiscard]] const auto &offsets() const { return m_column_offsets; }
void print() { print(m_record_count); }
void print(const std::size_t initial_records)
{
// Print header
for (auto column_name : m_column_names)
{
std::cout << column_name << "; ";
}
std::cout << std::endl;
// Print rows
std::size_t records_to_print = std::min(initial_records, m_record_count);
for (std::size_t i = 0; i < records_to_print; i++)
{
const auto start_p = i * record_size();
auto start_field = 0;
auto end_field = 0;
for (auto field : m_column_dtypes)
{
end_field += field.size();
switch (field.base())
{
case types::Base::UNSIGNED:
{
std::uint64_t uint_value = 0;
memcpy(&uint_value, m_raw.data() + start_p + start_field, field.size());
std::cout << uint_value;
break;
}
case types::Base::CHAR:
{
std::string str_value = std::string(m_raw.data() + start_p + start_field, field.size());
std::cout << str_value;
break;
}
}
start_field = end_field;
// New column
std::cout << "; ";
}
// New row
std::cout << std::endl;
}
}
std::shared_ptr<Dataframe> copy() const
{
auto x = std::vector<char>(m_raw);
return std::make_shared<Dataframe>(x, std::vector<std::string>(m_column_names), std::vector<types::Dtype>(m_column_dtypes));
}
private:
std::vector<char> m_raw = {};
std::vector<std::string> m_column_names = {};
std::vector<types::Dtype> m_column_dtypes = {};
std::vector<std::size_t> m_column_offsets = {};
std::size_t m_record_size = {};
std::size_t m_record_count = {};
};
using namespace types;
static std::shared_ptr<Dataframe> read_from_vector(const std::vector<std::vector<std::string>> values, const std::vector<std::string> names, const std::vector<Dtype> dtypes)
{
const auto record_size = std::accumulate(dtypes.begin(), dtypes.end(), std::size_t{0},
[](std::size_t accum, const auto &m)
{ return accum + m.size(); });
const auto total_size = values.size() * record_size;
const std::size_t INCR_RECORDS = std::max(total_size / (10 * record_size), std::size_t{65536});
auto raw = std::vector<char>{};
std::size_t written_records = 0;
auto offsets = std::vector<std::size_t>{};
for (int offset = 0; const auto &kd : dtypes)
{
offsets.push_back(offset);
offset += kd.size();
}
for (auto value : values)
{
if (written_records >= raw.size() / record_size)
{
raw.resize(raw.size() + INCR_RECORDS * record_size, char{' '});
}
for (int i = 0; i < names.size(); i++)
{
const auto name = names[i];
const auto dtype = dtypes[i];
const auto offset = offsets[i];
const auto pos = written_records * record_size + offset;
switch (dtype.base())
{
case df::Base::CHAR:
{
const auto v = value[i];
const auto byte_to_copy = std::min(v.size(), dtype.size());
std::memcpy(raw.data() + pos,
v.data() + v.size() - byte_to_copy, byte_to_copy); // Prendo gli ultimi byte
break;
}
case df::Base::UNSIGNED:
{
const auto v = std::stoull(value[i]);
const auto byte_to_copy = dtype.size();
std::memcpy(raw.data() + pos, &v, byte_to_copy); // Prendo gli ultimi byte
break;
}
default:
throw std::runtime_error("ColumnType non riconosciuto");
}
}
written_records++;
}
raw.resize(written_records * record_size);
raw.shrink_to_fit();
return std::make_shared<Dataframe>(raw, names, dtypes);
}
}
File: iterator.hpp
#pragma once
#include <iostream>
#include <cstring>
namespace df
{
class Iterator
{
std::size_t size;
char *ptr;
public:
struct record_reference;
struct record_value
{
std::size_t size;
char *ptr;
record_value(const record_reference &t) : record_value(t.size, t.ptr){};
record_value(const std::size_t m_size, char *m_ptr)
{
this->size = m_size;
this->ptr = new char[this->size];
std::memcpy(ptr, m_ptr, this->size);
}
~record_value()
{
delete[] this->ptr;
}
};
struct record_reference
{
std::size_t size;
char *ptr;
record_reference(const std::size_t m_size, char *m_ptr)
{
this->size = m_size;
this->ptr = m_ptr;
}
record_reference(const record_reference &t)
{
this->size = t.size;
this->ptr = t.ptr;
}
// record_reference(const record_value &t) : record_reference(t.size, t.ptr) {};
record_reference &operator=(const record_value &t)
{
std::memcpy(ptr, t.ptr, size);
return *this;
}
record_reference &operator=(const record_reference &t)
{
std::memcpy(ptr, t.ptr, size);
return *this;
}
record_reference &operator=(char *t)
{
std::memcpy(ptr, t, size);
return *this;
}
operator char *()
{
return ptr;
}
operator const char *() const { return ptr; }
};
using iterator_category = std::random_access_iterator_tag;
using value_type = record_value;
using reference = record_reference;
using difference_type = std::ptrdiff_t;
// default constructible
Iterator() : size(0), ptr(nullptr)
{
}
// copy assignable
Iterator &operator=(const Iterator &t)
{
size = t.size;
ptr = t.ptr;
return *this;
}
Iterator(char *ptr, const std::size_t size) : size{size}, ptr(ptr)
{
}
record_reference operator*() const
{
return {size, ptr};
}
// Prefix
Iterator &operator++()
{
ptr += size;
return *this;
}
// Postfix
Iterator operator++(int)
{
auto tmp = *this;
++*this;
return tmp;
}
Iterator &operator--()
{
ptr -= size;
return *this;
}
difference_type operator-(const Iterator &it) const
{
return (this->ptr - it.ptr) / size;
}
Iterator operator+(const difference_type &offset) const
{
return Iterator(ptr + offset * size, size);
}
friend Iterator operator+(const difference_type &diff, const Iterator &it)
{
return it + diff;
}
Iterator operator-(const difference_type &diff) const
{
return Iterator(ptr - diff * size, size);
}
reference operator[](const difference_type &offset) const
{
return {size, ptr + offset * size};
}
bool operator==(const Iterator &it) const
{
return this->ptr == it.ptr;
}
bool operator!=(const Iterator &it) const
{
return !(*this == it);
}
bool operator<(const Iterator &it) const
{
return this->ptr < it.ptr;
}
bool operator>=(const Iterator &it) const
{
return this->ptr >= it.ptr;
}
bool operator>(const Iterator &it) const
{
return this->ptr > it.ptr;
}
bool operator<=(const Iterator &it) const
{
return this->ptr <= it.ptr;
}
Iterator &operator+=(const difference_type &diff)
{
ptr += diff * size;
return *this;
}
operator Iterator() const
{
return Iterator(ptr, size);
}
};
void swap(df::Iterator::record_reference a, df::Iterator::record_reference b)
{
unsigned char *p;
unsigned char *q;
unsigned char *const sentry = (unsigned char *)a.ptr + a.size;
for (p = (unsigned char *)a.ptr, q = (unsigned char *)b.ptr; p < sentry; ++p, ++q)
{
const unsigned char t = *p;
*p = *q;
*q = t;
}
}
}
File: comparator.hpp
#pragma once
#include <memory>
#include <functional>
#include "dataframe.hpp"
#include "iterator.hpp"
namespace compare
{
using comparator_fn = std::function<int(const df::Iterator::record_reference, const df::Iterator::record_reference)>;
template <typename T, std::size_t offset = 0, std::size_t size = sizeof(T)>
static inline comparator_fn make_comparator()
{
if constexpr (size == 3 or size == 5 or size == 7 or size > 8)
return [=](const df::Iterator::record_reference a, const df::Iterator::record_reference b)
{ return std::memcmp(a + offset, b + offset, size); };
return [](const df::Iterator::record_reference a, const df::Iterator::record_reference b)
{ return *(T *)(a + offset) < *(T *)(b + offset) ? -1 : *(T *)(b + offset) < *(T *)(a + offset) ? +1
: 0; };
}
template <typename T>
static inline comparator_fn make_comparator(const std::size_t offset)
{
return [=](const df::Iterator::record_reference a, const df::Iterator::record_reference b)
{ return *(T *)(a + offset) < *(T *)(b + offset) ? -1 : *(T *)(b + offset) < *(T *)(a + offset) ? +1
: 0; };
}
static inline comparator_fn make_column_comparator(const df::Dtype dtype, const std::size_t offset)
{
switch (dtype.base())
{
case df::Base::CHAR:
{
if (dtype.size() == 1)
return make_comparator<std::uint8_t>(offset);
else if (dtype.size() == 2)
return [=](const df::Iterator::record_reference a, const df::Iterator::record_reference b)
{ return std::memcmp(a + offset, b + offset, 2); }; // C'� qualche beneficio a fissare il 2? o conviene trattarlo come uno unsigned short?
return [=](const df::Iterator::record_reference a, const df::Iterator::record_reference b)
{ return std::memcmp(a + offset, b + offset, dtype.size()); };
}
case df::Base::UNSIGNED:
{
return [=](const df::Iterator::record_reference a, const df::Iterator::record_reference b)
{
std::uint64_t uint_value_a = 0;
std::uint64_t uint_value_b = 0;
std::memcpy(&uint_value_a, a + offset, dtype.size());
std::memcpy(&uint_value_b, b + offset, dtype.size());
return (uint_value_a < uint_value_b ? -1 : uint_value_a > uint_value_b ? +1
: 0);
};
}
default:
throw std::runtime_error("Unsupported dtype");
break;
}
}
static inline comparator_fn make_composite_two_way_comparator(const std::shared_ptr<df::Dataframe> &T)
{
const auto K = T->dtypes().size();
std::vector<comparator_fn> F;
for (int i = 0; i < K; i++)
{
F.emplace_back(make_column_comparator(T->dtypes()[i], T->offsets()[i]));
}
const auto comparator = [=](const df::Iterator::record_reference a, const df::Iterator::record_reference b)
{
for (int i = 0; i < K; i++)
{
// If equal go to the next column, otherwise return the result
// The return value is true if the first argument is less than the second
// and false otherwise
if (const auto result = F[i](a, b); result != 0)
return result < 0;
}
return false;
};
return comparator;
}
}
File: main.cpp
#include <iostream>
#include <vector>
#include "dataframe.hpp"
#include "comparator.hpp"
template <typename RandomAccessIterator, typename Comparator>
static void selection_sort(RandomAccessIterator first, RandomAccessIterator last, Comparator comp)
{
for (auto i = first; i != last; ++i)
{
auto min = i;
for (auto j = i + 1; j != last; ++j)
{
if (comp(*j, *min))
min = j;
}
df::Iterator::value_type temp = *i;
*i = *min;
*min = temp;
// Alternative
// std::iter_swap(i, min);
}
}
int main(int argc, char const *argv[])
{
std::vector<std::string> values{"20", "21", "20", "19", "10", "11", "40", "41", "10", "11"};
// Create a vector that contains values grouped by 2
std::vector<std::vector<std::string>> v;
for (int i = 0; i < values.size(); i += 2)
{
std::vector<std::string> temp;
temp.push_back(values[i]);
temp.push_back(values[i + 1]);
v.push_back(temp);
}
std::vector<std::string> column_names = {"a", "b"};
df::Dtype d = df::Dtype(df::Base::UNSIGNED, 4);
std::vector dtypes = {d, d};
// Create a dataframe
std::shared_ptr<df::Dataframe> df = df::read_from_vector(v, column_names, dtypes);
std::cout << "Before sorting" << std::endl;
df->print();
// This comparator sorts the dataframe first by column a and then by column b in ascending order
auto comparator = compare::make_composite_two_way_comparator(df);
selection_sort(df->begin(), df->end(), comparator);
std::cout << "\nAfter sorting" << std::endl;
df->print();
// With the std::sort it does not work
std::sort(df->begin(), df->end(), comparator);
return 0;
}
Your type is not a C++17 RandomAccessIterator, because it isn't a C++17 ForwardIterator, because reference is an object type, not a reference type.
The type It satisfies ForwardIterator if
Let T be the value type of It. The type std::iterator_traits<It>::reference must be either
T& or T&& if It satisfies OutputIterator (It is mutable), or
const T& or const T&& otherwise (It is constant),
(Other requirements elided)
You will be able to satisfy the C++20 concept std::random_access_iterator, because that relaxes the requirement on It::reference.
In C++17, the reference type of an iterator must be precisely value_type& in order for that iterator to be random access. Only input iterators can have the reference type be something other than value_type&. So in C++17, proxy iterators are limited to input iterators. And every algorithm written against C++17 has this expectation.
The C++20 ranges library adds the ability to have random access proxy iterators. And the C++20 algorithms that use those range concepts will respect them.

Why is a hashmap slower when I edit it to be able to look up pointers as keys?

Here's a hashmap which I tried to make faster for something:
https://github.com/martinus/robin-hood-hashing/blob/master/src/include/robin_hood.h
I tried to make lookups faster by making it so it could use std::unique_ptr<wchar_t[]> keys and look up keys by raw pointers, and I did this by including cwchar on line 51, and adding the code from lines 770-809, lines 1726-1778, and lines 2628-2630:
https://pastebin.com/zbhxEj7B
// lines 770-809
template <class T>
struct ptr_comparer
{
bool operator() (const std::unique_ptr<T[]>& arr1, const std::unique_ptr<T[]>& arr2) const
{
#ifdef _WIN32
return std::wcscmp(arr1.get(), arr2.get()) == 0;
#else
return std::strcmp(arr1.get(), arr2.get()) == 0;
#endif
}
bool operator() (const std::unique_ptr<T[]>& arr1, const T* arr2) const
{
#ifdef _WIN32
return std::wcscmp(arr1.get(), arr2) == 0;
#else
return std::strcmp(arr1.get(), arr2) == 0;
#endif
}
};
template <class T>
struct ptr_hasher
{
size_t operator()(std::unique_ptr<T[]> const& arr) const noexcept
{
#ifdef _WIN32
return hash_bytes(arr.get(), sizeof(T) * (std::wcslen(arr.get()) + 1));
#else
return hash_bytes(arr.get(), sizeof(T) * (std::strlen(arr.get()) + 1));
#endif
}
// used with findWithSize
size_t operator()(const T* arr, int size) const noexcept
{
return hash_bytes(arr, sizeof(T) * size);
}
};
// lines 1726-1778
// same as keyToIdx, but takes size as argument and calls WHash::operator() with size argument
void keyToIdxWithSize(const char_type* key, size_t* idx, InfoType* info, int size) const {
// In addition to whatever hash is used, add another mul & shift so we get better hashing.
// This serves as a bad hash prevention, if the given data is
// badly mixed.
auto h = static_cast<uint64_t>(WHash::operator()(key, size));
h *= mHashMultiplier;
h ^= h >> 33U;
// the lower InitialInfoNumBits are reserved for info.
*info = mInfoInc + static_cast<InfoType>((h & InfoMask) >> mInfoHashShift);
*idx = (static_cast<size_t>(h) >> InitialInfoNumBits) & mMask;
}
// same as findIdx, but takes size as argument and calls keyToIdxWithSize
ROBIN_HOOD(NODISCARD)
size_t findIdxWithSize(const char_type* arr, int size) const
{
size_t idx{};
InfoType info{};
keyToIdxWithSize(arr, &idx, &info, size);
do {
// unrolling this twice gives a bit of a speedup. More unrolling did not help.
if (info == mInfo[idx] &&
ROBIN_HOOD_LIKELY(WKeyEqual::operator()(mKeyVals[idx].getFirst(), arr))) {
return idx;
}
next(&info, &idx);
if (info == mInfo[idx] &&
ROBIN_HOOD_LIKELY(WKeyEqual::operator()(mKeyVals[idx].getFirst(), arr))) {
return idx;
}
next(&info, &idx);
} while (info <= mInfo[idx]);
// nothing found!
return mMask == 0 ? 0
: static_cast<size_t>(std::distance(
mKeyVals, reinterpret_cast_no_cast_align_warning<Node*>(mInfo)));
}
// same as iterator find(const key_type& key), but takes size as argument and calls findIdxWithSize
// size passed as int because the c string won't be larger than INT_MAX
iterator findWithSize(const char_type* arr, int size)
{
ROBIN_HOOD_TRACE(this)
const size_t idx = findIdxWithSize(arr, size);
return iterator{ mKeyVals + idx, mInfo + idx };
}
// lines 2628-2630
template <typename Key, typename T, typename Hash = ptr_hasher<char_type>,
typename KeyEqual = ptr_comparer<char_type>, size_t MaxLoadFactor100 = 80>
using ptr_map = detail::Table<false, MaxLoadFactor100, Key, T, Hash, KeyEqual>;
The problem is that this ends up being about 100 nanoseconds slower than just using std::wstring_view keys. Why is it so much slower? In the program I'm using this for, I assumed it would be faster because I'm doing basically the same thing except for needing to make a std::wstring_view object from a wchar_t pointer. But it's slower in spite of that.
This is the code I'm using to check the speed:
static NTSTATUS WINAPI NtCreateFileHook(
PHANDLE FileHandle,
ACCESS_MASK DesiredAccess,
POBJECT_ATTRIBUTES ObjectAttributes,
PIO_STATUS_BLOCK IoStatusBlock,
PLARGE_INTEGER AllocationSize,
ULONG FileAttributes,
ULONG ShareAccess,
ULONG CreateDisposition,
ULONG CreateOptions,
PVOID EaBuffer,
ULONG EaLength)
{
auto start = std::chrono::high_resolution_clock::now();
wchar_t* file_path = (wchar_t*)(ObjectAttributes->ObjectName->Buffer);
wchar_t* file_name = file_path + ((ObjectAttributes->ObjectName->Length) / sizeof(wchar_t));
int file_name_size = 0; // will always increase to at least 0 due to there always being a null terminator
for (; file_name >= file_path && *file_name != L'\\'; file_name--, file_name_size++) {}
robin_hood::ptr_map<std::unique_ptr<wchar_t[]>, std::unique_ptr<int[]>>::iterator idk_iter = idk.findWithSize(file_name + 1, file_name_size);
if (idk_iter != idk.end()) // this will be false almost always, and times when it's true aren't counted
{
delay_file(idk, idk_iter->second, delay_array_mutex);
}
auto stop = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start);
long long int n = duration.count();
char buf[21] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 };
snprintf(buf, sizeof(buf), "%llu\n", n);
fwrite(buf, sizeof(char), strlen(buf), f);
return NtCreateFile(
FileHandle,
DesiredAccess,
ObjectAttributes,
IoStatusBlock,
AllocationSize,
FileAttributes,
ShareAccess,
CreateDisposition,
CreateOptions,
EaBuffer,
EaLength
);
}
This version takes about 550 nanoseconds on average, but if I instead don't edit the hashmap at all, and use std::wstring_view keys and create a std::wstring_view object in the function to pass to the hashmap, it only takes about 450 nanoseconds on average. Why is that?
I compiled on Windows 10 in MSVC with O2 optimization.
UPDATE: I still don't know why it's slower. Someone suggested it could be because std::wstring_view objects will compare lengths when using operator==. I was pretty sure this wouldn't matter, but I tested it anyways by storing the size in the first sizeof(int) bytes of the wchar_t strings and changing lines 770-809 to this:
template <class T>
struct ptr_comparer
{
bool operator() (const std::unique_ptr<T[]>& arr1, const std::unique_ptr<T[]>& arr2) const
{
#ifdef _WIN32
return std::wcscmp(arr1.get() + (sizeof(int) / sizeof(T)), arr2.get() + (sizeof(int) / sizeof(T))) == 0;
#else
return std::strcmp(arr1.get() + (sizeof(int) / sizeof(T)), arr2.get() + (sizeof(int) / sizeof(T))) == 0;
#endif
}
bool operator() (const std::unique_ptr<T[]>& arr1, const T* arr2, int arr2size) const
{
T* arr1ptr = arr1.get();
int n = 0;
std::memcpy(&n, arr1ptr, sizeof(int));
arr1ptr += sizeof(int) / sizeof(T);
#ifdef _WIN32
return n == arr2size && std::wcscmp(arr1ptr, arr2) == 0;
#else
return n == arr2size && std::strcmp(arr1ptr, arr2) == 0;
#endif
}
};
template <class T>
struct ptr_hasher
{
size_t operator()(std::unique_ptr<T[]> const& arr) const noexcept
{
T* arrptr = arr.get() + (sizeof(int) / sizeof(T));
#ifdef _WIN32
return hash_bytes(arrptr, sizeof(T) * (std::wcslen(arrptr) + 1));
#else
return hash_bytes(arrptr, sizeof(T) * (std::strlen(arrptr) + 1));
#endif
}
// used with findWithSize
size_t operator()(const T* arr, int size) const noexcept
{
return hash_bytes(arr, sizeof(T) * size);
}
};
The results are no different, it's still about 550 nanoseconds.
UPDATE 2: I figured out that if I call hash_bytes in keyToIdx instead of calling WHash::operator(), it makes it slower by about 100ns. Why, and how do I fix that so I can put std::unique_ptr<wchar_t[]> in the hashmap without that speed loss?

Custom allocator for STL fails to compile in release mode only

I have written a custom allocate which i'm using with std::vector. The code compiles and works when in debug mode, but it fails to compile in release mode with a strange error.
Here is my allocator :
template< class T >
class AllocPowOf2
{
public:
typedef size_t size_type;
typedef ptrdiff_t difference_type;
typedef T * pointer;
typedef const T * const_pointer;
typedef T & reference;
typedef const T & const_reference;
typedef T value_type;
private:
size_type m_nMinNbBytes;
public:
template< class U >
struct rebind
{
typedef AllocPowOf2< U > other;
};
inline pointer address( reference value ) const
{
return & value;
}
inline const_pointer address( const_reference value ) const
{
return & value;
}
inline AllocPowOf2( size_type nMinNbBytes = 32 )
: m_nMinNbBytes( nMinNbBytes ) { }
inline AllocPowOf2( const AllocPowOf2 & oAlloc )
: m_nMinNbBytes( oAlloc.m_nMinNbBytes ) { }
template< class U >
inline AllocPowOf2( const AllocPowOf2< U > & oAlloc )
: m_nMinNbBytes( oAlloc.m_nMinNbBytes ) { }
inline ~AllocPowOf2() { }
inline bool operator != ( const AllocPowOf2< T > & oAlloc )
{
return m_nMinNbBytes != oAlloc.m_nMinNbBytes;
}
inline size_type max_size() const
{
return size_type( -1 ) / sizeof( value_type );
}
static size_type OptimizeNbBytes( size_type nNbBytes, size_type nMin )
{
if( nNbBytes < nMin )
{
nNbBytes = nMin;
}
else
{
size_type j = nNbBytes;
j |= (j >> 1);
j |= (j >> 2);
j |= (j >> 4);
j |= (j >> 8);
#if ENV_32BITS || ENV_64BITS
j |= (j >> 16);
#endif
#if ENV_64BITS
j |= (j >> 32);
#endif
++j; // Least power of two greater than nNbBytes and nMin
if( j > nNbBytes )
{
nNbBytes = j;
}
}
return nNbBytes;
}
pointer allocate( size_type nNum )
{
return new value_type[ OptimizeNbBytes( nNum * sizeof( value_type ), 32 ) ]; // ERROR HERE, line 97
}
void construct( pointer p, const value_type & value )
{
new ((void *) p) value_type( value );
}
void destroy( pointer p )
{
p->~T();
}
void deallocate( pointer p, size_type nNum )
{
(void) nNum;
delete[] p;
}
};
Here is the error :
Error 1 error C2512: 'std::_Aux_cont' : no appropriate default constructor available c:\XXX\AllocPowOf2.h 97
The code compiles correctly in debug mode in both Windows with VS2008 and Android with the Android NDK and eclipse.
Any idea ?
return new value_type[ OptimizeNbBytes( nNum * sizeof( value_type ), 32 ) ];
Ignoring OptimizeNbBytes for now, you are newing up nNum * sizeof(value_type) value_types, which also calls value_type's constructor that many times.
In other words, asked to allocate memory for 16 ints, you would allocate enough for 64 ints instead; not only that, but you were asked for raw memory, and instead ran constructors all over them, creating objects that will be overwritten by the container without being destroyed - and then the delete[] in deallocate will result in double destruction.
allocate should allocate raw memory:
return pointer(::operator new(OptimizeNbBytes( nNum * sizeof( value_type ), 32 )));
and deallocate should deallocate the memory without running any destructor:
::operator delete((void*)p);

Copy constructor used in a "for" loop, but where?

I'm writing an UTF-8 string class and it's two const and non-const iterator classes. I'm encountering a const problem. Here are the classes :
class Utf8String
{
public:
class ConstIter;
class Iter
{
friend class ConstIter;
private:
Iter();
private:
Utf8String * m_pStr;
utf8::iterator< char * > m_oIter;
public:
Iter( const Iter & );
inline explicit Iter( Utf8String * pStr )
: m_pStr( pStr )
, m_oIter( m_pStr->m_sBuf, m_pStr->m_sBuf, m_pStr->m_sBuf + m_pStr->m_nSize )
{ }
inline Iter & operator = ( const Iter & oIter )
{
m_pStr = oIter.m_pStr;
m_oIter = utf8::iterator< char * >(
m_pStr->m_sBuf,
m_pStr->m_sBuf,
m_pStr->m_sBuf + m_pStr->m_nSize );
return *this;
}
inline operator const char * () const
{
return m_oIter.base();
}
inline uchar32_t operator * () const
{
return *m_oIter;
}
inline Iter & operator ++ ()
{
++m_oIter;
return *this;
}
inline Iter & operator -- ()
{
--m_oIter;
return *this;
}
inline bool operator == ( const Iter & oIter )
{
return m_oIter == oIter.m_oIter;
}
inline bool operator != ( const Iter & oIter )
{
return m_oIter != oIter.m_oIter;
}
};
class ConstIter
{
private:
ConstIter();
private:
const Utf8String * m_pStr;
utf8::iterator< const char * > m_oIter;
public:
ConstIter( const ConstIter & );
inline ConstIter( const Iter & oIter )
: m_pStr( oIter.m_pStr )
, m_oIter( m_pStr->m_sBuf, m_pStr->m_sBuf, m_pStr->m_sBuf + m_pStr->m_nSize )
{ }
inline ConstIter( const Utf8String * pStr )
: m_pStr( pStr )
, m_oIter( m_pStr->m_sBuf, m_pStr->m_sBuf, m_pStr->m_sBuf + m_pStr->m_nSize )
{ }
inline operator const char * () const
{
return m_oIter.base();
}
inline ConstIter & operator = ( const ConstIter & oIter )
{
m_pStr = oIter.m_pStr;
m_oIter = utf8::iterator< const char * >(
oIter.m_pStr->m_sBuf,
oIter.m_pStr->m_sBuf,
oIter.m_pStr->m_sBuf + oIter.m_pStr->m_nSize );
return *this;
}
inline ConstIter & operator = ( const Iter & oIter )
{
m_pStr = oIter.m_pStr;
m_oIter = utf8::iterator< const char * >(
m_pStr->m_sBuf,
m_pStr->m_sBuf,
m_pStr->m_sBuf + m_pStr->m_nSize );
return *this;
}
inline uchar32_t operator * () const
{
return *m_oIter;
}
inline ConstIter & operator ++ ()
{
++m_oIter;
return *this;
}
inline ConstIter & operator -- ()
{
--m_oIter;
return *this;
}
inline bool operator == ( const ConstIter & oIter )
{
return m_oIter == oIter.m_oIter;
}
inline bool operator != ( const ConstIter & oIter )
{
return m_oIter != oIter.m_oIter;
}
};
// More stuff
};
Which i'm using as follows :
Utf8String sStr = "not const";
for( Utf8String::Iter i = sStr.Begin(); i != sStr.End(); ++i )
{
}
// 2) Iterating over a const UTF-8 string :
const Utf8String sConstStr = "const";
for( Utf8String::ConstIter i = sConstStr.Begin(); i != sConstStr.End(); ++i )
{
}
// 3) Const interators can also iterate over a non-const string :
for( Utf8String::ConstIter i = sStr.Begin(); i != sStr.End(); ++i )
{
}
The problem is that, if the copy constructor of the iterator classes are not declared public, i'm getting the following error, despite that copy constructor not being explicitly used :
Error 1 error C2248: 'core::Utf8String::Iter::Iter' : cannot access private member declared in class 'core::Utf8String::Iter' c:\xxx\main.cpp 20
Declaring these copy constructors public solve the problem.
What happens ? Is the compiler optimizing Utf8String::ConstIter i = sStr.Begin() into Utf8String::ConstIter i( sStr.Begin() ) or doing some other implicit optimization ?
Thanks for your help. :)
EDIT: Using VS2005 and no C++11.
Utf8String::ConstIter i = sStr.Begin(); is a declaration together with an initialization. It is not an assignment. This initialization is done using the copy constructor.

Unable to insert more than 256 nodes into a custom tree

I've been stuck on this for quite some time now and have even tested the issue between a 64-bit version of gcc on Ubuntu as welll as a 32-bit gcc on Windows (MinGW).
Any time I insert more than 256 nodes into a binary-tree(?), it stops counting the number of nodes. I can still access all of my data. I have a feeling that it has something to do with the way I have my structure setup, by using chars to acquire each bit of each byte, but I have no idea how to fix it.
In this header, I have a structure and some functions setup which allows me to acquire an individual bit of an object.
This is the actual tree implementation. In order to find where to store each object, the tree iterates through each byte of a key, then iterates again through each bit of those bytes. The "iterate" function is what is giving me the most difficulty though; I have no idea why, but once 256 nodes become filled with data, my structure stops counting further, then begins to replace all previous data. I believe this has something to do with the fact that a single char can only hold 0-256, but I can't see where this would be an issue. Since the location of each node is determined by the individual bits of the key, it's hard to determine why only 256 items can be placed into the tree.
The URL to my test program is at the bottom of the post. SO won't let me post more than 2 at the moment. I would like to get this done soon, so any help would be greatly appreciated.
Edit:
Just to make things easier, this is the structure that gives me the individual bit of a byte, as well as a helper function:
struct bitMask {
char b1 : 1;
char b2 : 1;
char b3 : 1;
char b4 : 1;
char b5 : 1;
char b6 : 1;
char b7 : 1;
char b8 : 1;
char operator[] ( unsigned i ) const {
switch( i ) {
case 0 : return b1;
case 1 : return b2;
case 2 : return b3;
case 3 : return b4;
case 4 : return b5;
case 5 : return b6;
case 6 : return b7;
case 7 : return b8;
}
return 0; // Avoiding a compiler error
}
};
/******************************************************************************
* Functions shared between tree-type objects
******************************************************************************/
namespace treeShared {
// Function to retrieve the next set of bits at the pointer "key"
template <typename key_t>
inline const bitMask* getKeyByte( const key_t* key, unsigned iter );
/* template specializations */
template <>
inline const bitMask* getKeyByte( const char*, unsigned );
template <>
inline const bitMask* getKeyByte( const wchar_t*, unsigned );
template <>
inline const bitMask* getKeyByte( const char16_t*, unsigned );
template <>
inline const bitMask* getKeyByte( const char32_t*, unsigned );
} // end treeShared namespace
/*
* Tree Bit Mask Function
*/
template <typename key_t>
inline const bitMask* treeShared::getKeyByte( const key_t* k, unsigned iter ) {
return (iter < sizeof( key_t ))
? reinterpret_cast< const bitMask* >( k+iter )
: nullptr;
}
/*
* Tree Bit Mask Specializations
*/
template <>
inline const bitMask* treeShared::getKeyByte( const char* str, unsigned iter ) {
return (str[ iter ] != '\0')
? reinterpret_cast< const bitMask* >( str+iter )
: nullptr;
}
template <>
inline const bitMask* treeShared::getKeyByte( const wchar_t* str, unsigned iter ) {
return (str[ iter ] != '\0')
? reinterpret_cast< const bitMask* >( str+iter )
: nullptr;
}
template <>
inline const bitMask* treeShared::getKeyByte( const char16_t* str, unsigned iter ) {
return (str[ iter ] != '\0')
? reinterpret_cast< const bitMask* >( str+iter )
: nullptr;
}
template <>
inline const bitMask* treeShared::getKeyByte( const char32_t* str, unsigned iter ) {
return (str[ iter ] != '\0')
? reinterpret_cast< const bitMask* >( str+iter )
: nullptr;
}
And here is the tree class:
template <typename data_t>
struct bTreeNode {
data_t* data = nullptr;
bTreeNode* subNodes = nullptr;
~bTreeNode() {
delete data;
delete [] subNodes;
data = nullptr;
subNodes = nullptr;
}
};
/******************************************************************************
* Binary-Tree Structure Setup
******************************************************************************/
template <typename key_t, typename data_t>
class bTree {
enum node_dir : unsigned {
BNODE_LEFT = 0,
BNODE_RIGHT = 1,
BNODE_MAX
};
protected:
bTreeNode<data_t> head;
unsigned numNodes = 0;
private:
bTreeNode<data_t>* iterate( const key_t* k, bool createNodes );
public:
~bTree() {}
// STL-Map behavior
data_t& operator [] ( const key_t& k );
void push ( const key_t& k, const data_t& d );
void pop ( const key_t& k );
bool hasData ( const key_t& k );
const data_t* getData ( const key_t& k );
unsigned size () const { return numNodes; }
void clear ();
};
/*
* Binary-Tree -- Element iteration
*/
template <typename key_t, typename data_t>
bTreeNode<data_t>* bTree<key_t, data_t>::iterate( const key_t* k, bool createNodes ) {
node_dir dir;
unsigned bytePos = 0;
bTreeNode<data_t>* bNodeIter = &head;
const bitMask* byteIter = nullptr;
while ( byteIter = treeShared::getKeyByte< key_t >( k, bytePos++ ) ) {
for ( int currBit = 0; currBit < HL_BITS_PER_BYTE; ++currBit ) {
// compare the bits of each byte in k
dir = byteIter->operator []( currBit ) ? BNODE_LEFT : BNODE_RIGHT;
// check to see if a new bTreeNode needs to be made
if ( !bNodeIter->subNodes ) {
if ( createNodes ) {
// create and initialize the upcoming sub bTreeNode
bNodeIter->subNodes = new bTreeNode<data_t>[ BNODE_MAX ];
}
else {
return nullptr;
}
}
// move to the next bTreeNode
bNodeIter = &(bNodeIter->subNodes[ dir ]);
}
}
return bNodeIter;
}
/*
* Binary-Tree -- Destructor
*/
template <typename key_t, typename data_t>
void bTree<key_t, data_t>::clear() {
delete head.data;
delete [] head.subNodes;
head.data = nullptr;
head.subNodes = nullptr;
numNodes = 0;
}
/*
* Binary-Tree -- Array Subscript operators
*/
template <typename key_t, typename data_t>
data_t& bTree<key_t, data_t>::operator []( const key_t& k ) {
bTreeNode<data_t>* iter = iterate( &k, true );
if ( !iter->data ) {
iter->data = new data_t();
++numNodes;
}
return *iter->data;
}
/*
* Binary-Tree -- Push
* Push a data element to the tree using a key
*/
template <typename key_t, typename data_t>
void bTree<key_t, data_t>::push( const key_t& k, const data_t& d ) {
bTreeNode<data_t>* iter = iterate( &k, true );
if ( !iter->data ) {
iter->data = new data_t( d );
++numNodes;
}
else {
*iter->data = d;
}
}
/*
* Binary-Tree -- Pop
* Remove whichever element lies at the key
*/
template <typename key_t, typename data_t>
void bTree<key_t, data_t>::pop( const key_t& k ) {
bTreeNode<data_t>* iter = iterate( &k, false );
if ( !iter || !iter->data )
return;
delete iter->data;
iter->data = nullptr;
--numNodes;
}
/*
* Binary-Tree -- Has Data
* Return true if there is a data element at the key
*/
template <typename key_t, typename data_t>
bool bTree<key_t, data_t>::hasData( const key_t& k ) {
bTreeNode<data_t>* iter = iterate( &k, false );
return iter && ( iter->data != nullptr );
}
/*
* Binary-Tree -- Push
* Return a pointer to the data that lies at a key
* Returns a nullptr if no data exists
*/
template <typename key_t, typename data_t>
const data_t* bTree<key_t, data_t>::getData( const key_t& k ) {
bTreeNode<data_t>* iter = iterate( &k, false );
if ( !iter )
return nullptr;
return iter->data;
}
pastebin.com/8MZ0TMpj
template <typename key_t>
inline const bitMask* treeShared::getKeyByte( const key_t* k, unsigned iter ) {
return (iter < sizeof( key_t ))
? reinterpret_cast< const bitMask* >( k+iter )
: nullptr;
}
This doesn't do what you seem to think it does. (k+iter) doesn't retrieve the iter'th byte of k, but the iter'th element of the key_t[] array pointed to by k. In other words, k+iter advances the pointer by iter*sizeof(key_t) bytes, not by iter bytes.
Formally, this code exhibits undefined behavior, by overrunning array bounds. Practically speaking, your program uses just a single byte of the key, and then sizeof(key_t)-1 random bytes that just happen to sit in memory above that key. That's why you are effectively limited to 8 bits of state.
In addition, your reinterpret_cast also exhibits undefined behavior, formally speaking. The only legal use for a pointer obtained with reinterpret_cast is to reinterpret_cast it right back to the original type. This is not the immediate cause of your problem though.