C++ lock-free stack is corrupt

C++ lock-free stack is corrupt - c++

i am trying to implement a lockfree stack to be usable with external managed memory from a bounded plain c array. I know reference implementations (like from Anthony Williams: Concurrency in Action) and other books and blogs/article around the web.
The implementation follows those references and avoids the ABA problem, because external memory locations are addressed using unique indexes, rather than recycled pointers. Therefore it does not need to deal with mem managment at all and is simple.
I wrote some tests that execute pop and push operations on that stack under high load and contention (stress tests) and single threaded. The former fail with strange problems, that I do not understand and to me look obscure.
Maybe someone has an idea ?
Problem: Pushing an already popped node back to the stack fails, because precondition is violated that node has no successor (next).
BOOST_ASSERT(!m_aData.m_aNodes[nNode-1].next);
Reproduction setup: At least 3 threads and a capacity of ~16. Around 500 passes. Then push op fails.
Problem: Number of elements popped by all threads and number of elements left in stack after join do not match capacity (nodes lost in transition).
BOOST_ASSERT(aNodes.size()+nPopped == nCapacity);
Reproduction setup: 2 threads and capacity 2. Requires a lot of passes to occur, for me at least 700. After that head of stack is 0, but only one node is present in popped container. Node {2,0} is dangling.
I compiled with vs2005, vs2013 and vs2015. All have the same problem (vs2005 is also the reason that code looks C++03 like).
Here is the basic code for node+stack
template <typename sizeT> struct node
{
sizeT cur; //!< construction invariant
atomic<sizeT> next;
atomic<sizeT> data;
explicit node() // invalid node
: cur(0), next(0), data(0)
{}
explicit node(sizeT const& nCur, sizeT const& nNext, sizeT const& nData)
: cur(nCur), next(nNext), data(nData)
{}
node& operator=(node const& rhs)
{
cur = rhs.cur;
next.store(rhs.next.load(memory_order_relaxed));
data.store(rhs.data.load(memory_order_relaxed));
return *this;
}
};
template <typename sizeT> struct stack
{
private:
static memory_order const relaxed = memory_order_relaxed;
atomic<sizeT> m_aHead;
public:
explicit stack(sizeT const& nHead) : m_aHead(nHead) {}
template <typename tagT, typename T, std::size_t N>
typename enable_if<is_same<tagT,Synchronized>,sizeT>::type
pop(T (&aNodes)[N])
{
sizeT nOldHead = m_aHead.load();
for(;;)
{
if(!nOldHead) return 0;
BOOST_ASSERT(nOldHead <= N);
T& aOldHead = aNodes[nOldHead-1];
sizeT const nNewHead = aOldHead.next.load(/*relaxed*/);
BOOST_ASSERT(nNewHead <= N);
sizeT const nExpected = nOldHead;
if(m_aHead.compare_exchange_weak(nOldHead,nNewHead
/*,std::memory_order_acquire,std::memory_order_relaxed*/))
{
BOOST_ASSERT(nExpected == nOldHead);
// <--- from here on aOldHead is thread local ---> //
aOldHead.next.store(0 /*,relaxed*/);
return nOldHead;
}
// TODO: add back-off strategy under contention (use loop var)
}
}
template <typename tagT, typename T, std::size_t N>
typename enable_if<is_same<tagT,Synchronized>,void>::type
push(T (&aNodes)[N], sizeT const& nNewHead)
{
#ifndef NDEBUG
{
BOOST_ASSERT(0 < nNewHead && nNewHead <= N);
sizeT const nNext = aNodes[nNewHead-1].next;
BOOST_ASSERT(!nNext);
}
#endif
sizeT nOldHead = m_aHead.load(/*relaxed*/);
for(;;)
{
aNodes[nNewHead-1].next.store(nOldHead /*,relaxed*/);
sizeT const nExpected = nOldHead;
BOOST_ASSERT(nOldHead <= N);
if(m_aHead.compare_exchange_weak(nOldHead,nNewHead
/*,std::memory_order_release,std::memory_order_relaxed*/))
{
BOOST_ASSERT(nExpected == nOldHead);
return;
}
// TODO: add back-off strategy under contention (use loop var)
}
}
};
and the quite noisy test class
class StackTest
{
private:
typedef boost::mpl::size_t<64> Capacity;
//typedef boost::uint_t<static_log2_ceil<Capacity::value>::value>::least size_type;
typedef std::size_t size_type;
static size_type const nCapacity = Capacity::value;
static size_type const nNodes = Capacity::value;
typedef node<size_type> Node;
typedef stack<size_type> Stack;
typedef mt19937 Twister;
typedef random::uniform_int_distribution<std::size_t> Distribution;
typedef variate_generator<Twister,Distribution> Die;
struct Data //!< shared along threads
{
Node m_aNodes[nNodes];
Stack m_aStack;
explicit Data() : m_aStack(nNodes)
{
m_aNodes[0] = Node(1,0,0); // tail of stack
for(size_type i=1; i<nNodes; ++i)
{
m_aNodes[i] = Node(static_cast<size_type>(i+1),i,0);
}
}
template <typename syncT>
void Run(
uuids::random_generator& aUUIDGen,
std::size_t const& nPasses,
std::size_t const& nThreads)
{
std::vector<ThreadLocalData> aThreadLocalDatas(nThreads,ThreadLocalData(*this));
{
static std::size_t const N = 100000;
Die aRepetition(Twister(hash_value(aUUIDGen())),Distribution(0,N));
Die aAction(Twister(hash_value(aUUIDGen())),Distribution(0,1));
for(std::size_t i=0; i<nThreads; ++i)
{
std::vector<bool>& aActions = aThreadLocalDatas[i].m_aActions;
std::size_t const nRepetition = aRepetition();
aActions.reserve(nRepetition);
for(std::size_t k=0; k<nRepetition; ++k)
{
aActions.push_back(static_cast<bool>(aAction()));
}
}
}
std::size_t nPopped = 0;
if(nThreads == 1)
{
std::size_t const i = 0;
aThreadLocalDatas[i].Run<syncT>(i);
nPopped += aThreadLocalDatas[i].m_aPopped.size();
}
else
{
std::vector<boost::shared_ptr<thread> > aThreads;
aThreads.reserve(nThreads);
for(std::size_t i=0; i<nThreads; ++i)
{
aThreads.push_back(boost::make_shared<thread>(boost::bind(&ThreadLocalData::Run<syncT>,&aThreadLocalDatas[i],i)));
}
for(std::size_t i=0; i<nThreads; ++i)
{
aThreads[i]->join();
nPopped += aThreadLocalDatas[i].m_aPopped.size();
}
}
std::vector<size_type> aNodes;
aNodes.reserve(nCapacity);
while(size_type const nNode = m_aStack.pop<syncT>(m_aNodes))
{
aNodes.push_back(nNode);
}
std::clog << dump(m_aNodes,4) << std::endl;
BOOST_ASSERT(aNodes.size()+nPopped == nCapacity);
}
};
struct ThreadLocalData //!< local to each thread
{
Data& m_aData; //!< shared along threads
std::vector<bool> m_aActions; //!< either pop or push
std::vector<size_type> m_aPopped; //!< popp'ed nodes
explicit ThreadLocalData(Data& aData)
: m_aData(aData), m_aActions(), m_aPopped()
{
m_aPopped.reserve(nNodes);
}
template <typename syncT>
void Run(std::size_t const& k)
{
BOOST_FOREACH(bool const& aAction, m_aActions)
{
if(aAction)
{
if(size_type const nNode = m_aData.m_aStack.pop<syncT>(m_aData.m_aNodes))
{
BOOST_ASSERT(!m_aData.m_aNodes[nNode-1].next);
m_aPopped.push_back(nNode);
}
}
else
{
if(!m_aPopped.empty())
{
size_type const nNode = m_aPopped.back();
size_type const nNext = m_aData.m_aNodes[nNode-1].next;
ASSERT_IF(!nNext,"nNext=" << nNext << " for " << m_aData.m_aNodes[nNode-1] << "\n\n" << dump(m_aData.m_aNodes));
m_aData.m_aStack.push<syncT>(m_aData.m_aNodes,nNode);
m_aPopped.pop_back();
}
}
}
}
};
template <typename syncT>
static void PushPop(
uuids::random_generator& aUUIDGen,
std::size_t const& nPasses,
std::size_t const& nThreads)
{
BOOST_ASSERT(nThreads > 0);
BOOST_ASSERT(nThreads == 1 || (is_same<syncT,Synchronized>::value));
std::clog << BOOST_CURRENT_FUNCTION << " with threads=" << nThreads << std::endl;
for(std::size_t nPass=0; nPass<nPasses; ++nPass)
{
std::ostringstream s;
s << " " << nPass << "/" << nPasses << ": ...";
std::clog << s.str() << std::endl;
Data().Run<syncT>(aUUIDGen,nPass,nThreads);
}
}
public:
static void Run()
{
typedef StackTest self_t;
uuids::random_generator aUUIDGen;
static std::size_t const nMaxPasses = 1000;
Die aPasses(Twister(hash_value(aUUIDGen())),Distribution(0,nMaxPasses));
{
//std::size_t const nThreads = 2; // thread::hardware_concurrency()+1;
std::size_t const nThreads = thread::hardware_concurrency()+1;
self_t().PushPop<Synchronized>(aUUIDGen,aPasses(),nThreads);
}
}
};
Here is a link to download all required files.

Both problems are just another facet of the ABA problem.
stack: {2,1},{1,0}
Thread A
pop
new_head=1
... time slice exceeded
Thread B
pop
stack: {1,0}, popped: {2,0}
pop
stack: {}, popped: {2,0}, {1,0}
push({2,0})
stack: {2,0}
Thread A
pop continued
cmp_exch succeeds, because head is 2
stack: {}, head=1 --- WRONG, 0 would be correct
Any problems may arise, because access to nodes is not thread local anymore. This includes unexpected modifications of next for popped nodes (problem 1) or lost nodes (problem 2).
head+next need to be modified in one cmp_exch to avoid that problem.

Related

How can I optimise edge creation and vertex deletion in an Adjacency set representation of a graph?

I have a graph represented with an Adjacency set similar to:
struct Vertex {
int x;
bool operator==(const Vertex& b) {
return x==b.x;
}
};
template<> struct std::hash<Vertex> {
std::size_t operator()(Vertex const& v) const noexcept {
return std::hash<int>()(v.x);
}
};
struct Edge {
std::shared_ptr<Vertex> fr;
std::shared_ptr<Vertex> to;
double weight;
Edge(std::shared_ptr<Vertex> fr_in, std::shared_ptr<Vertex> to_in) : fr(fr_in), to(to_in) {};
};
class Graph{
public:
std::shared_ptr<Vertex> addVertex() {
auto new_vertex = std::make_shared<Vertex>();
mAdjacencySet[new_vertex] = {};
return new_vertex;
}
std::shared_ptr<Edge> addEdge(std::shared_ptr<Vertex> fr, std::shared_ptr<Vertex> to) {
auto edge = std::make_shared<Edge>(fr, to);
mAdjacencySet[fr][to] = edge;
return edge;
}
void deleteVertex(std::shared_ptr<Vertex> v) {
mAdjacencySet.erase(v);
for (auto& [key, val] : mAdjacencySet) {
val.erase(v);
}
};
private:
std::unordered_map<
std::shared_ptr<Vertex>,
std::unordered_map<
std::shared_ptr<Vertex>,
std::shared_ptr<Edge>,
Deref::Hash,
Deref::Compare
>,
Deref::Hash,
Deref::Compare
> mAdjacencySet;
};
After I build my graph, I need to prune as many edges as possible because they are expensive to calculate.
One of the strategies to do so, is to delete any vertices with out degree of zero, EXCEPT for the destination vertex. This is very slow to do, relative to the rest of my program.
I wrote a script to time the relative parts of the complexity of each part:
int main() {
Timer wholeProgram;
wholeProgram.start();
Graph g;
auto v1 = g.addVertex();
auto v2 = g.addVertex();
auto e = g.addEdge(v1, v2);
Timer makingVertices;
makingVertices.start();
size_t n = 1e3;
std::vector<std::shared_ptr<Vertex>> vertices(n);
for (size_t i=0; i<n; ++i) {
vertices[i] = g.addVertex();
vertices[i]->x = i;
}
makingVertices.stop();
Timer makingEdges;
makingEdges.start();
for (auto v1 : vertices) {
for (auto v2: vertices) {
if (v1!=v2) {
g.addEdge(v1, v2);
}
}
}
makingEdges.stop();
Timer deletingVertices;
deletingVertices.start();
for (auto vert : vertices) {
g.deleteVertex(vert);
}
deletingVertices.stop();
wholeProgram.stop();
std::cout << "Making Verts: " << makingVertices.elapsedMilliseconds() << std::endl;
std::cout << "Making edges: " << makingEdges.elapsedMilliseconds() << std::endl;
std::cout << "Deleting verts: " << deletingVertices.elapsedMilliseconds() << std::endl;
std::cout << "Whole program: " << wholeProgram.elapsedMilliseconds() << std::endl;
return 0;
}
And the timings (with '-O3') are:
Making Verts: 0
Making edges: 270
Deleting verts: 188
Whole program: 458
(In my actual code base, the deleting of the vertices is actually around 90% of the time to create the graph).
How can I optimize this code to reduce the time to delete vertices (And also I guess optimize the creation of edges, as this is also slow)?
The full code to run this example is:
#include <functional>
#include <memory>
#include <chrono>
#include <iostream>
class Timer
{
public:
void start()
{
m_StartTime = std::chrono::system_clock::now();
m_bRunning = true;
}
void stop()
{
m_EndTime = std::chrono::system_clock::now();
m_bRunning = false;
}
double elapsedMilliseconds()
{
std::chrono::time_point<std::chrono::system_clock> endTime;
if(m_bRunning)
{
endTime = std::chrono::system_clock::now();
}
else
{
endTime = m_EndTime;
}
return std::chrono::duration_cast<std::chrono::milliseconds>(endTime - m_StartTime).count();
}
double elapsedSeconds()
{
return elapsedMilliseconds() / 1000.0;
}
private:
std::chrono::time_point<std::chrono::system_clock> m_StartTime;
std::chrono::time_point<std::chrono::system_clock> m_EndTime;
bool m_bRunning = false;
};
struct Deref {
/**
* #brief Function to dereference the pointer when hashing elements in a hashmap of shared pointers
*
*/
struct Hash {
template <typename T> std::size_t operator()(std::shared_ptr<T> const& p) const
{
return std::hash<T>()(*p);
}
template <typename T> std::size_t operator()(T const & p) const
{
return std::hash<T>(p);
}
};
/**
* #brief Function to dereference the pointer when comparing elements in a hashmap of shared pointers
*
*/
struct Compare {
template <typename T> bool operator()(std::shared_ptr<T> const& a, std::shared_ptr<T> const& b) const
{
return *a == *b;
}
template <typename T> bool operator()(T const& a, T const& b) const
{
return a == b;
}
};
};
struct Vertex {
int x;
bool operator==(const Vertex& b) {
return x==b.x;
}
};
template<> struct std::hash<Vertex> {
std::size_t operator()(Vertex const& v) const noexcept {
return std::hash<int>()(v.x);
}
};
struct Edge {
std::shared_ptr<Vertex> fr;
std::shared_ptr<Vertex> to;
double weight;
Edge(std::shared_ptr<Vertex> fr_in, std::shared_ptr<Vertex> to_in) : fr(fr_in), to(to_in) {};
};
class Graph{
public:
std::shared_ptr<Vertex> addVertex() {
auto new_vertex = std::make_shared<Vertex>();
mAdjacencyList[new_vertex] = {};
return new_vertex;
}
std::shared_ptr<Edge> addEdge(std::shared_ptr<Vertex> fr, std::shared_ptr<Vertex> to) {
auto edge = std::make_shared<Edge>(fr, to);
mAdjacencyList[fr][to] = edge;
return edge;
}
void deleteVertex(std::shared_ptr<Vertex> v) {
mAdjacencyList.erase(v);
for (auto& [key, val] : mAdjacencyList) {
val.erase(v);
}
};
private:
std::unordered_map<
std::shared_ptr<Vertex>,
std::unordered_map<
std::shared_ptr<Vertex>,
std::shared_ptr<Edge>,
Deref::Hash,
Deref::Compare
>,
Deref::Hash,
Deref::Compare
> mAdjacencyList;
};
int main() {
Timer wholeProgram;
wholeProgram.start();
Graph g;
auto v1 = g.addVertex();
auto v2 = g.addVertex();
auto e = g.addEdge(v1, v2);
Timer makingVertices;
makingVertices.start();
size_t n = 1e3;
std::vector<std::shared_ptr<Vertex>> vertices(n);
for (size_t i=0; i<n; ++i) {
vertices[i] = g.addVertex();
vertices[i]->x = i;
}
makingVertices.stop();
Timer makingEdges;
makingEdges.start();
for (auto v1 : vertices) {
for (auto v2: vertices) {
if (v1!=v2) {
g.addEdge(v1, v2);
}
}
}
makingEdges.stop();
Timer deletingVertices;
deletingVertices.start();
for (auto vert : vertices) {
g.deleteVertex(vert);
}
deletingVertices.stop();
wholeProgram.stop();
std::cout << "Making Verts: " << makingVertices.elapsedMilliseconds() << std::endl;
std::cout << "Making edges: " << makingEdges.elapsedMilliseconds() << std::endl;
std::cout << "Deleting verts: " << deletingVertices.elapsedMilliseconds() << std::endl;
std::cout << "Whole program: " << wholeProgram.elapsedMilliseconds() << std::endl;
return 0;
}
And to run it, you can view it online

What's the suited container to push values on top, remove at whatever index and avoid memory reallocation?

I need to build a sort of stack where I can push values on top:
5 // (size 1)
5 3 // (size 2)
5 3 8 // (size 3)
than remove them by value, such as removing 3:
5 8 // (size 2)
than be able to always get the last value (i.e. 8 in the example), when I need it).
I can push max 32 values, so I know the whole size (avoiding heap?).
I think to std::vector with:
initial reserve(32)
.push_back() for insert
vector.erase(std::remove(vector.begin(), vector.end(), value), vector.end()) for remove by value
vector[vector.size() - 1] to retrieve the last element
But maybe there are some stl container better for this kind of process? Not sure if vector are always in the stack and will do further memory reallocation under the hood...

You can write an allocator that contains your 32 values, and refuses to allocate any amount other than 32
template <typename T, std::size_t N = 32>
struct static_allocator
{
T* allocate(std::size_t n) { if (n != N) throw std::bad_alloc(); return arr; }
void deallocate(T *, std::size_t) {}
using pointer = T*;
using const_pointer = const T*;
using void_pointer = void*;
using const_void_pointer = const void*;
using value_type = T;
using size_type = std::size_t;
using difference_type = std::ptrdiff_t;
template <typename U>
struct rebind
{
using other = static_allocator<U, N>;
};
static_allocator select_on_container_copy_construction() { return {}; }
using propagate_on_container_copy_assignment = std::true_type;
using propagate_on_container_move_assignment = std::true_type;
using propagate_on_container_swap = std::true_type;
private:
T arr[N];
};
Then a std::vector<T, static_allocator<T>> will have it's elements as subobjects.
I don't think it's possible to avoid dynamic allocation and have sublinear random-access remove.

if size is limited to 32 elements
why not use a circular buffer of 32 elements, and roll the elements when they are 32 ?
There may be some bugs (don't use last() or remove () on an empty container, don't remove an element not inserted...) ,but it works for the functions you wanted. Here is the idea (heap is avoided)
#include <iostream>
template <typename T>
class Container {
public :
static const int smax_ = 32;
void erase () {
T* pt ((T*) val_);
for (int i (0); i != smax_; ++i, ++pt) *pt = 0;
size_ = 0;
}
Container () : size_ (0) { erase ();}
~Container () {}
void copy (const Container& c) {
size_ = c.size_;
T* pt ((T*) val_);
const T* qt ((const T*) c.val_);
for (int i (0); i != size_; ++i, ++pt, ++qt) *pt++ = *qt++;
}
Container (const Container& c) {
copy (c);
}
void push_back (const T& t) {
if (size_ == smax_) {
T* pt ((T*) val_);
const T* qt ((const T*) val_);
++qt;
for (int i (0); i != size_ -1; ++i, ++pt, ++qt) {
*pt = *qt;
}
*pt = t;
}
else {
val_ [size_] = t;
++size_;
}
}
int size () const {
return size_;
}
void remove (const T& t) {
if (!size_) return;
int i (0);
T* pt ((T*)val_);
while ((i < smax_) && (*pt != t)) {
++pt; ++i;
}
if (i != smax_) {
T* qt (pt);
++qt;
for (; i != size_ -1; ++i, ++pt, ++qt) {
*pt = *qt;
}
}
--size_;
}
void write (std::ostream& os) const {
const T* pt ((const T*) val_);
for (int i (0); i != size_; ++i, ++pt) os << *pt << " ";
}
bool operator == (const Container& c) const {
if (size_ != c.size_) return false;
const T* pt ((const T*) val_), *qt ((const T*) c.val_);
for (int i (0); i != size_; ++i, ++pt, ++qt) if (*pt != *qt) return false;
return true;
}
bool operator != (const Container& c) const {
return !operator == (c);
}
T& operator = (const Container& c) {
copy (c);
return *this;
}
T last () const {
return val_ [size_ -1];
}
T val_ [smax_];
int size_;
};
Test Program
int main (int argc, char* argv []) {
Container<int> c;
std::cout << "pushing back 5..." << std::endl;
c.push_back (5);
c.write (std::cout);
std::cout << std::endl;
std::cout << "c.last == " << c.last () << std::endl;
std::cout << "pushing back 3..." << std::endl;
c.push_back (3);
c.write (std::cout);
std::cout << std::endl;
std::cout << "c.last == " << c.last () << std::endl;
std::cout << "pushing back 8..." << std::endl;
c.push_back (8);
c.write (std::cout);
std::cout << std::endl;
std::cout << "c.last == " << c.last () << std::endl;
std::cout << "erasing 3..." << std::endl;
c.remove (3);
c.write (std::cout);
std::cout << std::endl;
std::cout << "c.last == " << c.last () << std::endl;
}
and the results :
pushing back 5...
5
c.last == 5
pushing back 3...
5 3
c.last == 3
pushing back 8...
5 3 8
c.last == 8
erasing 3...
5 8
c.last == 8

if you dont want memory reallocation then you can also use list container i.e linked list ..as it has mostly same properties to the vector..just it do not support random access or []operator ...else vector is perfect:)

Why don't I get an segmentation fault when I overflow a vector with my custom allocator?

I've written a custom LinearAllocator that I'm using with a vector. The Allocator can take another Allocator as a template/constructor parameter to allocate its initial storage from. Right now it's making it's initial allocation with the std::allocator just fine, but when I overflow the vector it doesn't give an error. I understand this is undefined behavior so I can't guarantee a segfault, but it consistently allows me to modify 1037600 elements from the beginning of the vector (of size 64).
I printed out and modified each element until it gave me an error:
template <class T, size_t N, class Alloc = std::allocator<uint8_t>>
class LinearAllocator
{
public:
typedef T value_type;
LinearAllocator()
:m_memory{m_alloc.allocate(N)}, m_head{m_memory}
{
}
template<class U>
LinearAllocator(LinearAllocator<U, N> const& other) noexcept
:m_memory{other.m_memory}, m_head{other.m_head}
{}
template<class U>
bool operator==(LinearAllocator<U, N> const& other) const noexcept
{
return m_memory == other.m_memory && m_head == other.m_head;
}
template<class U>
bool operator!=(LinearAllocator<U, N> const& other) const noexcept
{
return !(*this == other);
}
T* allocate(const size_t n)
{
uint8_t* memory = static_cast<uint8_t*>(m_memory);
uint8_t* head = static_cast<uint8_t*>(m_head);
if(n == 0)
{
return nullptr;
}
if(n > static_cast<size_t>(-1) / sizeof(T))
{
throw std::bad_array_new_length();
}
if(n > N) { throw std::bad_alloc(); }
if(memory + N < head + n) { head = memory; }
void* pv = m_head;
head += n;
m_head = static_cast<void*>(head);
return static_cast<T*>(pv);
}
void deallocate(T* const p, size_t) const noexcept
{}
private:
Alloc m_alloc = Alloc();
void* m_memory = nullptr;
void* m_head = nullptr;
};
int main()
{
std::vector<uint8_t, LinearAllocator<uint8_t, 64>> vec(64, 1);
//std::vector<uint8_t> vec(65, 1);
std::cout << (void*)vec.data() << std::endl;
for(size_t i = 0; i < vec.size()+10000000; ++i)
{
std::cout << i << " " << (int)vec[i]++ << " " << (int)vec[i]<< "\n";
}
}
I expected this to fail upon overflow at element 64, since I thought this was heap allocated memory. But it seems to fail at the same point, element 1037663, way past where I expected.
Specifically:
$ ./run
0 1 2
1 1 2
...
1037662 0 1
1037663 0 1
Segmentation fault: 11

The most likely cause is that when you allocated space from std::allocator, it grabbed enough memory from the OS to hold 1037663 elements. The segmentation fault occurs when the OS notices that you've asked to look at a memory address that it hasn't already given your program permission to access. The std::allocator has asked for the other memory from the OS and so the OS will not notice when you exceed the bounds of the vector until you go beyond the bounds of memory provided to the std::allocator.

Memory overwrite in my own Vector class

For my Algorithm course project we can't use STL stuff like std::vector and so I'm trying to implement my own version of it (with templates).
It seems it works but when I declare a Vector< Vector< int > >
the .push() method starts to overwrite memory.
More specifically, with this code:
Vector<Vector<int>> v(3);
cout << v[0].push(0) << "\n";
cout << v[0].push(55) << "\n";
cout << v[0].push(4) << "\n";
cout << v[1].push(12) << "\n";
cout << v[1].push(3) << "\n";
cout << v[2].push(1) << "\n";
The output is this (.push() returns the address of where the element is inserted):
0x561328b0bc20
0x561328b0bc24
0x561328b0bc28
0x561328b0bc20
0x561328b0bc24
0x561328b0bc20
Any suggestion of why this happens?
Here is the code for my Vector class:
#include <iostream>
#include <bits/stdc++.h>
using namespace std;
template<class T>
class Vector {
private:
size_t _size;
size_t _capacity;
char* _buffer; //char* for performance
void _realloc(size_t);
public:
Vector(size_t s=0, T def=T());
T* push(T);
T& operator[](int);
size_t size() { return _size; }
};
template<class T>
void Vector<T>:: _realloc(size_t ncap)
{
_capacity = ncap;
char* nbuf = _buffer;
_buffer = new char[_capacity];
for(size_t i=0; i<_size * sizeof(T); ++i)
_buffer[i] = nbuf[i];
delete[] nbuf;
}
/*
* s -> size
* def -> default value
*/
template<class T>
Vector<T>:: Vector(size_t s, T def) : _size(s)
{
_capacity = 32;
while(_capacity < _size)
_capacity *= 2;
_buffer = new char[_capacity * sizeof(T)];
for(size_t i=0; i<_size; ++i)
((T*)_buffer)[i] = def;
}
/*
* check capacity, reallocs if necessary and push the element
* then return the addres (used only for debug)
*/
template<class T>
T* Vector<T>:: push(T ele)
{
if(_capacity == _size)
_realloc(2 * _capacity);
((T*)_buffer)[_size++] = ele;
return &((T*)_buffer)[_size-1];
}
template<class T>
T& Vector<T>:: operator[](int i)
{
if(i<0 or i>=(int)_size) {
cerr << "Out of bounds!\n";
abort();
}else
return ((T*)_buffer)[i];
}
template<class T>
ostream& operator<<(ostream& out, Vector<T>& v)
{
out << "{";
if(v.size() > 0) {
out << v[0];
for(size_t i=1; i<v.size(); ++i)
out << ", " << v[i];
}
out << "}";
return out;
}
Thanks!
PS: I know it's not a good use of C++ :P

Your operator= implicitly defined does the wrong thing. You use it in your constructor.
So, follow the rule of 0/3/5: Implement copy/move construtors/assignment and destructors, as this is an owning memory-management type. (Non-resource management types should follow the rule of 0; copyable resource management types the rule of 5.)
See rule of three. Implement all of destructor/copy constructor/move constructor/copy assign/move assign, or none of them.
Don't copy the data byte-wise when you realloc. std::move the Ts from the source to the dest.
In the copy construct/assign you'll want to copy the source Ts, not the underlying bytes.

Algorithm to unify contiguous chunks in a collection of chunks

I'm creating a pre-allocator with dynamic memory chunk size, and I need to unify contiguous memory chunks.
struct Chunk // Chunk of memory
{
Ptr begin, end; // [begin, end) range
}
struct PreAlloc
{
std::vector<Chunk> chunks; // I need to unify contiguous chunks here
...
}
I tried a naive solution, that, after sorting the chunks based on their begin, basically did a pass through the vector checking if the next chunk's begin was equal to the current chunk's end. I'm sure it could be improved.
Is there a good algorithm to unify contiguous ranges?
Information:
Chunks can never "overlap".
Chunks can have any size greater than 0.
Performance is the most important factor.

NOTE: there was an error in my original algorithm, where I only considered blocks to the left of the current block.
Use two associative tables (e.g. unordered_map), one mapping the begin address to the Chunk, another mapping the end to the Chunk. This lets you find the neighbouring blocks quickly. Alternatively, you can change the Chunk struct to store a pointer/id/whatever to the neighbouring Chunk, plus a flag to mark to tell if it's free.
The algorithm consists of scanning the vector of chunks once, while maintaining the invariant: if there is a neighbour to the left, you merge them; if there is a neighbour to the right, you merge them. At the end, just collect the remaining chunks.
Here's the code:
void unify(vector<Chunk>& chunks)
{
unordered_map<Ptr, Chunk> begins(chunks.size() * 1.25); // tweak this
unordered_map<Ptr, Chunk> ends(chunks.size() * 1.25); // tweak this
for (Chunk c : chunks) {
// check the left
auto left = ends.find(c.begin);
if (left != ends.end()) { // found something to the left
Chunk neighbour = left->second;
c.begin = neighbour.begin;
begins.erase(neighbour.begin);
ends.erase(left);
}
// check the right
auto right = begins.find(c.end);
if (right != begins.end()) { // found something to the right
Chunk neighbour = right->second;
c.end = neighbour.end;
begins.erase(right);
ends.erase(neighbour.end);
}
begins[c.begin] = c;
ends[c.end] = c;
}
chunks.clear();
for (auto x : begins)
chunks.push_back(x.second);
}
The algorithm has O(n) complexity assuming constant time access to the begins and ends tables (which is nearly what you get if you don't trigger rehashing, hence the "tweak this" comments). There are quite a few options to implement associative tables, make sure to try a few different alternatives; as pointed out in the comment by Ben Jackson, a hash table doesn't always make good use of cache, so even a sorted vector with binary searches might be faster.
If you can change the Chunk structure to store left/right pointers, you get a guaranteed O(1) lookup/insert/remove. Assuming you are doing this to consolidate free chunks of memory, the left/right checking can be done in O(1) during the free() call, so there is no need to consolidate it afterwards.

I think you can not do better then N log(N) - the naive approach. The idea using an unordered associative container I dislike - the hashing will degenerate performance. An improvement might be: keep the chunks sorted at each insert, making 'unify' O(N).
It seems you are writing some allocator, hence I dig up some old code of mine (with some adjustment regarding C++ 11 and without any warranty). The allocator is for small objects having a size <= 32 * sizeof(void*).
Code:
// Copyright (c) 1999, Dieter Lucking.
//
// Permission is hereby granted, free of charge, to any person or organization
// obtaining a copy of the software and accompanying documentation covered by
// this license (the "Software") to use, reproduce, display, distribute,
// execute, and transmit the Software, and to prepare derivative works of the
// Software, and to permit third-parties to whom the Software is furnished to
// do so, all subject to the following:
//
// The copyright notices in the Software and this entire statement, including
// the above license grant, this restriction and the following disclaimer,
// must be included in all copies of the Software, in whole or in part, and
// all derivative works of the Software, unless such copies or derivative
// works are solely in the form of machine-executable object code generated by
// a source language processor.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
// SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
// FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
// DEALINGS IN THE SOFTWARE.
//
#include <limits>
#include <chrono>
#include <iomanip>
#include <iostream>
#include <mutex>
#include <thread>
#include <vector>
// raw_allocator
// =============================================================================
class raw_allocator
{
// Types
// =====
public:
typedef std::size_t size_type;
typedef std::ptrdiff_t difference_type;
typedef void value_type;
typedef void* pointer;
typedef const void* const_pointer;
typedef unsigned char byte_type;
typedef byte_type* byte_pointer;
typedef const unsigned char* const_byte_pointer;
// Information
// ===========
public:
static size_type max_size() noexcept {
return std::numeric_limits<size_type>::max();
}
static size_type mem_size(size_type) noexcept;
// Allocation.System
// =================
public:
static pointer system_allocate(size_type) noexcept;
static void system_allocate(size_type, pointer&, size_type&) noexcept;
static void system_deallocate(pointer) noexcept;
// Allocation
// ==========
public:
static void allocate(size_type, pointer& result, size_type& capacity) noexcept;
static pointer allocate(size_type n) noexcept {
pointer result;
allocate(n, result, n);
return result;
}
static void deallocate(pointer p, size_type n) noexcept;
// Allocation.Temporary:
//======================
public:
static void allocate_temporary(size_type, pointer& result,
size_type& capacity) noexcept;
static pointer allocate_temporary(size_type n) noexcept {
pointer result;
allocate_temporary(n, result, n);
return result;
}
static void deallocate_temporary(pointer, size_type) noexcept;
// Logging
// =======
public:
static void log(std::ostream& stream);
};
// static_allocator
// =============================================================================
template<class T> class static_allocator;
template<>
class static_allocator<void>
{
public:
typedef void value_type;
typedef void* pointer;
typedef const void* const_pointer;
template<class U> struct rebind
{
typedef static_allocator<U> other;
};
};
template<class T>
class static_allocator
{
// Types
// =====
public:
typedef raw_allocator::size_type size_type;
typedef raw_allocator::difference_type difference_type;
typedef T value_type;
typedef T& reference;
typedef const T& const_reference;
typedef T* pointer;
typedef const T* const_pointer;
template<class U> struct rebind
{
typedef static_allocator<U> other;
};
// Construction/Destruction
// ========================
public:
static_allocator() noexcept {};
static_allocator(const static_allocator&) noexcept {};
~static_allocator() noexcept {};
// Information
// ===========
public:
static size_type max_size() noexcept {
return raw_allocator::max_size() / sizeof(T);
}
static size_type mem_size(size_type n) noexcept {
return raw_allocator::mem_size(n * sizeof(T)) / sizeof(T);
}
static pointer address(reference x) {
return &x;
}
static const_pointer address(const_reference x) {
return &x;
}
// Construct/Destroy
//==================
public:
static void construct(pointer p, const T& value) {
new ((void*) p) T(value);
}
static void destroy(pointer p) {
((T*) p)->~T();
}
// Allocation
//===========
public:
static pointer allocate(size_type n) noexcept {
return (pointer)raw_allocator::allocate(n * sizeof(value_type));
}
static void allocate(size_type n, pointer& result, size_type& capacity) noexcept
{
raw_allocator::pointer p;
raw_allocator::allocate(n * sizeof(value_type), p, capacity);
result = (pointer)(p);
capacity /= sizeof(value_type);
}
static void deallocate(pointer p, size_type n) noexcept {
raw_allocator::deallocate(p, n * sizeof(value_type));
}
// Allocation.Temporary
// ====================
static pointer allocate_temporary(size_type n) noexcept {
return (pointer)raw_allocator::allocate_temporary(n * sizeof(value_type));
}
static void allocate_temporary(size_type n, pointer& result,
size_type& capacity) noexcept
{
raw_allocator::pointer p;
raw_allocator::allocate_temporary(n * sizeof(value_type), p, capacity);
result = (pointer)(p);
capacity /= sizeof(value_type);
}
static void deallocate_temporary(pointer p, size_type n) noexcept {
raw_allocator::deallocate_temporary(p, n);
}
// Logging
// =======
public:
static void log(std::ostream& stream) {
raw_allocator::log(stream);
}
};
template <class T1, class T2>
inline bool operator ==(const static_allocator<T1>&,
const static_allocator<T2>&) noexcept {
return true;
}
template <class T1, class T2>
inline bool operator !=(const static_allocator<T1>&,
const static_allocator<T2>&) noexcept {
return false;
}
// allocator:
// =============================================================================
template<class T> class allocator;
template<>
class allocator<void>
{
public:
typedef static_allocator<void>::value_type value_type;
typedef static_allocator<void>::pointer pointer;
typedef static_allocator<void>::const_pointer const_pointer;
template<class U> struct rebind
{
typedef allocator<U> other;
};
};
template<class T>
class allocator
{
// Types
// =====
public:
typedef typename static_allocator<T>::size_type size_type;
typedef typename static_allocator<T>::difference_type difference_type;
typedef typename static_allocator<T>::value_type value_type;
typedef typename static_allocator<T>::reference reference;
typedef typename static_allocator<T>::const_reference const_reference;
typedef typename static_allocator<T>::pointer pointer;
typedef typename static_allocator<T>::const_pointer const_pointer;
template<class U> struct rebind
{
typedef allocator<U> other;
};
// Constructor/Destructor
// ======================
public:
template <class U>
allocator(const allocator<U>&) noexcept {}
allocator() noexcept {};
allocator(const allocator&) noexcept {};
~allocator() noexcept {};
// Information
// ===========
public:
size_type max_size() const noexcept {
return static_allocator<T>::max_size();
}
pointer address(reference x) const {
return static_allocator<T>::address(x);
}
const_pointer address(const_reference x) const {
return static_allocator<T>::address(x);
}
// Construct/Destroy
// =================
public:
void construct(pointer p, const T& value) {
static_allocator<T>::construct(p, value);
}
void destroy(pointer p) {
static_allocator<T>::destroy(p);
}
// Allocation
// ==========
public:
pointer allocate(size_type n, typename allocator<void>::const_pointer = 0) {
return static_allocator<T>::allocate(n);
}
void deallocate(pointer p, size_type n) {
static_allocator<T>::deallocate(p, n);
}
// Logging
// =======
public:
static void log(std::ostream& stream) {
raw_allocator::log(stream);
}
};
template <class T1, class T2>
inline bool operator ==(const allocator<T1>&, const allocator<T2>&) noexcept {
return true;
}
template <class T1, class T2>
inline bool operator !=(const allocator<T1>&, const allocator<T2>&) noexcept {
return false;
}
// Types
// =============================================================================
typedef raw_allocator::size_type size_type;
typedef raw_allocator::byte_pointer BytePointer;
struct LinkType
{
LinkType* Link;
};
struct FreelistType
{
LinkType* Link;
};
// const
// =============================================================================
// Memory layout:
// ==============
//
// Freelist
// Index Request Alignment
// =============================================================================
// [ 0 ... 7] [ 0 * align ... 8 * align] every 1 * align bytes
// [ 8 ... 11] ( 8 * align ... 16 * align] every 2 * align bytes
// [12 ... 13] ( 16 * align ... 24 * align] every 4 * align bytes
// [14] ( 24 * align ... 32 * align] 8 * align bytes
//
// temporary memory:
// [15] [ 0 * align ... 256 * align] 256 * align
static const unsigned FreeListArraySize = 16;
static const size_type FreelistInitSize = 16;
static const size_type MinAlign =
(8 < 2 * sizeof(void*)) ? 2 * sizeof(void*) : 8;
static const size_type MaxAlign = 32 * MinAlign;
static const size_type MaxIndex = 14;
static const size_type TmpIndex = 15;
static const size_type TmpAlign = 256 * MinAlign;
static const size_type IndexTable[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10,
10, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14 };
static_assert(sizeof(IndexTable) / sizeof(size_type) == MaxAlign / MinAlign, "Invalid Index Table");
inline size_type get_index(size_type n) {
return IndexTable[long(n - 1) / MinAlign];
}
static const size_type AlignTable[] = { MinAlign * 1, MinAlign * 2, MinAlign
* 3, MinAlign * 4, MinAlign * 5, MinAlign * 6, MinAlign * 7, MinAlign * 8,
MinAlign * 10, MinAlign * 12, MinAlign * 14, MinAlign * 16, MinAlign * 20,
MinAlign * 24, MinAlign * 32, TmpAlign, };
static_assert(sizeof(AlignTable) / sizeof(size_type) == TmpIndex + 1, "Invalid Align Table");
inline size_type get_align(size_type i) {
return AlignTable[i];
}
// Thread
// ============================================================================
static LinkType* Freelist[FreeListArraySize];
static BytePointer HeapBeg;
static BytePointer HeapEnd;
static size_type TotalHeapSize;
static std::mutex FreelistMutex[FreeListArraySize] = { };
inline void lock_free_list(size_type i) {
FreelistMutex[i].lock();
}
inline void unlock_free_list(size_type i) {
FreelistMutex[i].unlock();
}
// Allocation
// ============================================================================
// Requiers: freelist[index] is locked
LinkType* allocate_free_list(size_type index) noexcept {
static std::mutex mutex;
const size_type page_size = 4096; // FIXME some system_page_size();
std::lock_guard<std::mutex> guard(mutex);
size_type heap_size = HeapEnd - HeapBeg;
size_type align = get_align(index);
if(heap_size < align) {
LinkType* new_list = (LinkType*)(HeapBeg);
// If a temporary list:
if(MaxAlign <= heap_size) {
LinkType* current = new_list;
LinkType* next;
while(2*MaxAlign <= heap_size) {
next = (LinkType*)(BytePointer(current) + MaxAlign);
current->Link = next;
current = next;
heap_size -= MaxAlign;
}
if(index != MaxIndex) lock_free_list(MaxIndex);
current->Link = Freelist[MaxIndex];
Freelist[MaxIndex] = new_list;
if(index != MaxIndex) unlock_free_list(MaxIndex);
new_list = (LinkType*)(BytePointer(current) + MaxAlign);
heap_size -= MaxAlign;
}
if(MinAlign <= heap_size) {
std::cout << "heap_size: " << heap_size << std::endl;
size_type i = get_index(heap_size);
if(heap_size < get_align(i)) --i;
if(index != i) lock_free_list(i);
new_list->Link = Freelist[i];
Freelist[i] = new_list;
if(index != i) unlock_free_list(i);
}
heap_size = FreelistInitSize * align + TotalHeapSize / FreelistInitSize;
heap_size = (((heap_size - 1) / page_size) + 1) * page_size;
HeapBeg = BytePointer(raw_allocator::system_allocate(heap_size));
if(HeapBeg) {
HeapEnd = HeapBeg + heap_size;
TotalHeapSize += heap_size;
}
else {
HeapEnd = 0;
size_type i = FreeListArraySize;
while(HeapBeg == 0) {
--i;
if(i <= index) return 0;
lock_free_list(i);
if(Freelist[i]) {
heap_size = get_align(i);
HeapBeg = (BytePointer)(Freelist[i]);
HeapEnd = HeapBeg + heap_size;
Freelist[i] = Freelist[i]->Link;
}
unlock_free_list(i);
}
}
}
size_type size = FreelistInitSize * align;
size_type count = FreelistInitSize;
if(heap_size < size) {
count = heap_size / align;
size = align * count;
}
LinkType* beg_list = (LinkType*)(HeapBeg);
LinkType* end_list = beg_list;
while(--count) {
LinkType* init = (LinkType*)(BytePointer(end_list) + align);
end_list->Link = init;
end_list = init;
}
LinkType*& freelist = Freelist[index];
end_list->Link = freelist;
freelist = beg_list;
HeapBeg += size;
return freelist;
}
// raw_allocator
// ============================================================================
// size
// ====
raw_allocator::size_type
raw_allocator::mem_size(size_type n) noexcept {
if( ! n) return 0;
else {
if(n <= MaxAlign) return get_align(get_index(n));
else return ((difference_type(n) - 1) / difference_type(MaxAlign)) * MaxAlign
+ MaxAlign;
}
}
// allocation.system
// =================
raw_allocator::pointer raw_allocator::system_allocate(size_type n) noexcept
{
return ::malloc(n);
}
void raw_allocator::system_allocate(size_type n, pointer& p, size_type& capacity) noexcept
{
capacity = mem_size(n);
p = ::malloc(capacity);
if(p == 0) capacity = 0;
}
void raw_allocator::system_deallocate(pointer p) noexcept {
::free(p);
}
// allocation
// ==========
void raw_allocator::allocate(size_type n, pointer& p, size_type& capacity) noexcept
{
if(n == 0 || MaxAlign < n) system_allocate(n, p, capacity);
else {
p = 0;
capacity = 0;
size_type index = get_index(n);
lock_free_list(index);
LinkType*& freelist = Freelist[index];
if(freelist == 0) {
freelist = allocate_free_list(index);
}
if(freelist != 0) {
p = freelist;
capacity = get_align(index);
freelist = freelist->Link;
}
unlock_free_list(index);
}
}
void raw_allocator::deallocate(pointer p, size_type n) noexcept {
if(p) {
if(n == 0 || MaxAlign < n) system_deallocate(p);
else {
size_type index = get_index(n);
lock_free_list(index);
LinkType*& freelist = Freelist[index];
LinkType* new_list = ((LinkType*)(p));
new_list->Link = freelist;
freelist = new_list;
unlock_free_list(index);
}
}
}
// allocation.temporary
// ====================
void raw_allocator::allocate_temporary(size_type n, pointer& p,
size_type& capacity) noexcept
{
if(n == 0 || size_type(TmpAlign) < n) system_allocate(n, p, capacity);
else {
p = 0;
capacity = 0;
lock_free_list(TmpIndex);
LinkType*& freelist = Freelist[TmpIndex];
if(freelist == 0) freelist = allocate_free_list(TmpIndex);
if(freelist != 0) {
p = freelist;
freelist = freelist->Link;
capacity = TmpAlign;
}
unlock_free_list(TmpIndex);
}
}
void raw_allocator::deallocate_temporary(pointer p, size_type n) noexcept {
if(p) {
if(n == 0 || size_type(TmpAlign) < n) system_deallocate(p);
else {
lock_free_list(TmpIndex);
LinkType*& freelist = Freelist[TmpIndex];
LinkType* new_list = ((LinkType*)(p));
new_list->Link = freelist;
freelist = new_list;
unlock_free_list(TmpIndex);
}
}
}
void raw_allocator::log(std::ostream& stream) {
stream << " Heap Size: " << TotalHeapSize << '\n';
size_type total_size = 0;
for (unsigned i = 0; i < FreeListArraySize; ++i) {
size_type align = get_align(i);
size_type size = 0;
size_type count = 0;
lock_free_list(i);
LinkType* freelist = Freelist[i];
while (freelist) {
size += align;
++count;
freelist = freelist->Link;
}
total_size += size;
unlock_free_list(i);
stream << " Freelist: " << std::setw(4) << align << ": " << size
<< " [" << count << ']' << '\n';
}
size_type heap_size = HeapEnd - HeapBeg;
stream << " Freelists: " << total_size << '\n';
stream << " Free Heap: " << heap_size << '\n';
stream << " Allocated: " << TotalHeapSize - total_size - heap_size
<< '\n';
}
int main() {
const unsigned sample_count = 100000;
std::vector<char*> std_allocate_pointers;
std::vector<char*> allocate_pointers;
std::vector<unsigned> sample_sizes;
typedef std::chrono::nanoseconds duration;
duration std_allocate_duration;
duration std_deallocate_duration;
duration allocate_duration;
duration deallocate_duration;
std::allocator<char> std_allocator;
allocator<char> allocator;
for (unsigned i = 0; i < sample_count; ++i) {
if (std::rand() % 2) {
unsigned size = unsigned(std::rand()) % MaxAlign;
//std::cout << " Allocate: " << size << std::endl;
sample_sizes.push_back(size);
{
auto start = std::chrono::high_resolution_clock::now();
auto p = std_allocator.allocate(size);
auto end = std::chrono::high_resolution_clock::now();
std_allocate_pointers.push_back(p);
std_allocate_duration += std::chrono::duration_cast<duration>(
end - start);
}
{
auto start = std::chrono::high_resolution_clock::now();
auto p = allocator.allocate(size);
auto end = std::chrono::high_resolution_clock::now();
allocate_pointers.push_back(p);
allocate_duration += std::chrono::duration_cast<duration>(
end - start);
}
}
else {
if (!sample_sizes.empty()) {
char* std_p = std_allocate_pointers.back();
char* p = allocate_pointers.back();
unsigned size = sample_sizes.back();
//std::cout << "Deallocate: " << size << std::endl;
{
auto start = std::chrono::high_resolution_clock::now();
std_allocator.deallocate(std_p, size);
auto end = std::chrono::high_resolution_clock::now();
std_deallocate_duration += std::chrono::duration_cast<
duration>(end - start);
}
{
auto start = std::chrono::high_resolution_clock::now();
allocator.deallocate(p, size);
auto end = std::chrono::high_resolution_clock::now();
deallocate_duration += std::chrono::duration_cast<duration>(
end - start);
}
std_allocate_pointers.pop_back();
allocate_pointers.pop_back();
sample_sizes.pop_back();
}
}
}
for (unsigned i = 0; i < sample_sizes.size(); ++i) {
unsigned size = sample_sizes[i];
std_allocator.deallocate(std_allocate_pointers[i], size);
allocator.deallocate(allocate_pointers[i], size);
}
std::cout << "std_allocator: "
<< (std_allocate_duration + std_deallocate_duration).count() << " "
<< std_allocate_duration.count() << " "
<< std_deallocate_duration.count() << std::endl;
std::cout << " allocator: "
<< (allocate_duration + deallocate_duration).count() << " "
<< allocate_duration.count() << " " << deallocate_duration.count()
<< std::endl;
raw_allocator::log(std::cout);
return 0;
}
Note: The raw allocator never release memory to the system (That
might be a bug).
Note: Without optimizations enabled the performance
is lousy (g++ -std=c++11 -O3 ...)
Result:
std_allocator: 11645000 7416000 4229000
allocator: 5155000 2758000 2397000
Heap Size: 94208
Freelist: 16: 256 [16]
Freelist: 32: 640 [20]
Freelist: 48: 768 [16]
Freelist: 64: 1024 [16]
Freelist: 80: 1280 [16]
Freelist: 96: 1536 [16]
Freelist: 112: 1792 [16]
Freelist: 128: 2176 [17]
Freelist: 160: 5760 [36]
Freelist: 192: 6144 [32]
Freelist: 224: 3584 [16]
Freelist: 256: 7936 [31]
Freelist: 320: 10240 [32]
Freelist: 384: 14208 [37]
Freelist: 512: 34304 [67]
Freelist: 4096: 0 [0]
Freelists: 91648
Free Heap: 2560
Allocated: 0

It seemed like an interesting problem so I invested some time in it. The aproach you took is far from being naive. Actually it has pretty good results. It can definetly be optimized further though. I will assume the list of chunks is not already sorted because your algo is probably optimal then.
To optimize it my aproach was to optimize the sort itself eliminating the chunks that can be combined during the sort, thus making the sort faster for the remaining elements.
The code below is basically a modified version of bubble-sort. I also implemented your solution using std::sort just for comparison.
The results are suprisingly good using my also. For a data set of 10 million chunks the combined sort with the merge of chunks performs 20 times faster.
The output of the code is (algo1 is std::sort followed by merging consecutive elements, algo 2 is the sort optimized with removing the chunks that can be merged):
generating input took: 00:00:19.655999
algo 1 took 00:00:00.968738
initial chunks count: 10000000, output chunks count: 3332578
algo 2 took 00:00:00.046875
initial chunks count: 10000000, output chunks count: 3332578
You can probably improve it further using a better sort algo like introsort.
full code:
#include <vector>
#include <map>
#include <set>
#include <iostream>
#include <boost\date_time.hpp>
#define CHUNK_COUNT 10000000
struct Chunk // Chunk of memory
{
char *begin, *end; // [begin, end) range
bool operator<(const Chunk& rhs) const
{
return begin < rhs.begin;
}
};
std::vector<Chunk> in;
void generate_input_data()
{
std::multimap<int, Chunk> input_data;
Chunk chunk;
chunk.begin = 0;
chunk.end = 0;
for (int i = 0; i < CHUNK_COUNT; ++i)
{
int continuous = rand() % 3; // 66% chance of a chunk being continuous
if (continuous)
chunk.begin = chunk.end;
else
chunk.begin = chunk.end + rand() % 100 + 1;
int chunk_size = rand() % 100 + 1;
chunk.end = chunk.begin + chunk_size;
input_data.insert(std::multimap<int, Chunk>::value_type(rand(), chunk));
}
// now we have the chunks randomly ordered in the map
// will copy them in the input vector
for (std::multimap<int, Chunk>::const_iterator it = input_data.begin(); it != input_data.end(); ++it)
in.push_back(it->second);
}
void merge_chunks_sorted(std::vector<Chunk>& chunks)
{
if (in.empty())
return;
std::vector<Chunk> res;
Chunk ch = in[0];
for (size_t i = 1; i < in.size(); ++i)
{
if (in[i].begin == ch.end)
{
ch.end = in[i].end;
} else
{
res.push_back(ch);
ch = in[i];
}
}
res.push_back(ch);
chunks = res;
}
void merge_chunks_orig_algo(std::vector<Chunk>& chunks)
{
std::sort(in.begin(), in.end());
merge_chunks_sorted(chunks);
}
void merge_chunks_new_algo(std::vector<Chunk>& chunks)
{
size_t new_last_n = 0;
Chunk temp;
do {
int last_n = new_last_n;
new_last_n = chunks.size() - 1;
for (int i = chunks.size() - 2; i >= last_n; --i)
{
if (chunks[i].begin > chunks[i + 1].begin)
{
if (chunks[i].begin == chunks[i + 1].end)
{
chunks[i].begin = chunks[i + 1].begin;
if (i + 1 != chunks.size() - 1)
chunks[i + 1] = chunks[chunks.size() - 1];
chunks.pop_back();
} else
{
temp = chunks[i];
chunks[i] = chunks[i + 1];
chunks[i + 1] = temp;
}
new_last_n = i + 1;
} else
{
if (chunks[i].end == chunks[i + 1].begin)
{
chunks[i].end = chunks[i + 1].end;
if (i + 1 != chunks.size() - 1)
chunks[i + 1] = chunks[chunks.size() - 1];
chunks.pop_back();
}
}
}
} while (new_last_n < chunks.size() - 1);
}
void run_algo(void (*algo)(std::vector<Chunk>&))
{
static int count = 1;
// allowing the algo to modify the input vector is intentional
std::vector<Chunk> chunks = in;
size_t in_count = chunks.size();
boost::posix_time::ptime start = boost::posix_time::microsec_clock::local_time();
algo(chunks);
boost::posix_time::ptime stop = boost::posix_time::microsec_clock::local_time();
std::cout<<"algo "<<count++<<" took "<<stop - start<<std::endl;
// if all went ok, statistically we should have around 33% of the original chunks count in the output vector
std::cout<<" initial chunks count: "<<in_count<<", output chunks count: "<<chunks.size()<<std::endl;
}
int main()
{
boost::posix_time::ptime start = boost::posix_time::microsec_clock::local_time();
generate_input_data();
boost::posix_time::ptime stop = boost::posix_time::microsec_clock::local_time();
std::cout<<"generating input took:\t"<<stop - start<<std::endl;
run_algo(merge_chunks_orig_algo);
run_algo(merge_chunks_new_algo);
return 0;
}
I've seen below you mention n is not that high. so I rerun the test with 1000 chunks, 1000000 runs to make the times significant. The modified bubble sort still performs 5 times better. Basically for 1000 chunks total run time is 3 microseconds. Numbers below.
generating input took: 00:00:00
algo 1 took 00:00:15.343456, for 1000000 runs
initial chunks count: 1000, output chunks count: 355
algo 2 took 00:00:03.374935, for 1000000 runs
initial chunks count: 1000, output chunks count: 355

Add pointers to the chunk struct for previous and next adjacent chunk in contiguous memory, if such exists, null otherwise. When a chunk is released you check if adjacent chunks are free, and if they are you merge them and update prev->next and next->prev pointers. This procedure is O(1) and you do it each time a chunk is released.
Some memory allocators put the size of current and previous chunk at the memory position immediately before the address returned by malloc. It is then possible calculate the offset to adjacent chunks without explicit pointers.

The following doesn't require sorted input or provide sorted output. Treat the input as a stack. Pop a chunk off and check if it is adjacent to a member of the initially empty output set. If not, add it to the output set. If it is adjacent, remove the adjacent chunk from the output set and push the new combined chunk onto the input stack. Repeat until input is empty.
vector<Chunk> unify_contiguous(vector<Chunk> input)
{
vector<Chunk> output;
unordered_set<Ptr, Chunk> begins;
unordered_set<Ptr, Chunk> ends;
while (!input.empty())
{
// pop chunk from input
auto chunk = input.back();
input.pop_back();
// chunk end adjacent to an output begin?
auto it = begins.find(chunk.end);
if (it != begins.end())
{
auto end = it->second.end;
Chunk combined{chunk.begin, end};
ends.erase(end);
begins.erase(it);
input.push_back(combined);
continue;
}
// chunk begin adjacent to an output end?
it = ends.find(chunk.begin);
if (it != ends.end())
{
auto begin = it->second.begin;
Chunk combined{begin, chunk.end};
begins.erase(begin);
ends.erase(it);
input.push_back(combined);
continue;
}
// if not add chunk to output
begins[chunk.begin] = chunk;
ends[chunk.end] = chunk;
}
// collect output
for (auto kv : begins)
output.push_back(kv.second);
return output;
}

We Keep Coding

c++ django amazon-web-services regex python-2.7 google-cloud-platform list unit-testing opengl ember.js

C++ lock-free stack is corrupt - c++

Related

How can I optimise edge creation and vertex deletion in an Adjacency set representation of a graph?

What's the suited container to push values on top, remove at whatever index and avoid memory reallocation?

Why don't I get an segmentation fault when I overflow a vector with my custom allocator?

Memory overwrite in my own Vector class

Algorithm to unify contiguous chunks in a collection of chunks

Categories

Resources