I was looking at the sample code for a lock-free queue at:
http://drdobbs.com/high-performance-computing/210604448?pgno=2
(Also reference in many SO questions such as Is there a production ready lock-free queue or hash implementation in C++)
This looks like it should work for a single producer/consumer, although there are a number of typos in the code. I've updated the code to read as shown below, but it's crashing on me. Anybody have suggestions why?
In particular, should divider and last be declared as something like:
atomic<Node *> divider, last; // shared
I don't have a compiler supporting C++0x on this machine, so perhaps that's all I need...
// Implementation from http://drdobbs.com/high-performance-computing/210604448
// Note that the code in that article (10/26/11) is broken.
// The attempted fixed version is below.
template <typename T>
class LockFreeQueue {
private:
struct Node {
Node( T val ) : value(val), next(0) { }
T value;
Node* next;
};
Node *first, // for producer only
*divider, *last; // shared
public:
LockFreeQueue()
{
first = divider = last = new Node(T()); // add dummy separator
}
~LockFreeQueue()
{
while( first != 0 ) // release the list
{
Node* tmp = first;
first = tmp->next;
delete tmp;
}
}
void Produce( const T& t )
{
last->next = new Node(t); // add the new item
last = last->next; // publish it
while (first != divider) // trim unused nodes
{
Node* tmp = first;
first = first->next;
delete tmp;
}
}
bool Consume( T& result )
{
if (divider != last) // if queue is nonempty
{
result = divider->next->value; // C: copy it back
divider = divider->next; // D: publish that we took it
return true; // and report success
}
return false; // else report empty
}
};
I wrote the following code to test this. Main (not shown) just calls TestQ().
#include "LockFreeQueue.h"
const int numThreads = 1;
std::vector<LockFreeQueue<int> > q(numThreads);
void *Solver(void *whichID)
{
int id = (long)whichID;
printf("Thread %d initialized\n", id);
int result = 0;
do {
if (q[id].Consume(result))
{
int y = 0;
for (int x = 0; x < result; x++)
{ y++; }
y = 0;
}
} while (result != -1);
return 0;
}
void TestQ()
{
std::vector<pthread_t> threads;
for (int x = 0; x < numThreads; x++)
{
pthread_t thread;
pthread_create(&thread, NULL, Solver, (void *)x);
threads.push_back(thread);
}
for (int y = 0; y < 1000000; y++)
{
for (unsigned int x = 0; x < threads.size(); x++)
{
q[x].Produce(y);
}
}
for (unsigned int x = 0; x < threads.size(); x++)
{
q[x].Produce(-1);
}
for (unsigned int x = 0; x < threads.size(); x++)
pthread_join(threads[x], 0);
}
Update: It ends up that the crash is being caused by the queue declaration:
std::vector<LockFreeQueue<int> > q(numThreads);
When I change this to be a simple array, it runs fine. (I implemented a version with locks and it was crashing too.) I see that the destructor is being called immediate after the constructor, resulting in doubly-freed memory. But, does anyone know WHY the destructor would be called immediately with a std::vector?
You'll need to make several of the pointers std::atomic, as you note, and you'll need to use compare_exchange_weak in a loop to update them atomically. Otherwise, multiple consumers might consume the same node and multiple producers might corrupt the list.
It's critically important that these writes (just one example from your code) occur in order:
last->next = new Node(t); // add the new item
last = last->next; // publish it
That's not guaranteed by C++ -- the optimizer can rearrange things however it likes, as long as the current thread always acts as-if the program ran exactly the way you wrote it. And then the CPU cache can come along and reorder things further.
You need memory fences. Making the pointers use the atomic type should have that effect.
This could be totally off the mark, but I can't help but wonder whether you're having some sort of static initialization related issue... For laughs, try declaring q as a pointer to a vector of lock-free queues and allocating it on the heap in main().
Related
Bear with me, I'm new to C++. I'm trying to update a value which is stored in a vector, but I'm getting this error:
non-const lvalue reference to type 'Node'
I'm using a simple wrapper around std::vector so I can share methods like contains and others (similar to how the ArrayList is in Java).
#include <vector>
using namespace std;
template <class T> class NewFrames {
public:
// truncated ...
bool contains(T data) {
for(int i = 0; i < this->vec->size(); i++) {
if(this->vec->at(i) == data) {
return true;
}
}
return false;
}
int indexOf(T data) {
for(int i = 0; i < this->vec->size(); i++) {
if(this->vec->at(i) == data) {
return i;
}
}
return -1;
}
T get(int index) {
if(index > this->vec->size()) {
throw std::out_of_range("Cannot get index that exceeds the capacity");
}
return this->vec->at(index);
}
private:
vector<T> *vec;
};
#endif // A2_NEWFRAMES_H
The class which utilizes this wrapper is defined as follows:
#include "Page.h"
#include "NewFrames.h"
class Algo {
private:
typedef struct Node {
unsigned reference:1;
int data;
unsigned long _time;
Node() { }
Node(int data) {
this->data = data;
this->reference = 0;
this->_time = (unsigned long) time(NULL);
}
} Node;
unsigned _faults;
Page page;
NewFrames<Node> *frames;
};
I'm at a point where I need to reference one of the Node objects inside of the vector, but I need to be able to change reference to a different value. From what I've found on SO, I need to do this:
const Node &n = this->frames->get(this->frames->indexOf(data));
I've tried just using:
Node n = this->frames->get(this->frames->indexOf(data));
n.reference = 1;
and then viewing the data in the debugger, but the value is not updated when I check later on. Consider this:
const int data = this->page.pages[i];
const bool contains = this->frames->contains(Node(data));
Node node = this->frames->get(index);
for(unsigned i = 0; i < this->page.pages.size(); i++) {
if(node == NULL && !contains) {
// add node
} else if(contains) {
Node n = this->frames->get(this->frames->indexOf(data));
if(n.reference == 0) {
n.reference = 1;
} else {
n.reference = 0;
}
} else {
// do other stuff
}
}
With subsequent passes of the loop, the node with that particular data value is somehow different.
But if I attempt to change n.reference, I'll get an error because const is preventing the object from changing. Is there a way I can get this node so I can change it? I'm coming from the friendly Java world where something like this would work, but I want to know/understand why this doesn't work in C++.
Node n = this->frames->get(this->frames->indexOf(data));
n.reference = 1;
This copies the Node from frames and stores the copy as the object n. Modifying the copy does not change the original node.
The simplest "fix" is to use a reference. That means changing the return type of get from T to T&, and changing the previous two lines to
Node& n = this->frames->get(this->frames->indexOf(data));
n.reference = 1;
That should get the code to work. But there is so much indirection in the code that there are likely to be other problems that haven't shown up yet. As #nwp said in a comment, using vector<T> instead of vector<T>* will save you many headaches.
And while I'm giving style advice, get rid of those this->s; they're just noise. And simplify the belt-and-suspenders validity checks: when you loop from 0 to vec.size() you don't need to check that the index is okay when you access the element; change vec.at(i) to vec[i]. And in get, note that vec.at(index) will throw an exception if index is out of bounds, so you can either skip the initial range check or keep the check (after fixing it so that it checks the actual range) and, again, use vec[index] instead of vec.at(index).
I'm trying to implement a lock free multiple producer, multiple consumer queue in C++11. I'm doing this as a learning exercise, so I'm well aware that I could just use an existing open source implementation, but I'd really like to find out why my code doesn't work. The data is stored in a ringbuffer, apparently it is a "bounded MPMC queue".
I've modelled it pretty closely to what I've read of Disruptor. The thing I've noticed is that it works absolutely fine with a single consumer and single/multiple producers, it's just multiple consumers which seems to break it.
Here's the queue:
template <typename T>
class Queue : public IQueue<T>
{
public:
explicit Queue( int capacity );
~Queue();
bool try_push( T value );
bool try_pop( T& value );
private:
typedef struct
{
bool readable;
T value;
} Item;
std::atomic<int> m_head;
std::atomic<int> m_tail;
int m_capacity;
Item* m_items;
};
template <typename T>
Queue<T>::Queue( int capacity ) :
m_head( 0 ),
m_tail( 0 ),
m_capacity(capacity),
m_items( new Item[capacity] )
{
for( int i = 0; i < capacity; ++i )
{
m_items[i].readable = false;
}
}
template <typename T>
Queue<T>::~Queue()
{
delete[] m_items;
}
template <typename T>
bool Queue<T>::try_push( T value )
{
while( true )
{
// See that there's room
int tail = m_tail.load(std::memory_order_acquire);
int new_tail = ( tail + 1 );
int head = m_head.load(std::memory_order_acquire);
if( ( new_tail - head ) >= m_capacity )
{
return false;
}
if( m_tail.compare_exchange_weak( tail, new_tail, std::memory_order_acq_rel ) )
{
// In try_pop, m_head is incremented before the reading of the value has completed,
// so though we've acquired this slot, a consumer thread may be in the middle of reading
tail %= m_capacity;
std::atomic_thread_fence( std::memory_order_acquire );
while( m_items[tail].readable )
{
}
m_items[tail].value = value;
std::atomic_thread_fence( std::memory_order_release );
m_items[tail].readable = true;
return true;
}
}
}
template <typename T>
bool Queue<T>::try_pop( T& value )
{
while( true )
{
int head = m_head.load(std::memory_order_acquire);
int tail = m_tail.load(std::memory_order_acquire);
if( head == tail )
{
return false;
}
int new_head = ( head + 1 );
if( m_head.compare_exchange_weak( head, new_head, std::memory_order_acq_rel ) )
{
head %= m_capacity;
std::atomic_thread_fence( std::memory_order_acquire );
while( !m_items[head].readable )
{
}
value = m_items[head].value;
std::atomic_thread_fence( std::memory_order_release );
m_items[head].readable = false;
return true;
}
}
}
And here's the test I'm using:
void Test( std::string name, Queue<int>& queue )
{
const int NUM_PRODUCERS = 64;
const int NUM_CONSUMERS = 2;
const int NUM_ITERATIONS = 512;
bool table[NUM_PRODUCERS*NUM_ITERATIONS];
memset(table, 0, NUM_PRODUCERS*NUM_ITERATIONS*sizeof(bool));
std::vector<std::thread> threads(NUM_PRODUCERS+NUM_CONSUMERS);
std::chrono::system_clock::time_point start, end;
start = std::chrono::system_clock::now();
std::atomic<int> pop_count (NUM_PRODUCERS * NUM_ITERATIONS);
std::atomic<int> push_count (0);
for( int thread_id = 0; thread_id < NUM_PRODUCERS; ++thread_id )
{
threads[thread_id] = std::thread([&queue,thread_id,&push_count]()
{
int base = thread_id * NUM_ITERATIONS;
for( int i = 0; i < NUM_ITERATIONS; ++i )
{
while( !queue.try_push( base + i ) ){};
push_count.fetch_add(1);
}
});
}
for( int thread_id = 0; thread_id < ( NUM_CONSUMERS ); ++thread_id )
{
threads[thread_id+NUM_PRODUCERS] = std::thread([&]()
{
int v;
while( pop_count.load() > 0 )
{
if( queue.try_pop( v ) )
{
if( table[v] )
{
std::cout << v << " already set" << std::endl;
}
table[v] = true;
pop_count.fetch_sub(1);
}
}
});
}
for( int i = 0; i < ( NUM_PRODUCERS + NUM_CONSUMERS ); ++i )
{
threads[i].join();
}
end = std::chrono::system_clock::now();
std::chrono::duration<double> duration = end - start;
std::cout << name << " " << duration.count() << std::endl;
std::atomic_thread_fence( std::memory_order_acq_rel );
bool result = true;
for( int i = 0; i < NUM_PRODUCERS * NUM_ITERATIONS; ++i )
{
if( !table[i] )
{
std::cout << "failed at " << i << std::endl;
result = false;
}
}
std::cout << name << " " << ( result? "success" : "fail" ) << std::endl;
}
Any nudging in the right direction would be greatly appreciated. I'm pretty new to memory fences rather than just using a mutex for everything, so I'm probably just fundamentally misunderstanding something.
Cheers
J
I'd give a look to Moody Camel's implementation.
It is a fast general purpose lock-free queue for C++ entirely written in C++11. Documentation seems to be rather good along with a few performance tests.
Among all other interesting things (they're worth a read anyway), it's all contained in a single header, and available under the simplified BSD license. Just drop it in your project and enjoy!
The simplest approach uses a circular buffer. That is it's like an array of 256 elements and you use uint8_t as index so it wraps around and starts at beginning when you overflow it.
The simplest primitive you can build upon is when you have single producer, single consumer thread.
The buffer has two heads:
Write head: It points the element which will be written next.
Read head: It points to the element which will be read next.
Operation of the producer:
If write Head + 1 == read head, the buffer is full, return buffer full error.
Write content to the element.
Insert memory barrier to sync CPU cores.
Move the write head forward.
At the buffer full case there is still 1 room left, but we reserve that, to distinguish from the buffer empty case.
Operation of the consumer:
If read head == write head, the buffer is empty, return buffer empty error.
Read content of the element.
Insert memory barrier to sync CPU cores.
Move the read head forward.
The producer owns the write head, the consumer owns the read head, there is no concurrency on those. Also the heads are updated when the operation is completed, this ensure the consumer leaves finished elements behind, and the consumes leaves behind fully consumed empty cells.
Create 2 of these pipes in both directions whenever you fork off a new thread and you can have bidirectional communication with your threads.
Given that we are talking about lock freeness it also means none of the threads are blocked, when there is nothing to do the threads are spinning empty, you may want to detect this and add some sleep when it happens.
How about this lock free queue
It is memory ordering lock free queue, but this need to pre-set number of current thread when init the queue.
For example:-
int* ret;
int max_concurrent_thread = 16;
lfqueue_t my_queue;
lfqueue_init(&my_queue, max_concurrent_thread );
/** Wrap This scope in other threads **/
int_data = (int*) malloc(sizeof(int));
assert(int_data != NULL);
*int_data = i++;
/*Enqueue*/
while (lfqueue_enq(&my_queue, int_data) == -1) {
printf("ENQ Full ?\n");
}
/** Wrap This scope in other threads **/
/*Dequeue*/
while ( (int_data = lfqueue_deq(&my_queue)) == NULL) {
printf("DEQ EMPTY ..\n");
}
// printf("%d\n", *(int*) ret );
free(ret);
/** End **/
lfqueue_destroy(&my_queue);
On another similar question, I presented a solution to this problem. I believe that it the smallest found so far.
I will not put same answer here, but the repository has a fully functional C++ implementation of the lock free queue you desire.
EDIT: Thanks to code review from #PeterCordes, I've found a bug on the solution when using 64 bit templates, but now it's working perfectly.
This is the output I receive when running the tests
Creating 4 producers & 4 consumers
to flow 10.000.000 items trough the queue.
Produced: 10.743.668.245.000.000
Consumed: 5.554.289.678.184.004
Produced: 10.743.668.245.000.000
Consumed: 15.217.833.969.059.643
Produced: 10.743.668.245.000.000
Consumed: 7.380.542.769.600.801
Produced: 10.743.668.245.000.000
Consumed: 14.822.006.563.155.552
Checksum: 0 (it must be zero)
I am investigating data structures to satisfy O(1) get operations and came across a structure called Trie.
I have implemented the below simple Trie structure to hold numbers (digits only).
Ignore the memory leak - it is not the topic here :)
The actual storage in the Data class is not related as well.
#include <sstream>
#include <string>
struct Data
{
Data(): m_nData(0){}
int m_nData;
};
struct Node
{
Node(): m_pData(NULL)
{
for (size_t n = 0; n < 10; n++)
{
digits[n] = NULL;
}
}
void m_zAddPartialNumber(std::string sNumber)
{
if (sNumber.empty() == true) // last digit
{
m_pData = new Data;
m_pData->m_nData = 1;
}
else
{
size_t nDigit = *(sNumber.begin()) - '0';
if (digits[nDigit] == NULL)
{
digits[nDigit] = new Node;
}
digits[nDigit]->m_zAddPartialNumber(sNumber.substr(1, sNumber.length() - 1));
}
}
Data* m_pData;
Node* digits[10];
};
struct DB
{
DB() : root(NULL){}
void m_zAddNumber(std::string sNumber)
{
if (root == NULL)
{
root = new Node;
}
root->m_zAddPartialNumber(sNumber);
}
Node* root;
};
int main()
{
DB oDB;
for (size_t nNumber = 0; nNumber <= 10000; nNumber++)
{
std::ostringstream convert;
convert << nNumber;
std::string sNumber = convert.str();
oDB.m_zAddNumber(sNumber);
}
return 0;
}
My main function is simply inserting numbers into the data structure.
I've examined the overall memory allocated using Windows task manager and came across an interesting feature i can't explain and am seeking your advice.
I've re-executed my simple program with different numbers inserted to the structure (altering the for loop stop condition) - here is a table of the experiment results:
Plotting the numbers in a logarithmic scaled graph reveals:
As you can see, the graph is not linear.
My question is why?
I would expect the allocation to behave linear across the range.
A linear relation of y on x is of the form y=a+bx. This is a straight line in a y vs x plot, but not in a log(y) vs log(x) plot, unless the constant a=0. So, I conjecture that your relation may still be (nearly) linear with a~340 kB.
I am debugging a code dealing with vectors and iterators. I get an assertion error when I am clicking the "New" button on my GUI. The assertion error is that in the title, with the addition of /vector Line 251.
I have traced the problem to a part of the code attempting to remove an element from a vector. I will post the entire function and then the line that bugs:
int VsuCNTreeNodeManager::deleteTreeNode(RWCString & CNNameToDelete, RWTValSlist<VsuDeletedCN> & deletedCNList)
{
RWCString childName, parentName;
VsuCNTreeNode *pNode;
int i;
int size;
if (!nodeList.contains(CNNameToDelete))
return 1; // Means that CNNameToDelete doest not exist.
pNode = ordCNList[nodeList[CNNameToDelete]];
travForName.reset();
travForName.processElement(pNode);
const RWTValSlist<RWCString> & childNameList = travForName.getNameList();
size = childNameList.entries();
// If it is the Top node that is deleted then
// the VsuCNTreeNodeManager's top node pointer is reset.
if ( pNode == pTopCNTreeNode )
{
pTopCNTreeNode = NULL;
}
for ( i = 0; i < size; i++)
{
//******* How would it possible to have a name not contained in the nodeList
//******* since it has been extracted from the nodeList ?????????????
childName = childNameList.at(i);
if (nodeList.contains(childName))
{
//******* Process that get the Parent List of each deleted Tree Node
//******* The following code unref all the Tree Nodes that was referencing any deleted Tree Node
pNode = ordCNList[nodeList[childName]]; // Get the Tree Node to be deleted
// Fill the deletedCNList
deletedCNList.insert( VsuDeletedCN(childName, pNode->getCN()->hasType()) );
VsuDependencyRemoverVisitor visitor( *pNode );
for (unsigned int k = 0; k < pNode->getParentList().entries(); k++)
{
parentName = pNode->getParentList().at(k)->getCN()->getName();
if ( nodeList.contains(parentName) ) // Check if the parent is not deleted
{
//*** Remove the reference of the deleted tree node from that parent
RWBoolean status;
status = ordCNList[nodeList[parentName]]->removeElem(childName); // Removing the reference that pNode(parent) had on key(Child)
}
}
//******* Remove references on this object from observers.
pNode->resetObserverFlags();
pNode->updateAllObservers(&visitor);
//******* Process that delete all the Tree Nodes in the parentList
nodeList.remove(childName);
}
}
//*****************update Lists********************
size = ordCNList.entries();
int index = 0;
RWTValHashDictionary<RWCString, int> tmpNodeList(rwhash);
//nodeList.clear();
RWTPtrOrderedVector<VsuCNTreeNode> nodeToDelete(childNameList.entries());
for(i = 0; i < size; i++)
{
pNode = ordCNList[index];
childName = pNode->getCN()->getName();
if (!childNameList.contains(childName))
{
tmpNodeList.insertKeyAndValue(childName, index);
index++;
}
else
{
ordCNList.remove(pNode);
typeList[pNode->getCN()->hasType()].treeNodeList.remove(pNode);
// Decrement type counter and if it reach 0 then
// the entry is removed.
if( !typeList[pNode->getCN()->hasType()].treeNodeList.entries() )
typeList.remove(pNode->getCN()->hasType());
nodeToDelete.insert(pNode);
}
}
nodeList.clear();
nodeList = tmpNodeList;
ordCNList.resize(index);
if (!index)
pTopCNTreeNode = NULL;
for( unsigned int j=0; j < nodeToDelete.entries(); j++)
{
delete nodeToDelete[j];
}
return 0;
}
Now the line that bugs is:
RWBoolean status;
status = ordCNList[nodeList[parentName]]->removeElem(childName);
The definition of the removeElem function is:
RWBoolean VsuVE_Collection::removeElem(const RWCString & data)
{
VsuVE_Moveable *pMyObj = elementList.at(nameList[data]);
return removeElem1(pMyObj);
}
The definition of removeElem1 is:
RWBoolean VsuVE_Collection::removeElem1(VsuVE_Moveable *elem)
{
if (elementList.remove(elem) == FALSE) // THE ASSERTION ERROR HAPPENS RIGHT HERE
return FALSE;
//**** Reordering the nameList
nameList.clear();
int size = elementList.entries();
for (int i = 0; i < size; i++)
{
nameList.insertKeyAndValue(elementList.at(i)->name, i);
}
return TRUE;
}
My guess is that the removeElem function is attempting to remove a vector element that isn't there or that is out of the index range, but I am unable to figure out where exactly I can fix this. Any help is appreciated.
Thanks in advance
It's not obvious which particular framework you're using here (Rogue Wave?), but I think it may be possible to deduce the problem.
The key to decoding this assertion is understanding what incompatible iterators means. In general it means that you're trying to do an operation on a pair of items that don't refer to the same thing. For instance: (with standard library containers)
std::vector<int> v1, v2;
for (auto it=v1.begin(); it!=v2.end(); it++) { // <=== iterator incompatible
}
std::vector<int>::iterator it1=v1.begin();
v2.erase(v1); // <==== iterator incompatible
If you dig into the definition of the iterator types you have, then you should find that when the iterator is created it stores a reference back to the container it was created from. If you then perform an operation on two iterators (as in the first case above) then it can detect that they refer to different containers and hence aren't compatible. In the second case you have an operation on a container and an iterator, and so again it will assert that the iterator refers to that container.
In your case it appears that you're trying to remove an element from a container. The framework is asserting that the item isn't in the container (and in fact is probably in another). I suspect that you're deleting an item from the wrong container.
I was trying to implement exponential tree from documentation, but here is one place in the code which is not clear for me how to implement it:
#include<iostream>
using namespace std;
struct node
{
int level;
int count;
node **child;
int data[];
};
int binary_search(node *ptr,int element)
{
if(element>ptr->data[ptr->count-1]) return ptr->count;
int start=0;
int end=ptr->count-1;
int mid=start+(end-start)/2;
while(start<end)
{
if(element>ptr->data[mid]) { start=mid+1;}
else
{
end=mid;
}
mid=start+(end-start)/2;
}
return mid;
}
void insert(node *root,int element)
{
node *ptr=root,*parent=NULL;
int i=0;
while(ptr!=NULL)
{
int level=ptr->level,count=ptr->count;
i=binary_search(ptr,element);
if(count<level){
for(int j=count;j<=i-1;j--)
ptr->data[j]=ptr->data[j-1];
}
ptr->data[i]=element;
ptr->count=count+1;
return ;
}
parent=ptr,ptr=ptr->child[i];
//Create a new Exponential Node at ith child of parent and
//insert element in that
return ;
}
int main()
{
return 0;
}
Here is a link for the paper I'm referring to:
http://www.ijcaonline.org/volume24/number3/pxc3873876.pdf
This place is in comment, how can I create a new exponential node at level i? Like this?
parent->child[i]=new node;
insert(parent,element);
The presence of the empty array at the end of the structure indicates this is C style code rather than C++ (it's a C Hack for flexible arrays). I'll continue with C style code as idiomatic C++ code would prefer use of standard containers for the child and data members.
Some notes and comments on the following code:
There were a number of issues with the pseudo-code in the linked paper to a point where it is better to ignore it and develop the code from scratch. The indentation levels are unclear where loops end, all the loop indexes are not correct, the check for finding an insertion point is incorrect, etc....
I didn't include any code for deleting the allocated memory so the code will leak as is.
Zero-sized arrays may not be supported by all compilers (I believe it is a C99 feature). For example VS2010 gives me warning C4200 saying it will not generate the default copy/assignment methods.
I added the createNode() function which gives the answer to your original question of how to allocate a node at a given level.
A very basic test was added and appears to work but more thorough tests are needed before I would be comfortable with the code.
Besides the incorrect pseudo-code the paper has a number of other errors or at least questionable content. For example, concerning Figure 2 it says "which clearly depicts that the slope of graph is linear" where as the graph is clearly not linear. Even if the author meant "approaching linear" it is at least stretching the truth. I would also be interested in the set of integers they used for testing which doesn't appear to be mentioned at all. I assumed they used a random set but I would like to see at least several sets of random numbers used as well as several predefined sets such as an already sorted or inversely sorted set.
.
int binary_search(node *ptr, int element)
{
if (ptr->count == 0) return 0;
if (element > ptr->data[ptr->count-1]) return ptr->count;
int start = 0;
int end = ptr->count - 1;
int mid = start + (end - start)/2;
while (start < end)
{
if (element > ptr->data[mid])
start = mid + 1;
else
end = mid;
mid = start + (end - start)/2;
}
return mid;
}
node* createNode (const int level)
{
if (level <= 0) return NULL;
/* Allocate node with 2**(level-1) integers */
node* pNewNode = (node *) malloc(sizeof(node) + sizeof(int)*(1 << (level - 1)));
memset(pNewNode->data, 0, sizeof(int) * (1 << (level - 1 )));
/* Allocate 2**level child node pointers */
pNewNode->child = (node **) malloc(sizeof(node *)* (1 << level));
memset(pNewNode->child, 0, sizeof(int) * (1 << level));
pNewNode->count = 0;
pNewNode->level = level;
return pNewNode;
}
void insert(node *root, int element)
{
node *ptr = root;
node *parent = NULL;
int i = 0;
while (ptr != NULL)
{
int level = ptr->level;
int count = ptr->count;
i = binary_search(ptr, element);
if (count < (1 << (level-1)))
{
for(int j = count; j >= i+1; --j)
ptr->data[j] = ptr->data[j-1];
ptr->data[i] = element;
++ptr->count;
return;
}
parent = ptr;
ptr = ptr->child[i];
}
parent->child[i] = createNode(parent->level + 1);
insert(parent->child[i], element);
}
void InOrderTrace(node *root)
{
if (root == NULL) return;
for (int i = 0; i < root->count; ++i)
{
if (root->child[i]) InOrderTrace(root->child[i]);
printf ("%d\n", root->data[i]);
}
if (root->child[root->count]) InOrderTrace(root->child[root->count]);
}
void testdata (void)
{
node* pRoot = createNode(1);
for (int i = 0; i < 10000; ++i)
{
insert(pRoot, rand());
}
InOrderTrace(pRoot);
}